Commit 344079bb authored by feichtinger's avatar feichtinger
Browse files

dataset cleaning for zeroes in PEPP-avg. First new runs

parent 352d7326
This diff is collapsed.
......@@ -15,7 +15,7 @@
# + {"toc": true, "cell_type": "markdown"}
# <h1>Table of Contents<span class="tocSkip"></span></h1>
# <div class="toc"><ul class="toc-item"><li><span><a href="#Configuration" data-toc-modified-id="Configuration-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Configuration</a></span></li><li><span><a href="#Support-Routines" data-toc-modified-id="Support-Routines-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Support Routines</a></span><ul class="toc-item"><li><span><a href="#Visualization" data-toc-modified-id="Visualization-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Visualization</a></span></li></ul></li><li><span><a href="#Dataset-creation" data-toc-modified-id="Dataset-creation-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Dataset creation</a></span><ul class="toc-item"><li><span><a href="#Dataset-reading-and-preprocessing-definition" data-toc-modified-id="Dataset-reading-and-preprocessing-definition-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Dataset reading and preprocessing definition</a></span></li><li><span><a href="#Make-dataset" data-toc-modified-id="Make-dataset-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Make dataset</a></span></li><li><span><a href="#Training/Test-Split" data-toc-modified-id="Training/Test-Split-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Training/Test Split</a></span></li><li><span><a href="#Data-scaling-for-DNN-training" data-toc-modified-id="Data-scaling-for-DNN-training-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Data scaling for DNN training</a></span></li></ul></li><li><span><a href="#DNN-Model-definitions" data-toc-modified-id="DNN-Model-definitions-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>DNN Model definitions</a></span><ul class="toc-item"><li><span><a href="#L2reg-and-gaussian-noise" data-toc-modified-id="L2reg-and-gaussian-noise-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>L2reg and gaussian noise</a></span></li><li><span><a href="#Model-with-Dropout" data-toc-modified-id="Model-with-Dropout-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Model with Dropout</a></span></li></ul></li><li><span><a href="#DNN-Training-runs" data-toc-modified-id="DNN-Training-runs-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>DNN Training runs</a></span><ul class="toc-item"><li><span><a href="#Andi's-initial-DNN-using-gn" data-toc-modified-id="Andi's-initial-DNN-using-gn-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Andi's initial DNN using gn</a></span></li><li><span><a href="#without-any-regularization" data-toc-modified-id="without-any-regularization-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>without any regularization</a></span><ul class="toc-item"><li><span><a href="#Investigation-of-model-performance-errors" data-toc-modified-id="Investigation-of-model-performance-errors-5.2.1"><span class="toc-item-num">5.2.1&nbsp;&nbsp;</span>Investigation of model performance errors</a></span></li></ul></li><li><span><a href="#Trying-to-reproduce-best-hyperscan-run" data-toc-modified-id="Trying-to-reproduce-best-hyperscan-run-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Trying to reproduce best hyperscan run</a></span></li><li><span><a href="#Try-out-DNN-with-dropout" data-toc-modified-id="Try-out-DNN-with-dropout-5.4"><span class="toc-item-num">5.4&nbsp;&nbsp;</span>Try out DNN with dropout</a></span></li></ul></li><li><span><a href="#Hyperparameter-scans" data-toc-modified-id="Hyperparameter-scans-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Hyperparameter scans</a></span><ul class="toc-item"><li><span><a href="#Test:-Make-a-hyperparameter-scan-A" data-toc-modified-id="Test:-Make-a-hyperparameter-scan-A-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Test: Make a hyperparameter scan A</a></span></li><li><span><a href="#Offline-batch-parameter-scan" data-toc-modified-id="Offline-batch-parameter-scan-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Offline batch parameter scan</a></span></li><li><span><a href="#TODO-ModelB" data-toc-modified-id="TODO-ModelB-6.3"><span class="toc-item-num">6.3&nbsp;&nbsp;</span>TODO ModelB</a></span></li><li><span><a href="#TODO:-Model-C:-scan-regulatisation-and-noise" data-toc-modified-id="TODO:-Model-C:-scan-regulatisation-and-noise-6.4"><span class="toc-item-num">6.4&nbsp;&nbsp;</span>TODO: Model C: scan regulatisation and noise</a></span></li></ul></li><li><span><a href="#SVM-to-see-what-a-linear-model-can-do" data-toc-modified-id="SVM-to-see-what-a-linear-model-can-do-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>SVM to see what a linear model can do</a></span></li></ul></div>
# <div class="toc"><ul class="toc-item"><li><span><a href="#Configuration" data-toc-modified-id="Configuration-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Configuration</a></span></li><li><span><a href="#Support-Routines" data-toc-modified-id="Support-Routines-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Support Routines</a></span><ul class="toc-item"><li><span><a href="#Visualization" data-toc-modified-id="Visualization-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Visualization</a></span></li></ul></li><li><span><a href="#Dataset-creation" data-toc-modified-id="Dataset-creation-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Dataset creation</a></span><ul class="toc-item"><li><span><a href="#Dataset-reading-and-preprocessing-definition" data-toc-modified-id="Dataset-reading-and-preprocessing-definition-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Dataset reading and preprocessing definition</a></span></li><li><span><a href="#Make-dataset" data-toc-modified-id="Make-dataset-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Make dataset</a></span></li><li><span><a href="#Training/Test-Split" data-toc-modified-id="Training/Test-Split-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Training/Test Split</a></span></li><li><span><a href="#Data-scaling-for-DNN-training" data-toc-modified-id="Data-scaling-for-DNN-training-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Data scaling for DNN training</a></span></li></ul></li><li><span><a href="#DNN-Model-definitions" data-toc-modified-id="DNN-Model-definitions-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>DNN Model definitions</a></span><ul class="toc-item"><li><span><a href="#L2reg-and-gaussian-noise" data-toc-modified-id="L2reg-and-gaussian-noise-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>L2reg and gaussian noise</a></span></li><li><span><a href="#Model-with-Dropout" data-toc-modified-id="Model-with-Dropout-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Model with Dropout</a></span></li></ul></li><li><span><a href="#DNN-Training-runs-(data-not-corrected-for-zeroes)" data-toc-modified-id="DNN-Training-runs-(data-not-corrected-for-zeroes)-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>DNN Training runs (data not corrected for zeroes)</a></span><ul class="toc-item"><li><span><a href="#Andi's-initial-DNN-using-gn" data-toc-modified-id="Andi's-initial-DNN-using-gn-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Andi's initial DNN using gn</a></span></li><li><span><a href="#without-any-regularization" data-toc-modified-id="without-any-regularization-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>without any regularization</a></span><ul class="toc-item"><li><span><a href="#Investigation-of-model-performance-errors" data-toc-modified-id="Investigation-of-model-performance-errors-5.2.1"><span class="toc-item-num">5.2.1&nbsp;&nbsp;</span>Investigation of model performance errors</a></span></li><li><span><a href="#outlier-investigation-after-answer-by-Jochem/Pavle" data-toc-modified-id="outlier-investigation-after-answer-by-Jochem/Pavle-5.2.2"><span class="toc-item-num">5.2.2&nbsp;&nbsp;</span>outlier investigation after answer by Jochem/Pavle</a></span></li></ul></li><li><span><a href="#Trying-to-reproduce-best-hyperscan-run" data-toc-modified-id="Trying-to-reproduce-best-hyperscan-run-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Trying to reproduce best hyperscan run</a></span></li><li><span><a href="#Try-out-DNN-with-dropout" data-toc-modified-id="Try-out-DNN-with-dropout-5.4"><span class="toc-item-num">5.4&nbsp;&nbsp;</span>Try out DNN with dropout</a></span></li></ul></li><li><span><a href="#Hyperparameter-scans-(data-not-corrected-for-zeroes)" data-toc-modified-id="Hyperparameter-scans-(data-not-corrected-for-zeroes)-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Hyperparameter scans (data not corrected for zeroes)</a></span><ul class="toc-item"><li><span><a href="#Test:-Make-a-hyperparameter-scan-A" data-toc-modified-id="Test:-Make-a-hyperparameter-scan-A-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Test: Make a hyperparameter scan A</a></span></li><li><span><a href="#Offline-batch-parameter-scan" data-toc-modified-id="Offline-batch-parameter-scan-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Offline batch parameter scan</a></span></li><li><span><a href="#TODO-ModelB" data-toc-modified-id="TODO-ModelB-6.3"><span class="toc-item-num">6.3&nbsp;&nbsp;</span>TODO ModelB</a></span></li><li><span><a href="#TODO:-Model-C:-scan-regulatisation-and-noise" data-toc-modified-id="TODO:-Model-C:-scan-regulatisation-and-noise-6.4"><span class="toc-item-num">6.4&nbsp;&nbsp;</span>TODO: Model C: scan regulatisation and noise</a></span></li></ul></li><li><span><a href="#DNN-runs-with-data-set-cleaned-for-zero-energy-measurements" data-toc-modified-id="DNN-runs-with-data-set-cleaned-for-zero-energy-measurements-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>DNN runs with data set cleaned for zero energy measurements</a></span><ul class="toc-item"><li><span><a href="#without-any-regularization" data-toc-modified-id="without-any-regularization-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>without any regularization</a></span></li></ul></li><li><span><a href="#SVM-to-see-what-a-linear-model-can-do" data-toc-modified-id="SVM-to-see-what-a-linear-model-can-do-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>SVM to see what a linear model can do</a></span></li></ul></div>
# +
import os
......@@ -49,6 +49,7 @@ import talos as ta
# -
import itertools
import glob
# # Configuration
......@@ -294,26 +295,82 @@ def makeDataSetInterpolated(directory, excelFn, doInterpolate=True, dropBadPulse
data.dropna()
return data
# The initial DNN regression runs led to the identification of measurement artifacts where the measured average pulse energy was stored as zero, heavily influencing the interpolated pulse energies in their neighborhood.
#
# In the following data cleaning, the zeros are replaced by NaN and all interpolation is done based on the two surrounding values. The correctness of this procedure is unclear, since if there really was no beam in the vicinity of this measurement one maybe should rather not use the points of those regions at all
def makeDataSetInterpolated2(directory, excelFn, doInterpolate=True, dropBadPulses=True, verbose=False,
CALCTthreshold=-50, CALCSthreshold=-50):
first = True
data = pd.DataFrame()
for filename in sorted(glob.glob(os.path.join(directory,'dp*-nomeans.csv'))):
mat = re.match('^dp(\d+).*', os.path.basename(filename))
if mat is None:
print(f"WARNING: Could not parse number part of file {os.path.basename(filename)}")
continue
expNumber = int(mat.group(1))
file_excel = pd.read_excel(excelFn)
multVoltag = file_excel.iloc[expNumber]['XeMultVoltag']
try:
dp = pd.read_csv(filename, sep=";")
except:
print (f"WARNING: Can not read {os.path.basename(filename)}. skipping...")
continue
dp = dp[['SARFE10-PBIG050-EVR0:CALCT.value',
'SARFE10-PBIG050-EVR0:CALCS.value',
'SARFE10-PSSS059:SPECTRUM_CENTER.value',
'SARFE10-PBPG050:PHOTON-ENERGY-PER-PULSE-AVG.value']]
dp.columns = ['CALCT','CALCS','SPECTRUM_CENTER','PHOTON-ENERGY-PER-PULSE']
# Replace zeroes by NaN
fzeroes = dp.loc[dp['PHOTON-ENERGY-PER-PULSE'] == 0.0]
if fzeroes.shape[0] > 0:
print(f"zeroes found in PEPP-avg of {os.path.basename(filename)} at {fzeroes.index.values}")
dp.loc[dp['PHOTON-ENERGY-PER-PULSE'] == 0.0, 'PHOTON-ENERGY-PER-PULSE'] = np.NaN
if doInterpolate:
dp['PHOTON-ENERGY-PER-PULSE'].interpolate(method='linear',
inplace=True,
limit_direction='forward',
axis=0)
dp = dp.dropna();
# condition for bad pulse
if dropBadPulses:
dp = dp.query('CALCT < @CALCTthreshold & CALCS < @CALCSthreshold')
dp['XeMultVoltag'] = multVoltag
dp['rawDataFile'] = os.path.basename(filename)
data = pd.concat([data,dp])
if verbose:
print("Datapoint", expNumber, "gave", len(dp), "values")
data.dropna()
data.reset_index(inplace=True)
return data
# ## Make dataset
#
# (Can not read /psi/home/adelmann/SwissFEL-Gas-1/cleaned/dp41-nomeans.csv is ok)
data = makeDataSetInterpolated(directory,xlsxFn,CALCTthreshold=-50,CALCSthreshold=-50,verbose=False)
data = makeDataSetInterpolated2(directory,xlsxFn,CALCTthreshold=-50,CALCSthreshold=-50,verbose=False)
data.info()
data.tail()
#nr files actually used
data['rawDataFile'].nunique()
# +
#data['SPECTRUM_CENTER'].hist(bins=100)
data['SPECTRUM_CENTER'].hist(bins=100)
# +
#data['PHOTON-ENERGY-PER-PULSE'].hist(bins=100)
data['PHOTON-ENERGY-PER-PULSE'].hist(bins=100)
# +
#data['XeMultVoltag'].hist(bins=100)
# -
data['XeMultVoltag'].hist(bins=100)
# XePhotEnergyL = 6000. # Ev
# XePhotEnergyH = 12500.
......@@ -542,7 +599,7 @@ def build_ff_mdl_smallDropOut(in_dim = 2, out_dim = 1, l1 = 8, l2 = 6, l3 = 4, l
return model
# # DNN Training runs
# # DNN Training runs (data not corrected for zeroes)
# ## Andi's initial DNN using gn
......@@ -685,6 +742,50 @@ tst_df = test[cutfilter]
tst_df['PEPP_predict'] = y_predict[cutfilter]
modelPerfPerFile(tst_df)
# ### outlier investigation after answer by Jochem/Pavle
data[data['PHOTON-ENERGY-PER-PULSE'] == 0.0]
dftmp = pd.read_csv('cleaned/dp30-nomeans.csv', sep=";")
dftmp.head()
dftmp = pd.read_csv('cleaned/dp30-nomeans.csv', sep=";")
dftmp = dftmp[['SARFE10-PBIG050-EVR0:CALCT.value',
'SARFE10-PBIG050-EVR0:CALCS.value',
'SARFE10-PSSS059:SPECTRUM_CENTER.value',
'SARFE10-PBPG050:PHOTON-ENERGY-PER-PULSE-AVG.value']]
dftmp.columns = ['CALCT','CALCS','SPECTRUM_CENTER','PHOTON-ENERGY-PER-PULSE']
dftmp.info()
dftmp[~dftmp['PHOTON-ENERGY-PER-PULSE'].isna()].plot(y='PHOTON-ENERGY-PER-PULSE')
dftmp2 = dftmp[~dftmp['PHOTON-ENERGY-PER-PULSE'].isna()]
dftmp2[dftmp2['PHOTON-ENERGY-PER-PULSE'] == 0.0]
# use sorted file list for reproducibility
flist = glob.glob(os.path.join(directory,'dp*-nomeans.csv'))
flist.sort()
dffull=pd.DataFrame()
for fname in flist:
try:
dftmp = pd.read_csv(fname, sep=";")
except:
print(f"Failure CSV parsing of {fname}")
continue
dftmp = dftmp[['SARFE10-PBIG050-EVR0:CALCT.value',
'SARFE10-PBIG050-EVR0:CALCS.value',
'SARFE10-PSSS059:SPECTRUM_CENTER.value',
'SARFE10-PBPG050:PHOTON-ENERGY-PER-PULSE-AVG.value']]
dftmp.columns = ['CALCT','CALCS','SPECTRUM_CENTER','PHOTON-ENERGY-PER-PULSE']
dftmp['filename'] = re.sub('.*\/','', fname)
dffull = pd.concat([dffull, dftmp[dftmp['PHOTON-ENERGY-PER-PULSE'] == 0.0]])
dffull
# ## Trying to reproduce best hyperscan run
# Based on offline batch run from the Hyperparameter scan chapter.
# Scan is still running. Best result as of Mon Jan 20 10:24h
......@@ -789,7 +890,7 @@ model_dropout_m4_hist.plot(y=['loss', 'val_loss'])
print_model_err(model_dropout_m4, x_test, y_test)
# # Hyperparameter scans
# # Hyperparameter scans (data not corrected for zeroes)
# ## Test: Make a hyperparameter scan A
......@@ -981,15 +1082,75 @@ data['SARFE10-PBPG050:PHOTON-ENERGY-PER-PULSE-US.value'].plot()
data['SARFE10-PBPG050:ENERGY.value'].plot()
# # DNN runs with data set cleaned for zero energy measurements
# Make sure that the zero measurements are no longer in the data set
data[ data['PHOTON-ENERGY-PER-PULSE'] == 0.0]
# ## without any regularization
# %%time
model_0_noreg_c2 = build_ff_mdl_small(in_dim = x_train.shape[1], out_dim = y_train.shape[1],
l2reg=0.0, gn=0.0)
#mc = keras.callbacks.ModelCheckpoint('best_model_1.h5', monitor='val_loss', mode='min', save_best_only=True)
es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=100)
hist_0_noreg_c2 = model_0_noreg_c2.fit(x=x_train, y=y_train,
validation_data=(x_validate,y_validate),
batch_size=250, shuffle='true',epochs=2000,
verbose='false', callbacks=[plot_losses,es])
model_0_noreg_c2.save('model_0_noreg_c2.h5')
print_model_err(model_0_noreg_c2, x_test, y_test)
# %%time
plotModelPerf2(model_0_noreg_c2, 'Model0 No Regularization', x_test, y_test, test[var_dep].columns, transformer_y, '.')
# %%time
model_0_noreg_m4_c2 = build_ff_mdl_small(in_dim = x_train.shape[1], out_dim = y_train.shape[1],
l1 = 4*8, l2 = 4*6, l3 = 4*4, l4= 4*4,
l2reg=0.0, gn=0.0)
#mc = keras.callbacks.ModelCheckpoint('best_model_1.h5', monitor='val_loss', mode='min', save_best_only=True)
es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=100)
hist_0_noreg_m4_c2 = model_0_noreg_m4_c2.fit(x=x_train, y=y_train,
validation_data=(x_validate,y_validate),
batch_size=250, shuffle='true',epochs=2000,
verbose='false', callbacks=[plot_losses,es])
model_0_noreg_m4_c2.save('model_0_noreg_m4_c2.h5')
pd.DataFrame(hist_0_noreg_m4_c2.history).to_csv('model_0_noreg_m4_c2-hist.csv')
print_model_err(model_0_noreg_m4_c2, x_test, y_test)
model_0_noreg_m4_c2 = keras.models.load_model('model_0_noreg_m4_c2.h5')
# %%time
plotModelPerf2(model_0_noreg_m4_c2, 'Model0 m4 No Regularization', x_test, y_test, test[var_dep].columns, transformer_y, '.')
# %%time
plotModelPerf2dens(model_0_noreg_m4_c2, 'Model0 m4 No Regularization', x_test, y_test, test[var_dep].columns, transformer_y, '.')
# # SVM to see what a linear model can do
# +
#clf = svm.SVR()
#clf.fit(x_train, y_train)
#y_pred=clf.predict(x_test).ravel()
#print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
#print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
#print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# -
clf = svm.SVR()
clf.fit(x_train, y_train)
y_pred_svm=clf.predict(x_test).ravel()
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_svm))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_svm))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_svm)))
y_test.shape
y_pred_svm.shape
y_pred_svm.reshape(-1,1).shape
fig,ax = plt.subplots(figsize=(12,8))
ax.plot(transformer_y.inverse_transform(y_pred_svm.reshape(-1,1)),
transformer_y.inverse_transform(y_test), linestyle='', marker='o', alpha=0.05)
ax.set_xlabel('PEPP from model')
ax.set_ylabel('PEPP')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment