Commit 780ff9a2 authored by feichtinger's avatar feichtinger
Browse files

added test demonstrating data in each file strongly correlated

Each file contains points which cover a very small part of the
parameter space. It's more like having only one averaged measurement
per file. If the fitting is allowed to include points from all file,
the fit is trivially good, since the variation of parameters per file
is minimal. Leaving files out in training leads to miserable fits
for the unseen data.
parent 9db95c9d
This diff is collapsed.
......@@ -15,7 +15,7 @@
# + {"toc": true, "cell_type": "markdown"}
# <h1>Table of Contents<span class="tocSkip"></span></h1>
# <div class="toc"><ul class="toc-item"><li><span><a href="#Configuration" data-toc-modified-id="Configuration-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Configuration</a></span></li><li><span><a href="#Support-Routines" data-toc-modified-id="Support-Routines-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Support Routines</a></span><ul class="toc-item"><li><span><a href="#Visualization" data-toc-modified-id="Visualization-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Visualization</a></span></li></ul></li><li><span><a href="#Dataset-creation" data-toc-modified-id="Dataset-creation-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Dataset creation</a></span><ul class="toc-item"><li><span><a href="#Dataset-reading-and-preprocessing-definition" data-toc-modified-id="Dataset-reading-and-preprocessing-definition-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Dataset reading and preprocessing definition</a></span></li><li><span><a href="#Make-dataset" data-toc-modified-id="Make-dataset-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Make dataset</a></span></li><li><span><a href="#Examining-data" data-toc-modified-id="Examining-data-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Examining data</a></span></li><li><span><a href="#Training/Test-Split" data-toc-modified-id="Training/Test-Split-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Training/Test Split</a></span></li><li><span><a href="#Data-scaling-for-DNN-training" data-toc-modified-id="Data-scaling-for-DNN-training-3.5"><span class="toc-item-num">3.5&nbsp;&nbsp;</span>Data scaling for DNN training</a></span></li></ul></li><li><span><a href="#DNN-Model-definitions" data-toc-modified-id="DNN-Model-definitions-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>DNN Model definitions</a></span><ul class="toc-item"><li><span><a href="#L2reg-and-gaussian-noise" data-toc-modified-id="L2reg-and-gaussian-noise-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>L2reg and gaussian noise</a></span></li><li><span><a href="#Model-with-Dropout" data-toc-modified-id="Model-with-Dropout-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Model with Dropout</a></span></li></ul></li><li><span><a href="#DNN-Training-runs-(data-not-corrected-for-zeroes)" data-toc-modified-id="DNN-Training-runs-(data-not-corrected-for-zeroes)-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>DNN Training runs (data not corrected for zeroes)</a></span><ul class="toc-item"><li><span><a href="#Andi's-initial-DNN-using-gn" data-toc-modified-id="Andi's-initial-DNN-using-gn-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Andi's initial DNN using gn</a></span></li><li><span><a href="#without-any-regularization" data-toc-modified-id="without-any-regularization-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>without any regularization</a></span><ul class="toc-item"><li><span><a href="#Investigation-of-model-performance-errors" data-toc-modified-id="Investigation-of-model-performance-errors-5.2.1"><span class="toc-item-num">5.2.1&nbsp;&nbsp;</span>Investigation of model performance errors</a></span></li><li><span><a href="#outlier-investigation-after-answer-by-Jochem/Pavle" data-toc-modified-id="outlier-investigation-after-answer-by-Jochem/Pavle-5.2.2"><span class="toc-item-num">5.2.2&nbsp;&nbsp;</span>outlier investigation after answer by Jochem/Pavle</a></span></li></ul></li><li><span><a href="#Trying-to-reproduce-best-hyperscan-run" data-toc-modified-id="Trying-to-reproduce-best-hyperscan-run-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Trying to reproduce best hyperscan run</a></span></li><li><span><a href="#Try-out-DNN-with-dropout" data-toc-modified-id="Try-out-DNN-with-dropout-5.4"><span class="toc-item-num">5.4&nbsp;&nbsp;</span>Try out DNN with dropout</a></span></li></ul></li><li><span><a href="#Hyperparameter-scans-(data-not-corrected-for-zeroes)" data-toc-modified-id="Hyperparameter-scans-(data-not-corrected-for-zeroes)-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Hyperparameter scans (data not corrected for zeroes)</a></span><ul class="toc-item"><li><span><a href="#Test:-Make-a-hyperparameter-scan-A" data-toc-modified-id="Test:-Make-a-hyperparameter-scan-A-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Test: Make a hyperparameter scan A</a></span></li><li><span><a href="#Offline-batch-parameter-scan" data-toc-modified-id="Offline-batch-parameter-scan-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Offline batch parameter scan</a></span></li><li><span><a href="#TODO-ModelB" data-toc-modified-id="TODO-ModelB-6.3"><span class="toc-item-num">6.3&nbsp;&nbsp;</span>TODO ModelB</a></span></li><li><span><a href="#TODO:-Model-C:-scan-regulatisation-and-noise" data-toc-modified-id="TODO:-Model-C:-scan-regulatisation-and-noise-6.4"><span class="toc-item-num">6.4&nbsp;&nbsp;</span>TODO: Model C: scan regulatisation and noise</a></span></li></ul></li><li><span><a href="#DNN-runs-with-data-set-cleaned-for-zero-energy-measurements" data-toc-modified-id="DNN-runs-with-data-set-cleaned-for-zero-energy-measurements-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>DNN runs with data set cleaned for zero energy measurements</a></span><ul class="toc-item"><li><span><a href="#without-any-regularization" data-toc-modified-id="without-any-regularization-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>without any regularization</a></span></li></ul></li><li><span><a href="#SVM-to-see-what-a-linear-model-can-do" data-toc-modified-id="SVM-to-see-what-a-linear-model-can-do-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>SVM to see what a linear model can do</a></span></li></ul></div>
# <div class="toc"><ul class="toc-item"><li><span><a href="#Configuration" data-toc-modified-id="Configuration-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Configuration</a></span></li><li><span><a href="#Support-Routines" data-toc-modified-id="Support-Routines-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Support Routines</a></span><ul class="toc-item"><li><span><a href="#Visualization" data-toc-modified-id="Visualization-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Visualization</a></span></li></ul></li><li><span><a href="#Dataset-creation" data-toc-modified-id="Dataset-creation-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Dataset creation</a></span><ul class="toc-item"><li><span><a href="#Dataset-reading-and-preprocessing-definition" data-toc-modified-id="Dataset-reading-and-preprocessing-definition-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Dataset reading and preprocessing definition</a></span></li><li><span><a href="#Make-dataset" data-toc-modified-id="Make-dataset-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Make dataset</a></span></li><li><span><a href="#Examining-data" data-toc-modified-id="Examining-data-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Examining data</a></span></li><li><span><a href="#Training/Test-Split" data-toc-modified-id="Training/Test-Split-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Training/Test Split</a></span></li><li><span><a href="#Data-scaling-for-DNN-training" data-toc-modified-id="Data-scaling-for-DNN-training-3.5"><span class="toc-item-num">3.5&nbsp;&nbsp;</span>Data scaling for DNN training</a></span></li></ul></li><li><span><a href="#DNN-Model-definitions" data-toc-modified-id="DNN-Model-definitions-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>DNN Model definitions</a></span><ul class="toc-item"><li><span><a href="#L2reg-and-gaussian-noise" data-toc-modified-id="L2reg-and-gaussian-noise-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>L2reg and gaussian noise</a></span></li><li><span><a href="#Model-with-Dropout" data-toc-modified-id="Model-with-Dropout-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Model with Dropout</a></span></li></ul></li><li><span><a href="#DNN-Training-runs-(data-not-corrected-for-zeroes)" data-toc-modified-id="DNN-Training-runs-(data-not-corrected-for-zeroes)-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>DNN Training runs (data not corrected for zeroes)</a></span><ul class="toc-item"><li><span><a href="#Andi's-initial-DNN-using-gn" data-toc-modified-id="Andi's-initial-DNN-using-gn-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Andi's initial DNN using gn</a></span></li><li><span><a href="#without-any-regularization" data-toc-modified-id="without-any-regularization-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>without any regularization</a></span><ul class="toc-item"><li><span><a href="#Investigation-of-model-performance-errors" data-toc-modified-id="Investigation-of-model-performance-errors-5.2.1"><span class="toc-item-num">5.2.1&nbsp;&nbsp;</span>Investigation of model performance errors</a></span></li><li><span><a href="#outlier-investigation-after-answer-by-Jochem/Pavle" data-toc-modified-id="outlier-investigation-after-answer-by-Jochem/Pavle-5.2.2"><span class="toc-item-num">5.2.2&nbsp;&nbsp;</span>outlier investigation after answer by Jochem/Pavle</a></span></li></ul></li><li><span><a href="#Trying-to-reproduce-best-hyperscan-run" data-toc-modified-id="Trying-to-reproduce-best-hyperscan-run-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Trying to reproduce best hyperscan run</a></span></li><li><span><a href="#Try-out-DNN-with-dropout" data-toc-modified-id="Try-out-DNN-with-dropout-5.4"><span class="toc-item-num">5.4&nbsp;&nbsp;</span>Try out DNN with dropout</a></span></li></ul></li><li><span><a href="#Hyperparameter-scans-(data-not-corrected-for-zeroes)" data-toc-modified-id="Hyperparameter-scans-(data-not-corrected-for-zeroes)-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Hyperparameter scans (data not corrected for zeroes)</a></span><ul class="toc-item"><li><span><a href="#Test:-Make-a-hyperparameter-scan-A" data-toc-modified-id="Test:-Make-a-hyperparameter-scan-A-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Test: Make a hyperparameter scan A</a></span></li><li><span><a href="#Offline-batch-parameter-scan" data-toc-modified-id="Offline-batch-parameter-scan-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Offline batch parameter scan</a></span></li><li><span><a href="#TODO-ModelB" data-toc-modified-id="TODO-ModelB-6.3"><span class="toc-item-num">6.3&nbsp;&nbsp;</span>TODO ModelB</a></span></li><li><span><a href="#TODO:-Model-C:-scan-regulatisation-and-noise" data-toc-modified-id="TODO:-Model-C:-scan-regulatisation-and-noise-6.4"><span class="toc-item-num">6.4&nbsp;&nbsp;</span>TODO: Model C: scan regulatisation and noise</a></span></li></ul></li><li><span><a href="#DNN-runs-with-data-set-cleaned-for-zero-energy-measurements" data-toc-modified-id="DNN-runs-with-data-set-cleaned-for-zero-energy-measurements-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>DNN runs with data set cleaned for zero energy measurements</a></span><ul class="toc-item"><li><span><a href="#without-any-regularization" data-toc-modified-id="without-any-regularization-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>without any regularization</a></span></li></ul></li><li><span><a href="#SVM-to-see-what-a-linear-model-can-do" data-toc-modified-id="SVM-to-see-what-a-linear-model-can-do-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>SVM to see what a linear model can do</a></span></li><li><span><a href="#Test-for-leaving-out-some-files-from-the-training-set" data-toc-modified-id="Test-for-leaving-out-some-files-from-the-training-set-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Test for leaving out some files from the training set</a></span></li></ul></div>
# +
import os
......@@ -1192,5 +1192,59 @@ ax.plot(transformer_y.inverse_transform(y_pred_svm.reshape(-1,1)),
ax.set_xlabel('PEPP from model')
ax.set_ylabel('PEPP')
# # Test for leaving out some files from the training set
# By examining the source data, I get the impression that each file only contains a very small subspace of the total parameter space. So, it somehow is more that we have 58 measurement points with some confidence intervals, instead of 15000 measurements.
# +
fchoice = "33,34"
flist = [f'rawDataFile != "dp{fnum}-nomeans.csv"' for fnum in fchoice.split(",")]
trainB, validateB, testB = np.split(data.query(" & ".join(flist)) \
.sample(frac=1), [int(.6*len(data)), int(.8*len(data))])
x_trainB = trainB[var_indep]
y_trainB = trainB[var_dep]
x_validateB = validateB[var_indep]
y_validateB = validateB[var_dep]
x_testB = testB[var_indep]
y_testB = testB[var_dep]
# +
flist = [f'rawDataFile == "dp{fnum}-nomeans.csv"' for fnum in fchoice.split(",")]
data_ignf = data.query(" | ".join(flist))
x_testB_ignf = np.concatenate((x_testB, data_ignf[var_indep].to_numpy()))
y_testB_ignf = np.concatenate((y_testB, data_ignf[var_dep].to_numpy()))
x_testB_ignf = transformer_x.transform(x_testB_ignf)
y_testB_ignf = transformer_y.transform(y_testB_ignf)
# +
x_trainB = transformer_x.transform(x_trainB)
x_validateB = transformer_x.transform(x_validateB)
x_testB = transformer_x.transform(x_testB)
y_trainB = transformer_y.transform(y_trainB)
y_validateB = transformer_y.transform(y_validateB)
y_testB = transformer_y.transform(y_testB)
# -
x_trainB.shape, x_validateB.shape
# %%time
model_0_noreg_m4_B = build_ff_mdl_small(in_dim = x_trainB.shape[1], out_dim = y_trainB.shape[1],
l1 = 4*8, l2 = 4*6, l3 = 4*4, l4= 4*4,
l2reg=0.0, gn=0.0)
es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=100)
hist_0_noreg_m4_B = model_0_noreg_m4_B.fit(x=x_trainB, y=y_trainB,
validation_data=(x_validateB,y_validateB),
batch_size=250, shuffle='true',epochs=300,
verbose='false', callbacks=[plot_losses,es])
# %%time
plotModelPerf2(model_0_noreg_m4_B, 'Model0 m4 No Regularization', x_testB, y_testB, testB[var_dep].columns, transformer_y, '.')
# Now I visualize the performance including the samples from the unseen files
# %%time
plotModelPerf2(model_0_noreg_m4_B, 'Model0 m4 No Regularization', x_testB_ignf, y_testB_ignf, testB[var_dep].columns, transformer_y, '.')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment