Commit 13b71006 authored by feichtinger's avatar feichtinger
Browse files

working on new hdf5 based data from Pavle

parent e6e4ee04
This diff is collapsed.
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:light
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.4'
# jupytext_version: 1.2.4
# kernelspec:
# display_name: Python [conda env:datascience_py37]
# language: python
# name: conda-env-datascience_py37-py
# ---
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt
import seaborn as sns
import os
# +
datadir = 'pavledata'
#filename='test1.h5'
filename='MLData_20200520_1.h5'
# -
# # hdf5 data set loading
f = h5py.File(os.path.join(datadir,filename),'r')
f.items()
f.keys()
list(f['SARFE10-PBIG050-EVR0:CALCI'])
# +
#list(f['SARFE10-PBIG050-EVR0:CALCI']['pulse_id'])
# -
calci = f.get('SARFE10-PBIG050-EVR0:CALCI')
calci
calci = np.array(f.get('SARFE10-PBIG050-EVR0:CALCI'))
calci.shape
calci
f.keys()
list(f)
varmap = { 'SARFE10-PBIG050-EVR0:CALCI': 'CALCI',
'SARFE10-PBIG050-EVR0:CALCS': 'CALCS',
'SARFE10-PBIG050-EVR0:CALCT': 'CALCT',
'SARFE10-PBPG050:FELPHOTENE': 'Ephot',
'SARFE10-PBPG050:HAMP-HV-DS': 'Voltage_t',
'SARFE10-PBPG050:HAMP-HV-US': 'Voltage_s',
'SARFE10-PBPG050:MKS-PRESSURE': 'pressure',
'SARFE10-PBPG050:PHOTON-ENERGY-PER-PULSE-AVG': 'PEPavg',
'pulse_id': 'pulse_id'}
f.get('SARFE10-PBIG050-EVR0:CALCI').keys()
for item in f.get('SARFE10-PBIG050-EVR0:CALCI').items():
print(item)
test = f.get('SARFE10-PBIG050-EVR0:CALCI/data')
test
data = f.get('SARFE10-PBPG050:HAMP-HV-DS/data')
# this results in a complete numpy ndarray
data[()]
df = pd.DataFrame()
for grp in f.keys():
if grp == 'pulse_id':
continue
print(f.get(grp).keys())
df[grp] = f.get(grp + '/data')[()]
print(' ', grp, f.get(grp + '/data'))
df = df.rename(columns = varmap)
# # a look at the raw data
df.describe()
# Pavle:
#
# CALCI is, as you surmised, the average of the CALCS and CALCT. The response is not in uJ -- it's just a number that we wish to get in uJ eventually.FELPHOTONENE is the photon energy (wavelength) of the FEL beam in eV.The -HV channels are voltages, as you surmised. They are associated with CALCS and CALCT. CALCS is with -HV-US, CALCT is with -HV-DS.
#
fig = plt.figure(figsize=(12, 6*len(df.columns)))
for idx,col in enumerate(df.columns, start=1):
ax = fig.add_subplot(len(df.columns),1,idx)
ax.plot(df[col], linestyle='', marker='o')
ax.set_ylabel(col)
# # dataset cleaning
# Set values which are undefined or below a threshold to NAN
(df.CALCS < -50.0).sum()
for col in ['Ephot', 'Voltage_s', 'Voltage_t', 'PEPavg','pressure']:
df.loc[df.loc[:,col] == 0.0, col] = np.nan
# Note: I do not use CALCI, and therefore I do not clean it
calc_thresh = -50.0
for col in ['CALCS', 'CALCT']:
#df[col] = np.abs(df[col])
df.loc[df.loc[:,col] > calc_thresh, col] = np.nan
df.isna().sum()
df.shape
# +
# get rid of CALCS/CALCT below threshold
df = df[~(df.CALCS.isna() | df.CALCT.isna()) ]
df.isna().sum()
# -
# ## transforms (units, neg->pos)
# we want to have Ephot in eV, not in keV
df.Ephot *= 1000
# in the original data sets I was given, the voltage was a positive number
df.Voltage_s = -df.Voltage_s
df.Voltage_t = -df.Voltage_t
fig = plt.figure(figsize=(12, 6*len(df.columns)))
for idx,col in enumerate(df.columns, start=1):
ax = fig.add_subplot(len(df.columns),1,idx)
ax.plot(df[col], linestyle='', marker='.', alpha=0.05)
ax.set_ylabel(col)
# The missing values for some of the measurements of voltage, etc. must be filled in. Forward fill, backward fill... I think all would be equally correct. I go for simple interpolation assuming equally spaced index
for col in ['Ephot', 'Voltage_s', 'Voltage_t', 'PEPavg','pressure']:
df[col] = df[col].interpolate(method='linear')
df.isna().count()
# +
#sns.pairplot(df[['CALCT', 'CALCS', 'Ephot','Voltage_s','Voltage_t','PEPavg']])
# -
# function factory for these models
def mk_pep_pred_fn2(A, B, C):
return lambda calcs, voltage, ebeam: (1/6.241509e12) * ebeam * calcs / (1 + C * ebeam) / (A * np.power(voltage, B))
popt2 = [-1.11290561e-29, 6.97740548e+00, -7.66814987e-05]
pep_pred_fn2 = mk_pep_pred_fn2(popt2[0], popt2[1], popt2[2])
pep_pred_fn2(df.CALCS, df.Voltage_s, df.Ephot).describe()
fig,ax = plt.subplots(figsize=(12,6))
ax.plot(pep_pred_fn2(df.CALCS, df.Voltage_s, df.Ephot))
df.head()
df.describe()
df.loc[df.CALCS == df.CALCS.max(), ['CALCS','PEPavg','Voltage_s','Ephot']]
df.loc[df.CALCS == df.CALCS.min(), ['CALCS','PEPavg','Voltage_s','Ephot']]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment