Commit f5e5f724 authored by feichtinger's avatar feichtinger
Browse files

start notebook with nicer functions for dealing with hd5 data sets

parent 181785c8
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# SwissFEL Gas-monitor HDF5"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import h5py\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def sfel_gasmon_h5read(fname):\n",
" varmap = { 'SARFE10-PBIG050-EVR0:CALCI': 'CALCI',\n",
" 'SARFE10-PBIG050-EVR0:CALCS': 'CALCS',\n",
" 'SARFE10-PBIG050-EVR0:CALCT': 'CALCT',\n",
" 'SARFE10-PBPG050:FELPHOTENE': 'Ephot',\n",
" 'SARFE10-PBPG050:HAMP-HV-DS': 'Voltage_t',\n",
" 'SARFE10-PBPG050:HAMP-HV-US': 'Voltage_s',\n",
" 'SARFE10-PBPG050:MKS-PRESSURE': 'pressure',\n",
" 'SARFE10-PBPG050:PHOTON-ENERGY-PER-PULSE-AVG': 'PEPavg',\n",
" 'pulse_id': 'pulse_id'}\n",
" \n",
" f = h5py.File(fname,'r')\n",
" df = pd.DataFrame()\n",
" for grp in f.keys():\n",
" if grp == 'pulse_id':\n",
" continue\n",
" df[grp] = f.get(grp + '/data')[()]\n",
" f.close()\n",
" return df.rename(columns = varmap)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def sfel_gasmon_h5preproc(df, calc_thresh=50.0, interpolate=True):\n",
" \n",
" # first mark all missing data as NaN\n",
" for col in ['Ephot', 'Voltage_s', 'Voltage_t', 'PEPavg','pressure']:\n",
" df.loc[df.loc[:,col] == 0.0, col] = np.nan\n",
" \n",
" if interpolate:\n",
" for col in ['Ephot', 'Voltage_s', 'Voltage_t', 'PEPavg','pressure']:\n",
" df[col] = df[col].interpolate(method='linear')\n",
" \n",
" # CALCS and CALCT are negative in raw data\n",
" # TODO: base all calculations on positive CALC... I leave it for now\n",
" #\n",
" # enforce threshold\n",
" for col in ['CALCS', 'CALCT']:\n",
" df.loc[df.loc[:,col] > -calc_thresh, col] = np.nan\n",
" # remove all missing CALC\n",
" df = df[~(df.CALCS.isna() | df.CALCT.isna()) ]\n",
" \n",
" # in the original data sets I was given, the voltage was a positive number\n",
" # in the raw data Voltages are negative\n",
" df.Voltage_s = -df.Voltage_s\n",
" df.Voltage_t = -df.Voltage_t\n",
" \n",
" # we want to have Ephot in eV, not in keV\n",
" df.Ephot *= 1000\n",
" \n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"datadir = 'pavledata'\n",
"\n",
"#filename='test1.h5'\n",
"#filename='MLData_20200520_1.h5' # problematic - scale seems off\n",
"#filename='MLData_20200527_1.h5'\n",
"#filename='MLData_20200603_1.h5'\n",
"filename='MLData_20200608_1.h5'"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"df = sfel_gasmon_h5read(os.path.join(datadir,filename))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CALCI</th>\n",
" <th>CALCS</th>\n",
" <th>CALCT</th>\n",
" <th>Ephot</th>\n",
" <th>Voltage_t</th>\n",
" <th>Voltage_s</th>\n",
" <th>pressure</th>\n",
" <th>PEPavg</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>count</td>\n",
" <td>100000.000000</td>\n",
" <td>100000.000000</td>\n",
" <td>100000.000000</td>\n",
" <td>100000.000000</td>\n",
" <td>100000.000000</td>\n",
" <td>100000.000000</td>\n",
" <td>100000.000000</td>\n",
" <td>100000.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean</td>\n",
" <td>2241.292320</td>\n",
" <td>-2212.723284</td>\n",
" <td>-2269.861355</td>\n",
" <td>4.000393</td>\n",
" <td>-710.996154</td>\n",
" <td>-735.890275</td>\n",
" <td>0.000086</td>\n",
" <td>198.913625</td>\n",
" </tr>\n",
" <tr>\n",
" <td>std</td>\n",
" <td>185.227916</td>\n",
" <td>183.186651</td>\n",
" <td>187.537923</td>\n",
" <td>4.000333</td>\n",
" <td>710.985490</td>\n",
" <td>735.879237</td>\n",
" <td>0.000086</td>\n",
" <td>199.010794</td>\n",
" </tr>\n",
" <tr>\n",
" <td>min</td>\n",
" <td>0.000000</td>\n",
" <td>-2612.543566</td>\n",
" <td>-2659.088429</td>\n",
" <td>0.000000</td>\n",
" <td>-1422.120000</td>\n",
" <td>-1471.870000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>25%</td>\n",
" <td>2142.731802</td>\n",
" <td>-2324.908259</td>\n",
" <td>-2384.816265</td>\n",
" <td>0.000000</td>\n",
" <td>-1421.970000</td>\n",
" <td>-1471.750000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>50%</td>\n",
" <td>2262.659235</td>\n",
" <td>-2233.433017</td>\n",
" <td>-2292.049543</td>\n",
" <td>7.999976</td>\n",
" <td>-1421.770000</td>\n",
" <td>-1471.610000</td>\n",
" <td>0.000167</td>\n",
" <td>378.640406</td>\n",
" </tr>\n",
" <tr>\n",
" <td>75%</td>\n",
" <td>2354.635714</td>\n",
" <td>-2115.424661</td>\n",
" <td>-2170.009429</td>\n",
" <td>8.000626</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000171</td>\n",
" <td>397.783755</td>\n",
" </tr>\n",
" <tr>\n",
" <td>max</td>\n",
" <td>2627.878956</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>8.001325</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000175</td>\n",
" <td>417.950804</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" CALCI CALCS CALCT Ephot \\\n",
"count 100000.000000 100000.000000 100000.000000 100000.000000 \n",
"mean 2241.292320 -2212.723284 -2269.861355 4.000393 \n",
"std 185.227916 183.186651 187.537923 4.000333 \n",
"min 0.000000 -2612.543566 -2659.088429 0.000000 \n",
"25% 2142.731802 -2324.908259 -2384.816265 0.000000 \n",
"50% 2262.659235 -2233.433017 -2292.049543 7.999976 \n",
"75% 2354.635714 -2115.424661 -2170.009429 8.000626 \n",
"max 2627.878956 0.000000 0.000000 8.001325 \n",
"\n",
" Voltage_t Voltage_s pressure PEPavg \n",
"count 100000.000000 100000.000000 100000.000000 100000.000000 \n",
"mean -710.996154 -735.890275 0.000086 198.913625 \n",
"std 710.985490 735.879237 0.000086 199.010794 \n",
"min -1422.120000 -1471.870000 0.000000 0.000000 \n",
"25% -1421.970000 -1471.750000 0.000000 0.000000 \n",
"50% -1421.770000 -1471.610000 0.000167 378.640406 \n",
"75% 0.000000 0.000000 0.000171 397.783755 \n",
"max 0.000000 0.000000 0.000175 417.950804 "
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"df2 = sfel_gasmon_h5preproc(df)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CALCI</th>\n",
" <th>CALCS</th>\n",
" <th>CALCT</th>\n",
" <th>Ephot</th>\n",
" <th>Voltage_t</th>\n",
" <th>Voltage_s</th>\n",
" <th>pressure</th>\n",
" <th>PEPavg</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>count</td>\n",
" <td>99702.000000</td>\n",
" <td>99702.000000</td>\n",
" <td>99702.000000</td>\n",
" <td>99701.000000</td>\n",
" <td>99701.000000</td>\n",
" <td>99701.000000</td>\n",
" <td>99701.000000</td>\n",
" <td>99701.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <td>mean</td>\n",
" <td>2247.991334</td>\n",
" <td>-2219.336908</td>\n",
" <td>-2276.645760</td>\n",
" <td>8000.626362</td>\n",
" <td>1421.963873</td>\n",
" <td>1471.751120</td>\n",
" <td>0.000171</td>\n",
" <td>397.821243</td>\n",
" </tr>\n",
" <tr>\n",
" <td>std</td>\n",
" <td>139.113093</td>\n",
" <td>137.766624</td>\n",
" <td>140.818162</td>\n",
" <td>0.226868</td>\n",
" <td>0.060998</td>\n",
" <td>0.056986</td>\n",
" <td>0.000003</td>\n",
" <td>8.928335</td>\n",
" </tr>\n",
" <tr>\n",
" <td>min</td>\n",
" <td>1186.657053</td>\n",
" <td>-2612.543566</td>\n",
" <td>-2659.088429</td>\n",
" <td>7999.976115</td>\n",
" <td>1421.770000</td>\n",
" <td>1471.610000</td>\n",
" <td>0.000167</td>\n",
" <td>378.640406</td>\n",
" </tr>\n",
" <tr>\n",
" <td>25%</td>\n",
" <td>2144.033670</td>\n",
" <td>-2325.189739</td>\n",
" <td>-2385.116015</td>\n",
" <td>8000.503320</td>\n",
" <td>1421.920000</td>\n",
" <td>1471.700000</td>\n",
" <td>0.000169</td>\n",
" <td>391.143357</td>\n",
" </tr>\n",
" <tr>\n",
" <td>50%</td>\n",
" <td>2263.320249</td>\n",
" <td>-2233.988859</td>\n",
" <td>-2292.645454</td>\n",
" <td>8000.625928</td>\n",
" <td>1421.970000</td>\n",
" <td>1471.750000</td>\n",
" <td>0.000171</td>\n",
" <td>397.790291</td>\n",
" </tr>\n",
" <tr>\n",
" <td>75%</td>\n",
" <td>2354.923350</td>\n",
" <td>-2116.626917</td>\n",
" <td>-2171.158907</td>\n",
" <td>8000.748537</td>\n",
" <td>1422.010000</td>\n",
" <td>1471.800000</td>\n",
" <td>0.000174</td>\n",
" <td>404.896668</td>\n",
" </tr>\n",
" <tr>\n",
" <td>max</td>\n",
" <td>2627.878956</td>\n",
" <td>-1217.230589</td>\n",
" <td>-1156.083516</td>\n",
" <td>8001.324812</td>\n",
" <td>1422.120000</td>\n",
" <td>1471.870000</td>\n",
" <td>0.000175</td>\n",
" <td>417.950804</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" CALCI CALCS CALCT Ephot Voltage_t \\\n",
"count 99702.000000 99702.000000 99702.000000 99701.000000 99701.000000 \n",
"mean 2247.991334 -2219.336908 -2276.645760 8000.626362 1421.963873 \n",
"std 139.113093 137.766624 140.818162 0.226868 0.060998 \n",
"min 1186.657053 -2612.543566 -2659.088429 7999.976115 1421.770000 \n",
"25% 2144.033670 -2325.189739 -2385.116015 8000.503320 1421.920000 \n",
"50% 2263.320249 -2233.988859 -2292.645454 8000.625928 1421.970000 \n",
"75% 2354.923350 -2116.626917 -2171.158907 8000.748537 1422.010000 \n",
"max 2627.878956 -1217.230589 -1156.083516 8001.324812 1422.120000 \n",
"\n",
" Voltage_s pressure PEPavg \n",
"count 99701.000000 99701.000000 99701.000000 \n",
"mean 1471.751120 0.000171 397.821243 \n",
"std 0.056986 0.000003 8.928335 \n",
"min 1471.610000 0.000167 378.640406 \n",
"25% 1471.700000 0.000169 391.143357 \n",
"50% 1471.750000 0.000171 397.790291 \n",
"75% 1471.800000 0.000174 404.896668 \n",
"max 1471.870000 0.000175 417.950804 "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.describe()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 True\n",
"1 False\n",
"2 False\n",
"3 False\n",
"4 False\n",
" ... \n",
"99995 False\n",
"99996 False\n",
"99997 False\n",
"99998 False\n",
"99999 False\n",
"Name: Voltage_s, Length: 100000, dtype: bool"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# TODO: it seems that the first row(s) of the non-CALC columns sometimes contain NaN. This is then\n",
"# not interpolated.... need to reverse interpolate or backwards fill\n",
"df.Voltage_s.isna()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"jupytext": {
"formats": "ipynb,py:light"
},
"kernelspec": {
"display_name": "Python [conda env:datascience_py37]",
"language": "python",
"name": "conda-env-datascience_py37-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": true
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:light
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.4'
# jupytext_version: 1.2.4
# kernelspec:
# display_name: Python [conda env:datascience_py37]
# language: python
# name: conda-env-datascience_py37-py
# ---
# # SwissFEL Gas-monitor HDF5
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt
import seaborn as sns
import os
def sfel_gasmon_h5read(fname):
varmap = { 'SARFE10-PBIG050-EVR0:CALCI': 'CALCI',
'SARFE10-PBIG050-EVR0:CALCS': 'CALCS',
'SARFE10-PBIG050-EVR0:CALCT': 'CALCT',
'SARFE10-PBPG050:FELPHOTENE': 'Ephot',
'SARFE10-PBPG050:HAMP-HV-DS': 'Voltage_t',
'SARFE10-PBPG050:HAMP-HV-US': 'Voltage_s',
'SARFE10-PBPG050:MKS-PRESSURE': 'pressure',
'SARFE10-PBPG050:PHOTON-ENERGY-PER-PULSE-AVG': 'PEPavg',
'pulse_id': 'pulse_id'}
f = h5py.File(fname,'r')
df = pd.DataFrame()
for grp in f.keys():
if grp == 'pulse_id':
continue
df[grp] = f.get(grp + '/data')[()]
f.close()
return df.rename(columns = varmap)
def sfel_gasmon_h5preproc(df, calc_thresh=50.0, interpolate=True):
# first mark all missing data as NaN
for col in ['Ephot', 'Voltage_s', 'Voltage_t', 'PEPavg','pressure']:
df.loc[df.loc[:,col] == 0.0, col] = np.nan
if interpolate:
for col in ['Ephot', 'Voltage_s', 'Voltage_t', 'PEPavg','pressure']:
df[col] = df[col].interpolate(method='linear')
# CALCS and CALCT are negative in raw data
# TODO: base all calculations on positive CALC... I leave it for now
#
# enforce threshold
for col in ['CALCS', 'CALCT']:
df.loc[df.loc[:,col] > -calc_thresh, col] = np.nan
# remove all missing CALC
df = df[~(df.CALCS.isna() | df.CALCT.isna()) ]
# in the original data sets I was given, the voltage was a positive number
# in the raw data Voltages are negative
df.Voltage_s = -df.Voltage_s
df.Voltage_t = -df.Voltage_t
# we want to have Ephot in eV, not in keV
df.Ephot *= 1000
return df
# +
datadir = 'pavledata'
#filename='test1.h5'
#filename='MLData_20200520_1.h5' # problematic - scale seems off
#filename='MLData_20200527_1.h5'
#filename='MLData_20200603_1.h5'
filename='MLData_20200608_1.h5'
# -
df = sfel_gasmon_h5read(os.path.join(datadir,filename))
df.describe()
df2 = sfel_gasmon_h5preproc(df)
df2.describe()
# TODO: it seems that the first row(s) of the non-CALC columns sometimes contain NaN. This is then
# not interpolated.... need to reverse interpolate or backwards fill
df.Voltage_s.isna()