submit.sh 8.85 KB
Newer Older
1
#!/bin/bash
2
# requesting the number of nodes needed
ulrich_y's avatar
ulrich_y committed
3 4
#SBATCH --partition=hourly
#SBATCH --time=1:00:00
5
#SBATCH --ntasks=40
ulrich_y's avatar
ulrich_y committed
6 7 8
# do not touch these unless you know what you are doing!
#SBATCH --clusters=merlin6
#SBATCH --output=meg/slurm-%j.out
9 10 11 12 13 14
#SBATCH --input=meg/menu.txt

echo "Started as $0 $@"

this=./submit.sh
if [ $# -gt 0 ]; then
ulrich_y's avatar
ulrich_y committed
15
if [[ $1 == *.conf  || $1 == *.tconf ]]; then
16 17 18 19 20 21 22 23 24 25
    # We are a runner
    
    echo "[`date`] Runner start as $0 $@"
    hostname
    
    seed=$2
    xi=$3
    part=$4
    flavour=$5
    cuts=$6
ulrich_y's avatar
ulrich_y committed
26
    del=${7:-$xi}
ulrich_y's avatar
ulrich_y committed
27
    binary="mcmule"
28 29 30 31 32 33
    
    source $1

    olddir=`pwd`
    
    runstring="${STAT[$part]}"
ulrich_y's avatar
ulrich_y committed
34
    runstring="${runstring}\n$seed\n$xi\n$del\n$part\n$flavour\n$cuts"
35
   
ulrich_y's avatar
ulrich_y committed
36 37 38 39 40 41 42 43
    if [ -z $containerid ]; then
        binary="`pwd`/$binary"
        sha1=$(cat $binary | sha1sum)
        sha2=$(make hash)
        cd $folder
    else
        sha1=$(udocker run $containerid sh -c "cat /monte-carlo/$binary | sha1sum")
        sha2=$(udocker run $containerid sh -c "cd /monte-carlo/ && make hash")
ulrich_y's avatar
ulrich_y committed
44
        binary="stdbuf -oL -eL udocker run --volume=`pwd`/$folder/out/:/root/out $containerid /monte-carlo/$binary"
ulrich_y's avatar
ulrich_y committed
45
    fi
46
    echo "[`date`] Runner start as $0 $@"
ulrich_y's avatar
ulrich_y committed
47
    echo "[`date`] Hash of mcmule: $sha1"
48
    echo "[`date`] SHA of source: $sha2"
49
    echo -e "$runstring"
50
    echo -e "$runstring" | $binary
51 52


ulrich_y's avatar
ulrich_y committed
53
    echo "[`date`] Runner $0 $@ finishes"
54 55 56 57 58 59 60 61
    
    cd $olddir
    exit 0
else
    echo "Not a valid config file. skipping."
fi
fi

ulrich_y's avatar
ulrich_y committed
62
function nonmerlintrap() {
ulrich_y's avatar
ulrich_y committed
63
    trap  SIGCHLD
ulrich_y's avatar
ulrich_y committed
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
    exec 1>&6 6>&-
    cat <<'EOF'
+----------------------------------------------------------+
|  __          __              _                           |
|  \ \        / /             (_)                          |
|   \ \  /\  / /_ _ _ __ _ __  _ _ __   __ _               |
|    \ \/  \/ / _` |  __|  _ \| | '_ \ / _` |              |
|     \  /\  / (_| | |  | | | | | | | | (_| |              |
|      \/  \/ \__,_|_|  |_| |_|_|_| |_|\__, |              |
|                                       __/ |              |
|                                      |___/               |
+----------------------------------------------------------+

 You just send SIGINT (most likely through ^C). For you
 own good, this is blocked. I'm unblocking it now for five
 seconds. If you are sure, hit it again in three seconds.

------------------------------------------------------------
EOF
    for i in {3..1} ; do
        echo -ne "\rUnblocking in in ${i}s\r"
        sleep 1
    done
    echo "Trap removed!               "
88 89
    trap - SIGINT
    set +m
ulrich_y's avatar
ulrich_y committed
90 91
    for i in {5..1} ; do
        echo -ne "\rBlocking in in ${i}s\r"
92
        sleep 1 || exit 1
ulrich_y's avatar
ulrich_y committed
93
    done
94
    set -m
ulrich_y's avatar
ulrich_y committed
95 96 97 98
    echo "Trap added!                 "
    echo `list_descendants $$`
    trap 'nonmerlintrap' SIGINT
    exec 6>&1
ulrich_y's avatar
ulrich_y committed
99
    trap 'wakeup' SIGCHLD
100
    wakeup
ulrich_y's avatar
ulrich_y committed
101 102 103 104 105 106 107 108 109 110 111
}
function list_descendants () {
    local children=$(ps -o pid= --ppid "$1")
    for pid in $children
    do
        list_descendants "$pid"
    done
    echo "$children"
}
function diegracefully () {
    echo "Killing $$ (self) and all children"
ulrich_y's avatar
ulrich_y committed
112 113 114 115 116 117 118 119
    if [ ! -f "/usr/bin/srun" ]; then
        kill `list_descendants $$`
    fi
    if [ -n "$containerid" ]; then
        echo "Removing docker container $containerid"
        udocker unprotect $containerid
        udocker rm $containerid
    fi
ulrich_y's avatar
ulrich_y committed
120 121
    exit
}
ulrich_y's avatar
ulrich_y committed
122 123 124 125 126 127 128 129 130
spid=1
function wakeup () {
    psout=`ps -o pid= -o cmd= -p $spid`
    if [[ $psout == *"sleep infinity"* ]]; then
        echo "Killing $spid"
        kill $spid
        spid=1
    fi
}
ulrich_y's avatar
ulrich_y committed
131

ulrich_y's avatar
ulrich_y committed
132 133 134
if [ -f "/usr/bin/srun" ]; then
    echo "This is a merlin system"
    maxjobs=100000
ulrich_y's avatar
ulrich_y committed
135
    trap 'diegracefully' TERM
ulrich_y's avatar
ulrich_y committed
136
else
137
    set -m
138
    export SLURM_JOB_ID=`date +%s`
ulrich_y's avatar
ulrich_y committed
139
    maxjobs=$( cat $0 | grep "^#SBATCH --ntasks" | cut -d'=' -f2 )
140 141 142 143
    menufile=$( cat $0 | grep "^#SBATCH --input" | cut -d'=' -f2 )
    outputfile=`dirname $0`/nm-$SLURM_JOB_ID.out
    echo "This is *not* a merlin system"
    echo "Redirecting all output to $outputfile and reading from $menufile"
ulrich_y's avatar
ulrich_y committed
144 145 146
    # save stdout
    exec 6>&2
    # redirect everything
147 148 149 150 151 152
    exec < $menufile
    exec >> $outputfile
    exec 2>&1
    echo "This is *not* a merlin system"
    echo "Redirecting all output to $outputfile and reading from $menufile"
    echo "Running with $maxjobs jobs"
ulrich_y's avatar
ulrich_y committed
153 154
    trap 'nonmerlintrap' SIGINT
    trap 'diegracefully' EXIT
ulrich_y's avatar
ulrich_y committed
155 156
    set -o monitor
    trap 'wakeup' SIGCHLD
ulrich_y's avatar
ulrich_y committed
157
fi
158

ulrich_y's avatar
ulrich_y committed
159 160 161 162
function joinname {
    local IFS="_" ; echo "$*"
}

163 164
# https://lunarc-documentation.readthedocs.io/en/latest/batch_system/#running-multiple-serial-jobs-within-a-single-job-submission
function run (){
ulrich_y's avatar
ulrich_y committed
165 166 167 168 169 170 171 172 173 174 175
 # Arguemnts are:
 #  1. config file
 #  2. seed
 #  3. xi cut
 #  4. which_piece
 #  5. flavour
 #  6. cuts flag (opt)
 #  7. delcut (opt) if not present xi cut is used
  conf=$1 ; shift
  source $conf
  echo "[`date`] Executing job $this $conf $@"
176
    
ulrich_y's avatar
ulrich_y committed
177
  # --exclusive guarantees that each job gets its own CPU core
178 179
  # -n 1: one task
  # -N 1: one node
ulrich_y's avatar
ulrich_y committed
180
  outfile=$folder/worker_`joinname $@`_${SLURM_JOB_ID}
ulrich_y's avatar
ulrich_y committed
181
  echo "srun --exclusive -n 1 -c 1 $this $conf $@ &> $outfile &"
ulrich_y's avatar
ulrich_y committed
182
  if [ -f "/usr/bin/srun" ]; then
ulrich_y's avatar
ulrich_y committed
183
      srun --exclusive -n 1 -c 1 $this $conf $@ &> $outfile &
ulrich_y's avatar
ulrich_y committed
184
  else
ulrich_y's avatar
ulrich_y committed
185
      stdbuf -oL -eL $this $conf $@ &> $outfile &
ulrich_y's avatar
ulrich_y committed
186
  fi
187 188 189
  sleep 1
}

ulrich_y's avatar
ulrich_y committed
190

191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
if ! git ls-files 2>&1 > /dev/null ; then
  echo "Warning! not a git repository, no self-tracking"
else
  echo "Git rev is `git rev-parse --short HEAD` (branch `git rev-parse --abbrev-ref HEAD`)"
  if [ -z "$(git status --porcelain)" ]; then
    echo "Working directory clean"
  else
    patchpath=`dirname $this`/diff-`git rev-parse --short HEAD`-${SLURM_JOB_ID}.patch.gz
    git diff | gzip > $patchpath
    patchsize=`git diff | wc -c`
    csize=`wc -c $patchpath`
    echo "There are uncommitted changes. I have created"
    echo "   $patchpath ($patchsize byte > $csize)"
    echo "with the differences to `git rev-parse --short HEAD`."
  fi
ulrich_y's avatar
ulrich_y committed
206 207 208 209 210 211 212 213
  if test -f "user.f95"; then
    userpath=`dirname $this`/user-`git rev-parse --short HEAD`-${SLURM_JOB_ID}.f95.gz
    cat user.f95 | gzip > $userpath
    echo "I have create a copy of the user file at $userpath"
  else
    echo "Warning! I cannot find the user file, no self-tracking"
  fi

214 215
fi

216
time {
ulrich_y's avatar
ulrich_y committed
217 218 219 220 221 222
if [ -f "/usr/bin/srun" ]; then
    :
else
    set -o monitor
    trap 'wakeup' SIGCHLD
fi
223 224 225 226 227 228 229
  #$1: config file
  #$2: seed
  #$3: xicut
  #$4: part
  #$5: flavour
  #$4: cuts
config=""
ulrich_y's avatar
ulrich_y committed
230
while IFS=" " read -r command args
231
do
ulrich_y's avatar
ulrich_y committed
232 233
    [ -z $command ] && continue
    set -- $args
ulrich_y's avatar
ulrich_y committed
234 235 236 237
    runningjobs=`jobs | wc -l`
    if [ "$runningjobs" -ge "$maxjobs" ]; then
        echo "[`date`] $runningjobs running. Standing by"
        jobs
ulrich_y's avatar
ulrich_y committed
238 239 240 241 242 243 244
        while [ "$runningjobs" -ge "$maxjobs" ] ; do
            sleep infinity & spid=$!
            echo "Falling asleep with spid=$spid"
            wait $spid
            echo "Awake!"
            runningjobs=`jobs | wc -l`
        done
ulrich_y's avatar
ulrich_y committed
245 246 247
        echo "[`date`] A job returned. Continue reading"
        jobs
    fi
248
    case "$command" in
ulrich_y's avatar
ulrich_y committed
249
        image)
250 251 252 253 254
            if [ -n "$containerid" ]; then
                echo "Only one image specification is allowed"
                diegracefully
                exit 1
            fi
ulrich_y's avatar
ulrich_y committed
255 256
            img=$1
            echo "Using Docker image $img"
ulrich_y's avatar
ulrich_y committed
257
            export containerid=$(udocker create $img)
ulrich_y's avatar
ulrich_y committed
258
            echo "Created container $containerid"
259 260 261 262 263 264 265 266 267 268 269
            udocker protect $containerid
            if [ -n "$2" ]; then
                userpath=$2
                echo "Copy user file $userpath"
                cat $userpath | udocker run $containerid sh -c "cat > /monte-carlo/src/user.f95"
                udocker run $containerid sh -c "cd /monte-carlo && touch src/mat_el.f95 && make" || exit
            else
                userpath=`dirname $this`/user-$img-${SLURM_JOB_ID}.f95.gz
                echo udocker run $containerid cat /monte-carlo/src/user.f95
                udocker run $containerid cat /monte-carlo/src/user.f95 | gzip > $userpath
            fi
ulrich_y's avatar
ulrich_y committed
270 271 272 273

            patchpath=`dirname $this`/diff-$img-${SLURM_JOB_ID}.patch.gz
            echo udocker run $containerid sh -c "cd /monte-carlo && git diff"
            udocker run $containerid sh -c "cd /monte-carlo && git diff" | gzip > $patchpath
ulrich_y's avatar
ulrich_y committed
274
            ;;
275
        run)
ulrich_y's avatar
ulrich_y committed
276 277
            echo "running with $args"
            run $config $@
278 279
            ;;
        conf)
ulrich_y's avatar
ulrich_y committed
280 281
            echo "loading config file $1"
            config=$1
ulrich_y's avatar
ulrich_y committed
282 283 284 285 286
            if [ ! -z "$containerid" ]; then
                cat $config > ${config%.conf}.$SLURM_JOB_ID.tconf
                echo "containerid=\"$containerid\"" >> ${config%.conf}.$SLURM_JOB_ID.tconf
                config=${config%.conf}.$SLURM_JOB_ID.tconf
            fi
ulrich_y's avatar
ulrich_y committed
287 288 289
            ;;
        [#]* )
            echo "Comment $command $args"
290 291
            ;;
        *)
ulrich_y's avatar
ulrich_y committed
292 293
            echo "Unknown command $command"
            ;;
294 295 296 297 298
    esac
done

echo "[`date`] All jobs scheduled. Standing by for result"

ulrich_y's avatar
ulrich_y committed
299
until wait ; do : ; done
300 301 302 303

}

echo "[`date`] All jobs returned"
ulrich_y's avatar
ulrich_y committed
304 305 306 307 308 309 310
trap - EXIT
trap - SIGCHLD
if [ -n "$containerid" ]; then
    udocker ps
    udocker unprotect $containerid
    udocker rm $containerid
fi