simulation.py 27.2 KB
Newer Older
gsell's avatar
gsell committed
1
"""
snuverink_j's avatar
snuverink_j committed
2
Simulation class handles batch job related things
gsell's avatar
gsell committed
3 4

@author: Andreas Adelmann <andreas.adelmann@psi.ch>
5
@author: Yves Ineichen
gsell's avatar
gsell committed
6 7 8
@version: 0.1
"""

9 10 11 12
import sys
import os
import shutil
import subprocess
gsell's avatar
gsell committed
13

14

15
# Helper methods
16 17 18 19 20
def isInDirectory(filepath, directory):
    # From https://stackoverflow.com/questions/3812849/how-to-check-whether-a-directory-is-a-sub-directory-of-another-directory
    ''' Check if filepath is inside directory '''
    return os.path.realpath(filepath).startswith(os.path.realpath(directory) + os.sep)

21

22 23 24
def linkDirectory(path, name=''):
    '''Make files available in working directory with recursive symbolic links'''
    # Check for recursiveness
25 26
    if isInDirectory(os.getcwd(), path):
        print(name + ' directory is subdirectory of working directory! runOPAL cannot handle this.. bye!')
27 28
        sys.exit()
    # lndir and if fails try cp
29
    if os.system('lndir '+path) != 0:
30
        print("lndir failed (possibly doesn't exist on this system), using cp -rs... \n"),
31
        if os.listdir(path):
32
            os.system('cp -rs '+path+'/* .')
33

34

ext-bershanska_a's avatar
ext-bershanska_a committed
35
def linkFile(path, name):
36
    '''Make a file available in working directory with a symbolic link'''
37
    path = os.path.join(path, name)
38
    if not os.path.isfile(path):
39
        print(f'{name} cannot be found')
40 41
        sys.exit()
    os.system('ln -s '+path+' .')
42

43

44 45 46 47
def extractStr(line, name):
    zero = line.find(name)
    if zero < 0:
        return None
48 49
    start = min(x for x in [line.find('"', zero), line.find("'", zero)] if x > 0) +1
    end = min(x for x in [line.find('"', start), line.find("'", start)] if x > 0)
50
    return line[start:end]
gsell's avatar
gsell committed
51

52 53

class Simulation:
gsell's avatar
gsell committed
54 55 56 57
    def __init__(self, opaldict):
        self.opaldict = opaldict
        self.dirname = ""

58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
    @staticmethod
    def createDirectory(dirname, doKeep, quiet):
        '''
        Helper method to create a directory.

        Parameters
        ==========
        dirname: str
            directory to create
        doKeep: bool
            if True, keep the directory if it alread exists, else delete it
        quiet: bool
            if True, do not print output

        Returns
        =======
        bool
            Whether a new directory was created
        '''
77
        # If there's already a directory remove it...
78
        if os.path.isdir(dirname):
gsell's avatar
gsell committed
79
            if doKeep:
80 81
                print('KEEP existing directory {}'.format(dirname))
                print(dirname)
gsell's avatar
gsell committed
82 83
                return False
            else:
84
                if not quiet:
85 86
                    print('REMOVE existing directory {}'.format(dirname))
                shutil.rmtree(dirname)
gsell's avatar
gsell committed
87 88

        # create directory
89
        os.mkdir(dirname)
gsell's avatar
gsell committed
90 91
        return True

92 93
    def run(self, N, baseFileName, inputfilePath, tmplFile, oinpFile,
            doTest, doKeep, doNobatch, doOptimize, info, queue, hypert, quiet):
94 95 96 97 98 99 100 101 102 103
        '''
        Run an OPAL simulation.

        Support for batch systems is available.

        Parameters
        ==========
        N: int
            A running number.
            Useful when multiple output directories are needed
104
            Has no effect if it is < 0.
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
        baseFileName: str
            The base name of the simulation.
            If the template file is called mysim.tmpl,
            then the data file must be called mysim.data
            and the base name is mysim
        inputfilePath: str
            Path to the directory that contains the .in file
            (if running an optimisation).
            If running a single simulation: Path to the directory that
            contains the .tmpl file(s)
        tmplFile: str
            Path to the .tmpl file
        oinpFile: str
            Path to the .in file (called Opal in file)
        doTest: bool
            if True, does everything but submitting the job
        doKeep: bool
            if True:
                if same simulation has been run before, keep old data and abort
        doNobatch: bool
            run OPAL locally not using the batch system
            and wait until the job is done
        doOptimize: bool
            use optimization template (if any)
        info: int
            Steers the std-output of OPAL.
            The range is 0 < num < 6 (default), from minimum to maximum output
        queue: str
            Defines in which queue the job goes.
        hypert: int
            Defines the number of Hyper-Threads used. Default: 0
        quiet: bool
            suppress debug printout
        '''
139
        # make directory name indicating changed values
140
        self.dirname = baseFileName
gsell's avatar
gsell committed
141 142 143
        if N >= 0:
            self.dirname += str(N)
        self.dirname += self.opaldict.generateDirectoryName()
144

adelmann's avatar
adelmann committed
145 146 147 148 149
        try:
            CORES = self.opaldict['CORES']
        except KeyError:
            print("CORES not set bye bye")
            sys.exit(1)
150

151 152
        if not self.createDirectory(self.dirname, doKeep, quiet):
            print("Simulation results already exist")
153
            return
gsell's avatar
gsell committed
154
        os.chdir(self.dirname)
155

156
        # Linking magnet and RF files
gsell's avatar
gsell committed
157 158 159 160 161
        if (os.environ.get('FIELDMAPS')):
            fieldmapPath = os.environ.get('FIELDMAPS')
        else:
            fieldmapPath = '../fieldmaps'
            if not (os.path.isdir(fieldmapPath)):
162
                print('Fieldmap directory unknown exiting ...')
gsell's avatar
gsell committed
163
                sys.exit()
164 165
        linkDirectory(fieldmapPath, 'Fieldmap')

166 167 168 169
        # Link distribution directory if present
        if (os.environ.get('DISTRIBUTIONS')):
            distributionPath = os.environ.get('DISTRIBUTIONS')
            if os.path.isdir(distributionPath):
170 171
                linkDirectory(distributionPath, 'Distribution')

172 173
        # Read in the file
        filedata = None
174
        with open(tmplFile, 'r') as file:
175
            filedata = file.read()
gsell's avatar
gsell committed
176
        # do the replacements in the templatefile
177
        for s, value in self.opaldict.items():
178
            # Replace the target string
179
            filedata = filedata.replace('_'+s+'_', str(value))
180 181 182
        # Write the file out again
        with open(oinpFile, 'w') as file:
            file.write(filedata)
183 184 185 186

        # NOTE:
        # What's the best place to link tmpl file?
        # $TEMPLATES, _TEMPLATEDIR_, or parisng?
187 188 189
        if doOptimize:
            flag = False
            tmplDir = None
190 191
            tmplIn = None
            templateFile = open(oinpFile, 'r')
192 193 194 195 196
            for line in templateFile:
                if not line.startswith('//'):
                    if 'OPTIMIZE' in line:
                        flag = True
                    if flag and not tmplDir:
197
                        tmplDir = extractStr(line, 'TEMPLATEDIR')
198
                    if flag and not tmplIn:
199
                        tmplIn = extractStr(line, 'INPUT').split('/')[-1]
200
            templateFile.close()
201

202 203 204
            linkFile('..', tmplIn[:-5]+'.data')
            os.mkdir(tmplDir)
            os.chdir(tmplDir)
205
            linkFile(os.path.join('../..', tmplDir), tmplIn)
206
            os.chdir('..')
207

208
        if os.environ.get('OPAL_EXE_PATH'):
209 210 211 212
            if doNobatch:
                opalexe = os.environ.get('OPAL_EXE_PATH') + '/opal'
            else:
                opalexe = '$OPAL_EXE_PATH/opal'
213 214
        else:
            opalexe = 'opal'
215 216 217 218 219 220 221 222
        if not quiet:
            print(f'Simulation directory is {self.dirname} using OPAL at {os.environ.get("OPAL_EXE_PATH")}')
            print('Using templatefile at ' + inputfilePath)
            print('Using fieldmaps at    ' + fieldmapPath)
            print('Parameter set in ' + oinpFile + ' are:')
            for s, value in sorted(self.opaldict.items()):
                if not quiet:
                    print(' :::: ' + s + ' = ' + str(value))
gsell's avatar
gsell committed
223

snuverink_j's avatar
snuverink_j committed
224
        if not doNobatch:
ext-neveu_n's avatar
ext-neveu_n committed
225
            #hostname = commands.getoutput("hostname")
226
            hostname = (subprocess.check_output('hostname').decode('utf-8')).strip()
227
            if not quiet:
ext-neveu_n's avatar
ext-neveu_n committed
228
                print("On host {}".format(hostname))
adelmann's avatar
adelmann committed
229

snuverink_j's avatar
snuverink_j committed
230
            if os.getenv("SGE_TIME"):
231
                print("You use deprecated environment variable SGE_TIME. Please use in the future TIME")
snuverink_j's avatar
snuverink_j committed
232 233
                time = os.getenv("SGE_TIME")
            else:
ext-neveu_n's avatar
ext-neveu_n committed
234
                #print('You did not set a time limit. Using default: s_rt=23:59:00,h_rt=24:00:00')
snuverink_j's avatar
snuverink_j committed
235
                time = os.getenv("TIME", "s_rt=23:59:00,h_rt=24:00:00")
adelmann's avatar
adelmann committed
236

snuverink_j's avatar
snuverink_j committed
237
            if os.getenv("SGE_RAM"):
238
                print("You use deprecated environment variable SGE_RAM. Please use in the future RAM")
snuverink_j's avatar
snuverink_j committed
239 240
                ram = os.getenv("SGE_RAM")
            else:
241
                ram = os.getenv("RAM", "4")
snuverink_j's avatar
snuverink_j committed
242 243

            if not queue:
244 245
                try:
                    queue = os.environ.get('QUEUE')
246 247
                except:
                    queue = os.getenv("SGE_QUEUE", "prime_bd.q")
248

Adelmann Andreas's avatar
Adelmann Andreas committed
249
            # Merlin6
albajacas_a's avatar
albajacas_a committed
250
            if (hostname.startswith("merlin-l")):
251 252
                batchsys = 'SLURM'
                runfile = 'run.merlin6'
Renato Bellotti's avatar
Renato Bellotti committed
253
                time = os.getenv("SLURM_TIME", "23:59:59")
254
                ram = os.getenv("SLURM_RAM",  "36")
Renato Bellotti's avatar
Renato Bellotti committed
255
                partition = os.getenv("SLURM_PARTITION", "daily")
256 257
                self.WriteMerlin6(opalexe, oinpFile, CORES, time,
                                  ram, info, runfile, partition)
Adelmann Andreas's avatar
Adelmann Andreas committed
258

259 260 261
            # ANL theta.alcf.anl.gov
            elif (hostname.startswith("theta")):
                batchsys = 'COBALT'
262 263 264
                runfile = 'run.sh'
                self.WriteTheta(opalexe, oinpFile, CORES, time,
                                ram, info, queue, hypert)
265

snuverink_j's avatar
snuverink_j committed
266
            # ANL blues.lcrc.anl.gov
267
            elif (hostname.startswith("blogin")):
snuverink_j's avatar
snuverink_j committed
268
                batchsys = 'PBS'
269 270 271
                runfile = 'run.blues'
                self.WritePBSBlues(opalexe, oinpFile, CORES, time,
                                   ram, info, queue)
snuverink_j's avatar
snuverink_j committed
272

273
            # ANL Bebop
274 275 276
            elif (hostname.startswith("bebop")
                  or hostname.startswith("bdw")
                  or hostname.startswith("knl")):
277
                batchsys = 'SLURM'
278 279 280 281
                runfile = 'run.bebop'
                time = os.environ["TIME"]
                self.WriteBebop(opalexe, oinpFile, CORES, time,
                                ram, info, runfile, queue, hypert, quiet)
282

283
            # NERSC Cori Haswell
284
            elif (hostname.startswith("cori")):
snuverink_j's avatar
snuverink_j committed
285
                batchsys = 'SLURM'
286 287 288
                runfile = 'run.cori'
                self.WriteCori(opalexe, oinpFile, CORES, time,
                               ram, info, runfile)
snuverink_j's avatar
snuverink_j committed
289

290
            # NERSC Edison
291
            elif (hostname.startswith("edison")):
snuverink_j's avatar
snuverink_j committed
292
                batchsys = 'SLURM'
293 294 295
                runfile = 'run.edison'
                self.WriteEdison(opalexe, oinpFile, CORES, time,
                                 ram, info, runfile)
snuverink_j's avatar
snuverink_j committed
296

297
            # CSCS Piz-Daint
298
            elif (hostname.startswith("daint")):
snuverink_j's avatar
snuverink_j committed
299
                batchsys = 'SLURM'
300
                runfile = 'run.daint'
frey_m's avatar
frey_m committed
301
                time = os.getenv("SLURM_TIME", "00:01:00")
302
                ram = os.getenv("SLURM_RAM", "36")
snuverink_j's avatar
snuverink_j committed
303
                partition = os.getenv("SLURM_PARTITION", "normal")
frey_m's avatar
frey_m committed
304
                account = os.getenv("SLURM_ACCOUNT", "psi07")
305 306
                self.WritePizDaint(opalexe, oinpFile, CORES, time,
                                   ram, info, runfile, partition, account)
snuverink_j's avatar
snuverink_j committed
307

308
            elif (hostname.startswith("eofe")):
snuverink_j's avatar
snuverink_j committed
309 310 311
                batchsys = 'SLURM'
                runfile = 'run.engaging'
                time = os.getenv("SLURM_TIME", "24:00:00")
312 313 314
                ram = os.getenv("SLURM_RAM", "120")
                self.WriteEngaging(opalexe, oinpFile, CORES, time,
                                   ram, info, runfile)
gsell's avatar
gsell committed
315

snuverink_j's avatar
snuverink_j committed
316 317 318
            else:
                print("Hostname not known bye bye")
                sys.exit(1)
319 320 321

        qid = -1

322
        if doTest:
323 324
            if not quiet:
                print('Done with setup of the OPAL simulation but not submitting the job (--test) \n\n\n')
325

326
        elif doNobatch:
327 328
            if not quiet:
                print(f'Done with setup of the OPAL simulation and executing the job on {CORES} cores...\n\n\n')
329
            ofn, fileExtension = os.path.splitext(oinpFile)
330 331
            if not quiet:
                print('STD output is written to {}.out'.format(ofn))
332
            #execommand = 'mpirun -np ' + str(CORES)  + ' ' + opalexe + ' ' + oinpFile + '  2>&1 | tee ' + ofn + '.out'
333
            outfileName = ofn + '.out'
334 335
            # Currently not writing to screen anymore
            # There is a solution described at https://stackoverflow.com/questions/15535240/python-popen-write-to-stdout-and-log-file-simultaneously
336 337 338 339
            with open(outfileName, 'w') as outfile:
                qid = subprocess.call(['mpirun', '-np', str(CORES), opalexe, oinpFile],
                                      stdout=outfile,
                                      stderr=outfile)
340

341
        else:
342 343 344 345 346 347
            if batchsys == 'SLURM' or batchsys == 'COBALT':
                if batchsys == 'SLURM':
                    command = 'sbatch'
                elif batchsys == 'COBALT':
                    command = 'qsub'

348 349 350
                qid = subprocess.call([command, runfile, '|', 'awk', "\'{print $3}\'"])
                if not quiet:
                    print(f'Done with setup of the OPAL simulation and submitting the job with {CORES} cores \n\n\n')
351

snuverink_j's avatar
snuverink_j committed
352
            elif batchsys == 'PBS':
353 354
                if not quiet:
                    print('Done with setup of the OPAL simulation, please submit the job yourself')
snuverink_j's avatar
snuverink_j committed
355

356 357
            else:
                print("Batch system", batchsys, "not known!")
ext-neveu_n's avatar
ext-neveu_n committed
358

359 360
        os.chdir('..')
        return qid
361 362

    # Write for host
frey_m's avatar
frey_m committed
363
    def WriteCori(self, opalexe, oinpFile, cores, time, ram, info, name):
364 365
        title = oinpFile.partition(".")[0]
        myfile = open(name, 'w')
adelmann's avatar
adelmann committed
366 367 368
        s1 = "#!/bin/bash -l \n"
        s1 += "#SBATCH -p regular \n"
        s1 += "#SBATCH -N 1 \n"
369
        s1 += "#SBATCH -t " + time + "G\n"
adelmann's avatar
adelmann committed
370 371 372 373 374
        s1 += "#SBATCH -J " + title + "\n"
        s1 += "#SBATCH --qos=premium \n"
        s1 += "srun -n 1 .... \n"
        myfile.write(s1)
        myfile.close()
375

376 377
    def WriteEngaging(self, opalexe, oinpFile, cores, time, ram, info, name):
        print("Writing SLURM run file for Engaging cluster at MIT")
378

379 380 381
        cores = int(cores)
        coresPerNode = 32
        partition = os.getenv("SLURM_PARTITION", "sched_mit_psfc")
382 383

        if ((cores % coresPerNode) is 0):
384 385 386 387 388
            nodes = int(cores/coresPerNode)
        else:
            nodes = int(cores/coresPerNode) + 1

        with open(name, 'w') as outfile:
389
            outfile.write("#!/bin/bash\n"
390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406
                          "# submit with sbatch {}\n"
                          "# commandline arguments may instead by supplied with #SBATCH <flag> <value>\n"
                          "# commandline arguments override these values\n"
                          "\n"
                          "# Number of nodes\n".format(name))
            outfile.write("#SBATCH -N {}\n".format(nodes))
            outfile.write("# Number of total processor cores \n")
            outfile.write("#SBATCH -n {}\n".format(cores))
            outfile.write("# Memory (MB) \n")
            outfile.write("#SBATCH --mem {}\n".format(int(ram) * 1000))
            outfile.write("# specify how long your job needs.\n")
            outfile.write("#SBATCH --time={}\n".format(time))
            outfile.write("# which partition or queue the jobs runs in\n")
            outfile.write("#SBATCH -p {}\n".format(partition))
            outfile.write("#customize the name of the stderr/stdout file. %j is the job number\n")
            outfile.write("#SBATCH -o {}.o%j".format(os.path.splitext(oinpFile)[0]))
            outfile.write("\n")
407 408 409 410 411 412 413 414 415 416 417 418
#            outfile.write("#load default system modules\n")
#            outfile.write(". /etc/profile.d/modules.sh")
#            outfile.write("\n")
#            outfile.write("#load modules your job depends on.\n")
#            outfile.write("#better here than in your $HOME/.bashrc to make "
#                         "debugging and requirements easier to track.\n")
#            outfile.write("module load gcc/4.8.4\n")
#            outfile.write("module load engaging/openmpi/1.8.8\n")
#            outfile.write("module load engaging/cmake/3.5.2\n")
#            outfile.write("module load engaging/boost/1.56.0\n")
#            outfile.write("module load engaging/gsl/2.2.1\n")
#            outfile.write("\n")
419 420 421 422
            outfile.write("####################################################\n")
            outfile.write("# BEGIN DEBUG\n")
            outfile.write("# Print the SLURM environment on master host: \n")
            outfile.write("####################################################\n")
423
            outfile.write("echo '=== Slurm job  JOB_NAME=$JOB_NAME  JOB_ID=$JOB_ID'\n")
424 425
            outfile.write("####################################################\n")
            outfile.write("echo DATE=`date`\n")
426
            outfile.write("echo HOSTNAME=`hostname`\n")
427 428 429 430 431 432
            outfile.write("echo PWD=`pwd`\n")
            outfile.write("####################################################\n")
            outfile.write("echo 'Running environment:' \n")
            outfile.write("env \n")
            outfile.write("####################################################\n")
            outfile.write("echo 'Loaded environment modules:' \n")
433
            outfile.write("module list 2>&1\n")
434
            outfile.write("echo \n")
435
            outfile.write("# END DEBUG\n")
436 437 438 439 440
            outfile.write("####################################################\n")
            outfile.write("\n")
            outfile.write("#Finally, the command to execute.\n")
            outfile.write("#The job starts in the directory it was submitted from.\n")
            outfile.write("#Note that mpirun knows from SLURM how many processor we have\n")
441 442
            outfile.write(f"mpirun {opalexe} {oinpFile} --info {info} --warn 6\n")

frey_m's avatar
frey_m committed
443
    def WriteEdison(self, opalexe, oinpFile, cores, time, ram, info, name):
444 445
        title = oinpFile.partition(".")[0]

adelmann's avatar
adelmann committed
446
        coresPerNode = 24
Matthias Frey's avatar
Matthias Frey committed
447
        cores = int(cores)
448

Matthias Frey's avatar
Matthias Frey committed
449 450
        if cores % coresPerNode == 0:
            nodes = int(cores / coresPerNode)
adelmann's avatar
adelmann committed
451
        else:
Matthias Frey's avatar
Matthias Frey committed
452
            nodes = int(cores / coresPerNode) + 1
453

adelmann's avatar
adelmann committed
454
        s1 = "#!/bin/bash -l \n"
Matthias Frey's avatar
Matthias Frey committed
455
        s1 += "#SBATCH -q regular \n"
adelmann's avatar
adelmann committed
456
        s1 += "#SBATCH -N " + str(nodes) + " \n"
457
        s1 += "#SBATCH -t " + time + "\n"
adelmann's avatar
adelmann committed
458
        s1 += "#SBATCH -J " + title + "\n"
adelmann's avatar
adelmann committed
459 460 461
        s1 += "#SBATCH -o " + title + ".o%j\n"
        s1 += "#SBATCH -L SCRATCH \n"
        s1 += "srun -n " + str(cores) + " " + opalexe + " " + oinpFile + "\n"
Matthias Frey's avatar
Matthias Frey committed
462 463

        myfile = open(name, 'w')
adelmann's avatar
adelmann committed
464 465
        myfile.write(s1)
        myfile.close()
466 467 468 469

    def WriteMerlin6(self, opalexe, oinpFile, cores, time,
                     ram, info, name, partition):
        # ADA this is for the new PSI Merlin6
Adelmann Andreas's avatar
Adelmann Andreas committed
470 471
        title = oinpFile.partition(".")[0]
        myfile = open(name, 'w')
472
        s1 = "#!/bin/bash -l \n"
Adelmann Andreas's avatar
Adelmann Andreas committed
473
        s1 += "#SBATCH --job-name=" + title + "\n"
474
        s1 += "#SBATCH --output=" + title + ".o%j\n"
Adelmann Andreas's avatar
Adelmann Andreas committed
475 476
        s1 += "#SBATCH --time=" + time + "\n"
        s1 += "#SBATCH --ntasks=" + str(cores) + "\n"
frey_m's avatar
frey_m committed
477
        s1 += "#SBATCH --ntasks-per-core=1 \n"
albajacas_a's avatar
albajacas_a committed
478
        # s1 += "#SBATCH --constraint=mc \n"
Adelmann Andreas's avatar
Adelmann Andreas committed
479 480 481 482 483 484
        # Discussed in https://gitlab.psi.ch/OPAL/runOPAL/issues/7:
        #if (int(cores) > 22):
        #    s1 += "#SBATCH --ntasks-per-node=16 \n"
        #else:
        #    s1 += "#SBATCH --nodes=1 \n"
        s1 += "#SBATCH --partition=" + str(partition) + " \n"
albajacas_a's avatar
albajacas_a committed
485
        # s1 += "#SBATCH --exclude=merlin-c-001 \n"
Adelmann Andreas's avatar
Adelmann Andreas committed
486 487
        s1 += "#SBATCH --cores-per-socket=22 \n"
        s1 += "#SBATCH --sockets-per-node=2 \n"
488
        s1 += f"mpirun {opalexe} {oinpFile} --info {str(info)}\n"
Adelmann Andreas's avatar
Adelmann Andreas committed
489
        myfile.write(s1)
adelmann's avatar
adelmann committed
490
        myfile.close()
frey_m's avatar
frey_m committed
491

492 493
    def WritePizDaint(self, opalexe, oinpFile, cores, time,
                      ram, info, name, partition, account):
frey_m's avatar
frey_m committed
494 495 496 497 498 499
        # XC40 Compute Nodes
        # Intel Xeon E5-2696 v4 @ 2.10GHz (2x18 cores, 64/128 GB RAM)
        # http://user.cscs.ch/computing_systems/piz_daint/index.html
        coresPerNode = 36
        title = oinpFile.partition(".")[0]
        myfile = open(name, 'w')
500
        s1 = "#!/bin/bash -l \n"
frey_m's avatar
frey_m committed
501 502 503 504
        s1 += "#SBATCH --job-name=" + title + "\n"
        s1 += "#SBATCH --time=" + time + "\n"
        s1 += "#SBATCH --ntasks=" + str(cores) + "\n"
        s1 += "#SBATCH --ntasks-per-node=" + str(coresPerNode) + " \n"
505
        s1 += "#SBATCH --ntasks-per-core=1 \n"
frey_m's avatar
frey_m committed
506 507
        s1 += "#SBATCH --cpus-per-task=1 \n"
        s1 += "#SBATCH --constraint=mc \n"
adelmann's avatar
adelmann committed
508 509
        s1 += "#SBATCH --mem=" + str(ram) + "GB \n"
        s1 += "#SBATCH --partition=" + str(partition) + " \n"
frey_m's avatar
frey_m committed
510
        s1 += "#SBATCH --account=" + str(account) + " \n"
frey_m's avatar
frey_m committed
511 512
        s1 += "export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK \n"
        s1 += "module load daint-mc \n"
adelmann's avatar
adelmann committed
513
        s1 += "srun " + opalexe + " " + oinpFile + "\n"
frey_m's avatar
frey_m committed
514 515
        myfile.write(s1)
        myfile.close()
516

517
    def WritePBSBlues(self, opalexe, oinpFile, cores, time, ram, info, queue):
adelmann's avatar
adelmann committed
518 519
        # time  <- export SGE_TIME="walltime=0:20:00"
        # cores <- export CORES="nodes=1:ppn=16"
520 521
        title = oinpFile.partition(".")[0]
        myfile = open('run.pbs', 'w')
adelmann's avatar
adelmann committed
522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541
        s1 = "#!/bin/sh \n"
        s1 += "#PBS -o " + title + "_log  \n"
        s1 += "#PBS -r n \n"
        s1 += "#PBS -j oe \n"
        s1 += "#PBS -N " + title + "\n"
        s1 += "#PBS -m aeb \n"
        s1 += "#PBS -M nneveu@anl.gov \n"
        s1 += "#PBS -l " + time + " \n"
        s1 += "#PBS -l " + cores + " \n"
        s1 += "#PBS -q " + queue + " \n"
        try:
            v = os.environ["OPAL_EXE_PATH"]
        except KeyError:
            print("OPAL_EXE_PATH not set bye bye")
            sys.exit(1)
        s1 += "cd $PBS_O_WORKDIR \n"
        s1 += "####################################################\n"
        s1 += "echo DATE=`date`\n"
        s1 += "echo HOSTNAME=`hostname` \n"
        s1 += "echo PWD=`pwd`\n"
542
        s1 += "cat $PBS_NODEFILE\n"
543
        s1 += "NSLOTS=$(wc -l < $PBS_NODEFILE)\n"
adelmann's avatar
adelmann committed
544
        s1 += "####################################################\n"
ext-neveu_n's avatar
ext-neveu_n committed
545
        s1 += "CMD=$OPAL_EXE_PATH/opal \n"
546
        s1 += "echo $CMD\n"
547
        s1 += f'ARGS="{oinpFile} --info {str(info)} --warn 6 "\n'
adelmann's avatar
adelmann committed
548 549
        s1 += "####################################################\n"
        s1 += "MPICMD=\"mpirun -np $NSLOTS $CMD $ARGS\" \n"
550
        s1 += "echo $MPICMD\n"
adelmann's avatar
adelmann committed
551 552 553
        s1 += "$MPICMD \n"
        s1 += "####################################################\n"
        myfile.write(s1)
554 555 556 557
        myfile.close()

    def WriteBebop(self, opalexe, oinpFile, cores, time,
                   ram, info, name, queue, hypert, quiet):
558 559 560 561
        # BDW and KNL Compute Nodes at ANL
        # http://www.lcrc.anl.gov/for-users/using-lcrc/running-jobs/running-jobs-on-bebop/
        if type(cores) is str:
            cores = int(cores)
562 563
        else:
            cores = int(cores)
564 565 566 567 568 569 570 571
        # Checking that a valid queue is selected
        # Adjusting number of cores for specified queue
        if (queue == 'bdw' or queue == 'bdwall' or queue == 'bdwd'):
            if not quiet:
                print('Running on BDW')
            coresPerNode = 36 * (hypert+1)     # hypert == 0 -> no hyper threading
        elif (queue == 'knl' or queue == 'knlall' or queue == 'knld'):
            if not quiet:
ext-neveu_n's avatar
ext-neveu_n committed
572
                print('Running on KNL')
573
            coresPerNode = 64 * (hypert+1)
ext-neveu_n's avatar
ext-neveu_n committed
574 575
        else:
            print('You have picked a non-valid queue!! Your run will fail!!')
576

577 578
        # Calculating # of nodes needed, and # of tasks per node
        #  Only calc tasks per node if total core number
ext-neveu_n's avatar
ext-neveu_n committed
579
        #  is not evenly divisible by # of nodes
580 581 582 583 584
        if (cores % coresPerNode) is 0:
            if (cores < coresPerNode):
                nodes = 1
            else:
                nodes = cores / coresPerNode
ext-neveu_n's avatar
ext-neveu_n committed
585
                tasks_per_node = cores/nodes
586
        else:
587
            while((cores % coresPerNode) != 0):
ext-neveu_n's avatar
ext-neveu_n committed
588
                coresPerNode -= 1
589
                nodes = cores/coresPerNode
ext-neveu_n's avatar
ext-neveu_n committed
590 591

            tasks_per_node = cores/nodes
ext-neveu_n's avatar
ext-neveu_n committed
592
            #print(nodes,cores, tasks_per_node)
593 594 595

        title = oinpFile.partition(".")[0]
        myfile = open(name, 'w')
596 597

        s1 = "#!/bin/bash -l \n"
598
        s1 += "#SBATCH --job-name=" + title + "\n"
599
        s1 += "#SBATCH -o " + title + ".%j.%N.out \n"
600
        s1 += "#SBATCH -e " + title + ".%j.%N.error \n"
ext-neveu_n's avatar
ext-neveu_n committed
601
        s1 += "#SBATCH -p " + queue + " \n"
602 603
        s1 += "#SBATCH --time=" + time + "\n"
        s1 += "#SBATCH --ntasks=" + str(cores) + "\n"
adelmann's avatar
adelmann committed
604
        s1 += "#SBATCH --ntasks-per-node=" + str(coresPerNode) + "\n"
605 606
        s1 += "cd $SLURM_SUBMIT_DIR \n"
        #s1 += "export I_MPI_SLURM_EXT=0 \n"
607
        s1 += "export I_MPI_FABRICS=shm:tmi \n"
608
        if (queue == 'knl' or queue == 'knlall' or queue == 'knld'):
adelmann's avatar
adelmann committed
609
            s1 += "#SBATCH -C knl,quad,cache \n"
ext-neveu_n's avatar
ext-neveu_n committed
610
        if int(nodes) > 1:
611 612
            s1 += f"#SBATCH --ntasks-per-node={str(tasks_per_node)}\n"
            s1 += f"mpirun -n $SLURM_NTASKS {opalexe} {oinpFile}\n"
613
        else:
614
            s1 += f"mpirun -n $SLURM_NTASKS {opalexe} {oinpFile}\n"
615 616 617
        #s1 += "#SBATCH --mem=" + ram + "GB \n"
        #s1 += "export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK \n"
        #s1 += "--hint=nomultithread " + opalexe + " " + oinpFile + "\n"
618

ext-neveu_n's avatar
ext-neveu_n committed
619
        myfile.write(s1)
620 621 622

    def WriteTheta(self, opalexe, oinpFile, cores, time,
                   ram, info, queue, hypert):
623
        # queue = default, debug-cache-quad, debug-flat-quad
624
        # cores = min of 8 nodes for default queue
625 626 627 628 629
        try:
            v = os.environ["OPAL_EXE_PATH"]
        except KeyError:
            print("OPAL_EXE_PATH not set bye bye")
            sys.exit(1)
630 631

        cores = int(cores)
632 633 634 635
        coresPerNode = 64 * (hypert+1)

        if (cores % coresPerNode) is 0:
            if (cores < coresPerNode):
ext-neveu_n's avatar
ext-neveu_n committed
636
                nodes = int(1)
637
            else:
ext-neveu_n's avatar
ext-neveu_n committed
638 639
                nodes = int(cores / coresPerNode)
                tasks_per_node = int(cores/nodes)
640
        else:
641
            while((cores % coresPerNode) != 0):
ext-neveu_n's avatar
ext-neveu_n committed
642
                coresPerNode -= int(1)
643
                nodes = int(cores/coresPerNode)
644 645

            tasks_per_node = cores/nodes
ext-neveu_n's avatar
ext-neveu_n committed
646
            #print(nodes,cores, tasks_per_node)
647

648 649
        if cores < 512:
            queue = 'debug-cache-quad'
650 651
            time = '00:59:00'
        #elif cores > 512:
652 653
        #nodes = np.ceil(cores/64)

ext-neveu_n's avatar
ext-neveu_n committed
654
        total_mpi_ranks = int(nodes*coresPerNode)
655

656 657 658
        title = oinpFile.partition(".")[0]
        myfile = open('run.sh', 'w')
        s1 = "#!/bin/bash  \n"
659 660 661 662 663 664 665
        s1 += "#COBALT -t " + time + " \n"
        s1 += "#COBALT -n " + str(nodes) + " \n"
        s1 += "#COBALT -q " + queue + " \n"
        s1 += "#COBALT --attrs mcdram=cache:numa=quad \n"
        s1 += "#COBALT -A awa \n"
        s1 += 'echo "Starting Cobalt job script"\n'
        s1 += "export n_nodes=$COBALT_JOBSIZE \n"
666 667
        s1 += f"export n_mpi_ranks_per_node={str(coresPerNode)}\n"
        s1 += f"export n_mpi_ranks={str(total_mpi_ranks)}\n"
668
        #s1 += "export n_openmp_threads_per_rank=4"
669
        if hypert > 0:
ext-neveu_n's avatar
ext-neveu_n committed
670
            s1 += "export n_hyperthreads_per_core=2 \n"
671 672
        #s1 += "export n_hyperthreads_skipped_between_ranks=4"
        s1 += "####################################################\n"
673
        s1 += f'ARGS="{oinpFile} --info {str(info)} --warn 6 "\n'
674
        s1 += "CMD=$OPAL_EXE_PATH/opal \n"
ext-neveu_n's avatar
ext-neveu_n committed
675 676 677 678
        if hypert > 0:
            s1 += "MPICMD=\"aprun -n $n_mpi_ranks -N $n_mpi_ranks_per_node -j $n_hyperthreads_per_core $CMD $ARGS\" \n"
        else:
            s1 += "MPICMD=\"aprun -n $n_mpi_ranks -N $n_mpi_ranks_per_node $CMD $ARGS\" \n"
679 680 681 682
        s1 += "echo $MPICMD\n"
        s1 += "$MPICMD \n"
        s1 += "####################################################\n"
        myfile.write(s1)
683
        myfile.close()
ext-neveu_n's avatar
ext-neveu_n committed
684
        os.chmod("run.sh", 0o775)