simulation.py 25.2 KB
Newer Older
gsell's avatar
gsell committed
1
"""
snuverink_j's avatar
snuverink_j committed
2
Simulation class handles batch job related things
gsell's avatar
gsell committed
3 4

@author: Andreas Adelmann <andreas.adelmann@psi.ch>
5
@author: Yves Ineichen
gsell's avatar
gsell committed
6 7 8
@version: 0.1
"""

9 10
import sys,os,shutil, subprocess
#import numpy as np
gsell's avatar
gsell committed
11

12

13
### Helper methods
14 15 16 17 18 19 20 21 22 23 24 25
def isInDirectory(filepath, directory):
    # From https://stackoverflow.com/questions/3812849/how-to-check-whether-a-directory-is-a-sub-directory-of-another-directory
    ''' Check if filepath is inside directory '''
    return os.path.realpath(filepath).startswith(os.path.realpath(directory) + os.sep)

def linkDirectory(path, name=''):
    '''Make files available in working directory with recursive symbolic links'''
    # Check for recursiveness
    if isInDirectory(os.getcwd(),path):
        print (name + ' directory is subdirectory of working directory! runOPAL cannot handle this.. bye!')
        sys.exit()
    # lndir and if fails try cp
26
    if os.system('lndir '+path) != 0:
27
        print("lndir failed (possibly doesn't exist on this system), using cp -rs... \n"),
28
        if os.listdir(path):
29
            os.system('cp -rs '+path+'/* .')
30

ext-bershanska_a's avatar
ext-bershanska_a committed
31
def linkFile(path, name):
32 33 34 35 36 37
    '''Make a file available in working directory with a symbolic link'''
    path = os.path.join(path,name)
    if not os.path.isfile(path):
        print (name+' cannot be found')
        sys.exit()
    os.system('ln -s '+path+' .')
38

39 40 41 42 43 44 45
def extractStr(line, name):
    zero = line.find(name)
    if zero < 0:
        return None
    start = min(x for x in [line.find('"',zero ), line.find("'", zero )] if x > 0) +1
    end   = min(x for x in [line.find('"',start), line.find("'", start)] if x > 0)
    return line[start:end]
gsell's avatar
gsell committed
46

47 48

class Simulation:
gsell's avatar
gsell committed
49 50 51 52
    def __init__(self, opaldict):
        self.opaldict = opaldict
        self.dirname = ""

ext-neveu_n's avatar
ext-neveu_n committed
53
    def createDirectory(self, dirname, doKeep, quiet):
54
        # If there's already a directory remove it...
gsell's avatar
gsell committed
55 56
        if os.path.isdir(self.dirname):
            if doKeep:
ext-bershanska_a's avatar
ext-bershanska_a committed
57 58
                print('KEEP existing directory {}'.format(self.dirname))
                print(self.dirname)
gsell's avatar
gsell committed
59 60
                return False
            else:
ext-neveu_n's avatar
ext-neveu_n committed
61
                if quiet == False:
ext-bershanska_a's avatar
ext-bershanska_a committed
62
                    print('REMOVE existing directory {}'.format(self.dirname))
gsell's avatar
gsell committed
63 64 65 66 67 68
                shutil.rmtree(self.dirname)

        # create directory
        os.mkdir(self.dirname)
        return True

69
    def run(self,N, baseFileName, inputfilePath, tmplFile, oinpFile, doTest, doKeep, doNobatch, doOptimize, info, queue, hypert, quiet):
70
        # make directory name indicating changed values
71
        self.dirname = baseFileName
gsell's avatar
gsell committed
72 73 74
        if N >= 0:
            self.dirname += str(N)
        self.dirname += self.opaldict.generateDirectoryName()
75
        
adelmann's avatar
adelmann committed
76 77 78 79 80
        try:
            CORES = self.opaldict['CORES']
        except KeyError:
            print("CORES not set bye bye")
            sys.exit(1)
81

ext-neveu_n's avatar
ext-neveu_n committed
82
        if self.createDirectory(self.dirname, doKeep, quiet) == False:
ext-neveu_n's avatar
ext-neveu_n committed
83
            print( "Simulation results already exist")
84
            return
gsell's avatar
gsell committed
85
        os.chdir(self.dirname)
86
        
87
        # Linking magnet and RF files
gsell's avatar
gsell committed
88 89 90 91 92
        if (os.environ.get('FIELDMAPS')):
            fieldmapPath = os.environ.get('FIELDMAPS')
        else:
            fieldmapPath = '../fieldmaps'
            if not (os.path.isdir(fieldmapPath)):
ext-neveu_n's avatar
ext-neveu_n committed
93
                print( 'Fieldmap directory unknown exiting ...')
gsell's avatar
gsell committed
94
                sys.exit()
95
        linkDirectory(fieldmapPath,'Fieldmap')
96
        
97 98 99 100
        # Link distribution directory if present
        if (os.environ.get('DISTRIBUTIONS')):
            distributionPath = os.environ.get('DISTRIBUTIONS')
            if os.path.isdir(distributionPath):
101
                linkDirectory(distributionPath,'Distribution')
102
        
103 104
        # Read in the file
        filedata = None
105
        with open(tmplFile, 'r') as file :
106
            filedata = file.read()
gsell's avatar
gsell committed
107
        # do the replacements in the templatefile
ext-neveu_n's avatar
ext-neveu_n committed
108
        for s,value in self.opaldict.items():
109
            # Replace the target string
110
            filedata = filedata.replace('_'+s+'_', str(value))
111 112 113
        # Write the file out again
        with open(oinpFile, 'w') as file:
            file.write(filedata)
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
        
        #NOTE: What's the best place to link tmpl file? $TEMPLATES, _TEMPLATEDIR_, or parisng?
        if doOptimize:
            flag = False
            tmplDir = None
            tmplIn  = None
            templateFile = open(oinpFile,'r')
            for line in templateFile:
                if not line.startswith('//'):
                    if 'OPTIMIZE' in line:
                        flag = True
                    if flag and not tmplDir:
                        tmplDir = extractStr(line,'TEMPLATEDIR')
                    if flag and not tmplIn:
                        tmplIn = extractStr(line,'INPUT').split('/')[-1]
            templateFile.close()
            
            linkFile('..', tmplIn[:-5]+'.data')
            os.mkdir(tmplDir)
            os.chdir(tmplDir)
            linkFile(os.path.join('../..',tmplDir), tmplIn)
            os.chdir('..')
        
137
        if os.environ.get('OPAL_EXE_PATH'):
138 139 140 141
            if doNobatch:
                opalexe = os.environ.get('OPAL_EXE_PATH') + '/opal'
            else:
                opalexe = '$OPAL_EXE_PATH/opal'
142 143
        else:
            opalexe = 'opal'
ext-neveu_n's avatar
ext-neveu_n committed
144 145 146 147 148
        if quiet == False:
            print( 'Simulation directory is {} using OPAL at {}'.format(self.dirname, os.environ.get('OPAL_EXE_PATH')))
            print( 'Using templatefile at ' + inputfilePath)
            print( 'Using fieldmaps at    ' + fieldmapPath)
            print( 'Parameter set in ' + oinpFile + ' are:')
149 150 151
            for s, value in sorted(self.opaldict.items()): #EDIT: fixed indentation
                if quiet == False:
                    print( ' :::: ' + s + ' = ' + str(value))
gsell's avatar
gsell committed
152

snuverink_j's avatar
snuverink_j committed
153
        if not doNobatch:
ext-neveu_n's avatar
ext-neveu_n committed
154
            #hostname = commands.getoutput("hostname")
155
            hostname = (subprocess.check_output('hostname').decode('utf-8')).strip()
ext-neveu_n's avatar
ext-neveu_n committed
156 157
            if quiet == False:
                print("On host {}".format(hostname))
adelmann's avatar
adelmann committed
158

snuverink_j's avatar
snuverink_j committed
159
            if os.getenv("SGE_TIME"):
ext-neveu_n's avatar
ext-neveu_n committed
160
                print( "You use deprecated environment variable SGE_TIME. Please use in the future TIME")
snuverink_j's avatar
snuverink_j committed
161 162
                time = os.getenv("SGE_TIME")
            else:
ext-neveu_n's avatar
ext-neveu_n committed
163
                #print('You did not set a time limit. Using default: s_rt=23:59:00,h_rt=24:00:00')
snuverink_j's avatar
snuverink_j committed
164
                time = os.getenv("TIME", "s_rt=23:59:00,h_rt=24:00:00")
adelmann's avatar
adelmann committed
165

snuverink_j's avatar
snuverink_j committed
166
            if os.getenv("SGE_RAM"):
ext-neveu_n's avatar
ext-neveu_n committed
167
                print( "You use deprecated environment variable SGE_RAM. Please use in the future RAM")
snuverink_j's avatar
snuverink_j committed
168 169
                ram = os.getenv("SGE_RAM")
            else:
170
                ram = os.getenv("RAM", "4")
snuverink_j's avatar
snuverink_j committed
171 172

            if not queue:
173 174 175 176
                try: 
                    queue = os.environ.get('QUEUE') 
                except:
                    queue = os.getenv("SGE_QUEUE", "prime_bd.q")
177
            
Adelmann Andreas's avatar
Adelmann Andreas committed
178
            # Merlin6
albajacas_a's avatar
albajacas_a committed
179
            if (hostname.startswith("merlin-l")):
Adelmann Andreas's avatar
Adelmann Andreas committed
180
                batchsys  = 'SLURM'
albajacas_a's avatar
albajacas_a committed
181
                runfile   = 'run.merlin6'
Adelmann Andreas's avatar
Adelmann Andreas committed
182 183 184 185 186
                time      = os.getenv("SLURM_TIME", "24:00:00")
                ram       = os.getenv("SLURM_RAM",  "36")
                partition = os.getenv("SLURM_PARTITION", "general")
                self.WriteMerlin6(opalexe, oinpFile, CORES, time, ram, info, runfile, partition)

187 188 189
            # ANL theta.alcf.anl.gov
            elif (hostname.startswith("theta")):
                batchsys = 'COBALT'
ext-neveu_n's avatar
ext-neveu_n committed
190
                runfile  = 'run.sh'
191
                self.WriteTheta(opalexe, oinpFile, CORES, time, ram, info, queue, hypert)
192

snuverink_j's avatar
snuverink_j committed
193
            # ANL blues.lcrc.anl.gov
194
            elif (hostname.startswith("blogin")):
snuverink_j's avatar
snuverink_j committed
195 196 197 198
                batchsys = 'PBS'
                runfile  = 'run.blues'
                self.WritePBSBlues(opalexe, oinpFile, CORES, time, ram, info, queue)

199
            # ANL Bebop
200
            elif (hostname.startswith("bebop") or hostname.startswith("bdw") or hostname.startswith("knl")):
201 202
                batchsys = 'SLURM'
                runfile  = 'run.bebop'
ext-neveu_n's avatar
ext-neveu_n committed
203
                time     = os.environ["TIME"]
ext-neveu_n's avatar
ext-neveu_n committed
204
                self.WriteBebop(opalexe, oinpFile, CORES, time, ram, info, runfile, queue, hypert, quiet)
205

206
            # NERSC Cori Haswell
207
            elif (hostname.startswith("cori")):
snuverink_j's avatar
snuverink_j committed
208 209 210 211
                batchsys = 'SLURM'
                runfile  = 'run.cori'
                self.WriteCori(opalexe, oinpFile, CORES, time, ram, info, runfile)

212
            # NERSC Edison
213
            elif (hostname.startswith("edison")):
snuverink_j's avatar
snuverink_j committed
214 215 216 217
                batchsys = 'SLURM'
                runfile  = 'run.edison'
                self.WriteEdison(opalexe, oinpFile, CORES, time, ram, info, runfile)

218
            # CSCS Piz-Daint
219
            elif (hostname.startswith("daint")):
snuverink_j's avatar
snuverink_j committed
220 221
                batchsys = 'SLURM'
                runfile  = 'run.daint'
frey_m's avatar
frey_m committed
222
                time = os.getenv("SLURM_TIME", "00:01:00")
snuverink_j's avatar
snuverink_j committed
223 224
                ram  = os.getenv("SLURM_RAM", "36")
                partition = os.getenv("SLURM_PARTITION", "normal")
frey_m's avatar
frey_m committed
225 226
                account = os.getenv("SLURM_ACCOUNT", "psi07")
                self.WritePizDaint(opalexe, oinpFile, CORES, time, ram, info, runfile, partition, account)
snuverink_j's avatar
snuverink_j committed
227

228
            elif (hostname.startswith("eofe")):
snuverink_j's avatar
snuverink_j committed
229 230 231 232 233
                batchsys = 'SLURM'
                runfile = 'run.engaging'
                time = os.getenv("SLURM_TIME", "24:00:00")
                ram  = os.getenv("SLURM_RAM", "120")            
                self.WriteEngaging(opalexe, oinpFile, CORES, time, ram, info, runfile)
gsell's avatar
gsell committed
234

snuverink_j's avatar
snuverink_j committed
235 236 237
            else:
                print("Hostname not known bye bye")
                sys.exit(1)
238 239 240

        qid = -1

241
        if doTest:
ext-neveu_n's avatar
ext-neveu_n committed
242 243
            if quiet == False:
                print( 'Done with setup of the OPAL simulation but not submitting the job (--test) \n\n\n')
244

245
        elif doNobatch:
ext-neveu_n's avatar
ext-neveu_n committed
246 247
            if quiet == False:
                print( 'Done with setup of the OPAL simulation and executing the job on {} cores...\n\n\n'.format(CORES)) 
248
            ofn, fileExtension = os.path.splitext(oinpFile)
snuverink_j's avatar
snuverink_j committed
249 250
            if quiet == False:
                print( 'STD output is written to {}.out'.format(ofn))
251
            #execommand = 'mpirun -np ' + str(CORES)  + ' ' + opalexe + ' ' + oinpFile + '  2>&1 | tee ' + ofn + '.out'
252 253 254 255 256
            outfileName = ofn +'.out'
            # Currently not writing to screen anymore
            # There is a solution described at https://stackoverflow.com/questions/15535240/python-popen-write-to-stdout-and-log-file-simultaneously
            with open(outfileName,'w') as outfile:
                qid = subprocess.call(['mpirun', '-np', str(CORES), opalexe, oinpFile], stdout=outfile, stderr=outfile)
257

258
        else:
259 260 261 262 263 264 265
            if batchsys == 'SLURM' or batchsys == 'COBALT':
                if batchsys == 'SLURM':
                    command = 'sbatch'
                elif batchsys == 'COBALT':
                    command = 'qsub'

                qid = subprocess.call([command, runfile, '|', 'awk','\'{print $3}\''])
ext-neveu_n's avatar
ext-neveu_n committed
266 267
                if quiet == False:
                    print( 'Done with setup of the OPAL simulation and submitting the job with {} cores \n\n\n'.format(CORES))
268

snuverink_j's avatar
snuverink_j committed
269
            elif batchsys == 'PBS':
ext-neveu_n's avatar
ext-neveu_n committed
270 271
                if quiet == False:
                    print( 'Done with setup of the OPAL simulation, please submit the job yourself')
snuverink_j's avatar
snuverink_j committed
272

273 274
            else:
                print("Batch system", batchsys, "not known!")
ext-neveu_n's avatar
ext-neveu_n committed
275

276 277
        os.chdir('..')
        return qid
278 279 280
    
    
    ### Write for host
frey_m's avatar
frey_m committed
281
    def WriteCori(self, opalexe, oinpFile, cores, time, ram, info, name):
adelmann's avatar
adelmann committed
282
        title=oinpFile.partition(".")[0]
frey_m's avatar
frey_m committed
283
        myfile = open(name,'w')
adelmann's avatar
adelmann committed
284 285 286 287 288 289 290 291 292
        s1 = "#!/bin/bash -l \n"
        s1 += "#SBATCH -p regular \n"
        s1 += "#SBATCH -N 1 \n"
        s1 += "#SBATCH -t " + time + "G\n" 
        s1 += "#SBATCH -J " + title + "\n"
        s1 += "#SBATCH --qos=premium \n"
        s1 += "srun -n 1 .... \n"
        myfile.write(s1)
        myfile.close()
293 294
    
    
295 296
    def WriteEngaging(self, opalexe, oinpFile, cores, time, ram, info, name):
        print("Writing SLURM run file for Engaging cluster at MIT")
297
        
298 299 300
        cores = int(cores)
        coresPerNode = 32
        partition = os.getenv("SLURM_PARTITION", "sched_mit_psfc")
301
        
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
        if ((cores%coresPerNode) is 0):
            nodes = int(cores/coresPerNode)
        else:
            nodes = int(cores/coresPerNode) + 1

        with open(name, 'w') as outfile:
            outfile.write("#!/bin/bash\n" 
                          "# submit with sbatch {}\n"
                          "# commandline arguments may instead by supplied with #SBATCH <flag> <value>\n"
                          "# commandline arguments override these values\n"
                          "\n"
                          "# Number of nodes\n".format(name))
            outfile.write("#SBATCH -N {}\n".format(nodes))
            outfile.write("# Number of total processor cores \n")
            outfile.write("#SBATCH -n {}\n".format(cores))
            outfile.write("# Memory (MB) \n")
            outfile.write("#SBATCH --mem {}\n".format(int(ram) * 1000))
            outfile.write("# specify how long your job needs.\n")
            outfile.write("#SBATCH --time={}\n".format(time))
            outfile.write("# which partition or queue the jobs runs in\n")
            outfile.write("#SBATCH -p {}\n".format(partition))
            outfile.write("#customize the name of the stderr/stdout file. %j is the job number\n")
            outfile.write("#SBATCH -o {}.o%j".format(os.path.splitext(oinpFile)[0]))
            outfile.write("\n")
326 327 328 329 330 331 332 333 334 335 336 337
#            outfile.write("#load default system modules\n")
#            outfile.write(". /etc/profile.d/modules.sh")
#            outfile.write("\n")
#            outfile.write("#load modules your job depends on.\n")
#            outfile.write("#better here than in your $HOME/.bashrc to make "
#                         "debugging and requirements easier to track.\n")
#            outfile.write("module load gcc/4.8.4\n")
#            outfile.write("module load engaging/openmpi/1.8.8\n")
#            outfile.write("module load engaging/cmake/3.5.2\n")
#            outfile.write("module load engaging/boost/1.56.0\n")
#            outfile.write("module load engaging/gsl/2.2.1\n")
#            outfile.write("\n")
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
            outfile.write("####################################################\n")
            outfile.write("# BEGIN DEBUG\n")
            outfile.write("# Print the SLURM environment on master host: \n")
            outfile.write("####################################################\n")
            outfile.write("echo '=== Slurm job  JOB_NAME=$JOB_NAME  JOB_ID=$JOB_ID'\n") 
            outfile.write("####################################################\n")
            outfile.write("echo DATE=`date`\n")
            outfile.write("echo HOSTNAME=`hostname`\n") 
            outfile.write("echo PWD=`pwd`\n")
            outfile.write("####################################################\n")
            outfile.write("echo 'Running environment:' \n")
            outfile.write("env \n")
            outfile.write("####################################################\n")
            outfile.write("echo 'Loaded environment modules:' \n")
            outfile.write("module list 2>&1\n") 
            outfile.write("echo \n")
            outfile.write("# END DEBUG\n") 
            outfile.write("####################################################\n")
            outfile.write("\n")
            outfile.write("#Finally, the command to execute.\n")
            outfile.write("#The job starts in the directory it was submitted from.\n")
            outfile.write("#Note that mpirun knows from SLURM how many processor we have\n")
            outfile.write("mpirun {} {} --info {} --warn 6\n".format(opalexe, oinpFile, info))
361 362
    
    
frey_m's avatar
frey_m committed
363
    def WriteEdison(self, opalexe, oinpFile, cores, time, ram, info, name):
adelmann's avatar
adelmann committed
364
        title=oinpFile.partition(".")[0]
365
        
adelmann's avatar
adelmann committed
366
        coresPerNode = 24
Matthias Frey's avatar
Matthias Frey committed
367
        cores = int(cores)
368
        
Matthias Frey's avatar
Matthias Frey committed
369 370
        if cores % coresPerNode == 0:
            nodes = int(cores / coresPerNode)
adelmann's avatar
adelmann committed
371
        else:
Matthias Frey's avatar
Matthias Frey committed
372 373
            nodes = int(cores / coresPerNode) + 1
        
adelmann's avatar
adelmann committed
374
        s1 = "#!/bin/bash -l \n"
Matthias Frey's avatar
Matthias Frey committed
375
        s1 += "#SBATCH -q regular \n"
adelmann's avatar
adelmann committed
376
        s1 += "#SBATCH -N " + str(nodes) + " \n"
Matthias Frey's avatar
Matthias Frey committed
377
        s1 += "#SBATCH -t " + time + "\n" 
adelmann's avatar
adelmann committed
378
        s1 += "#SBATCH -J " + title + "\n"
adelmann's avatar
adelmann committed
379 380 381
        s1 += "#SBATCH -o " + title + ".o%j\n"
        s1 += "#SBATCH -L SCRATCH \n"
        s1 += "srun -n " + str(cores) + " " + opalexe + " " + oinpFile + "\n"
Matthias Frey's avatar
Matthias Frey committed
382 383

        myfile = open(name, 'w')
adelmann's avatar
adelmann committed
384 385
        myfile.write(s1)
        myfile.close()
frey_m's avatar
frey_m committed
386
        
Adelmann Andreas's avatar
Adelmann Andreas committed
387 388 389 390 391 392 393 394 395
    def WriteMerlin6(self, opalexe, oinpFile, cores, time, ram, info, name, partition):
        # ADA this is for the new PSI Merlin6     
        title = oinpFile.partition(".")[0]
        myfile = open(name, 'w')
        s1 =  "#!/bin/bash -l \n"
        s1 += "#SBATCH --job-name=" + title + "\n"
        s1 += "#SBATCH --output="   + title + ".o%j\n"
        s1 += "#SBATCH --time=" + time + "\n"
        s1 += "#SBATCH --ntasks=" + str(cores) + "\n"
frey_m's avatar
frey_m committed
396
        s1 += "#SBATCH --ntasks-per-core=1 \n"
albajacas_a's avatar
albajacas_a committed
397
        # s1 += "#SBATCH --constraint=mc \n"
Adelmann Andreas's avatar
Adelmann Andreas committed
398 399 400 401 402 403
        # Discussed in https://gitlab.psi.ch/OPAL/runOPAL/issues/7:
        #if (int(cores) > 22):
        #    s1 += "#SBATCH --ntasks-per-node=16 \n"
        #else:
        #    s1 += "#SBATCH --nodes=1 \n"
        s1 += "#SBATCH --partition=" + str(partition) + " \n"
albajacas_a's avatar
albajacas_a committed
404
        # s1 += "#SBATCH --exclude=merlin-c-001 \n"
Adelmann Andreas's avatar
Adelmann Andreas committed
405 406
        s1 += "#SBATCH --cores-per-socket=22 \n"
        s1 += "#SBATCH --sockets-per-node=2 \n"
frey_m's avatar
frey_m committed
407
        s1 += "mpirun " + opalexe + " " + oinpFile + " --info " + str(info) + "\n"
Adelmann Andreas's avatar
Adelmann Andreas committed
408
        myfile.write(s1)
adelmann's avatar
adelmann committed
409
        myfile.close()
frey_m's avatar
frey_m committed
410 411

    def WritePizDaint(self, opalexe, oinpFile, cores, time, ram, info, name, partition, account):
frey_m's avatar
frey_m committed
412 413 414 415 416 417 418 419 420 421 422
        # XC40 Compute Nodes
        # Intel Xeon E5-2696 v4 @ 2.10GHz (2x18 cores, 64/128 GB RAM)
        # http://user.cscs.ch/computing_systems/piz_daint/index.html
        coresPerNode = 36
        title = oinpFile.partition(".")[0]
        myfile = open(name, 'w')
        s1 =  "#!/bin/bash -l \n"
        s1 += "#SBATCH --job-name=" + title + "\n"
        s1 += "#SBATCH --time=" + time + "\n"
        s1 += "#SBATCH --ntasks=" + str(cores) + "\n"
        s1 += "#SBATCH --ntasks-per-node=" + str(coresPerNode) + " \n"
423
        s1 += "#SBATCH --ntasks-per-core=1 \n"
frey_m's avatar
frey_m committed
424 425
        s1 += "#SBATCH --cpus-per-task=1 \n"
        s1 += "#SBATCH --constraint=mc \n"
adelmann's avatar
adelmann committed
426 427
        s1 += "#SBATCH --mem=" + str(ram) + "GB \n"
        s1 += "#SBATCH --partition=" + str(partition) + " \n"
frey_m's avatar
frey_m committed
428
        s1 += "#SBATCH --account=" + str(account) + " \n"
frey_m's avatar
frey_m committed
429 430
        s1 += "export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK \n"
        s1 += "module load daint-mc \n"
adelmann's avatar
adelmann committed
431
        s1 += "srun " + opalexe + " " + oinpFile + "\n"
frey_m's avatar
frey_m committed
432 433
        myfile.write(s1)
        myfile.close()
adelmann's avatar
adelmann committed
434
    
435
    
436
    def WritePBSBlues(self, opalexe, oinpFile, cores, time, ram, info, queue):
adelmann's avatar
adelmann committed
437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460
        # time  <- export SGE_TIME="walltime=0:20:00"
        # cores <- export CORES="nodes=1:ppn=16"
        title=oinpFile.partition(".")[0]
        myfile = open('run.pbs','w')
        s1 = "#!/bin/sh \n"
        s1 += "#PBS -o " + title + "_log  \n"
        s1 += "#PBS -r n \n"
        s1 += "#PBS -j oe \n"
        s1 += "#PBS -N " + title + "\n"
        s1 += "#PBS -m aeb \n"
        s1 += "#PBS -M nneveu@anl.gov \n"
        s1 += "#PBS -l " + time + " \n"
        s1 += "#PBS -l " + cores + " \n"
        s1 += "#PBS -q " + queue + " \n"
        try:
            v = os.environ["OPAL_EXE_PATH"]
        except KeyError:
            print("OPAL_EXE_PATH not set bye bye")
            sys.exit(1)
        s1 += "cd $PBS_O_WORKDIR \n"
        s1 += "####################################################\n"
        s1 += "echo DATE=`date`\n"
        s1 += "echo HOSTNAME=`hostname` \n"
        s1 += "echo PWD=`pwd`\n"
461
        s1 += "cat $PBS_NODEFILE\n"
462
        s1 += "NSLOTS=$(wc -l < $PBS_NODEFILE)\n"
adelmann's avatar
adelmann committed
463
        s1 += "####################################################\n"
ext-neveu_n's avatar
ext-neveu_n committed
464
        s1 += "CMD=$OPAL_EXE_PATH/opal \n"
465
        s1 += "echo $CMD\n"
adelmann's avatar
adelmann committed
466 467 468
        s1 += "ARGS=" + "\"" + oinpFile + " --info " + str(info) + " --warn 6 \"\n"
        s1 += "####################################################\n"
        s1 += "MPICMD=\"mpirun -np $NSLOTS $CMD $ARGS\" \n"
469
        s1 += "echo $MPICMD\n"
adelmann's avatar
adelmann committed
470 471 472 473
        s1 += "$MPICMD \n"
        s1 += "####################################################\n"
        myfile.write(s1)
        myfile.close()              
474 475
    
    
ext-neveu_n's avatar
ext-neveu_n committed
476
    def WriteBebop(self, opalexe, oinpFile, cores, time, ram, info, name, queue, hypert, quiet):
477 478 479 480
        # BDW and KNL Compute Nodes at ANL
        # http://www.lcrc.anl.gov/for-users/using-lcrc/running-jobs/running-jobs-on-bebop/
        if type(cores) is str:
            cores = int(cores)
481 482
        else:
            cores = int(cores)
ext-neveu_n's avatar
ext-neveu_n committed
483 484 485
        #Checking that a valid queue is selected
        #Adjusting number of cores for specified queue 
        if (queue=='bdw' or queue=='bdwall' or queue=='bdwd'):
ext-neveu_n's avatar
ext-neveu_n committed
486 487
            if quiet == False:
                print('Running on BDW') 
adelmann's avatar
adelmann committed
488
            coresPerNode = 36 * (hypert+1)     # hypert == 0 -> no hyper threading 
ext-neveu_n's avatar
ext-neveu_n committed
489
        elif (queue=='knl' or queue=='knlall' or queue=='knld'):
ext-neveu_n's avatar
ext-neveu_n committed
490 491
            if quiet == False:
                print('Running on KNL')
492
            coresPerNode = 64 * (hypert+1)
ext-neveu_n's avatar
ext-neveu_n committed
493 494
        else:
            print('You have picked a non-valid queue!! Your run will fail!!')
495

ext-neveu_n's avatar
ext-neveu_n committed
496 497 498
        #Calculating # of nodes needed, and # of tasks per node 
        #  Only calc tasks per node if total core number 
        #  is not evenly divisible by # of nodes
499 500 501 502 503
        if (cores % coresPerNode) is 0:
            if (cores < coresPerNode):
                nodes = 1
            else:
                nodes = cores / coresPerNode
ext-neveu_n's avatar
ext-neveu_n committed
504
                tasks_per_node = cores/nodes
505
        else:
ext-neveu_n's avatar
ext-neveu_n committed
506 507 508 509 510
            while((cores % coresPerNode) != 0): 
                coresPerNode -= 1
                nodes = cores/coresPerNode 

            tasks_per_node = cores/nodes
ext-neveu_n's avatar
ext-neveu_n committed
511
            #print(nodes,cores, tasks_per_node)
512 513 514 515 516 517 518 519

        title = oinpFile.partition(".")[0]
        myfile = open(name, 'w')
        
        s1 =  "#!/bin/bash -l \n"
        s1 += "#SBATCH --job-name=" + title + "\n"
        s1 += "#SBATCH -o " + title + ".%j.%N.out \n" 
        s1 += "#SBATCH -e " + title + ".%j.%N.error \n"
ext-neveu_n's avatar
ext-neveu_n committed
520
        s1 += "#SBATCH -p " + queue + " \n"
521 522
        s1 += "#SBATCH --time=" + time + "\n"
        s1 += "#SBATCH --ntasks=" + str(cores) + "\n"
adelmann's avatar
adelmann committed
523
        s1 += "#SBATCH --ntasks-per-node=" + str(coresPerNode) + "\n"
524 525
        s1 += "cd $SLURM_SUBMIT_DIR \n"
        #s1 += "export I_MPI_SLURM_EXT=0 \n"
526
        s1 += "export I_MPI_FABRICS=shm:tmi \n"
adelmann's avatar
adelmann committed
527 528
        if (queue=='knl' or queue=='knlall' or queue=='knld'):
            s1 += "#SBATCH -C knl,quad,cache \n"
ext-neveu_n's avatar
ext-neveu_n committed
529 530 531
        if int(nodes) > 1:
            s1 += "#SBATCH --ntasks-per-node=" + str(tasks_per_node) + " \n"
            s1 += "mpirun -n $SLURM_NTASKS "+ opalexe + " " + oinpFile + "\n"
532
        else:
ext-neveu_n's avatar
ext-neveu_n committed
533
            s1 += "mpirun -n $SLURM_NTASKS " + opalexe + " " + oinpFile + "\n"
534 535 536
        #s1 += "#SBATCH --mem=" + ram + "GB \n"
        #s1 += "export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK \n"
        #s1 += "--hint=nomultithread " + opalexe + " " + oinpFile + "\n"
537
       
ext-neveu_n's avatar
ext-neveu_n committed
538
        myfile.write(s1)
539 540
    
    
541 542 543 544 545 546 547 548 549 550 551 552 553 554
    def WriteTheta(self, opalexe, oinpFile, cores, time, ram, info, queue, hypert):
        # queue = default, debug-cache-quad, debug-flat-quad
        # cores = min of 8 nodes for default queue 
        try:
            v = os.environ["OPAL_EXE_PATH"]
        except KeyError:
            print("OPAL_EXE_PATH not set bye bye")
            sys.exit(1)
              
        cores        = int(cores)
        coresPerNode = 64 * (hypert+1)

        if (cores % coresPerNode) is 0:
            if (cores < coresPerNode):
ext-neveu_n's avatar
ext-neveu_n committed
555
                nodes = int(1)
556
            else:
ext-neveu_n's avatar
ext-neveu_n committed
557 558
                nodes = int(cores / coresPerNode)
                tasks_per_node = int(cores/nodes)
559 560
        else:
            while((cores % coresPerNode) != 0): 
ext-neveu_n's avatar
ext-neveu_n committed
561 562
                coresPerNode -= int(1)
                nodes = int(cores/coresPerNode) 
563 564

            tasks_per_node = cores/nodes
ext-neveu_n's avatar
ext-neveu_n committed
565
            #print(nodes,cores, tasks_per_node)
566 567 568
   
        if cores < 512:
            queue = 'debug-cache-quad'
ext-neveu_n's avatar
ext-neveu_n committed
569
            time  = '00:59:00'
570 571 572
        #elif cores > 512: 
        #nodes = np.ceil(cores/64)

ext-neveu_n's avatar
ext-neveu_n committed
573
        total_mpi_ranks = int(nodes*coresPerNode)
574 575 576 577 578 579 580 581 582 583 584 585 586 587

        title=oinpFile.partition(".")[0]
        myfile = open('run.sh','w')
        s1 =  "#!/bin/bash  \n"
        s1 += "#COBALT -t " + time + " \n"
        s1 += "#COBALT -n " + str(nodes) + " \n"
        s1 += "#COBALT -q " + queue + " \n"
        s1 += "#COBALT --attrs mcdram=cache:numa=quad \n"
        s1 += "#COBALT -A awa \n"
        s1 += 'echo "Starting Cobalt job script"\n'
        s1 += "export n_nodes=$COBALT_JOBSIZE \n"
        s1 += "export n_mpi_ranks_per_node=" + str(coresPerNode)+ " \n"
        s1 += "export n_mpi_ranks=" + str(total_mpi_ranks) + "\n"
        #s1 += "export n_openmp_threads_per_rank=4"
ext-neveu_n's avatar
ext-neveu_n committed
588
        if hypert > 0:       
ext-neveu_n's avatar
ext-neveu_n committed
589
            s1 += "export n_hyperthreads_per_core=2 \n"
590 591 592 593
        #s1 += "export n_hyperthreads_skipped_between_ranks=4"
        s1 += "####################################################\n"
        s1 += "ARGS=" + "\"" + oinpFile + " --info " + str(info) + " --warn 6 \"\n"
        s1 += "CMD=$OPAL_EXE_PATH/opal \n"
ext-neveu_n's avatar
ext-neveu_n committed
594 595 596 597
        if hypert > 0:
            s1 += "MPICMD=\"aprun -n $n_mpi_ranks -N $n_mpi_ranks_per_node -j $n_hyperthreads_per_core $CMD $ARGS\" \n"
        else:
            s1 += "MPICMD=\"aprun -n $n_mpi_ranks -N $n_mpi_ranks_per_node $CMD $ARGS\" \n"
598 599 600 601 602
        s1 += "echo $MPICMD\n"
        s1 += "$MPICMD \n"
        s1 += "####################################################\n"
        myfile.write(s1)
        myfile.close()              
ext-neveu_n's avatar
ext-neveu_n committed
603
        os.chmod("run.sh", 0o775)
604