12 from NovaGridUtils
import warn, fail, sleep
14 recommended_sites=[
"BNL",
36 known_os_containers = {
37 "sl6":
"/cvmfs/singularity.opensciencegrid.org/fermilab/fnal-wn-sl6:latest",
38 "sl7":
"/cvmfs/singularity.opensciencegrid.org/fermilab/fnal-wn-sl7:latest",
39 "el8":
"/cvmfs/singularity.opensciencegrid.org/fermilab/fnal-wn-el8:latest",
43 if '#' not in src:
return src
44 return src[0:src.find(
'#')]
47 loc = os.path.expandvars(l)
48 for bad
in [
'/nova/app/',
'/nova/ana/',
'/nova/data/',
'/grid']:
49 if loc.startswith(bad):
50 txt =
"Location %s cannot be on %s it must be in dCache /pnfs/nova/" % (loc, bad)
51 if loc.startswith(
'/nova/app'):
52 txt =
"Jobs can no longer access BlueArc directly. Test releases will be tarred up and sent to worker nodes, however input files should be moved to dCache." 60 if os.path.isfile(filename):
63 for root, dirs, files
in os.walk(os.path.expandvars(path)):
65 return os.path.join(root, filename)
66 fail(
"Cannot find file "+filename)
70 for testpath
in pathlist:
71 if os.path.basename(filepath) == os.path.basename(testpath):
76 if __name__ ==
'__main__':
80 NovaGridUtils.prog =
'submit_cafana.py' 85 if "-f" in sys.argv
or "--file" in sys.argv:
86 prelim_parser = argparse.ArgumentParser()
88 prelim_parser.add_argument(
'-f',
'--file', type=str, action=
'append')
89 pre_args, unknown = prelim_parser.parse_known_args()
92 for filepath
in pre_args.file:
94 for line
in open(filepath,
'r'): 99 parser = argparse.ArgumentParser(description = 'Submit a CAFAna macro with datasets split between N jobs')
103 parser.add_argument(
'-f',
'--file', type=str,
104 help=
'Text file containing any arguments to this utility',
107 parser.add_argument(
'-n',
'--njobs', type=int, required=
True, metavar=
'N',
108 help=
'Number of grid processes')
110 parser.add_argument(
'-r',
'--rel', required=
True,
111 help=
'Release to use')
113 parser.add_argument(
'-i',
'--input_file', action=
"append",
114 help=
"Copy this input file to work area on worker node")
116 parser.add_argument(
'-o',
'--outdir', required=
True, metavar=
'DIR',
117 help=
'Directory output files will go to')
119 parser.add_argument(
'-ss',
'--snapshot', action=
'store_true',
120 help=
'Use latest snapshot instead of requerying')
122 parser.add_argument(
'-off',
'--offsite', action=
'store_true',
123 help=
'Run this cafana job offsite')
125 parser.add_argument(
'-d',
'--drain', action=
'store_true',
126 help=
'Recover files missing from your output directory')
128 parser.add_argument(
'-x',
'--xrootdebug', action=
'store_true',
129 help=
'Add extra xrootd debugging information',
132 parser.add_argument(
'--numuccinc', action =
'store_true',
133 help =
'Load libraries for specific xsec analysis')
134 parser.add_argument(
'--numubarccinc', action =
'store_true',
135 help =
'Load libraries for specific xsec analysis')
136 parser.add_argument(
'--numucc2p2h', action =
'store_true',
137 help =
'Load libraries for specific xsec analysis')
138 parser.add_argument(
'--numucc0pi', action =
'store_true',
139 help =
'Load libraries for specific xsec analysis')
140 parser.add_argument(
'--nuebarccinc', action =
'store_true',
141 help =
'Load libraries for specific xsec analysis')
142 parser.add_argument(
'--numubarccpi0', action =
'store_true',
143 help =
'Load libraries for specific xsec analysis')
144 parser.add_argument(
'--no3flavor', action =
'store_true',
145 help =
'Do *not* load libraries for 3-flavor analysis')
146 parser.add_argument(
'--nuxana', action=
"store_true",
147 help=
"Load libraries for NuX analysis")
149 parser.add_argument(
'macro.C',
150 help=
'The CAFAna macro to run')
152 parser.add_argument(
'args', nargs=
'*',
153 help=
'Arguments to the macro')
156 job_control_args = parser.add_argument_group(
"Job control options",
"These optional arguments help control where and how your jobs land.")
158 tarball_gp = job_control_args.add_mutually_exclusive_group(required=
False)
159 tarball_gp.add_argument(
'-t',
'--testrel', metavar=
'DIR',
160 help=
'Use a test release at location TESTREL. It will be tarred up, and sent to the worker node. (Conflicts with --user_tarball)',
163 tarball_gp.add_argument(
"--user_tarball",
164 help=
"Use existing test release tarball in specified location rather than having jobsub make one for you (conflicts with --testrel, and is redunant with --reuse_tarball)",
168 job_control_args.add_argument(
'--reuse_tarball',
169 help=
'Do you want to reuse a tarball that is already in resilient space? If using this option avoid trailing slash in --testrel option. (redundant with --user_tarball)',
170 action=
'store_true',default=
False)
172 job_control_args.add_argument(
'--dedicated',
173 help=
'Only run on dedicated nodes on fermigrid (default is to run opportunistically)',
174 action=
'store_true',default=
False)
176 job_control_args.add_argument(
'--site',
177 help=
'Specify allowed offsite locations. Omit to allow running at any offsite location',
178 type=str,action=
'append')
180 job_control_args.add_argument(
'--exclude_site',
181 help=
'Specify an offsite location to exclude.',
183 type=str,action=
'append')
185 job_control_args.add_argument(
'--recommended_sites',
186 help=
'Specify known working offsite locations.',
187 action=
'store_true',default=
False)
189 job_control_args.add_argument(
'--disk',
190 help=
'Local disk space requirement for worker node in MB (default is 2000MB).',
191 type=int, default=2000)
193 job_control_args.add_argument(
'--memory',
194 help=
'Local memory requirement for worker node in MB (default is 1900MB).',
195 type=int, default=1900)
197 job_control_args.add_argument(
'--lifetime',
198 help=
'Expected job lifetime. Valid values are an integer number of seconds. (default is 10800=3h)',
199 type=int, default=
"10800")
201 job_control_args.add_argument(
'--source',
202 help=
'Source script SOURCE:par1:par2:..',
203 type=str, action=
'append')
205 job_control_args.add_argument(
'--ngu_test',
206 help=
'Setup the test version of NovaGridUtils in the grid jobs.',
209 job_control_args.add_argument(
'--ngu_version',
210 help=
'Setup a specific NovaGridUtils version in the grid jobs.', metavar=
'VERSION',
213 job_control_args.add_argument(
'--testrel_ngu',
214 help=
"Must be used with --testrel, with NGU checked out. After unpacking tarball will setup the local version of NGU you are using on the work.",
217 job_control_args.add_argument(
'-ep',
'--extproduct', type=str, metavar=
'PRODUCT:VERSION',
218 help=
'Setup this external product on the worker node in format <product>:<version>',
221 job_control_args.add_argument(
"--mail_always",
222 help=
"Do you want an email whenever every jobs finishes?",
223 default=
False, action=
"store_true")
225 job_control_args.add_argument(
"--mail_on_error",
226 help=
"Do you want an email whenever a job fails on an error?",
227 default=
False, action=
"store_true")
230 sing_group = job_control_args.add_mutually_exclusive_group()
231 sing_group.add_argument(
'--singularity',
232 help=
'Location in CVMFS of a singularity container to launch the job into. ' +\
233 "If you're looking for a stock Scientific Linux image, try --os instead. " +\
234 "(This option is mutually exclusive with --os.)",
237 sing_group.add_argument(
"--os",
238 help=
"Run this job inside a Scientific Linux singularity image with given OS. " +\
239 "(This option is mutually exclusive with --singularity.)",
240 choices=known_os_containers)
242 job_control_args.add_argument(
'--jobfactory',
243 help=
'Use the specified JobFactoryType. Only use with --singularity',
246 job_control_args.add_argument(
"--gpu",
247 help=
"Request a node with a GPU",
248 default=
False, action=
"store_true")
250 job_control_args.add_argument(
"--export",
251 help=
'Export variable EXPORT to jobsub_submit',
252 type=str, action=
'append')
254 job_control_args.add_argument(
'-c',
255 help=
'Append Condor requirements',
256 type=str, action=
'append')
258 job_control_args.add_argument(
'--cpu',
259 help=
'Request worker nodes to have at least NUMBER cpus',
265 debugging_args = parser.add_argument_group(
"Debugging options",
"These optional arguments can help debug your submission.")
267 debugging_args.add_argument(
'--print_jobsub',
268 help=
'Print jobsub command',
269 action=
'store_true',default=
False)
271 debugging_args.add_argument(
'--test',
272 help=
'Do not actually do anything, just run tests and print jobsub cmd',
273 action=
'store_true',default=
False)
275 debugging_args.add_argument(
'--ifdh_debug',
276 help=
'Verbose output for pinning down IFDH/dCache issues',
277 action=
'store_true', default=
False)
279 opts = parser.parse_args()
281 macro = os.path.abspath(
vars(opts)[
'macro.C'])
293 if not os.path.isdir(opts.testrel+
'/lib/'+os.getenv(
'SRT_ARCH')+
'-GCC-maxopt'):
295 if opts.user_tarball:
301 outfiles = os.listdir(opts.outdir)
304 for filename
in outfiles:
306 if filename.endswith(
'.log.txt'):
continue 308 if filename.startswith(
'core.'):
continue 311 regex = re.compile(
".*?(\d+)_of_(\d+)\..*")
312 reresult = regex.match(filename)
314 thisjob =
int(reresult.group(1))
315 thismax =
int(reresult.group(2))
316 completejobs.add(thisjob)
317 if opts.njobs != thismax:
318 print "You specified {} total jobs, but output files have the form x_of_{}. These must be consistent! Bailing.".
format(opts.njobs, thismax)
320 for jobno
in range(1,opts.njobs+1):
321 if jobno
not in completejobs:
322 missingjobs.append(jobno)
323 if len(missingjobs) == 0:
324 print "No jobs to drain. Exiting." 327 print "Found jobs to drain:" 328 print " ",missingjobs
329 if opts.njobs != len(missingjobs):
330 print " Adjusting njobs from %i to %i to match." % (opts.njobs, len(missingjobs))
332 for jobno
in missingjobs:
333 draincmd +=
'-d %i ' % jobno
339 opts.singularity = known_os_containers[opts.os]
342 if not os.path.exists(opts.singularity):
343 fail(
"Requested singularity image cannot be found: %s" % opts.singularity)
345 if opts.gpu
and not opts.singularity:
346 warn(
"Requested GPU, but did not request singularity. This is not likely to succeed.")
348 if opts.gpu
and not opts.offsite:
349 warn(
"GPUs are only available offsite, and you have not chosen --offsite")
351 grid_script = os.getenv(
'NOVAGRIDUTILS_DIR')+
'/bin/cafe_grid_script.sh' 356 for inFile
in opts.input_file:
358 input_files += opts.input_file
363 source_scripts += opts.source
365 for script
in source_scripts:
367 script_path = script.split(
":")[0]
372 if not find_file(os.environ[
"PATH"].
split(os.pathsep), os.path.expandvars(script_path)):
373 fail(
"Script %s does not exist!" % script_path)
379 usage_models=[
"DEDICATED"]
380 if not opts.dedicated:
381 usage_models.append(
"OPPORTUNISTIC")
383 usage_models.append(
"OFFSITE")
384 jobsub_opts +=
" --resource-provides=usage_model=%s \\\n" % (
",".join(usage_models))
388 disk_opt=
" --disk=%sMB \\\n" % (opts.disk)
389 jobsub_opts += disk_opt
392 mem_opt=
" --memory=%sMB \\\n" % (opts.memory)
393 jobsub_opts += mem_opt
397 cpu_opt=
" --cpu=%s \\\n" %(opts.cpu)
398 jobsub_opts += cpu_opt
402 append_condor_opt =
" -c %s \\\n" % (
",".join(opts.c))
403 jobsub_opts += append_condor_opt
407 jobsub_opts +=
" --mail_always \\\n" 408 elif opts.mail_on_error:
409 jobsub_opts +=
" --mail_on_error \\\n" 411 jobsub_opts +=
" --mail_never \\\n" 413 if opts.node_features:
414 jobsub_opts +=
" %s \\\n" % (
421 life_opt=(
" --expected-lifetime=0s \\\n --append_condor_requirements='(((TARGET.GLIDEIN_ToDie-CurrentTime)>%s)||isUndefined(TARGET.GLIDEIN_ToDie))' \\\n" % opts.lifetime)
422 jobsub_opts += life_opt
426 jobsub_opts +=
" -e XrdSecDEBUG=2 \\\n" 430 jobsub_opts +=
" --line='+SingularityImage=\\\"%s\\\"' \\\n" % opts.singularity
431 jobsub_opts +=
" --append_condor_requirements='(TARGET.HAS_SINGULARITY=?=true)' \\\n" 433 jobsub_opts +=
" --line='+JobFactoryType=\\\"%s\\\"' \\\n" % opts.jobfactory
437 jobsub_opts +=
" --line='+RequestGPUs=1' \\\n" 439 if opts.recommended_sites
or opts.site:
442 if opts.recommended_sites:
443 for isite
in recommended_sites:
444 site_opt += isite +
"," 446 for isite
in opts.site:
447 if isite
not in recommended_sites:
448 warn(
"Site "+isite+
" is not known to work. Your jobs may fail at that site. Sleeping for 5 seconds")
450 site_opt += isite +
"," 452 site_opt=site_opt[:-1] +
" \\\n" 453 jobsub_opts += site_opt
455 if opts.exclude_site:
456 for isite
in opts.exclude_site:
457 jobsub_opts +=
" --append_condor_requirements='(TARGET.GLIDEIN_Site\ isnt\ \\\"%s\\\")' \\\n" % isite
461 print_jobsub=opts.print_jobsub
465 warn(
"--test was specified, so all we do is run checks and print jobsub cmd.")
468 if os.path.isfile(
".rootrc"):
469 fail(
"--ifdh_debug creates a temporary .rootrc file, but there is already one in this directory. Submit from somewhere else.")
472 rc.write(
"XNet.UseOldClient: yes\n")
473 rc.write(
"XNet.Debug: 2\n")
474 rc.write(
"XNet.MaxRedirectCount: 255\n")
476 source_scripts += [
'setup_test_product:NovaGridUtils']
477 source_scripts += [
'export_var:IFDH_SILENT=0']
478 source_scripts += [
'export_var:IFDH_DEBUG=1']
483 for export
in opts.export:
484 source_scripts += [
'export_var:%s' % export]
486 cmd =
'jobsub_submit ' 488 cmd +=
'-N '+
str(len(missingjobs))+
' ' 490 cmd +=
'-N '+
str(opts.njobs)+
' ' 491 cmd +=
"\\\n"+jobsub_opts
492 cmd +=
' -f dropbox://%s \\\n' % (macro)
493 for f
in input_files:
494 cmd +=
' -f %s \\\n' % (f)
496 cmd +=
' -f dropbox://%s \\\n' % (os.path.abspath(
".rootrc"))
500 if opts.reuse_tarball:
501 cmd +=
' --tar_file_name dropbox://'+os.path.basename(opts.testrel)+
'.tar' 503 cmd +=
' --tar_file_name tardir://'+opts.testrel+
'\\\n' 504 elif opts.user_tarball:
505 if not os.path.isfile(opts.user_tarball):
506 print "Tarball filename passed to --user_tarball does not exit:", opts.user_tarball
508 cmd +=
' --tar_file_name dropbox://' + opts.user_tarball +
' \\\n' 511 cmd +=
' file://%s \\\n' % (grid_script)
512 for source_script
in source_scripts:
513 cmd +=
' --source %s \\\n' % (source_script)
515 cmd +=
' --release %s \\\n' % (rel)
516 if opts.testrel
or opts.user_tarball:
517 cmd +=
' --tarball \\\n' 521 cmd +=
' --source setup_test_product:NovaGridUtils \\\n' 524 cmd +=
' --source setup_product:NovaGridUtils:%s \\\n' %(opts.ngu_version)
527 cmd +=
' --testrel_ngu \\\n' 529 cmd +=
' --macro %s \\\n' % (os.path.basename(macro))
530 cmd +=
' --outdir %s \\\n' % (opts.outdir)
531 cmd +=
' --njobs %s \\\n' % (
str(opts.njobs))
533 if opts.snapshot: cmd +=
' --snapshot \\\n' 535 if opts.numuccinc: cmd +=
' --numuccinc \\\n' 536 if opts.numubarccinc: cmd +=
' --numubarccinc \\\n' 537 if opts.numucc2p2h: cmd +=
' --numucc2p2h \\\n' 538 if opts.numucc0pi: cmd +=
' --numucc0pi \\\n' 539 if opts.nuebarccinc: cmd +=
' --nuebarccinc \\\n' 540 if opts.numubarccpi0: cmd +=
' --numubarccpi0 \\\n' 541 if opts.no3flavor: cmd +=
' --no3flavor \\\n' 542 if opts.nuxana: cmd +=
' --nuxana \\\n' 545 if opts.extproduct !=
'NONE': cmd +=
' --extproduct %s \\\n' % (opts.extproduct)
548 print "Jobs to drain, but no draining command. This shouldn't happen." 551 for a
in opts.args: cmd +=
' '+a
562 os.remove(
"./.rootrc")
void split(double tt, double *fr)
def check_is_group_writable(fname)
def find_file(paths, filename)
def test_not_dcache(l, warnOnly=False)
def make_jobsub_node_features_arg(features)
std::string format(const int32_t &value, const int &ndigits=8)
procfile open("FD_BRL_v0.txt")
def add_node_features_arg(parser)
def get_credentials(role)
def find_file_in_list(filepath, pathlist)