overlay_prestage_def.py
Go to the documentation of this file.
1 #!/bin/env python
2 """
3  overlay_prestage_def.py:
4 
5  Create (and snapshot) a SAM definition that contains only those files
6  from an overlay definition corresponding to the files actually used
7  by the overlay FCLs in the FCL definition.
8 """
9 from __future__ import print_function
10 from __future__ import division
11 
12 #from builtins import range
13 from past.utils import old_div
14 import argparse
15 import math
16 import os.path
17 import re
18 import sys
19 import samweb_client
20 
21 GROUP_SIZE = 100
22 
23 FILE_PATTERN = re.compile(r".*r(?P<run>\d+).*s(?P<subrun>\d+).*")
24 
25 def Subdivide(input):
26  return [ input[n*GROUP_SIZE:(n+1)*GROUP_SIZE] for n in range(old_div(len(input), GROUP_SIZE) + 1) ]
27 
28 def RecurseOverDefinitions(input, tier_num, n_tiers, overlay_def, output_def_name_base, overwrite_defns=False):
29  if len(input) == 0:
30  return None
31 
32  group_defns = []
33  for group_idx, grouping in enumerate(input):
34 
35  # fhicl_runs (they're tuples of two values)
36  if all(len(g) == 2 and all(isinstance(i, int) for i in g) for g in grouping):
37  subset_defn_dims = "defname: %s and (%s)" % (overlay_def, " or ".join(("run_number %d.%d" % (run,sub) for run,sub in grouping)))
38  else:
39  subset_defn_dims = " or ".join("dataset_def_name_newest_snapshot %s" % g for g in grouping)
40 
41  print(subset_defn_dims)
42 
43  if tier_num < n_tiers:
44  suffix = "_" + ("Sub" * (n_tiers - tier_num)) + ("set%d" % group_idx)
45  else:
46  suffix = ""
47 
48  output_def_name = output_def_name_base + suffix
49 
50  if overwrite_defns:
51  try:
52  SAM.deleteDefinition(defname=output_def_name)
54  pass
55 
56  SAM.createDefinition(defname=output_def_name, dims=subset_defn_dims)
57 
58 # print " " * (n_tiers - tier_num), "new defn:", output_def_name
59 # with open(output_def_name, "w") as outf:
60 # outf.write(subset_defn_dims)
61 
62  # MUST snapshot if we're recursing further
63  if tier_num < n_tiers:
64  SAM.takeSnapshot(defname=output_def_name)
65 # print " " * (n_tiers - tier_num), "snapshot:", output_def_name
66 
67  group_defns.append(output_def_name)
68 
69  # if it's still too big, go around again...
70  if len(group_defns) > 1:
71  recurse_defns = Subdivide(group_defns)
72  return RecurseOverDefinitions(recurse_defns, tier_num+1, n_tiers, overlay_def, output_def_name_base)
73  else:
74  assert(tier_num == n_tiers), "Down to 1 group before all tiers completed?!"
75  return group_defns[0]
76 
77 
78 SAM = samweb_client.SAMWebClient(experiment='nova')
79 
80 
81 parser = argparse.ArgumentParser(description="Make SAM definition consisting only of used overlay files suitable for prestaging")
82 parser.add_argument("overlay_def",
83  metavar="OVERLAY_DEF",
84  help="The overlay file SAM dataset definition (cosmics/rock/etc.)",
85 )
86 parser.add_argument("fcl_defs",
87  metavar="FCL_DEF",
88  nargs="+",
89  help="The SAM dataset definition(s) containing the FCLs to use",
90 )
91 parser.add_argument("-o", "--output_def_name", "--output-def-name",
92  metavar="OUTPUT_DEF",
93  help="The name of the output definition (default=<OVERLAY_DEF>_FCLmatched)"
94 )
95 parser.add_argument("-ss", "--use_snapshots",
96  action="store_true",
97  default=False,
98  help="Use snapshots for input definitions"
99 )
100 parser.add_argument("--no-final-snapshot", "--no_final_snapshot",
101  action="store_true",
102  default=False,
103  help="Don't take a snapshot of the output definition"
104 )
105 parser.add_argument("--overwrite_defns",
106  action="store_true",
107  default=False,
108  help="Replace any pre-existing definitions that have the same name as those created by this script")
109 
110 parser.add_argument("--unmatched_fcls_outfile",
111  metavar="FILENAME",
112  default="$PWD/unmatched_FCLs.txt",
113  help="File where the names of any unmatched FCLs should be written"
114 )
115 
116 args = parser.parse_args()
117 
118 output_def_name = args.output_def_name or args.overlay_def + "_FCLmatched"
119 
120 fhicl_runs = set()
121 fhicl_files = set()
122 for defn in args.fcl_defs:
123  print("Considering FCL defn '%s':" % defn)
124  if args.use_snapshots:
125  file_list = SAM.listFiles(dimensions="dataset_def_name_newest_snapshot %s" % defn)
126  else:
127  file_list = SAM.listFiles(defname=defn)
128  sys.stdout.flush()
129  n_files = len(file_list)
130  n_files_div10 = old_div(n_files,10)
131  print(" There are %d files... " % n_files, end=' ')
132  for n, f in enumerate(file_list):
133  if n % n_files_div10 == 0:
134  curr_perc = int(float(n+1)/n_files * 100 + 0.5) # the '+0.5' so as to get rounding to nearest int
135  print(" %d%%" % curr_perc, end=' ')
136  sys.stdout.flush()
137 
138  # getting the metadata file-by-file is pretty slow.
139  # try the filename pattern matching first.
140  # if that fails, then get it from the metadata
141  matches = FILE_PATTERN.match(f)
142  if matches:
143  fhicl_runs.add( (int(matches.group("run")), int(matches.group("subrun"))) )
144  else:
145  md = SAM.getMetadata(f)
146  fhicl_runs.add( (md["Simulated.firstRun"], md["Simulated.firstSubRun"]) )
147  fhicl_files.add(f)
148 
149  print()
150 
151 print("Total number of run+subrun pairs:", len(fhicl_runs))
152 
153 # SAM can't handle huge queries.
154 # Recursively group run numbers together in bunches of GROUP_SIZE.
155 num_tiers = int(old_div(math.log(len(fhicl_runs)), math.log(GROUP_SIZE)) + 1) # +1 to round UP
156 
157 if num_tiers > 1:
158  print("These will be grouped in", num_tiers, "successive tiers of snapshotted definitions.")
159  print("This may take some time.")
160 output_def_name = RecurseOverDefinitions(Subdivide(list(fhicl_runs)), 1, num_tiers, args.overlay_def, output_def_name, args.overwrite_defns)
161 
162 print()
163 print("Created definition '%s'" % output_def_name)
164 
165 check_output = True
166 if args.no_final_snapshot:
167  files = SAM.listFiles(defname=output_def_name)
168 else:
169  snapid = SAM.takeSnapshot(defname=output_def_name)
170  print(" snapshot ID =", snapid)
171  files = SAM.listFiles(dimensions="dataset_def_name_newest_snapshot %s" % output_def_name)
172 
173 n_files = len(files)
174 print(" This definition contains %d files." % n_files)
175 n_runs = len(fhicl_runs)
176 if n_files != n_runs:
177  files_unmatched = set(fhicl_files)
178  overlay_runs = set()
179  for overlay_file in files:
180  matches = FILE_PATTERN.match(overlay_file)
181  overlay_runs.add((int(matches.group("run")), int(matches.group("subrun"))))
182 
183  for f in fhicl_files:
184  matches = FILE_PATTERN.match(f)
185  if (int(matches.group("run")), int(matches.group("subrun"))) in overlay_runs:
186  files_unmatched.remove(f)
187 
188  print(" WARNING: there were %d run,subrun pairs selected from the FCLs. You are missing %d pairs!" % (n_runs, len(files_unmatched)))
189  print(" the FCLs corresponding to these files will be saved in %s." % args.unmatched_fcls_outfile)
190  with open(os.path.expandvars(args.unmatched_fcls_outfile), "w") as outf:
191  for f in files_unmatched:
192  outf.write("%s\n" % f)
193 
194 
def RecurseOverDefinitions(input, tier_num, n_tiers, overlay_def, output_def_name_base, overwrite_defns=False)
bool print
procfile open("FD_BRL_v0.txt")
assert(nhit_max >=nhit_nbins)