overlay_prestage_def.py
Go to the documentation of this file.
1 #!/bin/env python
2 """
3  overlay_prestage_def.py:
4 
5  Create (and snapshot) a SAM definition that contains only those files
6  from an overlay definition corresponding to the files actually used
7  by the overlay FCLs in the FCL definition.
8 """
9 
10 import argparse
11 import math
12 import os.path
13 import re
14 import sys
15 import samweb_client
16 
17 GROUP_SIZE = 100
18 
19 FILE_PATTERN = re.compile(r".*r(?P<run>\d+).*s(?P<subrun>\d+).*")
20 
21 def Subdivide(input):
22  return [ input[n*GROUP_SIZE:(n+1)*GROUP_SIZE] for n in range(len(input) / GROUP_SIZE + 1) ]
23 
24 def RecurseOverDefinitions(input, tier_num, n_tiers, overlay_def, output_def_name_base, overwrite_defns=False):
25  if len(input) == 0:
26  return None
27 
28  group_defns = []
29  for group_idx, grouping in enumerate(input):
30 
31  # fhicl_runs (they're tuples of two values)
32  if all(len(g) == 2 and all(isinstance(i, int) for i in g) for g in grouping):
33  subset_defn_dims = "defname: %s and (%s)" % (overlay_def, " or ".join(("run_number %d.%d" % (run,sub) for run,sub in grouping)))
34  else:
35  subset_defn_dims = " or ".join("dataset_def_name_newest_snapshot %s" % g for g in grouping)
36 
37  if tier_num < n_tiers:
38  suffix = "_" + ("Sub" * (n_tiers - tier_num)) + ("set%d" % group_idx)
39  else:
40  suffix = ""
41 
42  output_def_name = output_def_name_base + suffix
43 
44  if overwrite_defns:
45  try:
46  SAM.deleteDefinition(defname=output_def_name)
48  pass
49 
50  SAM.createDefinition(defname=output_def_name, dims=subset_defn_dims)
51 
52 # print " " * (n_tiers - tier_num), "new defn:", output_def_name
53 # with open(output_def_name, "w") as outf:
54 # outf.write(subset_defn_dims)
55 
56  # MUST snapshot if we're recursing further
57  if tier_num < n_tiers:
58  SAM.takeSnapshot(defname=output_def_name)
59 # print " " * (n_tiers - tier_num), "snapshot:", output_def_name
60 
61  group_defns.append(output_def_name)
62 
63  # if it's still too big, go around again...
64  if len(group_defns) > 1:
65  recurse_defns = Subdivide(group_defns)
66  return RecurseOverDefinitions(recurse_defns, tier_num+1, n_tiers, overlay_def, output_def_name_base)
67  else:
68  assert(tier_num == n_tiers), "Down to 1 group before all tiers completed?!"
69  return group_defns[0]
70 
71 
72 SAM = samweb_client.SAMWebClient(experiment='nova')
73 
74 
75 parser = argparse.ArgumentParser(description="Make SAM definition consisting only of used overlay files suitable for prestaging")
76 parser.add_argument("overlay_def",
77  metavar="OVERLAY_DEF",
78  help="The overlay file SAM dataset definition (cosmics/rock/etc.)",
79 )
80 parser.add_argument("fcl_defs",
81  metavar="FCL_DEF",
82  nargs="+",
83  help="The SAM dataset definition(s) containing the FCLs to use",
84 )
85 parser.add_argument("-o", "--output_def_name", "--output-def-name",
86  metavar="OUTPUT_DEF",
87  help="The name of the output definition (default=<OVERLAY_DEF>_FCLmatched)"
88 )
89 parser.add_argument("-ss", "--use_snapshots",
90  action="store_true",
91  default=False,
92  help="Use snapshots for input definitions"
93 )
94 parser.add_argument("--no-final-snapshot", "--no_final_snapshot",
95  action="store_true",
96  default=False,
97  help="Don't take a snapshot of the output definition"
98 )
99 parser.add_argument("--overwrite_defns",
100  action="store_true",
101  default=False,
102  help="Replace any pre-existing definitions that have the same name as those created by this script")
103 
104 parser.add_argument("--unmatched_fcls_outfile",
105  metavar="FILENAME",
106  default="$PWD/unmatched_FCLs.txt",
107  help="File where the names of any unmatched FCLs should be written"
108 )
109 
110 args = parser.parse_args()
111 
112 output_def_name = args.output_def_name or args.overlay_def + "_FCLmatched"
113 
114 fhicl_runs = set()
115 fhicl_files = set()
116 for defn in args.fcl_defs:
117  print "Considering FCL defn '%s':" % defn
118  if args.use_snapshots:
119  file_list = SAM.listFiles(dimensions="dataset_def_name_newest_snapshot %s" % defn)
120  else:
121  file_list = SAM.listFiles(defname=defn)
122  sys.stdout.flush()
123  n_files = len(file_list)
124  n_files_div10 = n_files/10
125  print " There are %d files... " % n_files,
126  for n, f in enumerate(file_list):
127  if n % n_files_div10 == 0:
128  curr_perc = int(float(n+1)/n_files * 100 + 0.5) # the '+0.5' so as to get rounding to nearest int
129  print " %d%%" % curr_perc,
130  sys.stdout.flush()
131 
132  # getting the metadata file-by-file is pretty slow.
133  # try the filename pattern matching first.
134  # if that fails, then get it from the metadata
135  matches = FILE_PATTERN.match(f)
136  if matches:
137  fhicl_runs.add( (int(matches.group("run")), int(matches.group("subrun"))) )
138  else:
139  md = SAM.getMetadata(f)
140  fhicl_runs.add( (md["Simulated.firstRun"], md["Simulated.firstSubRun"]) )
141  fhicl_files.add(f)
142 
143  print
144 
145 print "Total number of run+subrun pairs:", len(fhicl_runs)
146 
147 # SAM can't handle huge queries.
148 # Recursively group run numbers together in bunches of GROUP_SIZE.
149 num_tiers = int(math.log(len(fhicl_runs)) / math.log(GROUP_SIZE) + 1) # +1 to round UP
150 
151 if num_tiers > 1:
152  print "These will be grouped in", num_tiers, "successive tiers of snapshotted definitions."
153  print "This may take some time."
154 output_def_name = RecurseOverDefinitions(Subdivide(list(fhicl_runs)), 1, num_tiers, args.overlay_def, output_def_name, args.overwrite_defns)
155 
156 print
157 print "Created definition '%s'" % output_def_name
158 
159 check_output = True
160 if args.no_final_snapshot:
161  files = SAM.listFiles(defname=output_def_name)
162 else:
163  snapid = SAM.takeSnapshot(defname=output_def_name)
164  print " snapshot ID =", snapid
165  files = SAM.listFiles(dimensions="dataset_def_name_newest_snapshot %s" % output_def_name)
166 
167 n_files = len(files)
168 print " This definition contains %d files." % n_files
169 n_runs = len(fhicl_runs)
170 if n_files != n_runs:
171  files_unmatched = set(fhicl_files)
172  overlay_runs = set()
173  for overlay_file in files:
174  matches = FILE_PATTERN.match(overlay_file)
175  overlay_runs.add((int(matches.group("run")), int(matches.group("subrun"))))
176 
177  for f in fhicl_files:
178  matches = FILE_PATTERN.match(f)
179  if (int(matches.group("run")), int(matches.group("subrun"))) in overlay_runs:
180  files_unmatched.remove(f)
181 
182  print " WARNING: there were %d run,subrun pairs selected from the FCLs. You are missing %d pairs!" % (n_runs, len(files_unmatched))
183  print " the FCLs corresponding to these files will be saved in %s." % args.unmatched_fcls_outfile
184  with open(os.path.expandvars(args.unmatched_fcls_outfile), "w") as outf:
185  for f in files_unmatched:
186  outf.write("%s\n" % f)
187 
188 
def RecurseOverDefinitions(input, tier_num, n_tiers, overlay_def, output_def_name_base, overwrite_defns=False)
procfile open("FD_BRL_v0.txt")
assert(nhit_max >=nhit_nbins)