MetadataUtils.py
Go to the documentation of this file.
1 #!/bin/env python
2 import os, sys, stat, pwd
3 import datetime
4 from time import sleep
5 import re
6 import subprocess
7 import json
8 from samweb_client.utility import fileEnstoreChecksum
9 
10 detectors = dict()
11 detectors["fd"] = "fardet"
12 detectors["nd"] = "neardet"
13 detectors["tb"] = "testbeam"
14 detectors["ndos"] = "ndos"
15 
16 ##allow this to potentially expand beyond just generators we use now
17 neutrinoGenerators=["genie", "gibuu"]
18 neutrinoGenieTunes=["default", "G1802a0211a", "G1810b0211a", "G1810i00000", "G1801b0211a", "G1802a00000", "G1801a0211a", "G1810a0211a", "G0000b00000", "G1802b0211a", "G1801a00000", "G1801b00000", "G1810a00000", "G1810j00000", "G1802b00000", "G1810b00000","N1810j0211a"]
19 cosmicGenerators=["cry"]
20 
21 run_pattern = re.compile(r"^.*?_r([0-9]+)_s([0-9]+).+")
22 sim_pattern = re.compile(r"^.*_(20\d\d\d\d\d\d_\d\d\d\d\d\d)")
23 stream_pattern = re.compile(r"^.*?_r[0-9]+_s[0-9]+_(.*?)[\.,_].*")
24 
25 parent_pattern = re.compile(r'(,\n.*)?(?P<parent_str>"parents":.*)', re.MULTILINE)
26 
27 def isNone(value):
28  if value == None : return True
29  if value == "none" : return True
30  return False
31 
32 def appendField(base,extension):
33 
34  if isNone( base) and isNone(extension): return "none"
35  elif isNone( base): return extension
36  elif isNone(extension): return base
37  elif base == extension: return base
38 
39  return base +"_" + extension
40 
41 
42 def addMetadataToFCL(fclFile,parName,parValue):
43  fclFile.write("\nphysics.analyzers.metadata.params.")
44  fclFile.write(parName)
45  fclFile.write(": ")
46  fclFile.write(parValue)
47  fclFile.write(" \n")
48 
49 def unCamelCase(s):
50  """ Convert CamelCase to camel_case """
51  # \B matches an empty string which is NOT at the beginning of a word
52  # so requiring this means no _ will be inserted at the start
53  return re.sub( r'\B([A-Z])', r'_\1', s).lower()
54 
55 def cleanup_metadata(in_md):
56  """ Numerous keys are stored in the files in ways that we need to 'translate'
57  so that they can be given to SAM. """
58 
59  md = {}
60  for k, v in in_md.iteritems():
61  k = str(k)
62  if len(k) == 0:
63  continue
64  if isinstance(v, basestring) and len(v) == 0:
65  v = "none"
66 
67  # thanks, sam_metadata_dumper
68  if '.' not in k:
69  k = unCamelCase(k)
70  else:
71  k = k.lower()
72  v = str(v)
73 
74  # fix for poorly formatted metadata from FileReducer pre-r24852
75  if k.lower() == "parents" and '","file_name":' in v:
76  v = v.replace('","file_name":', '"},{"file_name":')
77 
78  # note that for correct JSON serialization,
79  # any lists or maps need to be processed into Python lists/maps.
80  # if that fails, just fall back to the string we had.
81  if isinstance(v, basestring) and any(c in v for c in "[]{}"):
82  try:
83  v = eval(v)
84  except:
85  pass
86 
87  # specific bugfix for run list duplicates
88  # (fixed upstream in r24859)
89  if k == "runs":
90  v = list(set([tuple(run) for run in v]))
91 
92  # sam_metadata_dumper spits out [ run, subrun, event ].
93  # but the field itself should just be the run number...
94  if k in ('first_event', 'last_event'):
95  try:
96  v = int(v)
97  except TypeError:
98  if len(v) == 3:
99  v = v[2]
100  else:
101  continue
102 
103  # this is historical, but I can't guarantee it's not needed any more,
104  # even though every file I can see now has this as an int...
105  if k == 'simulated.cycle':
106  v = eval(v)
107 
108  # more type coersion
109  if k in ('start_date', 'end_date'):
110  v = long(v)
111 
112  # ummm... yeah.
113  if k == 'process_name' and md.get('application',[]).get('name') is None:
114  k = 'application_name'
115  elif k == 'stream_name':
116  k = 'data_stream'
117 
118  if k.startswith('application_'):
119  if 'application' not in md:
120  md['application'] = {}
121  md['application'][k[12:]] = v
122  elif k in ('file_format_era', 'file_format_version'):
123  pass
124  else:
125  md[k] = v
126 
127  return md
128 
129 def createMetadata(inFile):
130  filesize = os.path.getsize(inFile)
131  filename = os.path.basename(inFile)
132  if filename.endswith("caf.root"):
133  md = {'file_name': filename, 'group': 'nova', 'file_size': filesize, 'file_format': 'unknown'}
134  try:
135  # data = subprocess.check_output(["extractCAFMetadata", str(inFile)])
136  data = subprocess.check_output(["extractCAFMetadata", str(inFile)])
137  except subprocess.CalledProcessError:
138  print "extractCAFMetadata failed!"
139  return None
140 
141  dumperDict = json.loads(data[data.find("{"):])
142 
143  md.update(cleanup_metadata(dumperDict))
144  md['crc'] = fileEnstoreChecksum(inFile)
145 
146  return md
147  elif filename.endswith(".h5"):
148  md = {'file_name': filename, 'group': 'nova', 'file_size': filesize, 'file_format': 'unknown'}
149  try:
150  data = subprocess.check_output(["extractHDF5Metadata", str(inFile)])
151  except subprocess.CalledProcessError:
152  print "extractHDF5Metadata failed!"
153  return None
154 
155  dumperDict = json.loads(data[data.find("{"):])
156 
157  md.update(cleanup_metadata(dumperDict))
158  md['crc'] = fileEnstoreChecksum(inFile)
159 
160  return md
161 
162  else:
163  md = {'file_name': filename, 'group': 'nova', 'file_size': filesize, 'file_format': 'artroot'}
164  try:
165  data = subprocess.check_output(["sam_metadata_dumper", str(inFile)])
166  except subprocess.CalledProcessError:
167  print "sam_metadata_dumper failed!"
168  return None
169 
170  # hilariously, sam_metadata_dumper prints out the 'parents' metadata twice:
171  # once, with format like '[ "<f1>", "<f2>", ... ]' (incorrect?)
172  # and again, with format like '[ {"filename": "<f1>"}, {"filename": "<f2>"}, ...]' (correct?)
173  # eliminate the first one.
174  matched = False
175  for matches in parent_pattern.findall(data):
176  match = matches[-1]
177  parent_dict = eval("{%s}" % match)
178  if isinstance(parent_dict["parents"][0], basestring):
179  matched = True
180  break
181  if matched:
182  data = data.replace("".join(matches), "")
183 
184  dumperDict = json.loads(data.replace("\x1b[?1034h", ""))
185 
186  for dumperKey in dumperDict:
187  if str(inFile) in str(dumperKey):
188  tmpMD = dumperDict[dumperKey]
189  md.update(cleanup_metadata(tmpMD))
190  break
191  md['crc'] = fileEnstoreChecksum(inFile)
192 
193  return md
194 
196 
197  def __init__(self,inFileName, metadata, release, systematic, skim, cosmicsPolarity, subversion, specialName):
198 
199  #make all lower case keys
200  self.metadata = {k.lower():v for k,v in metadata.items()}
201  self.inFileName=inFileName
202  self.release = release
203  self.setRunInfo()
204  self.setDatasetTag()
205  if not self.isSam4Users():
206  self.setDetector()
207  self.setDataFlag()
208  if self.dataFlag == "data":
209  self.setDataInfo()
210  else:
211  self.setSimInfo()
212  self.setSkim(skim)
213  self.cosmicsPolarity = cosmicsPolarity
214  self.setSpecialInfo(subversion,systematic, specialName)
215 
216  def getMetadataParameter(self,parName):
217  retVal=None
218  if parName in self.metadata:
219  retVal = self.metadata[parName]
220  return retVal
221 
222  def setDatasetTag(self):
223  mytag=self.getMetadataParameter("dataset.tag")
224  if mytag!= None:
225  self.mytag = mytag.lower()
226  else:
227  self.mytag = "none"
228 
229  def isSam4Users(self):
230  return not isNone(self.mytag)
231 
232  def setDetector(self):
233  detector=self.getMetadataParameter("online.detector")
234  if detector != None:
235  self.detector = detector.lower()
236  else:
237  detID = self.getMetadataParameter("nova.detectorid")
238  if detID == None:
239  print "Error!! No detector name found!!"
240  exit(1)
241  detID=detID.lower()
242  if detID in detectors.keys():
243  self.detector=detectors[detID]
244  else:
245  print "No detector name found for detectorid:", detId
246  exit(1)
247 
248 
249  def setDataFlag(self):
250 
251  self.fileType=self.getMetadataParameter("file_type")
252 
253  if self.fileType == None:
254  print "No file_type set!"
255  exit(1)
256 
257  elif self.fileType == "importedDetector":
258  self.dataFlag = "data"
259 
260  elif self.fileType == "importedSimulated":
261  self.dataFlag = "sim"
262 
263  elif self.fileType == "text":
264  self.dataFlag = "text"
265 
266  else:
267  print "Unrecognized file_type:", file_type
268  exit(1)
269 
270 
271  def setRunInfo(self):
272 
273  #if new format "runs" metadata, use that, otherwise, get from file name
274  if "runs" in self.metadata and len(self.metadata["runs"][0]) == 3:
275  self.runNum = int(self.metadata["runs"][0][0])
276  self.subNum = int(self.metadata["runs"][0][1])
277  else:
278  if "dataset.tag" in self.metadata != "none":
279  nameofThetag = self.getMetadataParameter("dataset.tag")
280 
281  TOTrunNum = subprocess.Popen( [ "samweb", "count-files", "dataset.tag " + str(nameofThetag) ], shell=False, stdout=subprocess.PIPE ).stdout.read()
282  Tot = int(TOTrunNum)
283  for i in range(0,Tot):
284  self.runNum = int(TOTrunNum)
285  self.subNum = int(i)
286 
287  else:
288  inFileBase=os.path.basename(self.inFileName)
289  run_match = run_pattern.match(inFileBase)
290  if run_match != None:
291  self.runNum = int(run_match.groups()[0])
292  self.subNum = int(run_match.groups()[1])
293 
294  else:
295  print "No run number/subrun number found!"
296  exit(1)
297 
298  def setDataInfo(self):
299 
300  #get data_stream from metadata. If it is "all", try extracting from the file name.
301  tmpStream = 'all'
302  if "data_stream" in self.metadata:
303  tmpStream = self.metadata["data_stream"]
304  elif "online.stream" in self.metadata:
305  tmpStream = self.metadata["online.stream"]
306 
307  if self.getMetadataParameter("nova.hornpolarity"):
308  self.horn=self.getMetadataParameter("nova.hornpolarity")
309  else:
310  self.horn=""
311 
312  if tmpStream == 'all':
313  stream_match = stream_pattern.match(inFileBase)
314  if stream_match != None:
315  tmpStream = stream_match.groups()[0]
316  if tmpStream[0] == 't':
317  tmpStream = tmpStream[1:]
318 
319  try:
320  self.stream = int(tmpStream)
321  self.streamEntry = 't{0:02d}'.format(self.stream)
322  except ValueError:
323  self.stream = tmpStream
324  self.streamEntry = self.stream
325  except:
326  print "No data stream found!"
327  exit(1)
328 
329  def setSimInfo(self):
330 
331  self.generator = self.getMetadataParameter("simulated.generator")
332 
333  # Some early gibuu files accidentally have generator set to singlep
334  # (fixed in r32723). If that's the case, fix it up here. In particular
335  # because it impacts the correct setting of flavorset.
336  if self.generator == 'singlep' and self.getMetadataParameter('simulated.gibuuused') == 'true':
337  self.generator = 'gibuu'
338 
339  if self.getMetadataParameter("nova.hornpolarity"):
340  self.horn=self.getMetadataParameter("nova.hornpolarity")
341  else:
342  self.horn=""
343 
344  if self.generator in neutrinoGenerators:
345 
346  self.flavorset=self.getMetadataParameter("simulated.genieflavorset")
347  if self.flavorset=="swap": self.flavorset="fluxswap"
348 
349  self.genierw=self.getMetadataParameter("simulated.genierw")
350  if self.genierw==None: self.genierw = "nogenierw"
351 
352  self.fluxVersion=self.getMetadataParameter("nova.flux_version")
353  if self.fluxVersion == None:
354  self.fluxVersion=os.environ["NOVA_FLUX_VERSION"].replace("nova_","")
355 
356  if self.generator == 'genie' and self.getMetadataParameter('simulated.genieused') == 'true':
357  self.NPPFX=self.getMetadataParameter("simulated.NPPFX")
358  self.genietune=self.getMetadataParameter("simulated.genietune")
359  if self.genietune not in neutrinoGenieTunes:
360  # If no genietune metadata field is found, the default tune
361  # is assumed
362  if self.genietune==None:
363  self.genietune="default"
364  else:
365  print "ERROR!! Invalid GENIE tune: ", self.genietune
366  exit(1)
367 
368  elif self.generator in cosmicGenerators:
369  self.flavorset=self.getMetadataParameter("simulated.cryflavorset")
370 
371  self.nevt = self.getMetadataParameter("simulated.number_of_spills")
372  self.cycle = self.getMetadataParameter("simulated.cycle")
373 
374  matches=sim_pattern.match(self.inFileName)
375  matchGroups=matches.groups()
376  if len(matchGroups)<1:
377  print "ERROR!! Couldn't find timestamp in" , self.inFileName
378  exit(1)
379  self.timestamp = matchGroups[0]
380 
381  def setSpecialInfo(self,subversion,systematic,specialName):
382 
383  self.systematic = "none"
384  mdSystematic=self.getMetadataParameter("nova.systematic")
385  if (not isNone(mdSystematic)) and (not isNone(systematic)):
386  print "ERROR: you specified a systematic, but this is already a systematic file"
387  exit(1)
388  elif not isNone(mdSystematic):
389  self.systematic = mdSystematic
390  elif not isNone(systematic):
391  self.systematic = systematic
392 
393  if subversion != None:
394  self.subversion = subversion
395  else:
396  self.subversion = self.getMetadataParameter("nova.subversion")
397  if self.subversion == None:
398  self.subversion="1"
399 
400  mdSpecialName = self.getMetadataParameter("nova.special")
401  self.special = appendField(mdSpecialName,specialName)
402 
403  def setSkim(self,skim):
404  mdSkim = self.getMetadataParameter("nova.skim")
405  self.skim = appendField(mdSkim,skim)
406 
407  def getOutputFileName(self,tier):
408  outFileName=""
409  if self.isSam4Users():
410  outFileName = os.path.basename(self.inFileName.strip(tier+'.root'))
411  outFileName += "_r%.8d_s%.2d" % (self.runNum, self.subNum)
412  else:
413  outFileName = self.detector
414 
415  if self.dataFlag == "data":
416  # data
417  # <det>_<run>_<subrun>_<horn>_<stream>_<tag>_<npass>_data_<syst>_<skim>_<spec>.<tier>.root
418 
419  print "SETTING FILE NAME DATA"
420  outFileName += "_r%.8d_s%.2d" % (self.runNum, self.subNum)
421 
422  if not isNone(self.cosmicsPolarity):
423  outFileName += "_%s" % self.cosmicsPolarity
424 
425  if self.horn != "":
426  outFileName += "_%s" % self.horn
427 
428 
429  outFileName += "_%s" % self.streamEntry
430  outFileName += "_%s" % self.release
431  outFileName += "_v%s" % self.subversion
432  outFileName += "_data"
433 
434  elif self.dataFlag == "sim":
435  # sim
436  #<det>_<gen>_<tune>_<flavor>_<genierw>_<horn>_<flux>_<nevt>_<run>_<subrun>_<cycle>_<tag>_<npass>_<timestamp>_sim_<syst>_<skim>_<spec>.<tier>.root
437  outFileName += "_%s" % self.generator
438  if self.generator in neutrinoGenerators:
439  outFileName += "_%s" % self.genietune
440  outFileName += "_%s" % self.flavorset
441  if self.generator in neutrinoGenerators: #these parameters do not apply for cosmic generators
442  outFileName += "_%s" % self.genierw
443  if not isNone(self.cosmicsPolarity):
444  outFileName += "_%s" % self.cosmicsPolarity
445  if self.horn != "":
446  outFileName += "_%s" % self.horn
447  if self.generator in neutrinoGenerators: #these parameters do not apply for cosmic generators
448  outFileName += "_%s_%s" % ( self.fluxVersion, self.nevt )
449 
450  if self.cycle == None: # cosmic MC files with cycle missing
451  outFileName += "_r%.8d_s%.2d" % (self.runNum, self.subNum)
452  else:
453  outFileName += "_r%.8d_s%.2d_c%.3d" % (self.runNum, self.subNum, self.cycle)
454 
455  outFileName += "_%s" % self.release
456  outFileName += "_v%s" % self.subversion
457  outFileName += "_%s" % self.timestamp
458  outFileName += "_sim"
459 
460  if not isNone(self.systematic):
461  outFileName += "_%s" % self.systematic
462  if not isNone(self.skim):
463  outFileName += "_%s" % self.skim
464  if not isNone(self.special):
465  outFileName += "_%s" % self.special
466 
467  if tier == "artdaq" :
468  if self.dataFlag == "sim":
469  outFileName += ".daq.root"
470  elif self.dataFlag == "data" :
471  outFileName += ".%s.root" % tier
472  elif tier == "h5caf":
473  outFileName += ".h5caf"
474  else :
475  outFileName += ".%s.root" % tier
476 
477  return outFileName
def isNone(value)
def getOutputFileName(self, tier)
def unCamelCase(s)
def getMetadataParameter(self, parName)
def appendField(base, extension)
def fileEnstoreChecksum(path)
Definition: utility.py:50
std::string format(const int32_t &value, const int &ndigits=8)
Definition: HexUtils.cpp:14
def __init__(self, inFileName, metadata, release, systematic, skim, cosmicsPolarity, subversion, specialName)
def createMetadata(inFile)
def cleanup_metadata(in_md)
def addMetadataToFCL(fclFile, parName, parValue)
def setSpecialInfo(self, subversion, systematic, specialName)
exit(0)