MetadataUtils.py
Go to the documentation of this file.
1 #!/bin/env python
2 from __future__ import print_function
3 from builtins import str
4 from builtins import range
5 from past.builtins import basestring
6 from builtins import object
7 import os, sys, stat, pwd
8 import datetime
9 from time import sleep
10 import re
11 import subprocess
12 import json
13 from samweb_client.utility import fileEnstoreChecksum
14 
15 detectors = dict()
16 detectors["fd"] = "fardet"
17 detectors["nd"] = "neardet"
18 detectors["tb"] = "testbeam"
19 detectors["ndos"] = "ndos"
20 
21 ##allow this to potentially expand beyond just generators we use now
22 neutrinoGenerators=["genie", "gibuu"]
23 neutrinoGenieTunes=["default", "G1802a0211a", "G1810b0211a", "G1810i00000", "G1801b0211a", "G1802a00000", "G1801a0211a", "G1810a0211a", "G0000b00000", "G1802b0211a", "G1801a00000", "G1801b00000", "G1810a00000", "G1810j00000", "G1802b00000", "G1810b00000","N1810j0211a", "NOVA1800a00000"]
24 cosmicGenerators=["cry"]
25 
26 run_pattern = re.compile(r"^.*?_r([0-9]+)_s([0-9]+).+")
27 sim_pattern = re.compile(r"^.*_(20\d\d\d\d\d\d_\d\d\d\d\d\d)")
28 stream_pattern = re.compile(r"^.*?_r[0-9]+_s[0-9]+_(.*?)[\.,_].*")
29 
30 parent_pattern = re.compile(r'(,\n.*)?(?P<parent_str>"parents":.*)', re.MULTILINE)
31 
32 def isNone(value):
33  if value == None : return True
34  if value == "none" : return True
35  return False
36 
37 def appendField(base,extension):
38 
39  if isNone( base) and isNone(extension): return "none"
40  elif isNone( base): return extension
41  elif isNone(extension): return base
42  elif base == extension: return base
43 
44  return base +"_" + extension
45 
46 
47 def addMetadataToFCL(fclFile,parName,parValue):
48  fclFile.write("\nphysics.analyzers.metadata.params.")
49  fclFile.write(parName)
50  fclFile.write(": ")
51  fclFile.write(parValue)
52  fclFile.write(" \n")
53 
54 def unCamelCase(s):
55  """ Convert CamelCase to camel_case """
56  # \B matches an empty string which is NOT at the beginning of a word
57  # so requiring this means no _ will be inserted at the start
58  return re.sub( r'\B([A-Z])', r'_\1', s).lower()
59 
60 def cleanup_metadata(in_md):
61  """ Numerous keys are stored in the files in ways that we need to 'translate'
62  so that they can be given to SAM. """
63 
64  md = {}
65  for k, v in list(in_md.items()):
66  k = str(k)
67  if len(k) == 0:
68  continue
69  if isinstance(v, basestring) and len(v) == 0:
70  v = "none"
71 
72  # thanks, sam_metadata_dumper
73  if '.' not in k:
74  k = unCamelCase(k)
75  else:
76  k = k.lower()
77  v = str(v)
78 
79  # fix for poorly formatted metadata from FileReducer pre-r24852
80  if k.lower() == "parents" and '","file_name":' in v:
81  v = v.replace('","file_name":', '"},{"file_name":')
82 
83  # note that for correct JSON serialization,
84  # any lists or maps need to be processed into Python lists/maps.
85  # if that fails, just fall back to the string we had.
86  if isinstance(v, basestring) and any(c in v for c in "[]{}"):
87  try:
88  v = eval(v)
89  except:
90  pass
91 
92  # specific bugfix for run list duplicates
93  # (fixed upstream in r24859)
94  if k == "runs":
95  v = list(set([tuple(run) for run in v]))
96 
97  # sam_metadata_dumper spits out [ run, subrun, event ].
98  # but the field itself should just be the run number...
99  if k in ('first_event', 'last_event'):
100  try:
101  v = int(v)
102  except TypeError:
103  if len(v) == 3:
104  v = v[2]
105  else:
106  continue
107 
108  # this is historical, but I can't guarantee it's not needed any more,
109  # even though every file I can see now has this as an int...
110  if k == 'simulated.cycle':
111  v = eval(v)
112 
113  # more type coersion
114  if k in ('start_date', 'end_date'):
115  v = int(v)
116 
117  # ummm... yeah.
118  if k == 'process_name' and md.get('application',[]).get('name') is None:
119  k = 'application_name'
120  elif k == 'stream_name':
121  k = 'data_stream'
122 
123  if k.startswith('application_'):
124  if 'application' not in md:
125  md['application'] = {}
126  md['application'][k[12:]] = v
127  elif k in ('file_format_era', 'file_format_version'):
128  pass
129  else:
130  md[k] = v
131 
132  return md
133 
134 def createMetadata(inFile):
135  filesize = os.path.getsize(inFile)
136  filename = os.path.basename(inFile)
137  if filename.endswith("caf.root"):
138  md = {'file_name': filename, 'group': 'nova', 'file_size': filesize, 'file_format': 'unknown'}
139  try:
140  # data = subprocess.check_output(["extractCAFMetadata", str(inFile)])
141  data = subprocess.check_output(["extractCAFMetadata", str(inFile)])
142  except subprocess.CalledProcessError:
143  print("extractCAFMetadata failed!")
144  return None
145 
146  dumperDict = json.loads(data[data.find(b"{"):])
147 
148  md.update(cleanup_metadata(dumperDict))
149  md['crc'] = fileEnstoreChecksum(inFile)
150 
151  return md
152  elif filename.endswith(".h5"):
153  md = {'file_name': filename, 'group': 'nova', 'file_size': filesize, 'file_format': 'unknown'}
154  try:
155  data = subprocess.check_output(["extractHDF5Metadata", str(inFile)])
156  except subprocess.CalledProcessError:
157  print("extractHDF5Metadata failed!")
158  return None
159 
160  dumperDict = json.loads(data[data.find(b"{"):])
161 
162  md.update(cleanup_metadata(dumperDict))
163  md['crc'] = fileEnstoreChecksum(inFile)
164 
165  return md
166 
167  else:
168  md = {'file_name': filename, 'group': 'nova', 'file_size': filesize, 'file_format': 'artroot'}
169  try:
170  data = subprocess.check_output(["sam_metadata_dumper", str(inFile)])
171  except subprocess.CalledProcessError:
172  print("sam_metadata_dumper failed!")
173  return None
174 
175  # hilariously, sam_metadata_dumper prints out the 'parents' metadata twice:
176  # once, with format like '[ "<f1>", "<f2>", ... ]' (incorrect?)
177  # and again, with format like '[ {"filename": "<f1>"}, {"filename": "<f2>"}, ...]' (correct?)
178  # eliminate the first one.
179  matched = False
180  for matches in parent_pattern.findall(data):
181  match = matches[-1]
182  parent_dict = eval("{%s}" % match)
183  if isinstance(parent_dict["parents"][0], basestring):
184  matched = True
185  break
186  if matched:
187  data = data.replace("".join(matches), "")
188 
189  dumperDict = json.loads(data.replace(b"\x1b[?1034h", b""))
190 
191  for dumperKey in dumperDict:
192  if str(inFile) in str(dumperKey):
193  tmpMD = dumperDict[dumperKey]
194  md.update(cleanup_metadata(tmpMD))
195  break
196  md['crc'] = fileEnstoreChecksum(inFile)
197 
198  return md
199 
200 class metaDataMgr(object):
201 
202  def __init__(self,inFileName, metadata, release, systematic, skim, cosmicsPolarity, subversion, specialName):
203 
204  #make all lower case keys
205  self.metadata = {k.lower():v for k,v in list(metadata.items())}
206  self.inFileName=inFileName
207  self.release = release
208  self.setRunInfo()
209  self.setDatasetTag()
210  if not self.isSam4Users():
211  self.setDetector()
212  self.setDataFlag()
213  if self.dataFlag == "data":
214  self.setDataInfo()
215  else:
216  self.setSimInfo()
217  self.setSkim(skim)
218  self.cosmicsPolarity = cosmicsPolarity
219  self.setSpecialInfo(subversion,systematic, specialName)
220 
221  def getMetadataParameter(self,parName):
222  retVal=None
223  if parName in self.metadata:
224  retVal = self.metadata[parName]
225  return retVal
226 
227  def setDatasetTag(self):
228  mytag=self.getMetadataParameter("dataset.tag")
229  if mytag!= None:
230  self.mytag = mytag.lower()
231  else:
232  self.mytag = "none"
233 
234  def isSam4Users(self):
235  return not isNone(self.mytag)
236 
237  def setDetector(self):
238  detector=self.getMetadataParameter("online.detector")
239  if detector != None:
240  self.detector = detector.lower()
241  else:
242  detID = self.getMetadataParameter("nova.detectorid")
243  if detID == None:
244  print("Error!! No detector name found!!")
245  exit(1)
246  detID=detID.lower()
247  if detID in list(detectors.keys()):
248  self.detector=detectors[detID]
249  else:
250  print("No detector name found for detectorid:", detId)
251  exit(1)
252 
253 
254  def setDataFlag(self):
255 
256  self.fileType=self.getMetadataParameter("file_type")
257 
258  if self.fileType == None:
259  print("No file_type set!")
260  exit(1)
261 
262  elif self.fileType == "importedDetector":
263  self.dataFlag = "data"
264 
265  elif self.fileType == "importedSimulated":
266  self.dataFlag = "sim"
267 
268  elif self.fileType == "text":
269  self.dataFlag = "text"
270 
271  else:
272  print("Unrecognized file_type:", file_type)
273  exit(1)
274 
275 
276  def setRunInfo(self):
277 
278  #if new format "runs" metadata, use that, otherwise, get from file name
279  if "runs" in self.metadata and len(self.metadata["runs"][0]) == 3:
280  self.runNum = int(self.metadata["runs"][0][0])
281  self.subNum = int(self.metadata["runs"][0][1])
282  else:
283  if "dataset.tag" in self.metadata != "none":
284  nameofThetag = self.getMetadataParameter("dataset.tag")
285 
286  TOTrunNum = subprocess.Popen( [ "samweb", "count-files", "dataset.tag " + str(nameofThetag) ], shell=False, stdout=subprocess.PIPE ).stdout.read()
287  Tot = int(TOTrunNum)
288  for i in range(0,Tot):
289  self.runNum = int(TOTrunNum)
290  self.subNum = int(i)
291 
292  else:
293  inFileBase=os.path.basename(self.inFileName)
294  run_match = run_pattern.match(inFileBase)
295  if run_match != None:
296  self.runNum = int(run_match.groups()[0])
297  self.subNum = int(run_match.groups()[1])
298 
299  else:
300  print("No run number/subrun number found!")
301  exit(1)
302 
303  def setDataInfo(self):
304 
305  #get data_stream from metadata. If it is "all", try extracting from the file name.
306  tmpStream = 'all'
307  if "data_stream" in self.metadata:
308  tmpStream = self.metadata["data_stream"]
309  elif "online.stream" in self.metadata:
310  tmpStream = self.metadata["online.stream"]
311 
312  if self.getMetadataParameter("nova.hornpolarity"):
313  self.horn=self.getMetadataParameter("nova.hornpolarity")
314  else:
315  self.horn=""
316 
317  if tmpStream == 'all':
318  stream_match = stream_pattern.match(inFileBase)
319  if stream_match != None:
320  tmpStream = stream_match.groups()[0]
321  if tmpStream[0] == 't':
322  tmpStream = tmpStream[1:]
323 
324  try:
325  self.stream = int(tmpStream)
326  self.streamEntry = 't{0:02d}'.format(self.stream)
327  except ValueError:
328  self.stream = tmpStream
329  self.streamEntry = self.stream
330  except:
331  print("No data stream found!")
332  exit(1)
333 
334  def setSimInfo(self):
335 
336  self.generator = self.getMetadataParameter("simulated.generator")
337 
338  # Some early gibuu files accidentally have generator set to singlep
339  # (fixed in r32723). If that's the case, fix it up here. In particular
340  # because it impacts the correct setting of flavorset.
341  if self.generator == 'singlep' and self.getMetadataParameter('simulated.gibuuused') == 'true':
342  self.generator = 'gibuu'
343 
344  if self.getMetadataParameter("nova.hornpolarity"):
345  self.horn=self.getMetadataParameter("nova.hornpolarity")
346  else:
347  self.horn=""
348 
349  if self.generator in neutrinoGenerators:
350 
351  self.flavorset=self.getMetadataParameter("simulated.genieflavorset")
352  if self.flavorset=="swap": self.flavorset="fluxswap"
353 
354  self.genierw=self.getMetadataParameter("simulated.genierw")
355  if self.genierw==None: self.genierw = "nogenierw"
356 
357  self.fluxVersion=self.getMetadataParameter("nova.flux_version")
358  if self.fluxVersion == None:
359  self.fluxVersion=os.environ["NOVA_FLUX_VERSION"].replace("nova_","")
360 
361  if self.generator == 'genie' and self.getMetadataParameter('simulated.genieused') == 'true':
362  self.NPPFX=self.getMetadataParameter("simulated.NPPFX")
363  self.genietune=self.getMetadataParameter("simulated.genietune")
364  if self.genietune not in neutrinoGenieTunes:
365  # If no genietune metadata field is found, the default tune
366  # is assumed
367  if self.genietune==None:
368  self.genietune="default"
369  else:
370  print("ERROR!! Invalid GENIE tune: ", self.genietune)
371  exit(1)
372 
373  elif self.generator in cosmicGenerators:
374  self.flavorset=self.getMetadataParameter("simulated.cryflavorset")
375 
376  self.nevt = self.getMetadataParameter("simulated.number_of_spills")
377  self.cycle = self.getMetadataParameter("simulated.cycle")
378 
379  matches=sim_pattern.match(self.inFileName)
380  matchGroups=matches.groups()
381  if len(matchGroups)<1:
382  print("ERROR!! Couldn't find timestamp in" , self.inFileName)
383  exit(1)
384  self.timestamp = matchGroups[0]
385 
386  def setSpecialInfo(self,subversion,systematic,specialName):
387 
388  self.systematic = "none"
389  mdSystematic=self.getMetadataParameter("nova.systematic")
390  if (not isNone(mdSystematic)) and (not isNone(systematic)):
391  print("ERROR: you specified a systematic, but this is already a systematic file")
392  exit(1)
393  elif not isNone(mdSystematic):
394  self.systematic = mdSystematic
395  elif not isNone(systematic):
396  self.systematic = systematic
397 
398  if subversion != None:
399  self.subversion = subversion
400  else:
401  self.subversion = self.getMetadataParameter("nova.subversion")
402  if self.subversion == None:
403  self.subversion="1"
404 
405  mdSpecialName = self.getMetadataParameter("nova.special")
406  self.special = appendField(mdSpecialName,specialName)
407 
408  def setSkim(self,skim):
409  mdSkim = self.getMetadataParameter("nova.skim")
410  self.skim = appendField(mdSkim,skim)
411 
412  def getOutputFileName(self,tier):
413  outFileName=""
414  if self.isSam4Users():
415  outFileName = os.path.basename(self.inFileName.strip(tier+'.root'))
416  outFileName += "_r%.8d_s%.2d" % (self.runNum, self.subNum)
417  else:
418  outFileName = self.detector
419 
420  if self.dataFlag == "data":
421  # data
422  # <det>_<run>_<subrun>_<horn>_<stream>_<tag>_<npass>_data_<syst>_<skim>_<spec>.<tier>.root
423 
424  print("SETTING FILE NAME DATA")
425  outFileName += "_r%.8d_s%.2d" % (self.runNum, self.subNum)
426 
427  if not isNone(self.cosmicsPolarity):
428  outFileName += "_%s" % self.cosmicsPolarity
429 
430  if self.horn != "":
431  outFileName += "_%s" % self.horn
432 
433 
434  outFileName += "_%s" % self.streamEntry
435  outFileName += "_%s" % self.release
436  outFileName += "_v%s" % self.subversion
437  outFileName += "_data"
438 
439  elif self.dataFlag == "sim":
440  # sim
441  #<det>_<gen>_<tune>_<flavor>_<genierw>_<horn>_<flux>_<nevt>_<run>_<subrun>_<cycle>_<tag>_<npass>_<timestamp>_sim_<syst>_<skim>_<spec>.<tier>.root
442  outFileName += "_%s" % self.generator
443  if self.generator in neutrinoGenerators:
444  outFileName += "_%s" % self.genietune
445  outFileName += "_%s" % self.flavorset
446  if self.generator in neutrinoGenerators: #these parameters do not apply for cosmic generators
447  outFileName += "_%s" % self.genierw
448  if not isNone(self.cosmicsPolarity):
449  outFileName += "_%s" % self.cosmicsPolarity
450  if self.horn != "":
451  outFileName += "_%s" % self.horn
452  if self.generator in neutrinoGenerators: #these parameters do not apply for cosmic generators
453  outFileName += "_%s_%s" % ( self.fluxVersion, self.nevt )
454 
455  if self.cycle == None: # cosmic MC files with cycle missing
456  outFileName += "_r%.8d_s%.2d" % (self.runNum, self.subNum)
457  else:
458  outFileName += "_r%.8d_s%.2d_c%.3d" % (self.runNum, self.subNum, self.cycle)
459 
460  outFileName += "_%s" % self.release
461  outFileName += "_v%s" % self.subversion
462  outFileName += "_%s" % self.timestamp
463  outFileName += "_sim"
464 
465  if not isNone(self.systematic):
466  outFileName += "_%s" % self.systematic
467  if not isNone(self.skim):
468  outFileName += "_%s" % self.skim
469  if not isNone(self.special):
470  outFileName += "_%s" % self.special
471 
472  if tier == "artdaq" :
473  if self.dataFlag == "sim":
474  outFileName += ".daq.root"
475  elif self.dataFlag == "data" :
476  outFileName += ".%s.root" % tier
477  elif tier == "h5caf":
478  outFileName += ".h5caf"
479  else :
480  outFileName += ".%s.root" % tier
481 
482  return outFileName
def isNone(value)
def getOutputFileName(self, tier)
def unCamelCase(s)
def getMetadataParameter(self, parName)
def appendField(base, extension)
bool print
def fileEnstoreChecksum(path)
Definition: utility.py:50
std::string format(const int32_t &value, const int &ndigits=8)
Definition: HexUtils.cpp:14
def __init__(self, inFileName, metadata, release, systematic, skim, cosmicsPolarity, subversion, specialName)
def createMetadata(inFile)
def cleanup_metadata(in_md)
def addMetadataToFCL(fclFile, parName, parValue)
def setSpecialInfo(self, subversion, systematic, specialName)
exit(0)