add_attributes.py
Go to the documentation of this file.
1 from __future__ import print_function
2 import h5py
3 import argparse
4 import numpy as np
5 
6 """
7 Turns out it is useful for hdf5 groups to be self-describing for the
8 PandAna framework. This is done by adding an HDF5 attribute to each group
9 that specifies the columns within that group that should be used as indices
10 when constructing a pandas DataFrame.
11 
12 HDF5Maker was updated to produces these attributes at the time of file creation
13 For those files that were created before HDF5Maker was updated,
14 this script adds the attributes by hand
15 
16 author: Derek.Doyle@colostate.edu
17 date: Feb 25, 2021
18 """
19 
20 # Here's the bit that makes this script NOvA-specific. Other
21 # experiments will have different indexing schemas
22 # All groups will be searched for datasets matching these names.
23 # When groups contain these datasets, we will add these to the index_cols
24 # attribute. Additional index columns will be determined by looking for
25 # datasets ending with '_idx'
26 required_indices = ['run', 'subrun', 'cycle', 'evt']
27 other_indices = ['subevt']
28 
29 def determine_index_cols(h5group):
30  index_cols = []
31  dsets = h5group.keys()
32  for required in required_indices:
33  if required in dsets: index_cols.append(required)
34  for other in other_indices:
35  if other in dsets: index_cols.append(other)
36 
37  for dset in h5group.keys():
38  if dset.endswith('_idx'): index_cols.append(dset)
39  return index_cols
40 
41 def encode_index_cols(index_cols):
42  return [','.join(index_cols)]
43 
44 
45 # nova group naming scheme provides a full "path" to the data
46 # as arranged in standard record where nested objects
47 # are separated by a '.'
48 # The parentage can then be determined by removing
49 # name after the last '.'
50 def parent_table(group_name):
51  if '.' not in group_name:
52  return ['/']
53  else:
54  return ['.'.join(group_name.split('.')[:-1])]
55 
56 
57 # function to write all attributes
58 def write_attributes(h5group, verbose=False, dry_run=True):
59  if verbose:
60  print(h5group.name)
61  # index_cols
62  index_cols = determine_index_cols(g)
63  attr_index_cols = encode_index_cols(index_cols)
64  if verbose:
65  try:
66  existing_index_cols = g.attrs['index_cols']
67  except KeyError:
68  existing_index_cols = 'None'
69 
70  finally:
71  print('\t|index_cols: {} ---> {}'.format(existing_index_cols, attr_index_cols))
72 
73  if not dry_run: h5group.attrs['index_cols' ] = attr_index_cols
74 
75  # parent_table
76  attr_parent_table = parent_table(h5group.name)
77  if verbose:
78  try:
79  existing_parent_table = g.attrs['parent_table']
80  except KeyError:
81  existing_parent_table = 'None'
82  finally:
83  print('\t|parent_table: {} ---> {}'.format(existing_parent_table, attr_parent_table))
84 
85  if not dry_run: h5group.attrs['parent_table'] = attr_parent_table
86 
87 
88 
89 # progress bar for funzies
90 def insert(astring, substr, pos, overwrite=True):
91  if overwrite:
92  return astring[:pos] + substr + astring[pos + len(substr):]
93  else:
94  return astring[:pos] + substr + astring[pos:]
95 
96 class progbar:
97  invert_open = '\033[F\033[7m['
98  invert_close = '\033[0m'
99  def __init__(self, message, nfields=50):
100  print(message, '....')
101  self._nfields = nfields
102 
103  def __call__(self, prog):
104  percent = prog
105  percent_str = '{:.1f}%'.format(percent*100)
106  current = int(percent * self._nfields)
107  bar = progbar.invert_open + ' ' * (self._nfields - 1) + ']'
108  bar = insert(bar, percent_str , pos=len(progbar.invert_open) , overwrite=True)
109  bar = insert(bar, progbar.invert_close, pos=len(progbar.invert_open)+current, overwrite=False)
110  print(bar)
111  if current == self._nfields: print()
112 
113 parser = argparse.ArgumentParser('Add index columns to the input h5 file(s)')
114 parser.add_argument('files', nargs='*', help='Input file(s). Wildcard allowed')
115 parser.add_argument('--dry_run', '-d', action='store_true', help='Don\'t actually add the attributes yet')
116 parser.add_argument('--verbose', '-v', action='store_true', help='Print out old and new attributes')
117 
118 args = parser.parse_args()
119 
120 files = args.files
121 
122 if not args.verbose: prog = progbar('Adding attributes')
123 for i, fname in enumerate(files):
124  with h5py.File(fname, 'r+') as h5file:
125  for group_name in h5file.keys():
126  if group_name != 'MetaData':
127  g = h5file.get(group_name)
128 
129  write_attributes(g,
130  verbose=args.verbose,
131  dry_run=args.dry_run)
132 
133  if not args.verbose: prog(i / len(files))
134 
135 
136 
def parent_table(group_name)
def write_attributes(h5group, verbose=False, dry_run=True)
def determine_index_cols(h5group)
def encode_index_cols(index_cols)
def __call__(self, prog)
def __init__(self, message, nfields=50)
bool print
std::string format(const int32_t &value, const int &ndigits=8)
Definition: HexUtils.cpp:14
def insert(astring, substr, pos, overwrite=True)