bp_docdb.py
Go to the documentation of this file.
1 """ BLESSED PLOTS - DOC DB QUERYING INTERFACE
2 
3 This module includes functions which provide basic querying of
4 NOvA's DocDB document repository.
5 """
6 import HTMLParser
7 from datetime import datetime, timedelta
8 
9 # External module imports
10 from lib import requests #/// HTML CRUD operations
11 from lib import xmltodict #/// XML parsing
12 
13 # Blessed plot imports
14 import bp_config as config #/// User Configuration
15 
16 
17 class Document:
18  """A class representing a single DocDB document
19 
20  All relevant metadata associated with a single DocDB document will be
21  stored in a member of this class.
22 
23  Note:
24  The data members "author_id" and "author", may seem redundant, but
25  there is no way to query DocDB for the full author name given an ID, so
26  we need to collect both pieces of information here.
27 
28  Attributes:
29  id (int) : DocDB ID
30  rev (int) : Revision number
31  modtime (str) : Last modified datetime (in CT, presumably)
32  author_id (int) : Author DocDB ID
33  author (str) : Author full name
34  submitter_id (int) : Submitter DocDB ID
35  submitter (str) : Submitter full name
36  modifygroup (str) : Name of group with view permissions for this doc
37  categories (array) : List of categories (str) applied to document
38  files (array) : List of files attached to document
39  title (str) : Title
40  abstract (str) : Description
41  """
42  # Constructor
43  def __init__(self, id, rev, modtime, authors, first_author_id, first_author, submitter_id, submitter, modifygroup, categories, files, title, abstract, is_blessed, is_dep):
44  self.id = id
45  self.rev = rev
46  self.modtime = modtime
47  self.authors = authors
48  self.first_author_id = first_author_id
49  self.first_author = first_author
50  self.submitter_id = submitter_id
51  self.submitter = submitter
52  self.modifygroup = modifygroup
53  self.categories = categories
54  self.files = files
55  self.title = title
56  self.abstract = abstract
57  self.is_blessed = is_blessed
58  self.is_dep = is_dep
59 
60 
61 # NOTE: Many of the functions contained within this class execute a HTTP
62 # request to DocDB. This process involves two-way communication and can
63 # be relatively slow. Attempting to minimize the number of times an HTTP call
64 # is made is advised when using this class in a program.
65 class DocDB:
66  """A class for interfacing with DocDB via HTTP
67 
68  A connection to DocDB is established over HTTP, and requests are made for
69  data in XML format.
70 
71  Attributes:
72  uri (str): Base URI with which any desired DocDB query can be formed
73  """
74  # Constructor
75  def __init__(self):
76  # Class variables
77  self.uri = config.DOCDB_URI
78 
79 
80  def GetDocument(self, id):
81  """Retrieve metadata for a DocDB document given a DocDB ID.
82 
83  Args:
84  id (int): DocDB ID
85 
86  Returns:
87  Document
88 
89  """
90 
91  # Make HTTP request
92  response = requests.get(self.uri + 'ShowDocument?docid=' + str(id) + '&outformat=xml', auth=(config.DOCDB_USER, config.DOCDB_PWD))
93  data = xmltodict.parse(response.content)['docdb']['document']
94 
95  # HTML parser is used to convert hex characters in the XML response to Unicode
96  parser = HTMLParser.HTMLParser()
97 
98  # Extract document metadata
99  rev = int(data['docrevision']['@version'])
100  modtime = data['docrevision']['@modified']
101  authors = [data['docrevision']['author']] if (type(data['docrevision']['author']) is not list) else data['docrevision']['author']
102  # Use only first author if there are multiple
103  first_author_id = int(authors[0]['@id']) if (type(authors) is list) else int(authors['@id'])
104  first_author = authors[0]['fullname'] if (type(authors) is list) else authors['fullname']
105  submitters = data['docrevision']['submitter']
106  # Use only first submitter if there are multiple
107  submitter_id = int(submitters[0]['@id']) if (type(submitters) is list) else int(submitters['@id'])
108  submitter = submitters[0]['fullname'] if (type(submitters) is list) else submitters['fullname']
109  modifygroups = data['docrevision']['modifygroup']
110  modifygroup = [mod['name'] for mod in modifygroups] if (type(modifygroups) is list) else [modifygroups['name']]
111  categories = []
112  files = []
113  title = parser.unescape(data['docrevision']['title'])
114  abstract = parser.unescape(data['docrevision']['abstract']) if data['docrevision']['abstract'] is not None else ''
115  is_blessed = False
116  is_dep = False
117 
118  # Deal with categories
119  if ('topic' not in data['docrevision']):
120  categories = None
121 
122  elif (type(data['docrevision']['topic']) is list):
123  for topic in data['docrevision']['topic']:
124  category = {}
125  category['id'] = int(topic['@id'])
126  if category['id'] == 370:
127  is_blessed = True
128  if category['id'] == 422:
129  is_dep = True
130  category['name'] = topic['name']
131  category['description'] = topic['description']
132  if ('@parentid' in topic):
133  category['p_id'] = int(topic['@parentid'])
134  else:
135  category['p_id'] = ''
136  categories.append(category)
137 
138  else:
139  category = {}
140  topic = data['docrevision']['topic']
141  category['id'] = int(topic['@id'])
142  if category['id'] == 370:
143  is_blessed = True
144  if category['id'] == 422:
145  is_dep = True
146  category['name'] = topic['name']
147  category['description'] = topic['description']
148  if ('@parentid' in topic):
149  category['p_id'] = int(topic['@parentid'])
150  else:
151  category['p_id'] = ''
152  categories.append(category)
153 
154  # Deal with files
155  if (type(data['docrevision']['file']) is list):
156  for item in data['docrevision']['file']:
157  file = {}
158  file['href'] = item['@href']
159  file['id'] = int(item['@id'])
160  file['name'] = item['name']
161  files.append(file)
162 
163  else:
164  file = {}
165  item = data['docrevision']['file']
166  file['href'] = item['@href']
167  file['id'] = int(item['@id'])
168  file['name'] = item['name']
169  files.append(file)
170 
171  return Document(id, rev, modtime, authors, first_author_id, first_author, submitter_id, submitter, modifygroup, categories, files, title, abstract, is_blessed, is_dep)
172 
173 
174  def GetDocIDsByCategory(self, cat_id, full_lookback):
175  """Retrieve list of DocDB IDs for a given DocDB category ID.
176 
177  Args:
178  cat_id (int): DocDB category ID
179 
180  Returns:
181  List(Document ID, revision number, modified time)
182 
183  """
184  # Make HTTP request
185  lookback = datetime.strftime(datetime.now() - timedelta(1), '&afterday=%d&aftermonth=%b&afteryear=%Y')
186  request_url = self.uri + 'Search?topics=' + str(cat_id) + '&includesubtopics=1&outformat=xml'
187  if not full_lookback: request_url += lookback
188  response = requests.get(request_url, auth=(config.DOCDB_USER, config.DOCDB_PWD))
189  data = xmltodict.parse(response.content)['docdb']
190 
191  # Extract document IDs for this category
192  documents = []
193  if data.get('document') == None:
194  return documents
195 
196  if type(data['document']) == list:
197  docs = data['document']
198  else:
199  docs = [data['document']]
200 
201  for doc in docs:
202  documents.append(
203  {
204  'id': int(doc['docrevision']['@docid']),
205  'revision': int(doc['docrevision']['@version']),
206  'modtime': doc['docrevision']['@modified']
207  }
208  )
209 
210  return documents
211 
212  def DownloadFile(self, doc_id, filename, dest):
213  """Download a given file from a given DocDB document.
214 
215  Args:
216  doc_id (int): DocDB document ID
217  filename: Full filename to download
218  dest: Destination directory
219 
220 
221  Returns:
222  NONE
223 
224  """
225  r = requests.get(self.uri + 'RetrieveFile?docid={}&filename={}'.format(doc_id, filename), auth=(config.DOCDB_USER, config.DOCDB_PWD))
226  with open(dest + '/{}'.format(filename), "wb") as d:
227  d.write(r.content)
228 
229 
230  def DownloadArchive(self, doc_id, filename, dest):
231  """Download archive of files from a given DocDB document.
232 
233  Args:
234  doc_id (int): DocDB document ID
235  dest: Destination directory
236 
237 
238  Returns:
239  NONE
240 
241  """
242  r = requests.get(self.uri + 'RetrieveArchive?docid={}&type=tar.gz'.format(doc_id), auth=(config.DOCDB_USER, config.DOCDB_PWD))
243  destination = '{}/{}'.format(dest, filename)
244  with open(destination, "wb") as d:
245  d.write(r.content)
::xsd::cxx::tree::type type
Definition: Database.h:110
def DownloadFile(self, doc_id, filename, dest)
Definition: bp_docdb.py:212
def GetDocument(self, id)
Definition: bp_docdb.py:80
def DownloadArchive(self, doc_id, filename, dest)
Definition: bp_docdb.py:230
def __init__(self)
Definition: bp_docdb.py:75
std::string format(const int32_t &value, const int &ndigits=8)
Definition: HexUtils.cpp:14
procfile open("FD_BRL_v0.txt")
def GetDocIDsByCategory(self, cat_id, full_lookback)
Definition: bp_docdb.py:174
def __init__(self, id, rev, modtime, authors, first_author_id, first_author, submitter_id, submitter, modifygroup, categories, files, title, abstract, is_blessed, is_dep)
Definition: bp_docdb.py:43