bp.py
Go to the documentation of this file.
1 import config #/// User Configuration
2 import helpers #/// Custom helper functions
3 import log #/// Basic console logging
4 
5 import json #/// write/read JSON
6 import os, subprocess, shutil #/// Interact with filesystem/os
7 import sys #/// System commands
8 import re #/// RegEx capabilities
9 from libs import requests #/// HTTP CRUD operations
10 import tarfile #/// Extract tarballs
11 import time #/// Used for sleep functions
12 import uuid #/// Unique uid generator
13 import zipfile #/// Extract zip files
14 import collections
15 
16 
17 #///////////////////////////////////////////////////////////////////////////////
18 # Class Definitions
19 class DocDBEntry():
20  def __init__(self):
21  self.id = 0
22  self.revision = 1
23  self.title = ""
24  self.modified = ""
25  self.url = ""
26  self.files = []
27  self.authors = []
28  self.topics = []
29 
30 
31 #///////////////////////////////////////////////////////////////////////////////
32 # Basic Initialization
33 def Init():
34  # Show a splash screen
35  print ""
36  print "/////////////////////////////////////////////////////////////"
37  print "// BLESSED PLOTS BACKEND //"
38  print "/////////////////////////////////////////////////////////////"
39  print ""
40 
41  time.sleep(1)
42 
43  # Check for Python config file
44  if not (os.path.isfile(config.WEB_PATH + config.JSON_FILENAME)):
45  log.error('File not found: ' + config.WEB_PATH + config.JSON_FILENAME + '. Let\'s start fresh.')
46  config.REGENERATE = True
47  time.sleep(1)
48 
49  if not (os.path.isfile(config.BLESSED_PLOTS)):
50  log.error('No blessed plots configuration found. I don\'t know which plots go where. Aborting...')
51  sys.exit(200)
52 
53  return 0
54 
55 
56 #///////////////////////////////////////////////////////////////////////////////
57 # Poll DocDB for Document Information
59  log.info("Grabbing plots list from DocDB")
60 
61  # 370 = "BlessedPlots:Official", 416 = "BlessedPlots:Proposed"
62  # You can find these numbers by looking at the URL after clicking on any
63  # entry in Topics in docdb
64  topicId = '416' if config.PROPOSED else '370'
65 
66  _xml = helpers.CallDocDB('Search', 'topics', topicId)['docdb']['document']
67  _plotlistByTopic = helpers.BlessedPlotsList()
68 
69  _docIds = []
70  for topic in _plotlistByTopic:
71  for doc in topic['docs']:
72  if doc not in _docIds:
73  _docIds.append(doc)
74 
75  # DocDB does a silly thing if there's only one document returned, which is
76  # to give it to you straight, instead of as a one-entry list. Fix that up.
77  if '@href' in _xml: _xml = [_xml]
78 
79  # Check to see if there are any docs in DocDB that are not listed in file,
80  # add them as uncategorized.
81  helpers.CombineDocLists(_xml, _docIds)
82 
83  log.success('Found %i officially blessed plots on DocDB' % len(_docIds))
84 
85  time.sleep(1)
86 
87  _entries = []
88  idx = 0
89 
90  for doc in _docIds:
91  idx += 1
92 
93  # Get document from DocDB and check that it actually exists
94  res = helpers.GetDoc(doc)['docdb']
95  if 'document' not in res:
96  log.error('[%i/%i] ERROR: doc %i not found in DocDB' % (idx, len(_docIds), doc))
97  continue
98 
99  item = res['document']
100 
101  # Construct the entry
102  entry = DocDBEntry()
103  entry.id = int(item['@id'])
104  entry.revision = int(item['docrevision']['@version'])
105  entry.title = item['docrevision']['title']
106  entry.modified = item['docrevision']['@modified']
107  entry.url = item['docrevision']['@href']
108 
109  log.info('[%i/%i] Fetching metadata for doc %i v%i -- %s' % (idx, len(_docIds), entry.id, entry.revision, entry.title))
110 
111  # Build author list (data structure is different if
112  # there are multiple authors)
113  if type(item['docrevision']['author']) is list:
114  for author in item['docrevision']['author']:
115  entry.authors.append(
116  {
117  'firstname' : author['firstname'],
118  'lastname' : author['lastname'],
119  'id' : int(author['@id'])
120  }
121  )
122  else:
123  author = item['docrevision']['author']
124  entry.authors.append(
125  {
126  'firstname' : author['firstname'],
127  'lastname' : author['lastname'],
128  'id' : int(author['@id'])
129  }
130  )
131 
132  # Get list of info for this entry
133  docXml = helpers.CallDocDB('ShowDocument', 'docid', item['@id'])
134 
135  docID = docXml['docdb']['document']['docrevision']['@docid']
136  rev = docXml['docdb']['document']['docrevision']['@version']
137 
138  # Get topics for this doc
139  entry.topics = helpers.GetTopicsByDocID(doc)
140 
141  _entries.append(entry.__dict__)
142 
143  return _entries
144 
145 
146 #///////////////////////////////////////////////////////////////////////////////
147 # Get Document Information from JSON File
149  if config.REGENERATE:
150  return []
151  else:
152  return json.loads(open(config.WEB_PATH + config.JSON_FILENAME).read())
153 
154 
155 #///////////////////////////////////////////////////////////////////////////////
156 # Return differences between current list of documents and previous
157 def FindChanges(new, old):
158  # This is where additions, modifications, and deletions will live
159  changes = {'additions': [], 'modifications': [], 'deletions': []}
160 
161  if config.REGENERATE:
162  revisions = {doc['id'] : doc['revision'] for doc in new}
163 
164  # Just add everything from DocDB into additions
165  for docID in revisions:
166  changes['additions'].append(docID)
167 
168  else:
169  # Build a reduced list of just id/rev numbers as key-value pairs
170  newRevs = {entry['id'] : entry['revision'] for entry in new}
171  oldRevs = {entry['id'] : entry['revision'] for entry in old}
172 
173  # Let's check for some differences
174  for docID in newRevs:
175  # If the new ID is already contained within the old set...
176  if docID in oldRevs:
177  # if the revision number of the new is different than the old,
178  # then we need to repropcess
179  if newRevs[docID] != oldRevs[docID]:
180  changes['modifications'].append(docID)
181 
182  # If the new ID is not contained within old set, then we have a new
183  # document
184  else:
185  changes['additions'].append(docID)
186 
187  # We also have to check for deletions, so now we look at situations where
188  # the new list is missing something that the old list has
189  for docID in oldRevs:
190  if docID in newRevs:
191  continue # Don't need to do anything here...
192 
193  # But, if this docID is not included in the new rest, we need tok
194  # remove it
195  else:
196  changes['deletions'].append(docID)
197 
198  log.debug('Detected %i additions, %i modifications, and %i deletions' % (len(changes['additions']), len(changes['modifications']), len(changes['deletions'])))
199 
200  return set(changes['additions']) | set(changes['modifications'])
201 
202 #///////////////////////////////////////////////////////////////////////////////
203 # Fetch all the documents for a list of docdb IDs to local disk
204 def DownloadFiles(docs):
205  tempDir = config.WEB_PATH + config.PLOT_SUBDIR + str(uuid.uuid4()) + '/'
206  os.mkdir(tempDir, 0755)
207 
208  idx = 0
209  for doc in docs:
210  idx += 1
211 
212  log.info('[%i/%i] Downloading files from docdb %i' % (idx, len(docs), doc))
213 
214  tempDocDir = tempDir + str(doc) + '/'
215  os.mkdir(tempDocDir, 0755)
216 
217  helpers.Download(config.DOCDB_URL + 'RetrieveArchive?docid=%d&type=tar.gz' % doc,
218  tempDocDir+'doc_archive.tar.gz')
219  with tarfile.open(tempDocDir+'doc_archive.tar.gz', 'r') as tar:
220  oldDir = os.getcwd()
221  os.chdir(tempDocDir)
222  try:
223  tar.extractall()
224  finally:
225  os.chdir(oldDir)
226 
227 
228  for root, directories, filenames in os.walk(tempDocDir):
229  for filename in filenames:
230  if filename == 'doc_archive.tar.gz': continue
231 
232  path = os.path.join(root, filename)
233 
234  if filename.endswith('.tar.gz') or filename.endswith('.tar') or filename.endswith('.tgz'):
235  log.debug('Extracting '+filename)
236  with tarfile.open(path, 'r') as tar:
237  oldDir = os.getcwd()
238  os.chdir(tempDocDir)
239  try:
240  tar.extractall()
241  finally:
242  os.chdir(oldDir)
243 
244  if filename.endswith('.zip'):
245  log.debug('Extracting '+filename)
246  with zipfile.ZipFile(path, 'r') as z:
247  oldDir = os.getcwd()
248  os.chdir(tempDocDir)
249  try:
250  z.extractall()
251  finally:
252  os.chdir(oldDir)
253 
254  return tempDir
255 
256 
257 #///////////////////////////////////////////////////////////////////////////////
258 # Look through all the downloaded files for captions and images, update documents_curr
259 def FindFiles(tempdir, documents_curr):
260 
261  files = collections.defaultdict(lambda: {}) # dict from id to dict from base to list of extensions
262 
263  # Find all the .txt files
264  for root, directories, filenames in os.walk(tempdir):
265  for filename in filenames:
266  path = os.path.join(root, filename)[len(tempdir):]
267  docid = int(path[:path.find('/')])
268  if filename.endswith('.txt') and not filename.startswith('.'):
269  cap = unicode(file(tempdir+'/'+path, 'r').read(), errors='ignore')
270  path = path[len(str(docid))+1:] # drop the docid too for the rest
271  base = path[:-4]
272  files[docid][base] = {'base': base, 'caption': cap, 'exts': []}
273 
274  # Now find matching image files
275  for root, directories, filenames in os.walk(tempdir):
276  for filename in filenames:
277  path = os.path.join(root, filename)[len(tempdir):]
278  docid = int(path[:path.find('/')])
279  path = path[len(str(docid))+1:] # drop the docid too for the rest
280  if not filename.endswith('.txt') and not filename.startswith('.'):
281  good = False
282  for e in config.EXTS:
283  if filename.endswith(e): good = True
284  if not good: continue
285 
286  stem = path[:path.rfind('.')]
287  ext = path[path.rfind('.')+1:]
288  if docid not in files or stem not in files[docid]:
289  log.debug('File with no .txt caption: '+path)
290  else:
291  files[docid][stem]['exts'].append(ext)
292 
293  # Save this information into the actual documents
294  for doc in documents_curr:
295  doc['files'] = []
296  docid = doc['id']
297  for base in files[docid]:
298  doc['files'].append(files[docid][base])
299 
300 
301 #///////////////////////////////////////////////////////////////////////////////
302 # Make thumnails from downloaded images
303 def ProcessImages(documents_to_process, documents, tempDir):
304  if len(documents_to_process) == 0:
305  return 0
306 
307  idx = 0
308  for docID in documents_to_process:
309  idx += 1
310  log.info('[%i/%i] Processing images for docdb %i' % (idx, len(documents_to_process), docID))
311 
312  document = next((doc for doc in documents if doc['id'] == docID), None)
313 
314  tempDocDir = tempDir + str(document['id']) + '/'
315  thumDir = tempDocDir + '/thumbs/'
316  os.mkdir(thumDir)
317 
318  for aFile in document['files']:
319  base = aFile['base']
320  exts = aFile['exts']
321 
322  # Favoured versions to make a thumbnail version from
323  srcs = ['png', 'jpg', 'jpeg', 'eps', 'pdf', 'ps']
324 
325  for src in srcs:
326  if src in exts:
327  # Create thumbnail
328  opt = ''
329  opt2 = None
330  if src == 'pdf':
331  opt = ' -define pdf:use-cropbox=true -transparent-color white '
332  opt2 = ' -transparent-color white ' # sometimes the cropbox is trouble
333  cmd = 'convert ' + opt + tempDocDir + base + '.'+src + ' -resize 400 ' + thumDir + base + '_thumb.png'
334  cmd2 = None
335  if opt2: cmd2 = 'convert ' + opt2 + tempDocDir + base + '.'+src + ' -resize 400 ' + thumDir + base + '_thumb.png'
336 
337  # In case of tarballs etc there can be subdirs required
338  # in the thumbs directory. Maybe we should have made
339  # them up-front?
340  try:
341  os.makedirs(os.path.dirname(thumDir+base))
342  except:
343  pass
344 
345  if cmd2:
346  os.system(cmd + ' || ' + cmd2)
347  else:
348  os.system(cmd)
349  log.success('Created thumbnail from '+base+'.'+src+': '+thumDir + base + '_thumb.png')
350  break
351 
352  os.system('cp -rpf ' + tempDir + '* ' + config.WEB_PATH + config.PLOT_SUBDIR)
353  shutil.rmtree(tempDir)
354 
355  return 0
356 
357 
358 #///////////////////////////////////////////////////////////////////////////////
359 # Write JSON file to disk
360 def WriteJSON(documents):
361  os.system('cp -pf ' + config.BLESSED_PLOTS + ' ' + config.WEB_PATH )
362  log.success('Copied ' + config.BLESSED_PLOTS + ' to ' + config.WEB_PATH)
363  jsonSerialized = json.dumps(documents, sort_keys = True, indent = 2)
364 
365  jsonFile = open(config.WEB_PATH + config.JSON_FILENAME, 'w')
366  jsonFile.write(jsonSerialized)
367  jsonFile.close()
368 
369  log.success('Wrote ' + config.WEB_PATH + config.JSON_FILENAME)
def FindChanges(new, old)
Definition: bp.py:157
def FindFiles(tempdir, documents_curr)
Definition: bp.py:259
def WriteJSON(documents)
Definition: bp.py:360
def CombineDocLists(_xml, _docIds)
Definition: helpers.py:54
def GetTopicsByDocID(docID)
Definition: helpers.py:40
def error(message)
Definition: log.py:7
modified
Definition: bp.py:24
def GetDoc(docID)
Definition: helpers.py:29
::xsd::cxx::tree::type type
Definition: Database.h:110
def DownloadFiles(docs)
Definition: bp.py:204
revision
Definition: bp.py:22
def debug(message)
Definition: log.py:3
def CallDocDB(function, parameter, value)
Definition: helpers.py:17
def success(message)
Definition: log.py:5
def info(message)
Definition: log.py:1
def BlessedPlotsList()
Definition: helpers.py:11
procfile open("FD_BRL_v0.txt")
def ProcessImages(documents_to_process, documents, tempDir)
Definition: bp.py:303
def Download(url, destination)
Definition: helpers.py:65
def GetDocumentInfoFromDisk()
Definition: bp.py:148
TFile * file
Definition: cellShifts.C:17
def Init()
Definition: bp.py:33
def __init__(self)
Definition: bp.py:20
def GetDocumentInfoFromDocDB()
Definition: bp.py:58
void next()
Definition: show_event.C:84