site_stats_from_log.py
Go to the documentation of this file.
1 #!/bin/env python
2 
3 import argparse
4 import datetime
5 import re
6 import sys
7 
8 class Status:
9  UNKNOWN = 0
10  IDLE = 1
11  RUNNING = 2
12  HELD = 3
13  FINISHED = 4
14 
15 class JobInfo(object):
16  def __init__(self,
17  jobid,
18  site=None,
19  start_time=None,
20  status=Status.UNKNOWN,
21  status_time=None,
22  exit_code=None,
23  hold_reason=None):
24  self.jobid = jobid
25  self.start_time = start_time
26  self.site = site
27  self.status = status
28  self.status_time = status_time
29  self.exit_code = exit_code
30  self.hold_reason = hold_reason
31 
33  def __init__(self, name, success=0, held=0, failed=0, ongoing=0, disconnects=0):
34  self.name = name
35  self.success = success
36  self.held = held
37  self.failed = failed
38  self.ongoing = ongoing
39  self.disconnects = disconnects
40 
41  self.exit_codes = {}
42  self.hold_reasons = {}
43  self.disconnect_reasons = {} # currently uninstrumented, though somebody could fill it if desired
44 
45  def __repr__(self):
46  return str(self.__dict__)
47 
48 # class JobTable(object):
49 # def __init__(self):
50 # self._jobs = {}
51 #
52 # def __getitem__(self, key):
53 
54 job_event_pattern = re.compile(r"\((?P<jobid>\d+\.\d+)\.000\).*(?P<when>\d{2}/\d{2} \d{2}:\d{2}:\d{2}).*Job (?P<action>.*)$")
55 job_site_pattern = re.compile(r"JOB_Site = \"(?P<job_site>[^\"]+)\"")
56 exit_code_pattern = re.compile("\(return value (?P<exit_code>\d+)\)")
57 hold_reason_pattern = re.compile("HoldReason = \".*(?P<reason>SYSTEM_PERIODIC_HOLD\D*|Docker job.*|SHADOW\D*)")
58 disconnected_pattern = re.compile("Reason = \"Job disconnected")
59 
60 parser = argparse.ArgumentParser(prog=sys.argv[0],description="Trawl Condor .log file and learn about site behavior")
61 parser.add_argument("files",
62  metavar="FILE",
63  nargs="+",
64  help="Examine these files",
65 )
66 parser.add_argument("--site-details",
67  default="",
68  help="Show detailed output (exit codes and hold reasons) for site names matching this regular expression. (Use the expression '.*' to show details for each site independently.)",
69 )
70 
71 args = parser.parse_args()
72 
73 filenames = args.files
74 site_details_pattern = re.compile(args.site_details)
75 
76 job_table = {}
77 disconnects_by_site = {}
78 options = ("submitted", "executing", "terminated", "held", "disconnected")
79 for logfile_name in filenames:
80  with open(logfile_name) as logfile:
81  # Condor opens each entry about a job with "..."
82  # Scan for those.
83 
84  job = None
85  for line in logfile:
86  if line == "...":
87  job = None
88  continue
89 
90  matches = job_event_pattern.search(line)
91  if matches:
92  jobid = matches.group("jobid")
93  if jobid not in job_table:
94  job_table[jobid] = JobInfo(jobid=jobid)
95  job = job_table[jobid]
96 
97  action = matches.group("action")
98 
99  when = datetime.datetime.strptime(matches.group("when"), "%m/%d %H:%M:%S")
100  when = when.replace(year=datetime.datetime.now().year)
101 
102  if "submitted" in action:
103  job.status = Status.IDLE
104  elif "executing" in action:
105  job.start_time = when
106  job.status = Status.RUNNING
107  job.status_time = when
108  elif "held" in action:
109  job.status = Status.HELD
110  job.status_time = when
111  elif "terminated" in action:
112  job.status = Status.FINISHED
113 
114  continue
115 
116  matches = job_site_pattern.search(line)
117  if matches:
118  # once they go 'held' they have no site any more...
119  if "$$(GLIDEIN_Site:Unknown)" not in matches.group("job_site"):
120  job.site = matches.group("job_site")
121  continue
122 
123  matches = hold_reason_pattern.search(line)
124  if matches:
125  job.hold_reason = matches.group("reason")
126  continue
127 
128  matches = exit_code_pattern.search(line)
129  if matches:
130  job.exit_code = int(matches.group("exit_code"))
131  continue
132 
133  # disconnects are special.
134  # the job gets restarted with the same job ID,
135  # we just have to count disconnects separately
136  matches = disconnected_pattern.search(line)
137  if matches:
138  if job.site:
139  disconnects_by_site.setdefault(job.site, 0)
140  disconnects_by_site[job.site] += 1
141 
142 site_stats = {}
143 for job in job_table.itervalues():
144  if job.site not in site_stats:
145  site_stats[job.site] = SiteStats(name=job.site)
146 
147  if job.status == Status.IDLE:
148  continue
149 
150  site = site_stats[job.site]
151  if job.status == Status.RUNNING:
152  site.ongoing += 1
153  elif job.status == Status.HELD:
154  site.held += 1
155  if job.hold_reason:
156  site.hold_reasons.setdefault(job.hold_reason, 0)
157  site.hold_reasons[job.hold_reason] += 1
158  elif job.status == Status.FINISHED:
159  site.exit_codes.setdefault(job.exit_code, 0)
160  site.exit_codes[job.exit_code] += 1
161  if job.exit_code > 0:
162  site.failed += 1
163  else:
164  site.success += 1
165 
166 for site_name, disconnects in disconnects_by_site.iteritems():
167  site = site_stats.setdefault(site, SiteStats(site_name))
168  site.disconnects = disconnects
169 
170 sites_sorted = sorted(site_stats.keys())
171 
172 print "Site Ongoing Held Failed Disconnected Success"
173 for site_name in sites_sorted:
174  site = site_stats[site_name]
175  print "%-15s%7d %4d %6d %12d %7d" % (site.name, site.ongoing, site.held, site.failed, site.disconnects, site.success)
176 
177 sum_exit_codes = {}
178 sum_hold_reasons = {}
179 for stats in site_stats.itervalues():
180  for summed, collection in [(sum_exit_codes, stats.exit_codes), (sum_hold_reasons, stats.hold_reasons)]:
181  for key, val in collection.iteritems():
182  summed.setdefault(key, 0)
183  summed[key] += val
184 
185 if len(sum_hold_reasons):
186  print
187  print "Held jobs:"
188  print " Count Hold reason"
189  for reason in sorted(sum_hold_reasons, key=sum_hold_reasons.get, reverse=True):
190  print " %-7d%s" % (sum_hold_reasons[reason], reason)
191 
192 if len(sum_exit_codes):
193  print
194  print "Exit codes:"
195  print " Count Code"
196  for code in sorted(sum_exit_codes, key=sum_exit_codes.get, reverse=True):
197  print " %-7d%s" % (sum_exit_codes[code], code)
198 
199 
200 if args.site_details:
201  print
202  print "Details for site names matching pattern '%s':" % args.site_details
203  for site_name in sites_sorted:
204  if not site_details_pattern.match(site_name):
205  continue
206  site = site_stats[site_name]
207  print
208  print site_name
209  if len(site.hold_reasons) > 0:
210  print " Count Hold reason"
211  print " ------------------"
212  for reason, count in site.hold_reasons.iteritems():
213  print " %-7d%s" % (count, reason)
214 
215  if len(site.exit_codes) > 0:
216  print
217  print " Count Exit code"
218  print " ------------------"
219  for code, count in site.exit_codes.iteritems():
220  print " %-7d%s" % (count, code)
221 
222 
223 
def __init__(self, jobid, site=None, start_time=None, status=Status.UNKNOWN, status_time=None, exit_code=None, hold_reason=None)
Definition: novas.h:112
procfile open("FD_BRL_v0.txt")
def __init__(self, name, success=0, held=0, failed=0, ongoing=0, disconnects=0)
char name[SIZE_OF_OBJ_NAME]
Definition: novas.h:116