site_stats_from_log.py
Go to the documentation of this file.
1 #!/bin/env python
2 
3 from __future__ import print_function
4 from builtins import str
5 from builtins import object
6 import argparse
7 import datetime
8 import re
9 import sys
10 
11 class Status(object):
12  UNKNOWN = 0
13  IDLE = 1
14  RUNNING = 2
15  HELD = 3
16  FINISHED = 4
17 
18 class JobInfo(object):
19  def __init__(self,
20  jobid,
21  site=None,
22  start_time=None,
23  status=Status.UNKNOWN,
24  status_time=None,
25  exit_code=None,
26  hold_reason=None):
27  self.jobid = jobid
28  self.start_time = start_time
29  self.site = site
30  self.status = status
31  self.status_time = status_time
32  self.exit_code = exit_code
33  self.hold_reason = hold_reason
34 
35 class SiteStats(object):
36  def __init__(self, name, success=0, held=0, failed=0, ongoing=0, disconnects=0):
37  self.name = name
38  self.success = success
39  self.held = held
40  self.failed = failed
41  self.ongoing = ongoing
42  self.disconnects = disconnects
43 
44  self.exit_codes = {}
45  self.hold_reasons = {}
46  self.disconnect_reasons = {} # currently uninstrumented, though somebody could fill it if desired
47 
48  def __repr__(self):
49  return str(self.__dict__)
50 
51 # class JobTable(object):
52 # def __init__(self):
53 # self._jobs = {}
54 #
55 # def __getitem__(self, key):
56 
57 job_event_pattern = re.compile(r"\((?P<jobid>\d+\.\d+)\.000\).*(?P<when>\d{2}/\d{2} \d{2}:\d{2}:\d{2}).*Job (?P<action>.*)$")
58 job_site_pattern = re.compile(r"JOB_Site = \"(?P<job_site>[^\"]+)\"")
59 exit_code_pattern = re.compile("\(return value (?P<exit_code>\d+)\)")
60 hold_reason_pattern = re.compile("HoldReason = \".*(?P<reason>SYSTEM_PERIODIC_HOLD\D*|Docker job.*|SHADOW\D*)")
61 disconnected_pattern = re.compile("Reason = \"Job disconnected")
62 
63 parser = argparse.ArgumentParser(prog=sys.argv[0],description="Trawl Condor .log file and learn about site behavior")
64 parser.add_argument("files",
65  metavar="FILE",
66  nargs="+",
67  help="Examine these files",
68 )
69 parser.add_argument("--site-details",
70  default="",
71  help="Show detailed output (exit codes and hold reasons) for site names matching this regular expression. (Use the expression '.*' to show details for each site independently.)",
72 )
73 
74 args = parser.parse_args()
75 
76 filenames = args.files
77 site_details_pattern = re.compile(args.site_details)
78 
79 job_table = {}
80 disconnects_by_site = {}
81 options = ("submitted", "executing", "terminated", "held", "disconnected")
82 for logfile_name in filenames:
83  with open(logfile_name) as logfile:
84  # Condor opens each entry about a job with "..."
85  # Scan for those.
86 
87  job = None
88  for line in logfile:
89  if line == "...":
90  job = None
91  continue
92 
93  matches = job_event_pattern.search(line)
94  if matches:
95  jobid = matches.group("jobid")
96  if jobid not in job_table:
97  job_table[jobid] = JobInfo(jobid=jobid)
98  job = job_table[jobid]
99 
100  action = matches.group("action")
101 
102  when = datetime.datetime.strptime(matches.group("when"), "%m/%d %H:%M:%S")
103  when = when.replace(year=datetime.datetime.now().year)
104 
105  if "submitted" in action:
106  job.status = Status.IDLE
107  elif "executing" in action:
108  job.start_time = when
109  job.status = Status.RUNNING
110  job.status_time = when
111  elif "held" in action:
112  job.status = Status.HELD
113  job.status_time = when
114  elif "terminated" in action:
115  job.status = Status.FINISHED
116 
117  continue
118 
119  matches = job_site_pattern.search(line)
120  if matches:
121  # once they go 'held' they have no site any more...
122  if "$$(GLIDEIN_Site:Unknown)" not in matches.group("job_site"):
123  job.site = matches.group("job_site")
124  continue
125 
126  matches = hold_reason_pattern.search(line)
127  if matches:
128  job.hold_reason = matches.group("reason")
129  continue
130 
131  matches = exit_code_pattern.search(line)
132  if matches:
133  job.exit_code = int(matches.group("exit_code"))
134  continue
135 
136  # disconnects are special.
137  # the job gets restarted with the same job ID,
138  # we just have to count disconnects separately
139  matches = disconnected_pattern.search(line)
140  if matches:
141  if job.site:
142  disconnects_by_site.setdefault(job.site, 0)
143  disconnects_by_site[job.site] += 1
144 
145 site_stats = {}
146 for job in job_table.values():
147  if job.site not in site_stats:
148  site_stats[job.site] = SiteStats(name=job.site)
149 
150  if job.status == Status.IDLE:
151  continue
152 
153  site = site_stats[job.site]
154  if job.status == Status.RUNNING:
155  site.ongoing += 1
156  elif job.status == Status.HELD:
157  site.held += 1
158  if job.hold_reason:
159  site.hold_reasons.setdefault(job.hold_reason, 0)
160  site.hold_reasons[job.hold_reason] += 1
161  elif job.status == Status.FINISHED:
162  site.exit_codes.setdefault(job.exit_code, 0)
163  site.exit_codes[job.exit_code] += 1
164  if job.exit_code > 0:
165  site.failed += 1
166  else:
167  site.success += 1
168 
169 for site_name, disconnects in disconnects_by_site.items():
170  site = site_stats.setdefault(site, SiteStats(site_name))
171  site.disconnects = disconnects
172 
173 sites_sorted = sorted(site_stats.keys())
174 
175 print("Site Ongoing Held Failed Disconnected Success")
176 for site_name in sites_sorted:
177  site = site_stats[site_name]
178  print("%-15s%7d %4d %6d %12d %7d" % (site.name, site.ongoing, site.held, site.failed, site.disconnects, site.success))
179 
180 sum_exit_codes = {}
181 sum_hold_reasons = {}
182 for stats in site_stats.values():
183  for summed, collection in [(sum_exit_codes, stats.exit_codes), (sum_hold_reasons, stats.hold_reasons)]:
184  for key, val in collection.items():
185  summed.setdefault(key, 0)
186  summed[key] += val
187 
188 if len(sum_hold_reasons):
189  print()
190  print("Held jobs:")
191  print(" Count Hold reason")
192  for reason in sorted(sum_hold_reasons, key=sum_hold_reasons.get, reverse=True):
193  print(" %-7d%s" % (sum_hold_reasons[reason], reason))
194 
195 if len(sum_exit_codes):
196  print()
197  print("Exit codes:")
198  print(" Count Code")
199  for code in sorted(sum_exit_codes, key=sum_exit_codes.get, reverse=True):
200  print(" %-7d%s" % (sum_exit_codes[code], code))
201 
202 
203 if args.site_details:
204  print()
205  print("Details for site names matching pattern '%s':" % args.site_details)
206  for site_name in sites_sorted:
207  if not site_details_pattern.match(site_name):
208  continue
209  site = site_stats[site_name]
210  print()
211  print(site_name)
212  if len(site.hold_reasons) > 0:
213  print(" Count Hold reason")
214  print(" ------------------")
215  for reason, count in site.hold_reasons.items():
216  print(" %-7d%s" % (count, reason))
217 
218  if len(site.exit_codes) > 0:
219  print()
220  print(" Count Exit code")
221  print(" ------------------")
222  for code, count in site.exit_codes.items():
223  print(" %-7d%s" % (count, code))
224 
225 
226 
def __init__(self, jobid, site=None, start_time=None, status=Status.UNKNOWN, status_time=None, exit_code=None, hold_reason=None)
bool print
procfile open("FD_BRL_v0.txt")
def __init__(self, name, success=0, held=0, failed=0, ongoing=0, disconnects=0)