Skip to content

Commit a8330f0

Browse files
authored
Merge pull request #4767 from barnabasdomozi/task_stats
Introduce task statistics script
2 parents 5933886 + 6a8b5fc commit a8330f0

File tree

2 files changed

+306
-0
lines changed

2 files changed

+306
-0
lines changed

scripts/statistics/__init__.py

Whitespace-only changes.
Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
#!/usr/bin/env python3
2+
3+
# -------------------------------------------------------------------------
4+
#
5+
# Part of the CodeChecker project, under the Apache License v2.0 with
6+
# LLVM Exceptions. See LICENSE for license information.
7+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8+
#
9+
# -------------------------------------------------------------------------
10+
11+
import argparse
12+
import dataclasses
13+
import json
14+
import sys
15+
from dataclasses import dataclass, field
16+
from datetime import datetime
17+
from enum import Enum
18+
from typing import Optional
19+
20+
21+
def print_dict(d: dict):
22+
max_key_length = max(map(len, d.keys()))
23+
for k, v in d.items():
24+
print(f"{k + ':':<{max_key_length + 2}}{v}")
25+
26+
27+
def parse_date(date: str) -> datetime:
28+
return datetime.strptime(date, "%Y-%m-%d %H:%M:%S")
29+
30+
31+
def date_diff(a: datetime, b: datetime) -> float:
32+
return (b - a).total_seconds()
33+
34+
35+
@dataclass
36+
class DataStats:
37+
min: float = 0
38+
max: float = 0
39+
avg: float = 0
40+
41+
42+
class EventType(Enum):
43+
WAIT_START = 1
44+
PROCESS_START = 2
45+
COMPLETED = 3
46+
47+
48+
@dataclass
49+
class TaskEvent:
50+
date: datetime
51+
type: EventType
52+
duration: float = 0
53+
54+
55+
@dataclass
56+
class DataPoint:
57+
date: datetime
58+
waiting_count: int = 0
59+
processing_count: int = 0
60+
completed_count: int = 0
61+
waiting_stats: Optional[DataStats] = None
62+
processing_stats: Optional[DataStats] = None
63+
64+
65+
def calc_stats(float_list: list[float]) -> DataStats:
66+
return DataStats(
67+
min=min(float_list),
68+
max=max(float_list),
69+
avg=round(sum(float_list) / len(float_list), 2))
70+
71+
72+
@dataclass
73+
class ProcessingResult:
74+
total_status_count: dict = field(default_factory=dict)
75+
total_waiting_stats: Optional[DataStats] = None
76+
total_processing_stats: Optional[DataStats] = None
77+
intervals: list[DataPoint] = field(default_factory=list)
78+
79+
80+
def check(data):
81+
print("Checking JSON file ...")
82+
83+
if not isinstance(data, list):
84+
print("Error: Invalid JSON file!")
85+
86+
if len(data) == 0:
87+
sys.exit("Error: No task data is available!")
88+
89+
required_fields = ["status", "enqueuedAt", "startedAt", "completedAt"]
90+
91+
for e in data:
92+
for r in required_fields:
93+
if r not in e:
94+
print("Error: Invalid JSON file!")
95+
print(f"Field '{r}' is missing in item {e}")
96+
sys.exit(1)
97+
98+
print("JSON file is valid.")
99+
100+
101+
def process(data, interval_duration: int) -> ProcessingResult:
102+
check(data)
103+
104+
# Convert date strings to datetime objects.
105+
for e in data:
106+
for k in ["enqueuedAt", "startedAt", "completedAt"]:
107+
e[k] = parse_date(e[k]) if e[k] else None
108+
109+
events: list[TaskEvent] = []
110+
total_waiting_durations: list[float] = []
111+
total_process_durations: list[float] = []
112+
status_count: dict[str, int] = {}
113+
114+
for e in data:
115+
status: str = e["status"]
116+
enqueued_at: datetime = e["enqueuedAt"]
117+
started_at: datetime = e["startedAt"]
118+
completed_at: datetime = e["completedAt"]
119+
120+
if status not in status_count:
121+
status_count[status] = 1
122+
else:
123+
status_count[status] += 1
124+
125+
if status == "COMPLETED":
126+
waiting_dur: float = date_diff(enqueued_at, started_at)
127+
processing_dur: float = date_diff(started_at, completed_at)
128+
129+
total_waiting_durations.append(waiting_dur)
130+
total_process_durations.append(processing_dur)
131+
132+
events.append(TaskEvent(date=enqueued_at,
133+
type=EventType.WAIT_START,
134+
duration=waiting_dur))
135+
136+
events.append(TaskEvent(date=started_at,
137+
type=EventType.PROCESS_START,
138+
duration=processing_dur))
139+
140+
events.append(TaskEvent(date=completed_at,
141+
type=EventType.COMPLETED))
142+
143+
events.sort(key=lambda e: e.date)
144+
145+
intervals: list[DataPoint] = [DataPoint(date=events[0].date)]
146+
counter_waiting: int = 0
147+
counter_processing: int = 0
148+
waiting_durations: list[float] = []
149+
processing_durations: list[float] = []
150+
last: DataPoint = intervals[-1]
151+
152+
for e in events:
153+
if (interval_duration != 0 and
154+
date_diff(last.date, e.date) > interval_duration):
155+
156+
# Closing last interval
157+
last.waiting_count = counter_waiting
158+
last.processing_count = counter_processing
159+
last.waiting_stats = calc_stats(waiting_durations) \
160+
if waiting_durations else None
161+
last.processing_stats = calc_stats(processing_durations) \
162+
if processing_durations else None
163+
waiting_durations = []
164+
processing_durations = []
165+
166+
# Create new interval
167+
intervals.append(DataPoint(date=e.date))
168+
last = intervals[-1]
169+
170+
if e.type == EventType.WAIT_START:
171+
counter_waiting += 1
172+
waiting_durations.append(e.duration)
173+
elif e.type == EventType.PROCESS_START:
174+
counter_waiting -= 1
175+
counter_processing += 1
176+
processing_durations.append(e.duration)
177+
elif e.type == EventType.COMPLETED:
178+
counter_processing -= 1
179+
180+
return ProcessingResult(
181+
total_status_count=status_count,
182+
total_waiting_stats=calc_stats(total_waiting_durations)
183+
if total_waiting_durations else None,
184+
total_processing_stats=calc_stats(total_process_durations)
185+
if total_process_durations else None,
186+
intervals=intervals)
187+
188+
189+
def present(result: ProcessingResult,
190+
plot: bool, plot_sum: str,
191+
plot_engine: str):
192+
print("--- Total Status Count ---")
193+
print_dict(result.total_status_count)
194+
print("--- Total Waiting in Queue Time Statistics ---")
195+
print_dict({k: str(v) + " secs"
196+
for k, v in vars(result.total_waiting_stats).items()})
197+
print("--- Total Processing Time Statistics ---")
198+
print_dict({k: str(v) + " secs"
199+
for k, v in vars(result.total_processing_stats).items()})
200+
201+
if plot:
202+
import matplotlib
203+
import matplotlib.pyplot as plt
204+
205+
matplotlib.use(plot_engine)
206+
207+
iv = result.intervals
208+
fig, axs = plt.subplots(2, 2)
209+
fig.suptitle("Serverside Task Statistics")
210+
211+
x = [i.date for i in iv]
212+
y = [i.waiting_count for i in iv]
213+
axs[0, 0].step(x, y, 'y')
214+
axs[0, 0].set_ylabel("Number of tasks waiting in queue")
215+
216+
x = [i.date for i in iv]
217+
y = [i.processing_count for i in iv]
218+
axs[1, 0].step(x, y)
219+
axs[1, 0].set_ylabel("Number of tasks being processed")
220+
221+
iv_wait = list(filter(lambda e: e.waiting_stats, iv))
222+
x = [i.date for i in iv_wait]
223+
y = [getattr(i.waiting_stats, plot_sum) for i in iv_wait]
224+
axs[0, 1].step(x, y, 'y')
225+
axs[0, 1].set_ylabel(
226+
f"{plot_sum.upper()} waiting in queue time (secs)")
227+
228+
iv_process = list(filter(lambda e: e.processing_stats, iv))
229+
x = [i.date for i in iv_process]
230+
y = [getattr(i.processing_stats, plot_sum) for i in iv_process]
231+
axs[1, 1].step(x, y)
232+
axs[1, 1].set_ylabel(
233+
f"{plot_sum.upper()} task processing time (secs)")
234+
235+
try:
236+
plt.show()
237+
except KeyboardInterrupt:
238+
pass
239+
240+
241+
def get_plot_sum_choices():
242+
return list(map(lambda e: e.name, dataclasses.fields(DataStats)))
243+
244+
245+
class CustomArgFormatter(argparse.RawTextHelpFormatter,
246+
argparse.ArgumentDefaultsHelpFormatter):
247+
pass
248+
249+
250+
if __name__ == '__main__':
251+
parser = argparse.ArgumentParser(formatter_class=CustomArgFormatter,
252+
description="""
253+
This script can be used to display useful metrics about the server-side tasks.
254+
The required input is a JSON file, which can be generated using the
255+
\"CodeChecker cmd serverside-tasks\" command. Using the --plot option,
256+
you can generate graphs showing various statistics. This requires matplotlib
257+
to be installed.""")
258+
259+
parser.add_argument("json_file", help="""
260+
JSON file containg a list of serverside tasks.
261+
This file can be acquired by running the command below:
262+
CodeChecker cmd serverside-tasks --url <server_url> --enqueued-after <date1>
263+
--enqueued-before <date2> --output json > out.json""")
264+
265+
parser.add_argument('-i', '--interval',
266+
dest="interval_duration",
267+
type=int,
268+
required=False,
269+
default=60,
270+
help="""
271+
Interval duration in seconds. Task events are grouped into intervals of this
272+
duration to compute various metrics, calculated separately for each
273+
interval.""")
274+
275+
parser.add_argument('--plot',
276+
dest="plot",
277+
action="store_true",
278+
required=False,
279+
default=False,
280+
help="""
281+
Displays the statistics plot. This requires matplotlib to be installed.""")
282+
283+
parser.add_argument('--plot-summary',
284+
dest="plot_sum",
285+
type=str,
286+
required=False,
287+
choices=get_plot_sum_choices(),
288+
default="avg",
289+
help="""
290+
Specifies the statistics shown in the right-hand plots.""")
291+
292+
parser.add_argument('--plot-engine',
293+
dest="plot_engine",
294+
type=str,
295+
required=False,
296+
default="Qt5Agg",
297+
help="""
298+
Defines which rendering engine matplotlib uses.""")
299+
300+
args = parser.parse_args()
301+
with open(args.json_file, encoding="utf-8") as f:
302+
json_data = json.load(f)
303+
present(process(json_data, args.interval_duration),
304+
args.plot,
305+
args.plot_sum,
306+
args.plot_engine)

0 commit comments

Comments
 (0)