Skip to content

Commit c555d5a

Browse files
bench: add duckdb benchmark
1 parent 2443652 commit c555d5a

File tree

5 files changed

+224
-127
lines changed

5 files changed

+224
-127
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,8 @@ test-cov:
8282
python3 -m pytest -x -vv --cov=rayforce --cov-report=term-missing tests/
8383

8484
lint:
85-
python3 -m ruff format tests/ rayforce/
86-
python3 -m ruff check rayforce/ --fix
85+
python3 -m ruff format tests/ rayforce/ benchmark/
86+
python3 -m ruff check rayforce/ benchmark/ --fix
8787
python3 -m ruff check tests/ --fix --select I
8888
python3 -m mypy rayforce/
8989
clang-format -i rayforce/capi/*

benchmark/benchmarks.py

Lines changed: 118 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
from rayforce import eval_str, Column
2-
from timer import time_microseconds
31
import polars as pl
2+
from timer import time_microseconds
3+
4+
from rayforce import Column, eval_str
45

56

67
class BenchmarkError(Exception): ...
@@ -40,6 +41,17 @@ def run():
4041

4142
return time_microseconds(run)
4243

44+
@staticmethod
45+
def benchmark_q1_duckdb(conn):
46+
"""
47+
Q1: Group by id1, sum v1
48+
"""
49+
50+
def run():
51+
return conn.execute("SELECT id1, SUM(v1) as v1_sum FROM df GROUP BY id1").fetchdf()
52+
53+
return time_microseconds(run)
54+
4355
@staticmethod
4456
def benchmark_q1_native_rayforce(table_name):
4557
"""
@@ -51,20 +63,17 @@ def benchmark_q1_native_rayforce(table_name):
5163

5264
if isinstance(result, dict) and "time" in result:
5365
return result["time"] * 1000, result
54-
elif isinstance(result, (int, float)):
66+
if isinstance(result, (int, float)):
5567
return result * 1000, result
56-
elif hasattr(result, "to_python"):
68+
if hasattr(result, "to_python"):
5769
# Handle Rayforce scalar types (F64, I64, etc.)
5870
value = result.to_python()
5971
return value * 1000, result
60-
elif hasattr(result, "value"):
72+
if hasattr(result, "value"):
6173
# Handle Rayforce scalar types with value property
6274
value = result.value
6375
return value * 1000, result
64-
else:
65-
raise BenchmarkError(
66-
f"rayforce runtime returned unsupported measure: {type(result)}"
67-
)
76+
raise BenchmarkError(f"rayforce runtime returned unsupported measure: {type(result)}")
6877

6978

7079
class Q2:
@@ -101,6 +110,19 @@ def run():
101110

102111
return time_microseconds(run)
103112

113+
@staticmethod
114+
def benchmark_q2_duckdb(conn):
115+
"""
116+
Q2: Group by id1, id2, sum v1
117+
"""
118+
119+
def run():
120+
return conn.execute(
121+
"SELECT id1, id2, SUM(v1) as v1_sum FROM df GROUP BY id1, id2"
122+
).fetchdf()
123+
124+
return time_microseconds(run)
125+
104126
@staticmethod
105127
def benchmark_q2_native_rayforce(table_name):
106128
"""
@@ -112,18 +134,15 @@ def benchmark_q2_native_rayforce(table_name):
112134

113135
if isinstance(result, dict) and "time" in result:
114136
return result["time"] * 1000, result
115-
elif isinstance(result, (int, float)):
137+
if isinstance(result, (int, float)):
116138
return result * 1000, result
117-
elif hasattr(result, "to_python"):
139+
if hasattr(result, "to_python"):
118140
value = result.to_python()
119141
return value * 1000, result
120-
elif hasattr(result, "value"):
142+
if hasattr(result, "value"):
121143
value = result.value
122144
return value * 1000, result
123-
else:
124-
raise BenchmarkError(
125-
f"rayforce runtime returned unsupported measure: {type(result)}"
126-
)
145+
raise BenchmarkError(f"rayforce runtime returned unsupported measure: {type(result)}")
127146

128147

129148
class Q3:
@@ -161,12 +180,24 @@ def benchmark_q3_polars(df):
161180

162181
def run():
163182
return df.group_by("id3").agg(
164-
pl.col("v1").sum().alias("v1_sum"),
165-
pl.col("v3").mean().alias("v3_avg")
183+
pl.col("v1").sum().alias("v1_sum"), pl.col("v3").mean().alias("v3_avg")
166184
)
167185

168186
return time_microseconds(run)
169187

188+
@staticmethod
189+
def benchmark_q3_duckdb(conn):
190+
"""
191+
Q3: Group by id3, sum v1, avg v3
192+
"""
193+
194+
def run():
195+
return conn.execute(
196+
"SELECT id3, SUM(v1) as v1_sum, AVG(v3) as v3_avg FROM df GROUP BY id3"
197+
).fetchdf()
198+
199+
return time_microseconds(run)
200+
170201
@staticmethod
171202
def benchmark_q3_native_rayforce(table_name):
172203
"""
@@ -178,18 +209,15 @@ def benchmark_q3_native_rayforce(table_name):
178209

179210
if isinstance(result, dict) and "time" in result:
180211
return result["time"] * 1000, result
181-
elif isinstance(result, (int, float)):
212+
if isinstance(result, (int, float)):
182213
return result * 1000, result
183-
elif hasattr(result, "to_python"):
214+
if hasattr(result, "to_python"):
184215
value = result.to_python()
185216
return value * 1000, result
186-
elif hasattr(result, "value"):
217+
if hasattr(result, "value"):
187218
value = result.value
188219
return value * 1000, result
189-
else:
190-
raise BenchmarkError(
191-
f"rayforce runtime returned unsupported measure: {type(result)}"
192-
)
220+
raise BenchmarkError(f"rayforce runtime returned unsupported measure: {type(result)}")
193221

194222

195223
class Q4:
@@ -219,11 +247,7 @@ def benchmark_q4_pandas(df):
219247
"""
220248

221249
def run():
222-
return (
223-
df.groupby("id3")
224-
.agg({"v1": "mean", "v2": "mean", "v3": "mean"})
225-
.reset_index()
226-
)
250+
return df.groupby("id3").agg({"v1": "mean", "v2": "mean", "v3": "mean"}).reset_index()
227251

228252
return time_microseconds(run)
229253

@@ -237,11 +261,24 @@ def run():
237261
return df.group_by("id3").agg(
238262
pl.col("v1").mean().alias("v1_avg"),
239263
pl.col("v2").mean().alias("v2_avg"),
240-
pl.col("v3").mean().alias("v3_avg")
264+
pl.col("v3").mean().alias("v3_avg"),
241265
)
242266

243267
return time_microseconds(run)
244268

269+
@staticmethod
270+
def benchmark_q4_duckdb(conn):
271+
"""
272+
Q4: Group by id3, avg v1, avg v2, avg v3
273+
"""
274+
275+
def run():
276+
return conn.execute(
277+
"SELECT id3, AVG(v1) as v1_avg, AVG(v2) as v2_avg, AVG(v3) as v3_avg FROM df GROUP BY id3"
278+
).fetchdf()
279+
280+
return time_microseconds(run)
281+
245282
@staticmethod
246283
def benchmark_q4_native_rayforce(table_name):
247284
"""
@@ -253,18 +290,15 @@ def benchmark_q4_native_rayforce(table_name):
253290

254291
if isinstance(result, dict) and "time" in result:
255292
return result["time"] * 1000, result
256-
elif isinstance(result, (int, float)):
293+
if isinstance(result, (int, float)):
257294
return result * 1000, result
258-
elif hasattr(result, "to_python"):
295+
if hasattr(result, "to_python"):
259296
value = result.to_python()
260297
return value * 1000, result
261-
elif hasattr(result, "value"):
298+
if hasattr(result, "value"):
262299
value = result.value
263300
return value * 1000, result
264-
else:
265-
raise BenchmarkError(
266-
f"rayforce runtime returned unsupported measure: {type(result)}"
267-
)
301+
raise BenchmarkError(f"rayforce runtime returned unsupported measure: {type(result)}")
268302

269303

270304
class Q5:
@@ -294,11 +328,7 @@ def benchmark_q5_pandas(df):
294328
"""
295329

296330
def run():
297-
return (
298-
df.groupby("id3")
299-
.agg({"v1": "sum", "v2": "sum", "v3": "sum"})
300-
.reset_index()
301-
)
331+
return df.groupby("id3").agg({"v1": "sum", "v2": "sum", "v3": "sum"}).reset_index()
302332

303333
return time_microseconds(run)
304334

@@ -312,11 +342,24 @@ def run():
312342
return df.group_by("id3").agg(
313343
pl.col("v1").sum().alias("v1_sum"),
314344
pl.col("v2").sum().alias("v2_sum"),
315-
pl.col("v3").sum().alias("v3_sum")
345+
pl.col("v3").sum().alias("v3_sum"),
316346
)
317347

318348
return time_microseconds(run)
319349

350+
@staticmethod
351+
def benchmark_q5_duckdb(conn):
352+
"""
353+
Q5: Group by id3, sum v1, sum v2, sum v3
354+
"""
355+
356+
def run():
357+
return conn.execute(
358+
"SELECT id3, SUM(v1) as v1_sum, SUM(v2) as v2_sum, SUM(v3) as v3_sum FROM df GROUP BY id3"
359+
).fetchdf()
360+
361+
return time_microseconds(run)
362+
320363
@staticmethod
321364
def benchmark_q5_native_rayforce(table_name):
322365
"""
@@ -328,18 +371,15 @@ def benchmark_q5_native_rayforce(table_name):
328371

329372
if isinstance(result, dict) and "time" in result:
330373
return result["time"] * 1000, result
331-
elif isinstance(result, (int, float)):
374+
if isinstance(result, (int, float)):
332375
return result * 1000, result
333-
elif hasattr(result, "to_python"):
376+
if hasattr(result, "to_python"):
334377
value = result.to_python()
335378
return value * 1000, result
336-
elif hasattr(result, "value"):
379+
if hasattr(result, "value"):
337380
value = result.value
338381
return value * 1000, result
339-
else:
340-
raise BenchmarkError(
341-
f"rayforce runtime returned unsupported measure: {type(result)}"
342-
)
382+
raise BenchmarkError(f"rayforce runtime returned unsupported measure: {type(result)}")
343383

344384

345385
class Q6:
@@ -351,9 +391,7 @@ def benchmark_q6_rayforce(table):
351391

352392
def run():
353393
return (
354-
table.select(
355-
range_v1_v2=(Column("v1").max() - Column("v2").min())
356-
)
394+
table.select(range_v1_v2=(Column("v1").max() - Column("v2").min()))
357395
.by("id3")
358396
.execute()
359397
)
@@ -386,29 +424,41 @@ def run():
386424

387425
return time_microseconds(run)
388426

427+
@staticmethod
428+
def benchmark_q6_duckdb(conn):
429+
"""
430+
Q6: Group by id3, max(v1) - min(v2)
431+
"""
432+
433+
def run():
434+
return conn.execute(
435+
"SELECT id3, MAX(v1) - MIN(v2) as range_v1_v2 FROM df GROUP BY id3"
436+
).fetchdf()
437+
438+
return time_microseconds(run)
439+
389440
@staticmethod
390441
def benchmark_q6_native_rayforce(table_name):
391442
"""
392443
Q6: Group by id3, max(v1) - min(v2)
393444
"""
394445

395-
query = f"(timeit (select {{range_v1_v2: (- (max v1) (min v2)) by: id3 from: {table_name}}}))"
446+
query = (
447+
f"(timeit (select {{range_v1_v2: (- (max v1) (min v2)) by: id3 from: {table_name}}}))"
448+
)
396449
result = eval_str(query)
397450

398451
if isinstance(result, dict) and "time" in result:
399452
return result["time"] * 1000, result
400-
elif isinstance(result, (int, float)):
453+
if isinstance(result, (int, float)):
401454
return result * 1000, result
402-
elif hasattr(result, "to_python"):
455+
if hasattr(result, "to_python"):
403456
value = result.to_python()
404457
return value * 1000, result
405-
elif hasattr(result, "value"):
458+
if hasattr(result, "value"):
406459
value = result.value
407460
return value * 1000, result
408-
else:
409-
raise BenchmarkError(
410-
f"rayforce runtime returned unsupported measure: {type(result)}"
411-
)
461+
raise BenchmarkError(f"rayforce runtime returned unsupported measure: {type(result)}")
412462

413463

414464
benchmarks = [
@@ -417,41 +467,47 @@ def benchmark_q6_native_rayforce(table_name):
417467
Q1.benchmark_q1_rayforce,
418468
Q1.benchmark_q1_pandas,
419469
Q1.benchmark_q1_polars,
470+
Q1.benchmark_q1_duckdb,
420471
Q1.benchmark_q1_native_rayforce,
421472
),
422473
(
423474
"Q2: Group by id1, id2, sum v1",
424475
Q2.benchmark_q2_rayforce,
425476
Q2.benchmark_q2_pandas,
426477
Q2.benchmark_q2_polars,
478+
Q2.benchmark_q2_duckdb,
427479
Q2.benchmark_q2_native_rayforce,
428480
),
429481
(
430482
"Q3: Group by id3, sum v1, avg v3",
431483
Q3.benchmark_q3_rayforce,
432484
Q3.benchmark_q3_pandas,
433485
Q3.benchmark_q3_polars,
486+
Q3.benchmark_q3_duckdb,
434487
Q3.benchmark_q3_native_rayforce,
435488
),
436489
(
437490
"Q4: Group by id3, avg v1, v2, v3",
438491
Q4.benchmark_q4_rayforce,
439492
Q4.benchmark_q4_pandas,
440493
Q4.benchmark_q4_polars,
494+
Q4.benchmark_q4_duckdb,
441495
Q4.benchmark_q4_native_rayforce,
442496
),
443497
(
444498
"Q5: Group by id3, sum v1, v2, v3",
445499
Q5.benchmark_q5_rayforce,
446500
Q5.benchmark_q5_pandas,
447501
Q5.benchmark_q5_polars,
502+
Q5.benchmark_q5_duckdb,
448503
Q5.benchmark_q5_native_rayforce,
449504
),
450505
(
451506
"Q6: Group by id3, max(v1) - min(v2)",
452507
Q6.benchmark_q6_rayforce,
453508
Q6.benchmark_q6_pandas,
454509
Q6.benchmark_q6_polars,
510+
Q6.benchmark_q6_duckdb,
455511
Q6.benchmark_q6_native_rayforce,
456512
),
457513
]

0 commit comments

Comments
 (0)