Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions kernels/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,15 @@ dev = [
"torch>=2.5",
"types-pyyaml",
"types-requests",
"types-tabulate",
]

[project.optional-dependencies]
abi-check = ["kernel-abi-check>=0.6.2,<0.7.0"]
benchmark = [
"numpy>=2.0.2",
"requests>=2.32.5",
"tabulate>=0.9.0",
"torch",
]
torch = ["torch"]
Expand Down
188 changes: 107 additions & 81 deletions kernels/src/kernels/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@
TORCH_AVAILABLE = False
MISSING_DEPS.append("torch")

try:
from tabulate import tabulate
except ImportError:
tabulate = None # type: ignore[assignment]
MISSING_DEPS.append("tabulate")


def _percentile(sorted_data: list[float], p: float) -> float:
n = len(sorted_data)
Expand Down Expand Up @@ -73,8 +79,8 @@ class MyBenchmark(Benchmark):
seed = 42

def setup(self):
self.x = torch.randn(128, 1024, device="cuda", dtype=torch.float16)
self.out = torch.empty(128, 512, device="cuda", dtype=torch.float16)
self.x = torch.randn(128, 1024, device=self.device, dtype=torch.float16)
self.out = torch.empty(128, 512, device=self.device, dtype=torch.float16)

def benchmark_silu(self):
self.kernel.silu_and_mul(self.out, self.x)
Expand All @@ -87,6 +93,7 @@ def verify_silu(self) -> torch.Tensor:
"""

seed: int | None = None # Optional: seed for reproducibility
device: str = "cpu" # Set automatically by runner

def __init__(self) -> None:
self.kernel: Any = None
Expand Down Expand Up @@ -183,96 +190,85 @@ def _print_results_table(results: dict[str, TimingResults]) -> None:
cls, method = "", name
parsed.append((name, cls, method))

# Calculate column widths
cls_w = max((len(p[1]) for p in parsed), default=9)
cls_w = max(cls_w, 9) # minimum "Benchmark" header
method_w = max((len(p[2]) for p in parsed), default=8)
method_w = max(method_w, 8) # minimum "Workload" header
num_w = 10
out_w = 8
n_w = 5 # "N" column width

# Check if we have any ref times to show
has_ref = any(results[name].ref_mean_ms is not None for name in results)

# Build table border
match_w = 5 # "Match" column
speedup_w = 7 # "Speedup" column
if has_ref:
col_widths = [
cls_w,
method_w,
n_w,
speedup_w,
num_w,
num_w,
num_w,
num_w,
num_w,
out_w,
num_w,
match_w,
]
else:
col_widths = [
cls_w,
method_w,
n_w,
num_w,
num_w,
num_w,
num_w,
num_w,
out_w,
match_w,
]

def border(left, sep, right):
return left + sep.join("─" * (w + 2) for w in col_widths) + right

print(file=sys.stderr)
print(border("┌", "┬", "┐"), file=sys.stderr)
if has_ref:
print(
f"│ {'Benchmark':<{cls_w}} │ {'Workload':<{method_w}} │ {'N':>{n_w}} │ {'Speedup':>{speedup_w}} │ {'Mean(ms)':>{num_w}} │ {'Std(ms)':>{num_w}} │ "
f"{'Min(ms)':>{num_w}} │ {'Max(ms)':>{num_w}} │ {'IQR(ms)':>{num_w}} │ {'Outliers':>{out_w}} │ {'Ref(ms)':>{num_w}} │ {'Match':^{match_w}} │",
file=sys.stderr,
)
else:
print(
f"│ {'Benchmark':<{cls_w}} │ {'Workload':<{method_w}} │ {'N':>{n_w}} │ {'Mean(ms)':>{num_w}} │ {'Std(ms)':>{num_w}} │ "
f"{'Min(ms)':>{num_w}} │ {'Max(ms)':>{num_w}} │ {'IQR(ms)':>{num_w}} │ {'Outliers':>{out_w}} │ {'Match':^{match_w}} │",
file=sys.stderr,
)
print(border("├", "┼", "┤"), file=sys.stderr)

# Build table data
rows = []
for full_name, cls, method in parsed:
t = results[full_name]
check = "✓" if t.verified else ("✗" if t.verified is False else "·")

if has_ref:
ref_str = (
f"{t.ref_mean_ms:>{num_w}.4f}"
if t.ref_mean_ms is not None
else " " * num_w
)
ref_str = f"{t.ref_mean_ms:.4f}" if t.ref_mean_ms is not None else ""
if t.ref_mean_ms is not None and t.mean_ms > 0:
speedup = t.ref_mean_ms / t.mean_ms
speedup_str = f"{speedup:.2f}x"
speedup_str = f"{t.ref_mean_ms / t.mean_ms:.2f}x"
else:
speedup_str = ""
print(
f"│ {cls:<{cls_w}} │ {method:<{method_w}} │ {t.iterations:>{n_w}} │ {speedup_str:>{speedup_w}} │ {t.mean_ms:>{num_w}.4f} │ {t.std_ms:>{num_w}.4f} │ "
f"{t.min_ms:>{num_w}.4f} │ {t.max_ms:>{num_w}.4f} │ {t.iqr_ms:>{num_w}.4f} │ {t.outliers:>{out_w}} │ {ref_str} │ {check:^{match_w}} │",
file=sys.stderr,
rows.append(
[
cls,
method,
t.iterations,
speedup_str,
f"{t.mean_ms:.4f}",
f"{t.std_ms:.4f}",
f"{t.min_ms:.4f}",
f"{t.max_ms:.4f}",
f"{t.iqr_ms:.4f}",
t.outliers,
ref_str,
check,
]
)
else:
print(
f"│ {cls:<{cls_w}} │ {method:<{method_w}} │ {t.iterations:>{n_w}} │ {t.mean_ms:>{num_w}.4f} │ {t.std_ms:>{num_w}.4f} │ "
f"{t.min_ms:>{num_w}.4f} │ {t.max_ms:>{num_w}.4f} │ {t.iqr_ms:>{num_w}.4f} │ {t.outliers:>{out_w}} │ {check:^{match_w}} │",
file=sys.stderr,
rows.append(
[
cls,
method,
t.iterations,
f"{t.mean_ms:.4f}",
f"{t.std_ms:.4f}",
f"{t.min_ms:.4f}",
f"{t.max_ms:.4f}",
f"{t.iqr_ms:.4f}",
t.outliers,
check,
]
)

print(border("└", "┴", "┘"), file=sys.stderr)
# Define headers
if has_ref:
headers = [
"Benchmark",
"Workload",
"N",
"Speedup",
"Mean(ms)",
"Std(ms)",
"Min(ms)",
"Max(ms)",
"IQR(ms)",
"Outliers",
"Ref(ms)",
"Match",
]
else:
headers = [
"Benchmark",
"Workload",
"N",
"Mean(ms)",
"Std(ms)",
"Min(ms)",
"Max(ms)",
"IQR(ms)",
"Outliers",
"Match",
]

print(file=sys.stderr)
print(tabulate(rows, headers=headers, tablefmt="simple_outline"), file=sys.stderr)

# Print statistical significance analysis if we have ref times
if has_ref:
Expand Down Expand Up @@ -409,6 +405,17 @@ def _get_macos_gpu() -> tuple[str | None, int | None]:
return None, None


def get_device() -> str:
if TORCH_AVAILABLE:
if torch.cuda.is_available():
return "cuda"
elif hasattr(torch, "xpu") and torch.xpu.is_available():
return "xpu"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
return "mps"
return "cpu"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can reuse this function:

def backend() -> str:

But we can do that in a follow-up PR.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh great point, just took a look and updated in this PR!



def collect_machine_info() -> MachineInfo:
gpu = "N/A"
gpu_cores = None
Expand All @@ -426,21 +433,24 @@ def collect_machine_info() -> MachineInfo:

if TORCH_AVAILABLE:
pytorch_version = torch.__version__
if torch.cuda.is_available():
device = get_device()
if device == "cuda":
gpu = torch.cuda.get_device_name(0)
# ROCm uses the CUDA API but has torch.version.hip
if hasattr(torch.version, "hip") and torch.version.hip:
backend = f"ROCm {torch.version.hip}"
else:
backend = f"CUDA {torch.version.cuda}" if torch.version.cuda else "CUDA"
elif hasattr(torch, "xpu") and torch.xpu.is_available():
elif device == "xpu":
gpu = torch.xpu.get_device_name(0)
backend = "XPU"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
elif device == "mps":
macos_gpu, macos_cores = _get_macos_gpu()
gpu = macos_gpu or "Apple MPS"
gpu_cores = macos_cores
backend = "MPS"
else:
backend = "CPU"

return MachineInfo(
gpu=gpu,
Expand Down Expand Up @@ -474,6 +484,7 @@ def run_benchmark_class(
warmup: int,
repo_id: str,
revision: str,
machine_info: MachineInfo,
) -> tuple[dict[str, TimingResults], str]:
results = {}

Expand All @@ -492,13 +503,16 @@ def run_benchmark_class(

kernel = get_kernel(repo_id, revision=revision)
kernel_sha = get_kernel_sha_from_build_name(kernel)
device = get_device()
print(f" Running {benchmark_cls.__name__} on {device}", file=sys.stderr)

for method_name in benchmark_methods:
workload_name = method_name.replace("benchmark_", "")

# Create fresh instance for each workload
instance = benchmark_cls()
instance.kernel = kernel
instance.device = device

# Apply seed for reproducibility
if instance.seed is not None:
Expand Down Expand Up @@ -647,6 +661,17 @@ def run_benchmark_script(
if not classes:
raise RuntimeError(f"No Benchmark subclasses found in {script_path}")

machine_info = collect_machine_info()
gpu_cores_str = (
f" ({machine_info.gpu_cores} cores)" if machine_info.gpu_cores else ""
)
print(file=sys.stderr)
print(f" GPU {machine_info.gpu}{gpu_cores_str}", file=sys.stderr)
print(f" CPU {machine_info.cpu}", file=sys.stderr)
print(f" OS {machine_info.os}", file=sys.stderr)
print(f" PyTorch {machine_info.pytorch_version}", file=sys.stderr)
print(file=sys.stderr)

all_results = {}
kernel_sha = ""
for cls in classes:
Expand All @@ -656,6 +681,7 @@ def run_benchmark_script(
warmup=warmup,
repo_id=repo_id,
revision=revision,
machine_info=machine_info,
)
for name, timing in results.items():
all_results[f"{cls.__name__}.{name}"] = timing
Expand Down
11 changes: 11 additions & 0 deletions kernels/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading