Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
cbdb757
add localized allocation and deallocation
dsding2 Jun 2, 2025
2fee158
delete commented out code
dsding2 Jun 2, 2025
8ace895
deal with base storage
dsding2 Jun 4, 2025
c4e635c
ruff check fixes
dsding2 Jun 5, 2025
24b1a47
rework to push allocations outside of loops
dsding2 Jun 8, 2025
be78797
add types, fix ruff
dsding2 Jun 9, 2025
461558d
Merge remote-tracking branch 'upstream/main' into opencl_allocation
dsding2 Jun 13, 2025
0bcf4df
Merge branch 'main' into opencl_allocation
dsding2 Jun 17, 2025
0b6abdd
refactor to make more target-generic
dsding2 Jun 17, 2025
4f95a6b
resolve lingering merge issues
dsding2 Jun 17, 2025
bd98636
fix to only allocate global temporaries
dsding2 Jun 17, 2025
47dda68
move temp declarations to ASTBuilder
dsding2 Jun 19, 2025
e494a3b
Merge branch 'main' into opencl_allocation
dsding2 Jun 19, 2025
88c436f
fix typing
dsding2 Jun 19, 2025
dae91e2
fix typing hopefully
dsding2 Jun 23, 2025
1cfe83a
add basic test
dsding2 Jun 23, 2025
3c3bb78
Merge branch 'main' into opencl_allocation
dsding2 Jun 30, 2025
3ef324c
more typing/ruff fixes
dsding2 Jun 30, 2025
f708b66
fix tutorial.rst and add to baseline
dsding2 Jun 30, 2025
a0a8365
Merge branch 'main' into opencl_allocation
inducer Jul 5, 2025
f12ce9f
Merge branch 'main' into opencl_allocation
inducer Jul 10, 2025
452be6b
Merge branch 'main' into opencl_allocation
inducer Jul 10, 2025
3985576
Update loopy/schedule/tools.py
dsding2 Jul 11, 2025
95e119e
Apply suggested test changes
dsding2 Jul 11, 2025
5cbfbf1
implement rename and documentation suggestions
dsding2 Jul 11, 2025
612b238
ruff fixes, revert broken change
dsding2 Jul 12, 2025
4b0d754
Merge branch 'main' into opencl_allocation
inducer Jul 28, 2025
e591ae6
Merge branch 'main' into opencl_allocation
inducer Jul 31, 2025
1290c64
Merge branch 'main' into opencl_allocation
inducer Aug 28, 2025
b24fe99
Improvements
inducer Aug 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions loopy/codegen/control.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,24 @@
glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
get_insn_ids_for_block_at(kernel.linearization, sched_index),
codegen_state.callables_table)

from loopy.target.pyopencl import PyOpenCLPythonASTBuilder
if isinstance(codegen_state.ast_builder, PyOpenCLPythonASTBuilder):
prefix, postfix = (
codegen_state.ast_builder
.get_temporary_decl_at_index(codegen_state, sched_index)
)
results = [
prefix,
codegen_result,
codegen_state.ast_builder.get_kernel_call(
codegen_state,
sched_item.kernel_name,
glob_grid, loc_grid),
postfix
]
return merge_codegen_results(codegen_state, results)

Check warning on line 95 in loopy/codegen/control.py

View workflow job for this annotation

GitHub Actions / basedpyright

Argument type is partially unknown   Argument corresponds to parameter "elements" in function "merge_codegen_results"   Argument type is "list[Suite | CodeGenerationResult | Unknown]" (reportUnknownArgumentType)

return merge_codegen_results(codegen_state, [
codegen_result,

Expand Down Expand Up @@ -127,6 +145,15 @@
"for '%s', tagged '%s'"
% (sched_item.iname, ", ".join(str(tag) for tag in tags)))

from loopy.target.pyopencl import PyOpenCLPythonASTBuilder
if isinstance(codegen_state.ast_builder, PyOpenCLPythonASTBuilder):
prefix, postfix = (
codegen_state.ast_builder
.get_temporary_decl_at_index(codegen_state, sched_index)
)
results = [prefix, func(codegen_state, sched_index), postfix]
return merge_codegen_results(codegen_state, results)

Check warning on line 155 in loopy/codegen/control.py

View workflow job for this annotation

GitHub Actions / basedpyright

Argument type is partially unknown   Argument corresponds to parameter "elements" in function "merge_codegen_results"   Argument type is "list[Suite | CodeGenerationResult | Unknown]" (reportUnknownArgumentType)

return func(codegen_state, sched_index)

elif isinstance(sched_item, Barrier):
Expand Down
245 changes: 199 additions & 46 deletions loopy/target/pyopencl.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,13 @@
ValueArg,
)
from loopy.kernel.function_interface import ScalarCallable
from loopy.schedule import CallKernel
from loopy.schedule import (
CallKernel,
EnterLoop,
LeaveLoop,
ReturnFromKernel,
ScheduleItem,
)
from loopy.target.opencl import (
ExpressionToOpenCLCExpressionMapper,
OpenCLCASTBuilder,
Expand All @@ -71,7 +77,7 @@
logger = logging.getLogger(__name__)

if TYPE_CHECKING:
from collections.abc import Mapping, Sequence
from collections.abc import Iterable, Mapping, Sequence

import genpy
import pyopencl as cl
Expand All @@ -80,26 +86,25 @@
from loopy.codegen import CodeGenerationState
from loopy.codegen.result import CodeGenerationResult
from loopy.kernel import LoopKernel
from loopy.schedule import ScheduleItem
from loopy.target.pyopencl_execution import PyOpenCLExecutor
from loopy.translation_unit import (
CallableId,
CallablesInferenceContext,
TranslationUnit,
)
from loopy.translation_unit import FunctionIdT, TranslationUnit

Check warning on line 91 in loopy/target/pyopencl.py

View workflow job for this annotation

GitHub Actions / basedpyright

Import "FunctionIdT" is not accessed (reportUnusedImport)

Check failure on line 91 in loopy/target/pyopencl.py

View workflow job for this annotation

GitHub Actions / basedpyright

"FunctionIdT" is unknown import symbol (reportAttributeAccessIssue)
from loopy.typing import Expression

Check warning on line 92 in loopy/target/pyopencl.py

View workflow job for this annotation

GitHub Actions / basedpyright

"Expression" is not exported from module "loopy.typing"   Import from "pymbolic.typing" instead (reportPrivateLocalImportUsage)


# {{{ pyopencl function scopers


class PyOpenCLCallable(ScalarCallable):
"""
Records information about the callables which are not covered by
:class:`loopy.target.opencl.OpenCLCallable`
"""
@override
def with_types(self,

Check warning on line 104 in loopy/target/pyopencl.py

View workflow job for this annotation

GitHub Actions / basedpyright

Return type, "tuple[ScalarCallable, Unknown]", is partially unknown (reportUnknownParameterType)
arg_id_to_dtype: Mapping[int | str, LoopyType],
clbl_inf_ctx: CallablesInferenceContext,

Check failure on line 106 in loopy/target/pyopencl.py

View workflow job for this annotation

GitHub Actions / basedpyright

"CallablesInferenceContext" is not defined (reportUndefinedVariable)

Check warning on line 106 in loopy/target/pyopencl.py

View workflow job for this annotation

GitHub Actions / basedpyright

Type of parameter "clbl_inf_ctx" is unknown (reportUnknownParameterType)
) -> tuple[ScalarCallable, CallablesInferenceContext]:

Check failure on line 107 in loopy/target/pyopencl.py

View workflow job for this annotation

GitHub Actions / basedpyright

"CallablesInferenceContext" is not defined (reportUndefinedVariable)

name = self.name

Expand Down Expand Up @@ -169,7 +174,7 @@
arg_id_to_dtype=self.arg_id_to_dtype,
arg_id_to_descr=self.arg_id_to_descr,
name_in_target=self.name_in_target).with_types(
arg_id_to_dtype, clbl_inf_ctx)

Check warning on line 177 in loopy/target/pyopencl.py

View workflow job for this annotation

GitHub Actions / basedpyright

Argument type is unknown   Argument corresponds to parameter "clbl_inf_ctx" in function "with_types" (reportUnknownArgumentType)

def generate_preambles(self, target):
name = self.name_in_target
Expand Down Expand Up @@ -652,7 +657,7 @@
# and mypy doesn't like it.
def get_kernel_executor(self, t_unit: TranslationUnit, # type: ignore[override]
queue_or_context: cl.CommandQueue | cl.Context,
*args: Any, entrypoint: CallableId, **kwargs: Any

Check failure on line 660 in loopy/target/pyopencl.py

View workflow job for this annotation

GitHub Actions / basedpyright

"CallableId" is not defined (reportUndefinedVariable)

Check warning on line 660 in loopy/target/pyopencl.py

View workflow job for this annotation

GitHub Actions / basedpyright

Type of parameter "entrypoint" is unknown (reportUnknownParameterType)
) -> PyOpenCLExecutor:
from pyopencl import CommandQueue
if isinstance(queue_or_context, CommandQueue):
Expand All @@ -661,7 +666,7 @@
context = queue_or_context

from loopy.target.pyopencl_execution import PyOpenCLExecutor
return PyOpenCLExecutor(context, t_unit, entrypoint=entrypoint)

Check warning on line 669 in loopy/target/pyopencl.py

View workflow job for this annotation

GitHub Actions / basedpyright

Argument type is unknown   Argument corresponds to parameter "entrypoint" in function "__init__" (reportUnknownArgumentType)

# }}}

Expand Down Expand Up @@ -809,25 +814,15 @@
["_lpy_cl_kernels", "queue", *kai.passed_arg_names,
"wait_for=None", "allocator=None"])

from genpy import For, Function, Line, Return, Statement as S, Suite
from genpy import Function, Line, Return, Suite
return Function(
codegen_result.current_program(codegen_state).name,
args,
Suite([
Line(),
] + [
Line(),
function_body,
Line(),
] + ([
For("_tv", "_global_temporaries",
# Free global temporaries.
# Zero-size temporaries allocate as None, tolerate that.
# https://documen.tician.de/pyopencl/tools.html#pyopencl.tools.ImmediateAllocator
S("if _tv is not None: _tv.release()"))
] if self._get_global_temporaries(codegen_state) else []
) + [
Line(),
Return("_lpy_evt"),
]))

Expand All @@ -847,56 +842,215 @@
key=lambda tv: tv.name)

def get_temporary_decls(self, codegen_state, schedule_index):
from genpy import Assign, Comment, Line
from pymbolic.mapper.stringifier import PREC_NONE
ecm = self.get_expression_to_code_mapper(codegen_state)
return []

global_temporaries = self._get_global_temporaries(codegen_state)
if not global_temporaries:
return []
def get_temporary_decl_locations(
self, codegen_state: CodeGenerationState
) -> tuple[Mapping[int, set[str]], Mapping[int, set[str]]]:
from collections import defaultdict

allocated_var_names = []
code_lines = []
code_lines.append(Line())
code_lines.append(Comment("{{{ allocate global temporaries"))
code_lines.append(Line())
from loopy.schedule.tools import (
temporaries_read_in_subkernel,
temporaries_written_in_subkernel,
)
# Find sub-kernels
kernel = codegen_state.kernel
assert kernel.linearization is not None
sched_index = 0

# deal with base storage
storage_variables: defaultdict[str, set[str]] = defaultdict(set)
global_temporaries = self._get_global_temporaries(codegen_state)
for tv in global_temporaries:
if tv.base_storage:

Check warning on line 865 in loopy/target/pyopencl.py

View workflow job for this annotation

GitHub Actions / basedpyright

Type of "base_storage" is unknown (reportUnknownMemberType)
storage_variables[tv.base_storage].add(tv.name)
else:
storage_variables[tv.name].add(tv.name)

# Collapse into blocks
def get_temporaries_in_bounds(
linearization: Sequence[ScheduleItem],
lower_bound: int,
upper_bound: int
) -> frozenset[str]:
temporaries: frozenset[str] = frozenset()
for sched_index in range(lower_bound, upper_bound+1):
sched_item = linearization[sched_index]
if isinstance(sched_item, CallKernel):
temporaries = (
temporaries_written_in_subkernel(kernel, sched_item.kernel_name)
.union(temporaries_read_in_subkernel(
kernel, sched_item.kernel_name
))
.union(temporaries)
)
return temporaries

def get_leave_loop_index(
linearization: Sequence[ScheduleItem],
iname: str,
starting_index: int
) -> int:
for sched_index in range(starting_index, len(linearization)):
sched_item = linearization[sched_index]
if isinstance(sched_item, LeaveLoop) and sched_item.iname == iname:
return sched_index
raise LoopyError("LeaveLoop for iname '%s' not found" % iname)

def get_return_from_kernel_index(
linearization: Sequence[ScheduleItem],
kernel_name: str,
starting_index: int
) -> int:
for sched_index in range(starting_index, len(linearization)):
sched_item = linearization[sched_index]
if (
isinstance(sched_item, ReturnFromKernel)
and sched_item.kernel_name == kernel_name
):
return sched_index
raise LoopyError("ReturnFromKernel for subkernel"
"'%s' not found" % kernel_name)

bounds: dict[int, frozenset[str]] = {}
sched_index = 0
while sched_index < codegen_state.schedule_index_end:
sched_item = kernel.linearization[sched_index]
if isinstance(sched_item, EnterLoop) or isinstance(sched_item, CallKernel):
if isinstance(sched_item, CallKernel):
block_end = get_return_from_kernel_index(
kernel.linearization, sched_item.kernel_name, sched_index
)
accessed_temporaries = (
temporaries_written_in_subkernel(kernel, sched_item.kernel_name)
.union(temporaries_read_in_subkernel(
kernel, sched_item.kernel_name)
)
)
else:
block_end = get_leave_loop_index(
kernel.linearization, sched_item.iname, sched_index
)
accessed_temporaries = get_temporaries_in_bounds(
kernel.linearization, sched_index, block_end
)
bounds[sched_index] = accessed_temporaries
sched_index = block_end + 1
else:
sched_index += 1

# forward pass for first accesses
first_accesses: dict[int, set[str]] = {}
unseen_storage_variables = set(storage_variables.keys())
for sched_index in range(0, codegen_state.schedule_index_end):
if (sched_index not in bounds):
continue
sched_item = kernel.linearization[sched_index]
new_temporary_variables = bounds[sched_index]
fwd_new_storage_variables: set[str] = set()
for sv in unseen_storage_variables:
if not storage_variables[sv].isdisjoint(new_temporary_variables):
fwd_new_storage_variables.add(sv)
unseen_storage_variables = (
unseen_storage_variables - fwd_new_storage_variables
)
if (len(fwd_new_storage_variables) > 0):
target_index = sched_index
if target_index in first_accesses:
first_accesses[target_index] = (
first_accesses[target_index].union(fwd_new_storage_variables)
)
else:
first_accesses[target_index] = fwd_new_storage_variables

last_accesses: dict[int, set[str]] = {}
unseen_storage_variables = set(storage_variables.keys())
for sched_index in range(codegen_state.schedule_index_end-1, -1, -1):
if (sched_index not in bounds):
continue
sched_item = kernel.linearization[sched_index]
new_temporary_variables = bounds[sched_index]
back_new_storage_variables: set[str] = set()
for sv in unseen_storage_variables:
if not storage_variables[sv].isdisjoint(new_temporary_variables):
back_new_storage_variables.add(sv)
unseen_storage_variables = (
unseen_storage_variables - back_new_storage_variables
)
if (len(back_new_storage_variables) > 0):
target_index = sched_index
if target_index in last_accesses:
last_accesses[target_index] = (
last_accesses[target_index].union(back_new_storage_variables)
)
else:
last_accesses[target_index] = back_new_storage_variables
return (first_accesses, last_accesses)

def get_temporary_allocation(
self,
codegen_state: CodeGenerationState,
temporary_variable_names: Iterable[str]
) -> genpy.Suite:
from genpy import Assign, Suite
from pymbolic.mapper.stringifier import PREC_NONE
kernel = codegen_state.kernel
ecm = self.get_expression_to_code_mapper(codegen_state)
allocation_code_lines: list[Assign] = []
for tv_name in temporary_variable_names:
tv = kernel.temporary_variables[tv_name]
if not tv.base_storage:
if tv.nbytes:
# NB: This does not prevent all zero-size allocations,
# as sizes are parametric, and allocation size
# could turn out to be zero at runtime.
nbytes_str = ecm(tv.nbytes, PREC_NONE, "i")
allocated_var_names.append(tv.name)
code_lines.append(Assign(tv.name,
allocation_code_lines.append(Assign(tv.name,
f"allocator({nbytes_str})"))
else:
code_lines.append(Assign(tv.name, "None"))

code_lines.append(Assign("_global_temporaries", "[{tvs}]".format(
tvs=", ".join(tv for tv in allocated_var_names))))

code_lines.append(Line())
code_lines.append(Comment("}}}"))
code_lines.append(Line())
allocation_code_lines.append(Assign(tv.name, "None"))
return Suite(allocation_code_lines)

return code_lines
def get_temporary_deallocation(
self,
codegen_state: CodeGenerationState,
temporary_variable_names: Iterable[str]
) -> genpy.Suite:
from genpy import Statement, Suite
deallocation_code_lines: list[Statement] = []
for tv_name in temporary_variable_names:
deallocation_code_lines.append(
Statement(f"if {tv_name} is not None: {tv_name}.release()")
)
return Suite(deallocation_code_lines)

def get_temporary_decl_at_index(
self, codegen_state: CodeGenerationState, sched_index: int
) -> tuple[genpy.Suite, genpy.Suite]:
from genpy import Suite
first_accesses, last_accesses = self.get_temporary_decl_locations(codegen_state)
prefixes, suffixes = Suite(), Suite()
if sched_index in first_accesses:
prefixes = self.get_temporary_allocation(
codegen_state, first_accesses[sched_index]
)
if sched_index in last_accesses:
suffixes = self.get_temporary_deallocation(
codegen_state, last_accesses[sched_index]
)
return (prefixes, suffixes)

def get_kernel_call(
self, codegen_state: CodeGenerationState,
subkernel_name: str,
gsize: tuple[Expression, ...], lsize: tuple[Expression, ...]
) -> genpy.Suite:
from genpy import Assert, Assign, Comment, Line, Suite
from pymbolic.mapper.stringifier import PREC_NONE

kernel = codegen_state.kernel
ecm = self.get_expression_to_code_mapper(codegen_state)

from loopy.schedule.tools import get_subkernel_arg_info
skai = get_subkernel_arg_info(kernel, subkernel_name)

ecm = self.get_expression_to_code_mapper(codegen_state)

if not gsize:
gsize = (1,)
if not lsize:
Expand Down Expand Up @@ -963,7 +1117,6 @@
overflow_args_code = Suite([])

import pyopencl.version as cl_ver
from pymbolic.mapper.stringifier import PREC_NONE
if cl_ver.VERSION < (2020, 2):
from warnings import warn
warn("Your kernel invocation will likely fail because your "
Expand Down
Loading