Caching improvements

shwina · shwina · commit a094bded2fec · 2026-02-04T11:52:58.000-05:00
diff --git a/python/cuda_cccl/cuda/compute/_caching.py b/python/cuda_cccl/cuda/compute/_caching.py
@@ -14,7 +14,7 @@
 except ImportError:
     from cuda.core.experimental import Device
 
-from ._utils.protocols import get_dtype, get_shape
+from ._utils.protocols import get_dtype, get_shape, is_device_array
 from .typing import DeviceArrayLike, GpuStruct
 
 # Registry thet maps type -> key function for extracting cache key
@@ -45,9 +45,16 @@ def _key_for(value: Any) -> Hashable:
     if value_type in _KEY_FUNCTIONS:
         return _KEY_FUNCTIONS[value_type](value)
 
+    # DeviceArrayLike is not a runtime-checkable protocol, so
+    # we cannot isinstance() with it.
+    if is_device_array(value):
+        return _KEY_FUNCTIONS[DeviceArrayLike](value)
+
     # Check for instance match (handles inheritance)
     for registered_type, keyer in _KEY_FUNCTIONS.items():
-        if isinstance(value, registered_type):
+        if registered_type is not DeviceArrayLike and isinstance(
+            value, registered_type
+        ):
             return keyer(value)
 
     # Fallback: use value directly (assumes it's hashable)
@@ -138,12 +145,10 @@ def register(self, type_: type, key_function: Callable[[Any], Hashable]) -> None
 
 
 def _make_hashable(value):
-    from .typing import DeviceArrayLike
-
     # duck-type check for numba.cuda.CUDADispatcher:
     if hasattr(value, "py_func") and callable(value.py_func):
         return CachableFunction(value.py_func)
-    elif isinstance(value, DeviceArrayLike):
+    elif is_device_array(value):
         # Ops with device arrays in globals/closures will be handled
         # by stateful op machinery, which enables updating the state
         # (pointers). Thus, we only cache on the dtype and shape of
diff --git a/python/cuda_cccl/cuda/compute/_jit.py b/python/cuda_cccl/cuda/compute/_jit.py
@@ -34,6 +34,7 @@
 from numba.extending import lower_builtin, lower_cast
 
 from ._caching import CachableFunction, cache_with_registered_key_functions
+from ._utils.protocols import is_device_array
 from .op import Op, OpAdapter
 from .typing import DeviceArrayLike
 
@@ -768,7 +769,7 @@ def _detect_device_array_globals(func: Callable) -> List[Tuple[str, object]]:
 
     for name in code.co_names:
         val = func.__globals__.get(name)
-        if val is not None and isinstance(val, DeviceArrayLike):
+        if val is not None and hasattr(val, "__cuda_array_interface__"):
             state_arrays.append((name, val))
 
     return state_arrays
@@ -795,7 +796,7 @@ def _detect_device_array_closures(func: Callable) -> List[Tuple[str, object]]:
     for name, cell in zip(code.co_freevars, closure):
         try:
             val = cell.cell_contents
-            if isinstance(val, DeviceArrayLike):
+            if is_device_array(val):
                 state_arrays.append((name, val))
         except ValueError:
             # Cell is empty
diff --git a/python/cuda_cccl/cuda/compute/_utils/protocols.py b/python/cuda_cccl/cuda/compute/_utils/protocols.py
@@ -14,6 +14,11 @@
 from ..typing import DeviceArrayLike, GpuStruct
 
 
+def is_device_array(obj: object) -> bool:
+    """Check if an object implements the `__cuda_array_interface__` protocol."""
+    return hasattr(obj, "__cuda_array_interface__")
+
+
 def get_data_pointer(arr: DeviceArrayLike) -> int:
     # TODO: these are fast paths for CuPy and PyTorch until
     # we have a more general solution.
diff --git a/python/cuda_cccl/cuda/compute/algorithms/_scan.py b/python/cuda_cccl/cuda/compute/algorithms/_scan.py
@@ -16,7 +16,11 @@
     set_cccl_iterator_state,
     to_cccl_value_state,
 )
-from .._utils.protocols import get_data_pointer, validate_and_get_stream
+from .._utils.protocols import (
+    get_data_pointer,
+    is_device_array,
+    validate_and_get_stream,
+)
 from .._utils.temp_storage_buffer import TempStorageBuffer
 from ..iterators._iterators import IteratorBase
 from ..op import OpAdapter, OpKind, make_op_adapter
@@ -29,7 +33,7 @@ def get_init_kind(
     match init_value:
         case None:
             return _bindings.InitKind.NO_INIT
-        case _ if isinstance(init_value, DeviceArrayLike):
+        case _ if is_device_array(init_value):
             return _bindings.InitKind.FUTURE_VALUE_INIT
         case _:
             return _bindings.InitKind.VALUE_INIT
diff --git a/python/cuda_cccl/cuda/compute/typing.py b/python/cuda_cccl/cuda/compute/typing.py
@@ -3,12 +3,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from typing import Protocol, runtime_checkable
+from typing import Protocol
 
-import numpy as np
+from .struct import _Struct
 
 
-@runtime_checkable
 class DeviceArrayLike(Protocol):
     """
     Objects representing a device array, having a `.__cuda_array_interface__`
@@ -26,12 +25,4 @@ class StreamLike(Protocol):
     def __cuda_stream__(self) -> tuple[int, int]: ...
 
 
-@runtime_checkable
-class GpuStruct(Protocol):
-    """
-    Type of instances of structs created with gpu_struct().
-    """
-
-    _data: np.ndarray
-    __array_interface__: dict
-    dtype: np.dtype
+GpuStruct = _Struct