feat[cuda]: patches kernel (#6231)

a10y · web-flow · commit 782d65014aa0 · 2026-02-02T15:09:07.000Z
Apply patches in-place for BP and ALP.

Added unit tests, and also added patches to the existing BP/ALP tests to
verify it works

---------

Signed-off-by: Andrew Duffy &lt;andrew@a10y.dev&gt;
diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs
@@ -101,7 +101,7 @@ fn launch_dict_kernel_timed<V: cudarc::driver::DeviceRepr, I: cudarc::driver::De
     let events = vortex_cuda::launch_cuda_kernel!(
         execution_ctx: cuda_ctx,
         module: "dict",
-        ptypes: &[value_ptype.to_string().as_str(), code_ptype.to_string().as_str()],
+        ptypes: &[value_ptype, code_ptype],
         launch_args: [codes_view, codes_len_u64, values_view, output_view],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: codes_len
diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs
@@ -89,7 +89,7 @@ fn launch_for_kernel_timed_u8(
     let events = vortex_cuda::launch_cuda_kernel!(
         execution_ctx: cuda_ctx,
         module: "for",
-        ptypes: &[for_array.ptype().to_string().as_str()],
+        ptypes: &[for_array.ptype()],
         launch_args: [device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()
@@ -110,7 +110,7 @@ fn launch_for_kernel_timed_u16(
     let events = vortex_cuda::launch_cuda_kernel!(
         execution_ctx: cuda_ctx,
         module: "for",
-        ptypes: &[for_array.ptype().to_string().as_str()],
+        ptypes: &[for_array.ptype()],
         launch_args: [device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()
@@ -131,7 +131,7 @@ fn launch_for_kernel_timed_u32(
     let events = vortex_cuda::launch_cuda_kernel!(
         execution_ctx: cuda_ctx,
         module: "for",
-        ptypes: &[for_array.ptype().to_string().as_str()],
+        ptypes: &[for_array.ptype()],
         launch_args: [device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()
@@ -152,7 +152,7 @@ fn launch_for_kernel_timed_u64(
     let events = vortex_cuda::launch_cuda_kernel!(
         execution_ctx: cuda_ctx,
         module: "for",
-        ptypes: &[for_array.ptype().to_string().as_str()],
+        ptypes: &[for_array.ptype()],
         launch_args: [device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()
diff --git a/vortex-cuda/kernels/src/config.cuh b/vortex-cuda/kernels/src/config.cuh
@@ -3,10 +3,17 @@
 
 #pragma once
 
+#include <stdint.h>
+
 // Kernel launch configuration constants.
 // Must match the Rust launch config in src/kernel/mod.rs.
 //
 // With THREADS_PER_BLOCK=64 (set by Rust) and ELEMENTS_PER_THREAD=32:
 //   elements_per_block = 64 * 32 = 2048
 //   grid_dim = ceil(array_len / 2048)
 constexpr uint32_t ELEMENTS_PER_THREAD = 32;
+
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+
+#define START_ELEM(idx, len) MIN((idx) * ELEMENTS_PER_THREAD, (len))
+#define STOP_ELEM(idx, len) MIN(START_ELEM(idx, len) + ELEMENTS_PER_THREAD, (len))
diff --git a/vortex-cuda/kernels/src/patches.cu b/vortex-cuda/kernels/src/patches.cu
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#include "config.cuh"
+
+// TODO(aduffy): this is very naive. In the future we need to
+//   transpose the patches, see G-ALP paper.
+// Apply patches to a source array
+template<typename ValueT, typename IndexT>
+__device__ void patches(
+    ValueT *const values,
+    const IndexT *const patchIndices,
+    const ValueT *const patchValues,
+    uint64_t patchesLen
+) {
+    const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x;
+    const uint64_t startElem = START_ELEM(worker, patchesLen);
+    const uint64_t stopElem = START_ELEM(worker, patchesLen);
+
+    if (startElem >= patchesLen) {
+        return;
+    }
+
+    for (uint64_t idx = startElem; idx < stopElem; idx++) {
+        const IndexT patchIdx = patchIndices[idx];
+        const ValueT patchVal = patchValues[idx];
+
+        const size_t valueIdx = static_cast<size_t>(patchIdx);
+        values[valueIdx] = patchVal;
+    }
+}
+
+#define GENERATE_PATCHES_KERNEL(ValueT, value_suffix, IndexT, index_suffix) \
+extern "C" __global__ void patches_##value_suffix##_##index_suffix( \
+    ValueT *const values, \
+    const IndexT *const patchIndices, \
+    const ValueT *const patchValues, \
+    uint64_t patchesLen \
+) { \
+    patches(values, patchIndices, patchValues, patchesLen); \
+}
+
+#define GENERATE_PATCHES_KERNEL_FOR_VALUE(ValueT, value_suffix) \
+     GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint8_t, u8) \
+     GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint16_t, u16) \
+     GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint32_t, u32) \
+     GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint64_t, u64)
+
+
+GENERATE_PATCHES_KERNEL_FOR_VALUE(uint8_t, u8)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(uint16_t, u16)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(uint32_t, u32)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(uint64_t, u64)
+
+GENERATE_PATCHES_KERNEL_FOR_VALUE(int8_t, i8)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(int16_t, i16)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(int32_t, i32)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(int64_t, i64)
+
+GENERATE_PATCHES_KERNEL_FOR_VALUE(float, f32)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(double, f64)
diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs
@@ -27,6 +27,7 @@ use crate::stream::await_stream_callback;
 /// A [`DeviceBuffer`] wrapping a CUDA GPU allocation.
 ///
 /// Like the host `BufferHandle` variant, all slicing/referencing works in terms of byte units.
+#[derive(Clone)]
 pub struct CudaDeviceBuffer {
     allocation: Arc<dyn private::DeviceAllocation>,
     /// Offset in bytes from the start of the allocation
@@ -39,8 +40,6 @@ pub struct CudaDeviceBuffer {
     alignment: Alignment,
 }
 
-// We can call the sys methods, it's just a lot of extra code...fuck that lol
-
 mod private {
     use std::fmt::Debug;
     use std::sync::Arc;
diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs
@@ -129,7 +129,7 @@ async fn execute_dict_prim_typed<V: DeviceRepr + NativePType, I: DeviceRepr + Na
     let _cuda_events = crate::launch_cuda_kernel!(
         execution_ctx: ctx,
         module: "dict",
-        ptypes: &[value_ptype.to_string().as_str(), I::PTYPE.to_string().as_str()],
+        ptypes: &[value_ptype, I::PTYPE],
         launch_args: [codes_view, codes_len_u64, values_view, output_view],
         event_recording: cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING,
         array_len: codes_len
diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs
@@ -20,6 +20,7 @@ use vortex_array::arrays::PrimitiveArrayParts;
 use vortex_array::buffer::BufferHandle;
 use vortex_cuda_macros::cuda_tests;
 use vortex_dtype::NativePType;
+use vortex_dtype::match_each_unsigned_integer_ptype;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
 
@@ -28,6 +29,7 @@ use crate::CudaDeviceBuffer;
 use crate::executor::CudaArrayExt;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
+use crate::kernel::patches::execute_patches;
 use crate::launch_cuda_kernel_impl;
 
 /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression.
@@ -88,20 +90,33 @@ where
     // Load kernel function
     let kernel_ptypes = [A::ALPInt::PTYPE, A::PTYPE];
     let cuda_function = ctx.load_function_ptype("alp", &kernel_ptypes)?;
-    let mut launch_builder = ctx.launch_builder(&cuda_function);
+    {
+        let mut launch_builder = ctx.launch_builder(&cuda_function);
+
+        // Build launch args: input, output, f, e, length
+        launch_builder.arg(&input_view);
+        launch_builder.arg(&output_view);
+        launch_builder.arg(&f);
+        launch_builder.arg(&e);
+        launch_builder.arg(&array_len_u64);
+
+        // Launch kernel
+        let _cuda_events =
+            launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?;
+    }
 
-    // Build launch args: input, output, f, e, length
-    launch_builder.arg(&input_view);
-    launch_builder.arg(&output_view);
-    launch_builder.arg(&f);
-    launch_builder.arg(&e);
-    launch_builder.arg(&array_len_u64);
+    // Check if there are any patches to decode here
+    let output_buf = if let Some(patches) = array.patches() {
+        match_each_unsigned_integer_ptype!(patches.indices_ptype()?, |I| {
+            execute_patches::<A, I>(patches.clone(), output_buf, ctx).await?
+        })
+    } else {
+        output_buf
+    };
 
-    // Launch kernel
-    let _cuda_events =
-        launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?;
+    // TODO(aduffy): scatter patch values validity. There are several places we'll need to start
+    //  handling validity.
 
-    // Build result with newly allocated buffer
     let output_handle = BufferHandle::new_device(Arc::new(output_buf));
     Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
         output_handle,
@@ -117,8 +132,10 @@ mod tests {
     use vortex_array::IntoArray;
     use vortex_array::arrays::PrimitiveArray;
     use vortex_array::assert_arrays_eq;
-    use vortex_array::validity::Validity::NonNullable;
+    use vortex_array::patches::Patches;
+    use vortex_array::validity::Validity;
     use vortex_buffer::Buffer;
+    use vortex_buffer::buffer;
     use vortex_error::VortexExpect;
     use vortex_session::VortexSession;
 
@@ -138,13 +155,24 @@ mod tests {
         let encoded_data: Vec<i32> = vec![100, 200, 300, 400, 500];
         let exponents = Exponents { e: 0, f: 2 }; // multiply by 100
 
+        // Patches
+        let patches = Patches::new(
+            5,
+            0,
+            PrimitiveArray::new(buffer![0u32, 4u32], Validity::NonNullable).into_array(),
+            PrimitiveArray::new(buffer![0.0f32, 999f32], Validity::NonNullable).into_array(),
+            None,
+        )
+        .unwrap();
+
         let alp_array = ALPArray::try_new(
-            PrimitiveArray::new(Buffer::from(encoded_data.clone()), NonNullable).into_array(),
+            PrimitiveArray::new(Buffer::from(encoded_data.clone()), Validity::NonNullable)
+                .into_array(),
             exponents,
-            None,
+            Some(patches),
         )?;
 
-        let cpu_result = alp_array.to_canonical()?;
+        let cpu_result = alp_array.to_canonical()?.into_array();
 
         let gpu_result = ALPExecutor
             .execute(alp_array.to_array(), &mut cuda_ctx)
@@ -154,7 +182,7 @@ mod tests {
             .await?
             .into_array();
 
-        assert_arrays_eq!(cpu_result.into_array(), gpu_result);
+        assert_arrays_eq!(cpu_result, gpu_result);
 
         Ok(())
     }
diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs
@@ -2,6 +2,7 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use std::fmt::Debug;
+use std::sync::Arc;
 
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
@@ -16,6 +17,8 @@ use vortex_array::buffer::DeviceBufferExt;
 use vortex_cuda_macros::cuda_tests;
 use vortex_dtype::NativePType;
 use vortex_dtype::match_each_integer_ptype;
+use vortex_dtype::match_each_unsigned_integer_ptype;
+use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_error::vortex_ensure;
 use vortex_error::vortex_err;
@@ -29,6 +32,7 @@ use crate::CudaDeviceBuffer;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
 use crate::kernel::launch_cuda_kernel_with_config;
+use crate::kernel::patches::execute_patches;
 
 /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression.
 #[derive(Debug)]
@@ -74,7 +78,6 @@ where
     } = array.into_parts();
 
     vortex_ensure!(len > 0, "Non empty array");
-    vortex_ensure!(patches.is_none(), "Patches not supported");
     let offset = offset as usize;
 
     let device_input: BufferHandle = if packed.is_on_device() {
@@ -97,27 +100,46 @@ where
     let thread_count = if bits == 64 { 16 } else { 32 };
     let suffixes: [&str; _] = [&format!("{bit_width}bw"), &format!("{thread_count}t")];
     let cuda_function = ctx.load_function(&format!("bit_unpack_{}", bits), &suffixes)?;
-    let mut launch_builder = ctx.launch_builder(&cuda_function);
 
-    // Build launch args: input, output, f, e, length
-    launch_builder.arg(&input_view);
-    launch_builder.arg(&output_view);
+    {
+        let mut launch_builder = ctx.launch_builder(&cuda_function);
 
-    let num_blocks = u32::try_from(len.div_ceil(1024))?;
+        // Build launch args: input, output, f, e, length
+        launch_builder.arg(&input_view);
+        launch_builder.arg(&output_view);
 
-    let config = LaunchConfig {
-        grid_dim: (num_blocks, 1, 1),
-        block_dim: (thread_count, 1, 1),
-        shared_mem_bytes: 0,
-    };
+        let num_blocks = u32::try_from(len.div_ceil(1024))?;
+
+        let config = LaunchConfig {
+            grid_dim: (num_blocks, 1, 1),
+            block_dim: (thread_count, 1, 1),
+            shared_mem_bytes: 0,
+        };
 
-    // Launch kernel
-    let _cuda_events =
-        launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_DISABLE_TIMING)?;
+        // Launch kernel
+        let _cuda_events =
+            launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_DISABLE_TIMING)?;
+    }
+
+    let output_handle = match patches {
+        None => BufferHandle::new_device(output_buf.slice_typed::<A>(offset..(offset + len))),
+        Some(p) => {
+            let output_buf = output_buf.slice_typed::<A>(offset..(offset + len));
+            let buf = output_buf
+                .as_any()
+                .downcast_ref::<CudaDeviceBuffer>()
+                .vortex_expect("we created this as CudaDeviceBuffer")
+                .clone();
+
+            let patched_buf = match_each_unsigned_integer_ptype!(p.indices_ptype()?, |I| {
+                execute_patches::<A, I>(p, buf, ctx).await?
+            });
+
+            BufferHandle::new_device(Arc::new(patched_buf))
+        }
+    };
 
     // Build result with newly allocated buffer
-    let output_handle =
-        BufferHandle::new_device(output_buf.slice_typed::<A>(offset..(offset + len)));
     Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
         output_handle,
         A::PTYPE,
@@ -141,6 +163,34 @@ mod tests {
     use crate::CanonicalCudaExt;
     use crate::session::CudaSession;
 
+    #[test]
+    fn test_patches() -> VortexResult<()> {
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        let array = PrimitiveArray::new((0u16..=513).collect::<Buffer<_>>(), NonNullable);
+
+        // Last two items should be patched
+        let bp_with_patches = BitPackedArray::encode(array.as_ref(), 9)?;
+        assert!(bp_with_patches.patches().is_some());
+
+        let cpu_result = bp_with_patches.to_canonical()?.into_array();
+
+        let gpu_result = block_on(async {
+            BitPackedExecutor
+                .execute(bp_with_patches.to_array(), &mut cuda_ctx)
+                .await
+                .vortex_expect("GPU decompression failed")
+                .into_host()
+                .await
+                .map(|a| a.into_array())
+        })?;
+
+        assert_arrays_eq!(cpu_result, gpu_result);
+
+        Ok(())
+    }
+
     #[rstest]
     #[case::bw_1(1)]
     #[case::bw_2(2)]
diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs
diff --git a/vortex-cuda/src/kernel/patches/mod.rs b/vortex-cuda/src/kernel/patches/mod.rs