From e54c3cbdcbcf11a274e64b5a6ae3ecc8d0b17cd3 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 2 Feb 2026 15:03:48 -0500 Subject: [PATCH 01/22] initial work toward export to device interface Signed-off-by: Andrew Duffy --- Cargo.lock | 2 + vortex-cuda/Cargo.toml | 2 + vortex-cuda/src/arrow/canonical.rs | 110 ++++++++++++++++++++++++++ vortex-cuda/src/arrow/mod.rs | 119 +++++++++++++++++++++++++++++ vortex-cuda/src/device_buffer.rs | 22 +++++- vortex-cuda/src/lib.rs | 1 + 6 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 vortex-cuda/src/arrow/canonical.rs create mode 100644 vortex-cuda/src/arrow/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 1d3854d23ef..5a830952573 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10420,6 +10420,8 @@ name = "vortex-cuda" version = "0.1.0" dependencies = [ "arc-swap", + "arrow-data 57.2.0", + "arrow-schema 57.2.0", "async-trait", "codspeed-criterion-compat-walltime", "cudarc", diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml index ed85085545a..30db4d6a30a 100644 --- a/vortex-cuda/Cargo.toml +++ b/vortex-cuda/Cargo.toml @@ -22,6 +22,8 @@ _test-harness = [] [dependencies] arc-swap = { workspace = true } +arrow-data = { workspace = true, features = ["ffi"] } +arrow-schema = { workspace = true, features = ["ffi"] } async-trait = { workspace = true } cudarc = { workspace = true, features = ["f16"] } fastlanes = { workspace = true } diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs new file mode 100644 index 00000000000..902cafba5c8 --- /dev/null +++ b/vortex-cuda/src/arrow/canonical.rs @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; + +use cudarc::driver::sys; +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::PrimitiveArrayParts; +use vortex_array::buffer::BufferHandle; +use vortex_array::validity::Validity; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; + +use crate::CudaBufferExt; +use crate::CudaExecutionCtx; +use crate::arrow::ArrowArray; +use crate::arrow::ArrowDeviceArray; +use crate::arrow::CudaDeviceArrayExecute; +use crate::arrow::CudaPrivateData; +use crate::arrow::DeviceType; +use crate::executor::CudaArrayExt; + +// Impl it for the execution context instead here...I think this is right? +impl CudaDeviceArrayExecute for Canonical { + async fn execute( + &self, + array: ArrayRef, + ctx: &mut CudaExecutionCtx, + ) -> VortexResult { + let cuda_array = array.execute_cuda(ctx).await?; + + match cuda_array { + Canonical::Primitive(primitive) => export_primitive(primitive, ctx).await, + c => todo!("implement support for exporting {}", c.dtype()), + } + } +} + +async fn export_primitive( + array: PrimitiveArray, + ctx: &mut CudaExecutionCtx, +) -> VortexResult { + let len = array.len(); + let PrimitiveArrayParts { + buffer, + ptype, + validity, + .. + } = array.into_parts(); + + unsafe extern "C" fn release(array: *mut ArrowArray) { + // SAFETY: this is only safe if the caller provides a valid pointer to an `ArrowArray`. + drop(unsafe { Box::from_raw(array) }); + } + + let null_count = match validity { + Validity::NonNullable | Validity::AllValid => 0, + Validity::AllInvalid => len, + Validity::Array(_) => { + vortex_bail!("Exporting PrimitiveArray with non-trivial validity not supported yet") + } + }; + + // TODO(aduffy): currently the null buffer is always empty, in the future we will need + // to pass it. + let buffers: Box<[Option]> = vec![None, Some(buffer)].into_boxed_slice(); + + let buffer_ptrs: Box<[sys::CUdeviceptr]> = buffers + .iter() + .map(|buf| { + match buf { + None => { + // null pointer + Ok(sys::CUdeviceptr::default()) + } + Some(handle) => handle.cuda_device_ptr(), + } + }) + .collect::>>()? + .into_boxed_slice(); + + let mut private_data = Box::new(CudaPrivateData { + cuda_stream: Arc::clone(ctx.stream()), + buffers, + buffer_ptrs, + }); + + let arrow_array = ArrowArray { + length: array.len() as i64, + null_count: null_count as i64, + offset: 0, + // 1 (optional) buffer for nulls, one buffer for data + n_buffers: 2, + buffers: private_data.buffer_ptrs.as_mut_ptr(), + n_children: 0, + children: std::ptr::null_mut(), + release: Some(release), + dictionary: std::ptr::null_mut(), + private_data: Box::into_raw(private_data).cast(), + }; + + Ok(ArrowDeviceArray { + array: arrow_array, + device_id: 0, + device_type: DeviceType::Cuda, + sync_event: None, + }) +} diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs new file mode 100644 index 00000000000..783da3196e5 --- /dev/null +++ b/vortex-cuda/src/arrow/mod.rs @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! This module implements the Arrow C Data Device Interface extension for sharing GPU-resident +//! data. +//! +//! This is an extension to the Arrow C Data Interface. +//! +//! More documentation at + +mod canonical; + +use std::ffi::c_void; +use std::ptr::NonNull; +use std::sync::Arc; + +use cudarc::driver::CudaStream; +use cudarc::driver::sys; +use cudarc::runtime::sys::cudaEvent_t; +use vortex_array::ArrayRef; +use vortex_array::Executable; +use vortex_array::buffer::BufferHandle; +use vortex_error::VortexResult; + +use crate::CudaExecutionCtx; +use crate::executor::CudaArrayExt; +use crate::executor::CudaExecute; + +#[derive(Debug, Copy, Clone)] +#[repr(i32)] +pub enum DeviceType { + /// Host-resident data buffer + Cpu = 1, + Cuda = 2, + CudaHost = 3, + // OpenCL = 4, + // Vulkan = 7, + // Metal = 8, + // Vpi = 9, + // Rocm = 10, + // RocmHost = 11, + CudaManaged = 13, + // OneApi = 14, + // WebGPU = 15, + // Hexagon = 16, +} + +pub type SyncEvent = Option>; + +/// The C Data Device Interface representation of an Arrow array. +/// +/// This array contains on-device pointers to Arrow array data, along with a synchronization +/// event that the client must wait on. +#[repr(C)] +#[derive(Debug)] +pub(crate) struct ArrowDeviceArray { + array: ArrowArray, + device_id: i64, + device_type: DeviceType, + sync_event: SyncEvent, +} + +/// An FFI-compatible version of the ArrowArray that holds pointers to device buffers. +#[repr(C)] +#[derive(Debug)] +pub(crate) struct ArrowArray { + length: i64, + null_count: i64, + offset: i64, + n_buffers: i64, + n_children: i64, + buffers: *mut sys::CUdeviceptr, + children: *mut *mut ArrowArray, + dictionary: *mut ArrowArray, + release: Option, + // When exported, this MUST contain everything that is owned by this array. + // for example, any buffer pointed to in `buffers` must be here, as well + // as the `buffers` pointer itself. + // In other words, everything in [FFI_ArrowArray] must be owned by + // `private_data` and can assume that they do not outlive `private_data`. + private_data: *mut c_void, +} + +impl ArrowArray { + pub fn empty() -> Self { + Self { + length: 0, + null_count: 0, + offset: 0, + n_buffers: 0, + n_children: 0, + buffers: std::ptr::null_mut(), + children: std::ptr::null_mut(), + dictionary: std::ptr::null_mut(), + release: None, + private_data: std::ptr::null_mut(), + } + } +} + +pub(crate) struct CudaPrivateData { + /// Hold a reference to the CudaStream so that it stays alive even after CudaExecutionCtx + /// has been dropped. + pub(crate) cuda_stream: Arc, + /// The single boxed slice which owns all buffers that the Rust code allocated on the device. + pub(crate) buffers: Box<[Option]>, + /// Boxed slice of buffer pointers. We return a pointer to the start of this allocation over + /// the interface, so we hold it here so the Box contents are not freed. + pub(crate) buffer_ptrs: Box<[sys::CUdeviceptr]>, +} + +/// Trait implemented for types that can be exported to [`ArrowDeviceArray`]. +pub(crate) trait CudaDeviceArrayExecute { + async fn execute( + &self, + array: ArrayRef, + ctx: &mut CudaExecutionCtx, + ) -> VortexResult; +} diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs index 520f4551e96..363f160c815 100644 --- a/vortex-cuda/src/device_buffer.rs +++ b/vortex-cuda/src/device_buffer.rs @@ -12,6 +12,7 @@ use cudarc::driver::DevicePtr; use cudarc::driver::DeviceRepr; use cudarc::driver::sys; use futures::future::BoxFuture; +use futures::future::ok; use vortex_array::buffer::BufferHandle; use vortex_array::buffer::DeviceBuffer; use vortex_buffer::Alignment; @@ -124,8 +125,15 @@ pub trait CudaBufferExt { /// /// # Errors /// - /// Returns an error if the buffer is not on the device. + /// Returns an error if the buffer is not a CUDA buffer. fn cuda_view(&self) -> VortexResult>; + + /// Returns the on-device pointer for the start of the buffer handle. + /// + /// # Errors + /// + /// Returns an error if the buffer is not a CUDA buffer. + fn cuda_device_ptr(&self) -> VortexResult; } impl CudaBufferExt for BufferHandle { @@ -141,6 +149,18 @@ impl CudaBufferExt for BufferHandle { Ok(cuda_buf.as_view::()) } + + fn cuda_device_ptr(&self) -> VortexResult { + let ptr = self + .as_device_opt() + .ok_or_else(|| vortex_err!("Buffer is not on device"))? + .as_any() + .downcast_ref::() + .ok_or_else(|| vortex_err!("expected CudaDeviceBuffer"))? + .device_ptr; + + Ok(ptr) + } } impl Debug for CudaDeviceBuffer { diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs index c186ea687c0..485fcc80fa8 100644 --- a/vortex-cuda/src/lib.rs +++ b/vortex-cuda/src/lib.rs @@ -5,6 +5,7 @@ use std::process::Command; +mod arrow; mod canonical; mod device_buffer; pub mod executor; From 113574815650200ee1d2f6d325a3ad0dfd692a35 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 2 Feb 2026 15:53:50 -0500 Subject: [PATCH 02/22] starting on cudf integration tests Signed-off-by: Andrew Duffy --- Cargo.lock | 7 + Cargo.toml | 3 +- vortex-cuda/cudf-test/Cargo.toml | 25 +++ vortex-cuda/cudf-test/build.rs | 83 +++++++++ vortex-cuda/cudf-test/cpp/CMakeLists.txt | 46 +++++ vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.cpp | 182 +++++++++++++++++++ vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h | 120 ++++++++++++ vortex-cuda/cudf-test/src/lib.rs | 182 +++++++++++++++++++ vortex-cuda/src/arrow/canonical.rs | 38 ++-- vortex-cuda/src/arrow/mod.rs | 3 + 10 files changed, 673 insertions(+), 16 deletions(-) create mode 100644 vortex-cuda/cudf-test/Cargo.toml create mode 100644 vortex-cuda/cudf-test/build.rs create mode 100644 vortex-cuda/cudf-test/cpp/CMakeLists.txt create mode 100644 vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.cpp create mode 100644 vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h create mode 100644 vortex-cuda/cudf-test/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 5a830952573..3e0119b8621 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10463,6 +10463,13 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "vortex-cudf-test" +version = "0.1.0" +dependencies = [ + "bindgen", +] + [[package]] name = "vortex-cxx" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index c63e310ab97..a87b67a052e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,9 +25,10 @@ members = [ "vortex-datafusion", "vortex-duckdb", "vortex-cuda", + "vortex-cuda/cub", + "vortex-cuda/cudf-test", "vortex-cuda/macros", "vortex-cuda/nvcomp", - "vortex-cuda/cub", "vortex-cxx", "vortex-ffi", "fuzz", diff --git a/vortex-cuda/cudf-test/Cargo.toml b/vortex-cuda/cudf-test/Cargo.toml new file mode 100644 index 00000000000..847e6e3e678 --- /dev/null +++ b/vortex-cuda/cudf-test/Cargo.toml @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + +[package] +name = "vortex-cudf-test" +authors.workspace = true +description = "Test for cuDF integration" +edition = { workspace = true } +homepage = { workspace = true } +categories = { workspace = true } +include = { workspace = true } +keywords = { workspace = true } +license = { workspace = true } +readme = { workspace = true } +repository = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[lints] +workspace = true + +[dependencies] + +[build-dependencies] +bindgen = { workspace = true } diff --git a/vortex-cuda/cudf-test/build.rs b/vortex-cuda/cudf-test/build.rs new file mode 100644 index 00000000000..863d85a846c --- /dev/null +++ b/vortex-cuda/cudf-test/build.rs @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::env; +use std::path::PathBuf; +use std::process::Command; + +fn main() { + let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); + let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); + let cpp_dir = manifest_dir.join("cpp"); + + // Create build directory + let build_dir = out_dir.join("cmake_build"); + std::fs::create_dir_all(&build_dir).expect("Failed to create build directory"); + + // Get conda prefix for finding cudf + let conda_prefix = env::var("CONDA_PREFIX").ok(); + + // Configure CMake + let mut cmake_cmd = Command::new("cmake"); + cmake_cmd + .current_dir(&build_dir) + .arg(&cpp_dir) + .arg(format!("-DCMAKE_BUILD_TYPE=Release")); + + // Add conda prefix to CMAKE_PREFIX_PATH if available + if let Some(prefix) = &conda_prefix { + cmake_cmd.arg(format!("-DCMAKE_PREFIX_PATH={}", prefix)); + } + + let status = cmake_cmd + .status() + .expect("Failed to run cmake configure"); + + if !status.success() { + panic!("CMake configure failed"); + } + + // Build + let status = Command::new("cmake") + .current_dir(&build_dir) + .args(["--build", ".", "--config", "Release", "-j"]) + .status() + .expect("Failed to run cmake build"); + + if !status.success() { + panic!("CMake build failed"); + } + + // Tell cargo where to find the library + println!("cargo:rustc-link-search=native={}", build_dir.display()); + println!("cargo:rustc-link-lib=dylib=cudf_arrow_ffi"); + + // Also link to cudf and its dependencies + if let Some(prefix) = &conda_prefix { + println!("cargo:rustc-link-search=native={}/lib", prefix); + } + + // Rebuild if C++ sources change + println!("cargo:rerun-if-changed=cpp/cudf_arrow_ffi.cpp"); + println!("cargo:rerun-if-changed=cpp/cudf_arrow_ffi.h"); + println!("cargo:rerun-if-changed=cpp/CMakeLists.txt"); + + // Generate bindings using bindgen + let bindings = bindgen::Builder::default() + .header(cpp_dir.join("cudf_arrow_ffi.h").to_string_lossy()) + .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) + .allowlist_function("cudf_.*") + .allowlist_type("CudfResult") + .allowlist_type("CudfErrorCode") + .allowlist_type("ArrowSchema") + .allowlist_type("ArrowArray") + .allowlist_type("ArrowDeviceArray") + .allowlist_type("ArrowDeviceType") + .allowlist_var("ARROW_DEVICE_.*") + .generate() + .expect("Unable to generate bindings"); + + bindings + .write_to_file(out_dir.join("bindings.rs")) + .expect("Couldn't write bindings!"); +} diff --git a/vortex-cuda/cudf-test/cpp/CMakeLists.txt b/vortex-cuda/cudf-test/cpp/CMakeLists.txt new file mode 100644 index 00000000000..f329930ffb5 --- /dev/null +++ b/vortex-cuda/cudf-test/cpp/CMakeLists.txt @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + +cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) + +project(cudf_arrow_ffi LANGUAGES CXX CUDA) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +# Find RAPIDS dependencies +find_package(cudf REQUIRED) +find_package(rmm REQUIRED) + +# Create the shared library +add_library(cudf_arrow_ffi SHARED + cudf_arrow_ffi.cpp +) + +target_include_directories(cudf_arrow_ffi + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR} +) + +target_link_libraries(cudf_arrow_ffi + PUBLIC + cudf::cudf + rmm::rmm +) + +# Set output directory to parent directory for easier linking from Rust +set_target_properties(cudf_arrow_ffi PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" +) + +# Install the library +install(TARGETS cudf_arrow_ffi + LIBRARY DESTINATION lib + RUNTIME DESTINATION bin +) + +install(FILES cudf_arrow_ffi.h + DESTINATION include +) diff --git a/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.cpp b/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.cpp new file mode 100644 index 00000000000..ebb583dec76 --- /dev/null +++ b/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.cpp @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#include "cudf_arrow_ffi.h" + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +// Global table storage (in real code, you'd want proper handle management) +static std::unique_ptr g_loaded_table; + +extern "C" { + +CudfResult cudf_init() { + try { + // Initialize RMM with default CUDA memory resource + static rmm::mr::cuda_memory_resource cuda_mr; + rmm::mr::set_current_device_resource(&cuda_mr); + return CudfResult{CUDF_SUCCESS, nullptr}; + } catch (const std::exception& e) { + // Note: In production, you'd want to properly manage this string's lifetime + return CudfResult{CUDF_ERROR_INIT_FAILED, strdup(e.what())}; + } +} + +CudfResult cudf_load_from_arrow_device( + const ArrowSchema* schema, + const ArrowDeviceArray* device_array +) { + if (!schema || !device_array) { + return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "schema or device_array is null"}; + } + + try { + // Use cudf's from_arrow_device to import the data + // This takes ownership of the ArrowDeviceArray + g_loaded_table = cudf::from_arrow_device(schema, device_array); + + return CudfResult{CUDF_SUCCESS, nullptr}; + } catch (const std::exception& e) { + return CudfResult{CUDF_ERROR_LOAD_FAILED, strdup(e.what())}; + } +} + +CudfResult cudf_load_column_from_arrow_device( + const ArrowSchema* schema, + const ArrowDeviceArray* device_array +) { + if (!schema || !device_array) { + return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "schema or device_array is null"}; + } + + try { + // Use cudf's from_arrow_device_column to import a single column + auto column = cudf::from_arrow_device_column(schema, device_array); + + // Wrap the column in a table for consistent handling + std::vector> columns; + columns.push_back(std::move(column)); + g_loaded_table = std::make_unique(std::move(columns)); + + return CudfResult{CUDF_SUCCESS, nullptr}; + } catch (const std::exception& e) { + return CudfResult{CUDF_ERROR_LOAD_FAILED, strdup(e.what())}; + } +} + +CudfResult cudf_get_row_count(int64_t* count) { + if (!count) { + return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "count pointer is null"}; + } + + if (!g_loaded_table) { + return CudfResult{CUDF_ERROR_NO_DATA, "no table loaded"}; + } + + try { + *count = static_cast(g_loaded_table->num_rows()); + return CudfResult{CUDF_SUCCESS, nullptr}; + } catch (const std::exception& e) { + return CudfResult{CUDF_ERROR_OPERATION_FAILED, strdup(e.what())}; + } +} + +CudfResult cudf_get_column_count(int32_t* count) { + if (!count) { + return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "count pointer is null"}; + } + + if (!g_loaded_table) { + return CudfResult{CUDF_ERROR_NO_DATA, "no table loaded"}; + } + + try { + *count = static_cast(g_loaded_table->num_columns()); + return CudfResult{CUDF_SUCCESS, nullptr}; + } catch (const std::exception& e) { + return CudfResult{CUDF_ERROR_OPERATION_FAILED, strdup(e.what())}; + } +} + +CudfResult cudf_count_valid(int32_t column_index, int64_t* valid_count) { + if (!valid_count) { + return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "valid_count pointer is null"}; + } + + if (!g_loaded_table) { + return CudfResult{CUDF_ERROR_NO_DATA, "no table loaded"}; + } + + try { + auto view = g_loaded_table->view(); + if (column_index < 0 || column_index >= view.num_columns()) { + return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "column index out of bounds"}; + } + + auto column_view = view.column(column_index); + + // count_all aggregation counts all non-null values + auto agg = cudf::make_count_aggregation(); + auto result = cudf::reduce(column_view, *agg, cudf::data_type{cudf::type_id::INT64}); + + // Get the scalar value + auto* int_scalar = static_cast*>(result.get()); + *valid_count = int_scalar->value(); + + return CudfResult{CUDF_SUCCESS, nullptr}; + } catch (const std::exception& e) { + return CudfResult{CUDF_ERROR_OPERATION_FAILED, strdup(e.what())}; + } +} + +CudfResult cudf_sum_int64(int32_t column_index, int64_t* sum) { + if (!sum) { + return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "sum pointer is null"}; + } + + if (!g_loaded_table) { + return CudfResult{CUDF_ERROR_NO_DATA, "no table loaded"}; + } + + try { + auto view = g_loaded_table->view(); + if (column_index < 0 || column_index >= view.num_columns()) { + return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "column index out of bounds"}; + } + + auto column_view = view.column(column_index); + + auto agg = cudf::make_sum_aggregation(); + auto result = cudf::reduce(column_view, *agg, cudf::data_type{cudf::type_id::INT64}); + + auto* int_scalar = static_cast*>(result.get()); + *sum = int_scalar->value(); + + return CudfResult{CUDF_SUCCESS, nullptr}; + } catch (const std::exception& e) { + return CudfResult{CUDF_ERROR_OPERATION_FAILED, strdup(e.what())}; + } +} + +CudfResult cudf_free_table() { + g_loaded_table.reset(); + return CudfResult{CUDF_SUCCESS, nullptr}; +} + +void cudf_free_error(const char* error_msg) { + if (error_msg) { + free(const_cast(error_msg)); + } +} + +} // extern "C" diff --git a/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h b/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h new file mode 100644 index 00000000000..aef904c5b4d --- /dev/null +++ b/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#ifndef CUDF_ARROW_FFI_H +#define CUDF_ARROW_FFI_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Arrow C Device Data Interface structures +// These match the Arrow specification for device data exchange + +struct ArrowSchema { + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + void (*release)(struct ArrowSchema*); + void* private_data; +}; + +struct ArrowArray { + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + void (*release)(struct ArrowArray*); + void* private_data; +}; + +// Arrow Device type constants +typedef int32_t ArrowDeviceType; +#define ARROW_DEVICE_CPU 1 +#define ARROW_DEVICE_CUDA 2 +#define ARROW_DEVICE_CUDA_HOST 3 +#define ARROW_DEVICE_OPENCL 4 +#define ARROW_DEVICE_VULKAN 7 +#define ARROW_DEVICE_METAL 8 +#define ARROW_DEVICE_VPI 9 +#define ARROW_DEVICE_ROCM 10 +#define ARROW_DEVICE_ROCM_HOST 11 +#define ARROW_DEVICE_EXT_DEV 12 +#define ARROW_DEVICE_CUDA_MANAGED 13 +#define ARROW_DEVICE_ONEAPI 14 +#define ARROW_DEVICE_WEBGPU 15 +#define ARROW_DEVICE_HEXAGON 16 + +struct ArrowDeviceArray { + struct ArrowArray array; + int64_t device_id; + ArrowDeviceType device_type; + void* sync_event; +}; + +// Error codes for cudf operations +typedef enum { + CUDF_SUCCESS = 0, + CUDF_ERROR_INIT_FAILED = 1, + CUDF_ERROR_INVALID_ARGUMENT = 2, + CUDF_ERROR_LOAD_FAILED = 3, + CUDF_ERROR_NO_DATA = 4, + CUDF_ERROR_OPERATION_FAILED = 5, +} CudfErrorCode; + +// Result type for cudf operations +typedef struct { + CudfErrorCode code; + const char* error_message; // NULL on success, caller must free with cudf_free_error +} CudfResult; + +// Initialize cudf/RMM runtime +CudfResult cudf_init(void); + +// Load Arrow data from device memory into cudf +// Takes a table (struct of arrays) +CudfResult cudf_load_from_arrow_device( + const struct ArrowSchema* schema, + const struct ArrowDeviceArray* device_array +); + +// Load a single Arrow column from device memory into cudf +CudfResult cudf_load_column_from_arrow_device( + const struct ArrowSchema* schema, + const struct ArrowDeviceArray* device_array +); + +// Get the number of rows in the loaded table +CudfResult cudf_get_row_count(int64_t* count); + +// Get the number of columns in the loaded table +CudfResult cudf_get_column_count(int32_t* count); + +// Count valid (non-null) values in a column +CudfResult cudf_count_valid(int32_t column_index, int64_t* valid_count); + +// Sum values in an int64 column +CudfResult cudf_sum_int64(int32_t column_index, int64_t* sum); + +// Free the loaded table +CudfResult cudf_free_table(void); + +// Free an error message returned by a CudfResult +void cudf_free_error(const char* error_msg); + +#ifdef __cplusplus +} +#endif + +#endif // CUDF_ARROW_FFI_H diff --git a/vortex-cuda/cudf-test/src/lib.rs b/vortex-cuda/cudf-test/src/lib.rs new file mode 100644 index 00000000000..76ba2d2b33a --- /dev/null +++ b/vortex-cuda/cudf-test/src/lib.rs @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Rust bindings for cudf Arrow Device FFI operations. +//! +//! This crate provides a safe Rust interface to cudf's Arrow Device data +//! import functionality, allowing GPU data to be passed directly to cudf +//! for processing. + +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] + +use std::ffi::CStr; +use std::fmt; + +// Include the generated bindings +include!(concat!(env!("OUT_DIR"), "/bindings.rs")); + +/// Error type for cudf operations +#[derive(Debug)] +pub struct CudfError { + pub code: CudfErrorCode, + pub message: String, +} + +impl fmt::Display for CudfError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "CudfError({:?}): {}", self.code, self.message) + } +} + +impl std::error::Error for CudfError {} + +/// Result type for cudf operations +pub type Result = std::result::Result; + +/// Convert a CudfResult to a Rust Result +fn check_result(result: CudfResult) -> Result<()> { + if result.code == CudfErrorCode_CUDF_SUCCESS { + Ok(()) + } else { + let message = if result.error_message.is_null() { + format!("Unknown error (code: {:?})", result.code) + } else { + let msg = unsafe { CStr::from_ptr(result.error_message) } + .to_string_lossy() + .into_owned(); + // Free the error message + unsafe { cudf_free_error(result.error_message) }; + msg + }; + Err(CudfError { + code: result.code, + message, + }) + } +} + +/// Initialize the cudf/RMM runtime. +/// +/// This must be called before any other cudf operations. +pub fn init() -> Result<()> { + let result = unsafe { cudf_init() }; + check_result(result) +} + +/// Load Arrow data from device memory into cudf. +/// +/// # Safety +/// +/// The schema and device_array must be valid Arrow C Data Interface structures +/// with device memory pointers. +pub unsafe fn load_from_arrow_device( + schema: *const ArrowSchema, + device_array: *const ArrowDeviceArray, +) -> Result<()> { + let result = cudf_load_from_arrow_device(schema, device_array); + check_result(result) +} + +/// Load a single Arrow column from device memory into cudf. +/// +/// # Safety +/// +/// The schema and device_array must be valid Arrow C Data Interface structures +/// with device memory pointers. +pub unsafe fn load_column_from_arrow_device( + schema: *const ArrowSchema, + device_array: *const ArrowDeviceArray, +) -> Result<()> { + let result = cudf_load_column_from_arrow_device(schema, device_array); + check_result(result) +} + +/// Get the number of rows in the loaded table. +pub fn get_row_count() -> Result { + let mut count: i64 = 0; + let result = unsafe { cudf_get_row_count(&mut count) }; + check_result(result)?; + Ok(count) +} + +/// Get the number of columns in the loaded table. +pub fn get_column_count() -> Result { + let mut count: i32 = 0; + let result = unsafe { cudf_get_column_count(&mut count) }; + check_result(result)?; + Ok(count) +} + +/// Count valid (non-null) values in a column. +pub fn count_valid(column_index: i32) -> Result { + let mut count: i64 = 0; + let result = unsafe { cudf_count_valid(column_index, &mut count) }; + check_result(result)?; + Ok(count) +} + +/// Sum values in an int64 column. +pub fn sum_int64(column_index: i32) -> Result { + let mut sum: i64 = 0; + let result = unsafe { cudf_sum_int64(column_index, &mut sum) }; + check_result(result)?; + Ok(sum) +} + +/// Free the currently loaded table. +pub fn free_table() -> Result<()> { + let result = unsafe { cudf_free_table() }; + check_result(result) +} + +/// RAII guard for the loaded table. +/// +/// Automatically frees the table when dropped. +pub struct TableGuard; + +impl Drop for TableGuard { + fn drop(&mut self) { + let _ = free_table(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_init() -> Result<()> { + // This will fail if CUDA/cudf is not available, which is expected + // in CI environments without GPU + match init() { + Ok(()) => { + println!("cudf initialized successfully"); + Ok(()) + } + Err(e) => { + println!("cudf init failed (expected without GPU): {}", e); + Ok(()) + } + } + } + + #[test] + fn test_no_data_error() { + // Without loading data, operations should fail with NO_DATA error + let result = get_row_count(); + match result { + Err(e) if e.code == CudfErrorCode_CUDF_ERROR_NO_DATA => { + // Expected + } + Err(e) => { + // Also acceptable - might fail for other reasons without GPU + println!("Got error (acceptable): {}", e); + } + Ok(_) => { + panic!("Expected error when no data loaded"); + } + } + } +} diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs index 902cafba5c8..177a4ffd1e4 100644 --- a/vortex-cuda/src/arrow/canonical.rs +++ b/vortex-cuda/src/arrow/canonical.rs @@ -31,17 +31,22 @@ impl CudaDeviceArrayExecute for Canonical { ) -> VortexResult { let cuda_array = array.execute_cuda(ctx).await?; - match cuda_array { + let arrow_array = match cuda_array { Canonical::Primitive(primitive) => export_primitive(primitive, ctx).await, c => todo!("implement support for exporting {}", c.dtype()), - } + }; + + Ok(ArrowDeviceArray { + array: arrow_array, + device_id: 0, + device_type: DeviceType::Cuda, + sync_event: None, + _reserved: Default::default(), + }) } } -async fn export_primitive( - array: PrimitiveArray, - ctx: &mut CudaExecutionCtx, -) -> VortexResult { +fn export_primitive(array: PrimitiveArray, ctx: &mut CudaExecutionCtx) -> VortexResult { let len = array.len(); let PrimitiveArrayParts { buffer, @@ -87,8 +92,8 @@ async fn export_primitive( buffer_ptrs, }); - let arrow_array = ArrowArray { - length: array.len() as i64, + Ok(ArrowArray { + length: len as i64, null_count: null_count as i64, offset: 0, // 1 (optional) buffer for nulls, one buffer for data @@ -99,12 +104,15 @@ async fn export_primitive( release: Some(release), dictionary: std::ptr::null_mut(), private_data: Box::into_raw(private_data).cast(), - }; - - Ok(ArrowDeviceArray { - array: arrow_array, - device_id: 0, - device_type: DeviceType::Cuda, - sync_event: None, }) } + +// Get the DecimalArray and the VarBinViewArray so we know +// how to treat all of these timestamps and such. + +#[cfg(test)] +mod tests { + #[tokio::test] + async fn test_export_primitive() { + } +} diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs index 783da3196e5..ff43432dffa 100644 --- a/vortex-cuda/src/arrow/mod.rs +++ b/vortex-cuda/src/arrow/mod.rs @@ -58,6 +58,9 @@ pub(crate) struct ArrowDeviceArray { device_id: i64, device_type: DeviceType, sync_event: SyncEvent, + + // unused space reserved for future fields + _reserved: [i64; 3], } /// An FFI-compatible version of the ArrowArray that holds pointers to device buffers. From 032e3342e09b20676dca566fec8509ce5ebf89b9 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 2 Feb 2026 22:37:11 +0000 Subject: [PATCH 03/22] add cudf test harness Signed-off-by: Andrew Duffy --- vortex-cuda/cudf-test/build.rs | 17 +- vortex-cuda/cudf-test/cpp/CMakeLists.txt | 2 +- vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.cpp | 255 +++++++++++++------ vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h | 97 ++++--- vortex-cuda/cudf-test/src/lib.rs | 253 ++++++++++-------- vortex-cuda/src/arrow/canonical.rs | 11 +- 6 files changed, 384 insertions(+), 251 deletions(-) diff --git a/vortex-cuda/cudf-test/build.rs b/vortex-cuda/cudf-test/build.rs index 863d85a846c..8b4dea3136c 100644 --- a/vortex-cuda/cudf-test/build.rs +++ b/vortex-cuda/cudf-test/build.rs @@ -1,6 +1,9 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +// Build scripts use expect/panic to fail the build with clear error messages +#![allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)] + use std::env; use std::path::PathBuf; use std::process::Command; @@ -22,20 +25,16 @@ fn main() { cmake_cmd .current_dir(&build_dir) .arg(&cpp_dir) - .arg(format!("-DCMAKE_BUILD_TYPE=Release")); + .arg("-DCMAKE_BUILD_TYPE=Release"); // Add conda prefix to CMAKE_PREFIX_PATH if available if let Some(prefix) = &conda_prefix { cmake_cmd.arg(format!("-DCMAKE_PREFIX_PATH={}", prefix)); } - let status = cmake_cmd - .status() - .expect("Failed to run cmake configure"); + let status = cmake_cmd.status().expect("Failed to run cmake configure"); - if !status.success() { - panic!("CMake configure failed"); - } + assert!(status.success(), "CMake configure failed"); // Build let status = Command::new("cmake") @@ -44,9 +43,7 @@ fn main() { .status() .expect("Failed to run cmake build"); - if !status.success() { - panic!("CMake build failed"); - } + assert!(status.success(), "CMake build failed"); // Tell cargo where to find the library println!("cargo:rustc-link-search=native={}", build_dir.display()); diff --git a/vortex-cuda/cudf-test/cpp/CMakeLists.txt b/vortex-cuda/cudf-test/cpp/CMakeLists.txt index f329930ffb5..5d310446beb 100644 --- a/vortex-cuda/cudf-test/cpp/CMakeLists.txt +++ b/vortex-cuda/cudf-test/cpp/CMakeLists.txt @@ -5,7 +5,7 @@ cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) project(cudf_arrow_ffi LANGUAGES CXX CUDA) -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON) diff --git a/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.cpp b/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.cpp index ebb583dec76..1e6efee56f4 100644 --- a/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.cpp +++ b/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.cpp @@ -14,168 +14,261 @@ #include #include +#include +#include -// Global table storage (in real code, you'd want proper handle management) -static std::unique_ptr g_loaded_table; +// Internal struct definitions for opaque types + +struct cudf_context { + std::unique_ptr cuda_mr; +}; + +struct cudf_tableview { + cudf::unique_table_view_t view; + + explicit cudf_tableview(cudf::unique_table_view_t v) : view(std::move(v)) {} +}; + +struct cudf_columnview { + cudf::unique_column_view_t view; + + explicit cudf_columnview(cudf::unique_column_view_t v) : view(std::move(v)) {} +}; + +// Helper to create an error string +static cudf_err_t make_error(const char* msg) { + return strdup(msg); +} + +static cudf_err_t make_error(const std::string& msg) { + return strdup(msg.c_str()); +} extern "C" { -CudfResult cudf_init() { +cudf_err_t cudf_context_create(cudf_context_t** ctx) { + if (!ctx) { + return make_error("ctx pointer is null"); + } + try { - // Initialize RMM with default CUDA memory resource - static rmm::mr::cuda_memory_resource cuda_mr; - rmm::mr::set_current_device_resource(&cuda_mr); - return CudfResult{CUDF_SUCCESS, nullptr}; + auto context = std::make_unique(); + context->cuda_mr = std::make_unique(); + rmm::mr::set_current_device_resource(context->cuda_mr.get()); + *ctx = context.release(); + return nullptr; } catch (const std::exception& e) { - // Note: In production, you'd want to properly manage this string's lifetime - return CudfResult{CUDF_ERROR_INIT_FAILED, strdup(e.what())}; + return make_error(e.what()); } } -CudfResult cudf_load_from_arrow_device( +void cudf_context_free(cudf_context_t* ctx) { + delete ctx; +} + +cudf_err_t cudf_tableview_from_device( + cudf_context_t* ctx, const ArrowSchema* schema, - const ArrowDeviceArray* device_array + const ArrowDeviceArray* device_array, + cudf_tableview_t** out ) { + if (!ctx) { + return make_error("context is null"); + } if (!schema || !device_array) { - return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "schema or device_array is null"}; + return make_error("schema or device_array is null"); + } + if (!out) { + return make_error("out pointer is null"); } try { - // Use cudf's from_arrow_device to import the data - // This takes ownership of the ArrowDeviceArray - g_loaded_table = cudf::from_arrow_device(schema, device_array); - - return CudfResult{CUDF_SUCCESS, nullptr}; + auto view = cudf::from_arrow_device(schema, device_array); + *out = new cudf_tableview(std::move(view)); + return nullptr; } catch (const std::exception& e) { - return CudfResult{CUDF_ERROR_LOAD_FAILED, strdup(e.what())}; + return make_error(e.what()); } } -CudfResult cudf_load_column_from_arrow_device( +cudf_err_t cudf_columnview_from_device( + cudf_context_t* ctx, const ArrowSchema* schema, - const ArrowDeviceArray* device_array + const ArrowDeviceArray* device_array, + cudf_columnview_t** out ) { + if (!ctx) { + return make_error("context is null"); + } if (!schema || !device_array) { - return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "schema or device_array is null"}; + return make_error("schema or device_array is null"); + } + if (!out) { + return make_error("out pointer is null"); } try { - // Use cudf's from_arrow_device_column to import a single column - auto column = cudf::from_arrow_device_column(schema, device_array); - - // Wrap the column in a table for consistent handling - std::vector> columns; - columns.push_back(std::move(column)); - g_loaded_table = std::make_unique(std::move(columns)); - - return CudfResult{CUDF_SUCCESS, nullptr}; + auto view = cudf::from_arrow_device_column(schema, device_array); + *out = new cudf_columnview(std::move(view)); + return nullptr; } catch (const std::exception& e) { - return CudfResult{CUDF_ERROR_LOAD_FAILED, strdup(e.what())}; + return make_error(e.what()); } } -CudfResult cudf_get_row_count(int64_t* count) { - if (!count) { - return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "count pointer is null"}; +cudf_err_t cudf_tableview_num_rows(const cudf_tableview_t* tv, int64_t* count) { + if (!tv) { + return make_error("table view is null"); } - - if (!g_loaded_table) { - return CudfResult{CUDF_ERROR_NO_DATA, "no table loaded"}; + if (!count) { + return make_error("count pointer is null"); } try { - *count = static_cast(g_loaded_table->num_rows()); - return CudfResult{CUDF_SUCCESS, nullptr}; + *count = static_cast(tv->view->num_rows()); + return nullptr; } catch (const std::exception& e) { - return CudfResult{CUDF_ERROR_OPERATION_FAILED, strdup(e.what())}; + return make_error(e.what()); } } -CudfResult cudf_get_column_count(int32_t* count) { +cudf_err_t cudf_tableview_num_columns(const cudf_tableview_t* tv, int32_t* count) { + if (!tv) { + return make_error("table view is null"); + } if (!count) { - return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "count pointer is null"}; + return make_error("count pointer is null"); + } + + try { + *count = static_cast(tv->view->num_columns()); + return nullptr; + } catch (const std::exception& e) { + return make_error(e.what()); } +} - if (!g_loaded_table) { - return CudfResult{CUDF_ERROR_NO_DATA, "no table loaded"}; +cudf_err_t cudf_columnview_size(const cudf_columnview_t* cv, int64_t* count) { + if (!cv) { + return make_error("column view is null"); + } + if (!count) { + return make_error("count pointer is null"); } try { - *count = static_cast(g_loaded_table->num_columns()); - return CudfResult{CUDF_SUCCESS, nullptr}; + *count = static_cast(cv->view->size()); + return nullptr; } catch (const std::exception& e) { - return CudfResult{CUDF_ERROR_OPERATION_FAILED, strdup(e.what())}; + return make_error(e.what()); } } -CudfResult cudf_count_valid(int32_t column_index, int64_t* valid_count) { - if (!valid_count) { - return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "valid_count pointer is null"}; +cudf_err_t cudf_tableview_count_valid(const cudf_tableview_t* tv, int32_t column_index, int64_t* valid_count) { + if (!tv) { + return make_error("table view is null"); } - - if (!g_loaded_table) { - return CudfResult{CUDF_ERROR_NO_DATA, "no table loaded"}; + if (!valid_count) { + return make_error("valid_count pointer is null"); } try { - auto view = g_loaded_table->view(); - if (column_index < 0 || column_index >= view.num_columns()) { - return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "column index out of bounds"}; + if (column_index < 0 || column_index >= tv->view->num_columns()) { + return make_error("column index out of bounds"); } - auto column_view = view.column(column_index); - - // count_all aggregation counts all non-null values + auto col_view = tv->view->column(column_index); auto agg = cudf::make_count_aggregation(); - auto result = cudf::reduce(column_view, *agg, cudf::data_type{cudf::type_id::INT64}); + auto result = cudf::reduce(col_view, *agg, cudf::data_type{cudf::type_id::INT64}); - // Get the scalar value auto* int_scalar = static_cast*>(result.get()); *valid_count = int_scalar->value(); - return CudfResult{CUDF_SUCCESS, nullptr}; + return nullptr; } catch (const std::exception& e) { - return CudfResult{CUDF_ERROR_OPERATION_FAILED, strdup(e.what())}; + return make_error(e.what()); } } -CudfResult cudf_sum_int64(int32_t column_index, int64_t* sum) { - if (!sum) { - return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "sum pointer is null"}; +cudf_err_t cudf_columnview_count_valid(const cudf_columnview_t* cv, int64_t* valid_count) { + if (!cv) { + return make_error("column view is null"); + } + if (!valid_count) { + return make_error("valid_count pointer is null"); + } + + try { + auto agg = cudf::make_count_aggregation(); + auto result = cudf::reduce(*cv->view, *agg, cudf::data_type{cudf::type_id::INT64}); + + auto* int_scalar = static_cast*>(result.get()); + *valid_count = int_scalar->value(); + + return nullptr; + } catch (const std::exception& e) { + return make_error(e.what()); } +} - if (!g_loaded_table) { - return CudfResult{CUDF_ERROR_NO_DATA, "no table loaded"}; +cudf_err_t cudf_tableview_sum_int64(const cudf_tableview_t* tv, int32_t column_index, int64_t* sum) { + if (!tv) { + return make_error("table view is null"); + } + if (!sum) { + return make_error("sum pointer is null"); } try { - auto view = g_loaded_table->view(); - if (column_index < 0 || column_index >= view.num_columns()) { - return CudfResult{CUDF_ERROR_INVALID_ARGUMENT, "column index out of bounds"}; + if (column_index < 0 || column_index >= tv->view->num_columns()) { + return make_error("column index out of bounds"); } - auto column_view = view.column(column_index); + auto col_view = tv->view->column(column_index); + auto agg = cudf::make_sum_aggregation(); + auto result = cudf::reduce(col_view, *agg, cudf::data_type{cudf::type_id::INT64}); + + auto* int_scalar = static_cast*>(result.get()); + *sum = int_scalar->value(); + + return nullptr; + } catch (const std::exception& e) { + return make_error(e.what()); + } +} + +cudf_err_t cudf_columnview_sum_int64(const cudf_columnview_t* cv, int64_t* sum) { + if (!cv) { + return make_error("column view is null"); + } + if (!sum) { + return make_error("sum pointer is null"); + } + try { auto agg = cudf::make_sum_aggregation(); - auto result = cudf::reduce(column_view, *agg, cudf::data_type{cudf::type_id::INT64}); + auto result = cudf::reduce(*cv->view, *agg, cudf::data_type{cudf::type_id::INT64}); auto* int_scalar = static_cast*>(result.get()); *sum = int_scalar->value(); - return CudfResult{CUDF_SUCCESS, nullptr}; + return nullptr; } catch (const std::exception& e) { - return CudfResult{CUDF_ERROR_OPERATION_FAILED, strdup(e.what())}; + return make_error(e.what()); } } -CudfResult cudf_free_table() { - g_loaded_table.reset(); - return CudfResult{CUDF_SUCCESS, nullptr}; +void cudf_tableview_free(cudf_tableview_t* tv) { + delete tv; +} + +void cudf_columnview_free(cudf_columnview_t* cv) { + delete cv; } -void cudf_free_error(const char* error_msg) { - if (error_msg) { - free(const_cast(error_msg)); +void cudf_err_free(cudf_err_t err) { + if (err) { + free(const_cast(err)); } } diff --git a/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h b/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h index aef904c5b4d..ef8ae6693e0 100644 --- a/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h +++ b/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h @@ -63,55 +63,76 @@ struct ArrowDeviceArray { void* sync_event; }; -// Error codes for cudf operations -typedef enum { - CUDF_SUCCESS = 0, - CUDF_ERROR_INIT_FAILED = 1, - CUDF_ERROR_INVALID_ARGUMENT = 2, - CUDF_ERROR_LOAD_FAILED = 3, - CUDF_ERROR_NO_DATA = 4, - CUDF_ERROR_OPERATION_FAILED = 5, -} CudfErrorCode; - -// Result type for cudf operations -typedef struct { - CudfErrorCode code; - const char* error_message; // NULL on success, caller must free with cudf_free_error -} CudfResult; - -// Initialize cudf/RMM runtime -CudfResult cudf_init(void); - -// Load Arrow data from device memory into cudf -// Takes a table (struct of arrays) -CudfResult cudf_load_from_arrow_device( +// Error type: NULL on success, pointer to error string on failure. +// Caller must free with cudf_err_free() when non-NULL. +typedef const char* cudf_err_t; + +// Opaque context type that holds CUDA memory resources and global state. +typedef struct cudf_context cudf_context_t; + +// Opaque table view type wrapping cudf::unique_table_view_t +typedef struct cudf_tableview cudf_tableview_t; + +// Opaque column view type wrapping cudf::unique_column_view_t +typedef struct cudf_columnview cudf_columnview_t; + +// Create a new cudf context and initialize RMM. +// On success, *ctx is set to the new context and NULL is returned. +// On failure, *ctx is unchanged and an error string is returned. +cudf_err_t cudf_context_create(cudf_context_t** ctx); + +// Free a cudf context and all associated resources. +void cudf_context_free(cudf_context_t* ctx); + +// Import an Arrow table from device memory into a cudf table view. +// On success, *out is set to the new table view and NULL is returned. +// On failure, *out is unchanged and an error string is returned. +cudf_err_t cudf_tableview_from_device( + cudf_context_t* ctx, const struct ArrowSchema* schema, - const struct ArrowDeviceArray* device_array + const struct ArrowDeviceArray* device_array, + cudf_tableview_t** out ); -// Load a single Arrow column from device memory into cudf -CudfResult cudf_load_column_from_arrow_device( +// Import an Arrow column from device memory into a cudf column view. +// On success, *out is set to the new column view and NULL is returned. +// On failure, *out is unchanged and an error string is returned. +cudf_err_t cudf_columnview_from_device( + cudf_context_t* ctx, const struct ArrowSchema* schema, - const struct ArrowDeviceArray* device_array + const struct ArrowDeviceArray* device_array, + cudf_columnview_t** out ); -// Get the number of rows in the loaded table -CudfResult cudf_get_row_count(int64_t* count); +// Get the number of rows in a table view. +cudf_err_t cudf_tableview_num_rows(const cudf_tableview_t* tv, int64_t* count); + +// Get the number of columns in a table view. +cudf_err_t cudf_tableview_num_columns(const cudf_tableview_t* tv, int32_t* count); + +// Get the number of rows in a column view. +cudf_err_t cudf_columnview_size(const cudf_columnview_t* cv, int64_t* count); + +// Count valid (non-null) values in a table column. +cudf_err_t cudf_tableview_count_valid(const cudf_tableview_t* tv, int32_t column_index, int64_t* valid_count); + +// Count valid (non-null) values in a column view. +cudf_err_t cudf_columnview_count_valid(const cudf_columnview_t* cv, int64_t* valid_count); -// Get the number of columns in the loaded table -CudfResult cudf_get_column_count(int32_t* count); +// Sum values in an int64 table column. +cudf_err_t cudf_tableview_sum_int64(const cudf_tableview_t* tv, int32_t column_index, int64_t* sum); -// Count valid (non-null) values in a column -CudfResult cudf_count_valid(int32_t column_index, int64_t* valid_count); +// Sum values in an int64 column view. +cudf_err_t cudf_columnview_sum_int64(const cudf_columnview_t* cv, int64_t* sum); -// Sum values in an int64 column -CudfResult cudf_sum_int64(int32_t column_index, int64_t* sum); +// Free a table view. +void cudf_tableview_free(cudf_tableview_t* tv); -// Free the loaded table -CudfResult cudf_free_table(void); +// Free a column view. +void cudf_columnview_free(cudf_columnview_t* cv); -// Free an error message returned by a CudfResult -void cudf_free_error(const char* error_msg); +// Free an error string. +void cudf_err_free(cudf_err_t err); #ifdef __cplusplus } diff --git a/vortex-cuda/cudf-test/src/lib.rs b/vortex-cuda/cudf-test/src/lib.rs index 76ba2d2b33a..cacbb7ce115 100644 --- a/vortex-cuda/cudf-test/src/lib.rs +++ b/vortex-cuda/cudf-test/src/lib.rs @@ -13,132 +13,183 @@ use std::ffi::CStr; use std::fmt; +use std::ptr; // Include the generated bindings include!(concat!(env!("OUT_DIR"), "/bindings.rs")); -/// Error type for cudf operations +/// Error type for cudf operations. #[derive(Debug)] pub struct CudfError { - pub code: CudfErrorCode, pub message: String, } impl fmt::Display for CudfError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "CudfError({:?}): {}", self.code, self.message) + write!(f, "CudfError: {}", self.message) } } impl std::error::Error for CudfError {} -/// Result type for cudf operations +/// Result type for cudf operations. pub type Result = std::result::Result; -/// Convert a CudfResult to a Rust Result -fn check_result(result: CudfResult) -> Result<()> { - if result.code == CudfErrorCode_CUDF_SUCCESS { +/// Check a cudf_err_t and convert to Result. +fn check_err(err: cudf_err_t) -> Result<()> { + if err.is_null() { Ok(()) } else { - let message = if result.error_message.is_null() { - format!("Unknown error (code: {:?})", result.code) - } else { - let msg = unsafe { CStr::from_ptr(result.error_message) } - .to_string_lossy() - .into_owned(); - // Free the error message - unsafe { cudf_free_error(result.error_message) }; - msg - }; - Err(CudfError { - code: result.code, - message, - }) + let message = unsafe { CStr::from_ptr(err) } + .to_string_lossy() + .into_owned(); + unsafe { cudf_err_free(err) }; + Err(CudfError { message }) } } -/// Initialize the cudf/RMM runtime. -/// -/// This must be called before any other cudf operations. -pub fn init() -> Result<()> { - let result = unsafe { cudf_init() }; - check_result(result) +/// RAII wrapper for cudf_context_t. +pub struct CudfContext { + ctx: *mut cudf_context_t, } -/// Load Arrow data from device memory into cudf. -/// -/// # Safety -/// -/// The schema and device_array must be valid Arrow C Data Interface structures -/// with device memory pointers. -pub unsafe fn load_from_arrow_device( - schema: *const ArrowSchema, - device_array: *const ArrowDeviceArray, -) -> Result<()> { - let result = cudf_load_from_arrow_device(schema, device_array); - check_result(result) -} +impl CudfContext { + /// Create a new cudf context and initialize RMM. + pub fn new() -> Result { + let mut ctx: *mut cudf_context_t = ptr::null_mut(); + let err = unsafe { cudf_context_create(&raw mut ctx) }; + check_err(err)?; + Ok(Self { ctx }) + } -/// Load a single Arrow column from device memory into cudf. -/// -/// # Safety -/// -/// The schema and device_array must be valid Arrow C Data Interface structures -/// with device memory pointers. -pub unsafe fn load_column_from_arrow_device( - schema: *const ArrowSchema, - device_array: *const ArrowDeviceArray, -) -> Result<()> { - let result = cudf_load_column_from_arrow_device(schema, device_array); - check_result(result) + /// Import an Arrow table from device memory into a cudf table view. + /// + /// # Safety + /// + /// The schema and device_array must be valid Arrow C Device Data Interface structures + /// with device memory pointers. + pub unsafe fn tableview_from_device( + &self, + schema: *const ArrowSchema, + device_array: *const ArrowDeviceArray, + ) -> Result { + let mut tv: *mut cudf_tableview_t = ptr::null_mut(); + let err = + unsafe { cudf_tableview_from_device(self.ctx, schema, device_array, &raw mut tv) }; + check_err(err)?; + Ok(CudfTableView { tv }) + } + + /// Import an Arrow column from device memory into a cudf column view. + /// + /// # Safety + /// + /// The schema and device_array must be valid Arrow C Data Interface structures + /// with device memory pointers. + pub unsafe fn columnview_from_device( + &self, + schema: *const ArrowSchema, + device_array: *const ArrowDeviceArray, + ) -> Result { + let mut cv: *mut cudf_columnview_t = ptr::null_mut(); + let err = + unsafe { cudf_columnview_from_device(self.ctx, schema, device_array, &raw mut cv) }; + check_err(err)?; + Ok(CudfColumnView { cv }) + } } -/// Get the number of rows in the loaded table. -pub fn get_row_count() -> Result { - let mut count: i64 = 0; - let result = unsafe { cudf_get_row_count(&mut count) }; - check_result(result)?; - Ok(count) +impl Drop for CudfContext { + fn drop(&mut self) { + if !self.ctx.is_null() { + unsafe { cudf_context_free(self.ctx) }; + } + } } -/// Get the number of columns in the loaded table. -pub fn get_column_count() -> Result { - let mut count: i32 = 0; - let result = unsafe { cudf_get_column_count(&mut count) }; - check_result(result)?; - Ok(count) +/// RAII wrapper for cudf_tableview_t. +pub struct CudfTableView { + tv: *mut cudf_tableview_t, } -/// Count valid (non-null) values in a column. -pub fn count_valid(column_index: i32) -> Result { - let mut count: i64 = 0; - let result = unsafe { cudf_count_valid(column_index, &mut count) }; - check_result(result)?; - Ok(count) +impl CudfTableView { + /// Get the number of rows in the table. + pub fn num_rows(&self) -> Result { + let mut count: i64 = 0; + let err = unsafe { cudf_tableview_num_rows(self.tv, &raw mut count) }; + check_err(err)?; + Ok(count) + } + + /// Get the number of columns in the table. + pub fn num_columns(&self) -> Result { + let mut count: i32 = 0; + let err = unsafe { cudf_tableview_num_columns(self.tv, &raw mut count) }; + check_err(err)?; + Ok(count) + } + + /// Count valid (non-null) values in a column. + pub fn count_valid(&self, column_index: i32) -> Result { + let mut count: i64 = 0; + let err = unsafe { cudf_tableview_count_valid(self.tv, column_index, &raw mut count) }; + check_err(err)?; + Ok(count) + } + + /// Sum values in an int64 column. + pub fn sum_int64(&self, column_index: i32) -> Result { + let mut sum: i64 = 0; + let err = unsafe { cudf_tableview_sum_int64(self.tv, column_index, &raw mut sum) }; + check_err(err)?; + Ok(sum) + } } -/// Sum values in an int64 column. -pub fn sum_int64(column_index: i32) -> Result { - let mut sum: i64 = 0; - let result = unsafe { cudf_sum_int64(column_index, &mut sum) }; - check_result(result)?; - Ok(sum) +impl Drop for CudfTableView { + fn drop(&mut self) { + if !self.tv.is_null() { + unsafe { cudf_tableview_free(self.tv) }; + } + } } -/// Free the currently loaded table. -pub fn free_table() -> Result<()> { - let result = unsafe { cudf_free_table() }; - check_result(result) +/// RAII wrapper for cudf_columnview_t. +pub struct CudfColumnView { + cv: *mut cudf_columnview_t, } -/// RAII guard for the loaded table. -/// -/// Automatically frees the table when dropped. -pub struct TableGuard; +impl CudfColumnView { + /// Get the number of rows in the column. + pub fn size(&self) -> Result { + let mut count: i64 = 0; + let err = unsafe { cudf_columnview_size(self.cv, &raw mut count) }; + check_err(err)?; + Ok(count) + } + + /// Count valid (non-null) values in the column. + pub fn count_valid(&self) -> Result { + let mut count: i64 = 0; + let err = unsafe { cudf_columnview_count_valid(self.cv, &raw mut count) }; + check_err(err)?; + Ok(count) + } + + /// Sum values in the column (int64). + pub fn sum_int64(&self) -> Result { + let mut sum: i64 = 0; + let err = unsafe { cudf_columnview_sum_int64(self.cv, &raw mut sum) }; + check_err(err)?; + Ok(sum) + } +} -impl Drop for TableGuard { +impl Drop for CudfColumnView { fn drop(&mut self) { - let _ = free_table(); + if !self.cv.is_null() { + unsafe { cudf_columnview_free(self.cv) }; + } } } @@ -147,36 +198,16 @@ mod tests { use super::*; #[test] - fn test_init() -> Result<()> { - // This will fail if CUDA/cudf is not available, which is expected - // in CI environments without GPU - match init() { - Ok(()) => { - println!("cudf initialized successfully"); + fn test_context_create() -> Result<()> { + match CudfContext::new() { + Ok(_ctx) => { + println!("cudf context created successfully"); Ok(()) } Err(e) => { - println!("cudf init failed (expected without GPU): {}", e); + println!("cudf context creation failed (expected without GPU): {}", e); Ok(()) } } } - - #[test] - fn test_no_data_error() { - // Without loading data, operations should fail with NO_DATA error - let result = get_row_count(); - match result { - Err(e) if e.code == CudfErrorCode_CUDF_ERROR_NO_DATA => { - // Expected - } - Err(e) => { - // Also acceptable - might fail for other reasons without GPU - println!("Got error (acceptable): {}", e); - } - Ok(_) => { - panic!("Expected error when no data loaded"); - } - } - } } diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs index 177a4ffd1e4..796d078dc91 100644 --- a/vortex-cuda/src/arrow/canonical.rs +++ b/vortex-cuda/src/arrow/canonical.rs @@ -32,7 +32,7 @@ impl CudaDeviceArrayExecute for Canonical { let cuda_array = array.execute_cuda(ctx).await?; let arrow_array = match cuda_array { - Canonical::Primitive(primitive) => export_primitive(primitive, ctx).await, + Canonical::Primitive(primitive) => export_primitive(primitive, ctx)?, c => todo!("implement support for exporting {}", c.dtype()), }; @@ -107,12 +107,3 @@ fn export_primitive(array: PrimitiveArray, ctx: &mut CudaExecutionCtx) -> Vortex }) } -// Get the DecimalArray and the VarBinViewArray so we know -// how to treat all of these timestamps and such. - -#[cfg(test)] -mod tests { - #[tokio::test] - async fn test_export_primitive() { - } -} From 182e35b00055983b2c8485464218cb0ebcdbdfc6 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 2 Feb 2026 23:43:55 +0000 Subject: [PATCH 04/22] more test Signed-off-by: Andrew Duffy --- Cargo.lock | 7 +++ vortex-cuda/cudf-test/Cargo.toml | 7 +++ vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h | 1 + vortex-cuda/cudf-test/src/lib.rs | 72 ++++++++++++++++++++++ vortex-cuda/src/arrow/canonical.rs | 24 +++++--- vortex-cuda/src/arrow/mod.rs | 11 ++-- vortex-cuda/src/device_buffer.rs | 1 - vortex-cuda/src/lib.rs | 2 +- 8 files changed, 109 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3e0119b8621..a8991c396d3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10467,7 +10467,14 @@ dependencies = [ name = "vortex-cudf-test" version = "0.1.0" dependencies = [ + "arrow-array 57.2.0", + "arrow-schema 57.2.0", "bindgen", + "futures", + "vortex-array", + "vortex-buffer", + "vortex-cuda", + "vortex-session", ] [[package]] diff --git a/vortex-cuda/cudf-test/Cargo.toml b/vortex-cuda/cudf-test/Cargo.toml index 847e6e3e678..84d92f7cec4 100644 --- a/vortex-cuda/cudf-test/Cargo.toml +++ b/vortex-cuda/cudf-test/Cargo.toml @@ -20,6 +20,13 @@ version = { workspace = true } workspace = true [dependencies] +arrow-array = { workspace = true, features = ["ffi"] } +arrow-schema = { workspace = true, features = ["ffi"] } +futures = { workspace = true, features = ["executor"] } +vortex-array = { workspace = true } +vortex-buffer = { workspace = true } +vortex-cuda = { path = "..", features = ["_test-harness"] } +vortex-session = { workspace = true } [build-dependencies] bindgen = { workspace = true } diff --git a/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h b/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h index ef8ae6693e0..89a02019ba4 100644 --- a/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h +++ b/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h @@ -61,6 +61,7 @@ struct ArrowDeviceArray { int64_t device_id; ArrowDeviceType device_type; void* sync_event; + int64_t reserved[3]; }; // Error type: NULL on success, pointer to error string on failure. diff --git a/vortex-cuda/cudf-test/src/lib.rs b/vortex-cuda/cudf-test/src/lib.rs index cacbb7ce115..1b6b014001e 100644 --- a/vortex-cuda/cudf-test/src/lib.rs +++ b/vortex-cuda/cudf-test/src/lib.rs @@ -195,6 +195,19 @@ impl Drop for CudfColumnView { #[cfg(test)] mod tests { + use arrow_array::ffi::FFI_ArrowSchema; + use arrow_schema::DataType; + use futures::executor::block_on; + use vortex_array::Canonical; + use vortex_array::IntoArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; + use vortex_cuda::CudaSession; + use vortex_cuda::arrow::CudaDeviceArrayExecute; + use vortex_cuda::executor::CudaArrayExt; + use vortex_session::VortexSession; + use super::*; #[test] @@ -210,4 +223,63 @@ mod tests { } } } + + #[test] + fn test_primitive_array_to_cudf_tableview() -> Result<()> { + // Create a PrimitiveArray with 100 i64 values + let data: Vec = (0..100).collect(); + let expected_len = data.len(); + let primitive_array = + PrimitiveArray::new(Buffer::from(data), Validity::NonNullable).into_array(); + + // Create CUDA execution context + let mut cuda_ctx = match CudaSession::create_execution_ctx(&VortexSession::empty()).unwrap(); + + // Export as ArrowDeviceArray using CudaDeviceArrayExecute + let device_array = block_on(Canonical::execute( + &primitive_array, + primitive_array.clone(), + &mut cuda_ctx, + )) + .unwrap(); + + // Synchronize the CUDA stream to ensure the data is ready + cuda_ctx.synchronize_stream().map_err(|e| CudfError { + message: e.to_string(), + })?; + + // Create FFI_ArrowSchema from the data type + let mut ffi_schema = + FFI_ArrowSchema::try_from(&DataType::Int64).map_err(|e| CudfError { + message: format!("Failed to create FFI schema: {}", e), + })?; + + // Create cudf context + let cudf_ctx = CudfContext::new()?; + + // Import into cudf tableview + let tableview = unsafe { + cudf_ctx.tableview_from_device( + (&raw mut ffi_schema).cast::(), + (&raw const device_array).cast::(), + )? + }; + + // Verify row count + let num_rows = tableview.num_rows()?; + assert_eq!(num_rows, expected_len as i64, "Row count mismatch"); + println!( + "Successfully imported PrimitiveArray into cudf tableview with {} rows", + num_rows + ); + + // Verify column count (should be 1 for a primitive array) + let num_columns = tableview.num_columns()?; + assert_eq!(num_columns, 1, "Column count mismatch"); + println!("Tableview has {} column(s)", num_columns); + + // Tableview and cudf_ctx will be deallocated automatically via Drop + + Ok(()) + } } diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs index 796d078dc91..2d3893f90cd 100644 --- a/vortex-cuda/src/arrow/canonical.rs +++ b/vortex-cuda/src/arrow/canonical.rs @@ -3,6 +3,7 @@ use std::sync::Arc; +use async_trait::async_trait; use cudarc::driver::sys; use vortex_array::ArrayRef; use vortex_array::Canonical; @@ -22,7 +23,7 @@ use crate::arrow::CudaPrivateData; use crate::arrow::DeviceType; use crate::executor::CudaArrayExt; -// Impl it for the execution context instead here...I think this is right? +#[async_trait] impl CudaDeviceArrayExecute for Canonical { async fn execute( &self, @@ -32,7 +33,7 @@ impl CudaDeviceArrayExecute for Canonical { let cuda_array = array.execute_cuda(ctx).await?; let arrow_array = match cuda_array { - Canonical::Primitive(primitive) => export_primitive(primitive, ctx)?, + Canonical::Primitive(primitive) => export_primitive(primitive, ctx).await?, c => todo!("implement support for exporting {}", c.dtype()), }; @@ -46,19 +47,25 @@ impl CudaDeviceArrayExecute for Canonical { } } -fn export_primitive(array: PrimitiveArray, ctx: &mut CudaExecutionCtx) -> VortexResult { +async fn export_primitive(array: PrimitiveArray, ctx: &mut CudaExecutionCtx) -> VortexResult { + unsafe extern "C" fn release(array: *mut ArrowArray) { + // SAFETY: this is only safe if the caller provides a valid pointer to an `ArrowArray`. + drop(unsafe { Box::from_raw(array) }); + } + let len = array.len(); let PrimitiveArrayParts { buffer, - ptype, validity, .. } = array.into_parts(); - unsafe extern "C" fn release(array: *mut ArrowArray) { - // SAFETY: this is only safe if the caller provides a valid pointer to an `ArrowArray`. - drop(unsafe { Box::from_raw(array) }); - } + let buffer = if buffer.is_on_device() { + buffer + } else { + // TODO(aduffy): I don't think this type parameter does anything + ctx.move_to_device::(buffer)?.await? + }; let null_count = match validity { Validity::NonNullable | Validity::AllValid => 0, @@ -106,4 +113,3 @@ fn export_primitive(array: PrimitiveArray, ctx: &mut CudaExecutionCtx) -> Vortex private_data: Box::into_raw(private_data).cast(), }) } - diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs index ff43432dffa..22eb5cdd391 100644 --- a/vortex-cuda/src/arrow/mod.rs +++ b/vortex-cuda/src/arrow/mod.rs @@ -14,17 +14,15 @@ use std::ffi::c_void; use std::ptr::NonNull; use std::sync::Arc; +use async_trait::async_trait; use cudarc::driver::CudaStream; use cudarc::driver::sys; use cudarc::runtime::sys::cudaEvent_t; use vortex_array::ArrayRef; -use vortex_array::Executable; use vortex_array::buffer::BufferHandle; use vortex_error::VortexResult; use crate::CudaExecutionCtx; -use crate::executor::CudaArrayExt; -use crate::executor::CudaExecute; #[derive(Debug, Copy, Clone)] #[repr(i32)] @@ -53,7 +51,7 @@ pub type SyncEvent = Option>; /// event that the client must wait on. #[repr(C)] #[derive(Debug)] -pub(crate) struct ArrowDeviceArray { +pub struct ArrowDeviceArray { array: ArrowArray, device_id: i64, device_type: DeviceType, @@ -85,6 +83,7 @@ pub(crate) struct ArrowArray { } impl ArrowArray { + #[allow(unused)] pub fn empty() -> Self { Self { length: 0, @@ -101,6 +100,7 @@ impl ArrowArray { } } +#[expect(unused, reason = "cuda_stream and cuda_buffers need to have deferred drop")] pub(crate) struct CudaPrivateData { /// Hold a reference to the CudaStream so that it stays alive even after CudaExecutionCtx /// has been dropped. @@ -113,7 +113,8 @@ pub(crate) struct CudaPrivateData { } /// Trait implemented for types that can be exported to [`ArrowDeviceArray`]. -pub(crate) trait CudaDeviceArrayExecute { +#[async_trait] +pub trait CudaDeviceArrayExecute { async fn execute( &self, array: ArrayRef, diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs index 363f160c815..f1ad5e06af0 100644 --- a/vortex-cuda/src/device_buffer.rs +++ b/vortex-cuda/src/device_buffer.rs @@ -12,7 +12,6 @@ use cudarc::driver::DevicePtr; use cudarc::driver::DeviceRepr; use cudarc::driver::sys; use futures::future::BoxFuture; -use futures::future::ok; use vortex_array::buffer::BufferHandle; use vortex_array::buffer::DeviceBuffer; use vortex_buffer::Alignment; diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs index 485fcc80fa8..94c1ea50c30 100644 --- a/vortex-cuda/src/lib.rs +++ b/vortex-cuda/src/lib.rs @@ -5,7 +5,7 @@ use std::process::Command; -mod arrow; +pub mod arrow; mod canonical; mod device_buffer; pub mod executor; From a31192f6e35c52b9147d28764e8d6f9f95ccaff9 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 2 Feb 2026 23:48:20 +0000 Subject: [PATCH 05/22] fix tests Signed-off-by: Andrew Duffy --- vortex-cuda/cudf-test/src/lib.rs | 46 +++++++++++++++--------------- vortex-cuda/src/arrow/canonical.rs | 9 +++--- vortex-cuda/src/arrow/mod.rs | 5 +++- 3 files changed, 32 insertions(+), 28 deletions(-) diff --git a/vortex-cuda/cudf-test/src/lib.rs b/vortex-cuda/cudf-test/src/lib.rs index 1b6b014001e..b4591dfd70e 100644 --- a/vortex-cuda/cudf-test/src/lib.rs +++ b/vortex-cuda/cudf-test/src/lib.rs @@ -198,14 +198,13 @@ mod tests { use arrow_array::ffi::FFI_ArrowSchema; use arrow_schema::DataType; use futures::executor::block_on; - use vortex_array::Canonical; + use vortex_array::Array; use vortex_array::IntoArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity; use vortex_buffer::Buffer; use vortex_cuda::CudaSession; use vortex_cuda::arrow::CudaDeviceArrayExecute; - use vortex_cuda::executor::CudaArrayExt; use vortex_session::VortexSession; use super::*; @@ -225,7 +224,7 @@ mod tests { } #[test] - fn test_primitive_array_to_cudf_tableview() -> Result<()> { + fn test_primitive_array_to_cudf_columnview() -> Result<()> { // Create a PrimitiveArray with 100 i64 values let data: Vec = (0..100).collect(); let expected_len = data.len(); @@ -233,15 +232,16 @@ mod tests { PrimitiveArray::new(Buffer::from(data), Validity::NonNullable).into_array(); // Create CUDA execution context - let mut cuda_ctx = match CudaSession::create_execution_ctx(&VortexSession::empty()).unwrap(); + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()).unwrap(); - // Export as ArrowDeviceArray using CudaDeviceArrayExecute - let device_array = block_on(Canonical::execute( - &primitive_array, - primitive_array.clone(), - &mut cuda_ctx, - )) - .unwrap(); + // Get canonical form and export as ArrowDeviceArray + let canonical = primitive_array.to_canonical().map_err(|e| CudfError { + message: e.to_string(), + })?; + let device_array = block_on(canonical.execute(primitive_array.clone(), &mut cuda_ctx)) + .map_err(|e| CudfError { + message: e.to_string(), + })?; // Synchronize the CUDA stream to ensure the data is ready cuda_ctx.synchronize_stream().map_err(|e| CudfError { @@ -257,28 +257,28 @@ mod tests { // Create cudf context let cudf_ctx = CudfContext::new()?; - // Import into cudf tableview - let tableview = unsafe { - cudf_ctx.tableview_from_device( + // Import into cudf columnview + let columnview = unsafe { + cudf_ctx.columnview_from_device( (&raw mut ffi_schema).cast::(), (&raw const device_array).cast::(), )? }; // Verify row count - let num_rows = tableview.num_rows()?; - assert_eq!(num_rows, expected_len as i64, "Row count mismatch"); + let size = columnview.size()?; + assert_eq!(size, expected_len as i64, "Size mismatch"); println!( - "Successfully imported PrimitiveArray into cudf tableview with {} rows", - num_rows + "Successfully imported PrimitiveArray into cudf columnview with {} rows", + size ); - // Verify column count (should be 1 for a primitive array) - let num_columns = tableview.num_columns()?; - assert_eq!(num_columns, 1, "Column count mismatch"); - println!("Tableview has {} column(s)", num_columns); + // Verify valid count (should be same as length since NonNullable) + let valid_count = columnview.count_valid()?; + assert_eq!(valid_count, expected_len as i64, "Valid count mismatch"); + println!("Columnview has {} valid values", valid_count); - // Tableview and cudf_ctx will be deallocated automatically via Drop + // Columnview and cudf_ctx will be deallocated automatically via Drop Ok(()) } diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs index 2d3893f90cd..c0a902454b3 100644 --- a/vortex-cuda/src/arrow/canonical.rs +++ b/vortex-cuda/src/arrow/canonical.rs @@ -47,7 +47,10 @@ impl CudaDeviceArrayExecute for Canonical { } } -async fn export_primitive(array: PrimitiveArray, ctx: &mut CudaExecutionCtx) -> VortexResult { +async fn export_primitive( + array: PrimitiveArray, + ctx: &mut CudaExecutionCtx, +) -> VortexResult { unsafe extern "C" fn release(array: *mut ArrowArray) { // SAFETY: this is only safe if the caller provides a valid pointer to an `ArrowArray`. drop(unsafe { Box::from_raw(array) }); @@ -55,9 +58,7 @@ async fn export_primitive(array: PrimitiveArray, ctx: &mut CudaExecutionCtx) -> let len = array.len(); let PrimitiveArrayParts { - buffer, - validity, - .. + buffer, validity, .. } = array.into_parts(); let buffer = if buffer.is_on_device() { diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs index 22eb5cdd391..89a476ee54e 100644 --- a/vortex-cuda/src/arrow/mod.rs +++ b/vortex-cuda/src/arrow/mod.rs @@ -100,7 +100,10 @@ impl ArrowArray { } } -#[expect(unused, reason = "cuda_stream and cuda_buffers need to have deferred drop")] +#[expect( + unused, + reason = "cuda_stream and cuda_buffers need to have deferred drop" +)] pub(crate) struct CudaPrivateData { /// Hold a reference to the CudaStream so that it stays alive even after CudaExecutionCtx /// has been dropped. From da8c68081663a3f8a2d3bf074ca1ff61c3a34622 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 3 Feb 2026 12:10:48 -0500 Subject: [PATCH 06/22] delete a bunch of code, add some more to the test Signed-off-by: Andrew Duffy --- Cargo.lock | 6 +- vortex-cuda/cudf-test/Cargo.toml | 14 +- vortex-cuda/cudf-test/build.rs | 80 ----- vortex-cuda/cudf-test/cpp/CMakeLists.txt | 46 --- vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.cpp | 275 ---------------- vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h | 142 -------- vortex-cuda/cudf-test/src/lib.rs | 320 +++---------------- vortex-cuda/src/arrow/canonical.rs | 14 +- vortex-cuda/src/arrow/mod.rs | 33 +- vortex-cuda/src/executor.rs | 6 + vortex-cuda/src/lib.rs | 2 + vortex-cuda/src/session.rs | 11 + 12 files changed, 110 insertions(+), 839 deletions(-) delete mode 100644 vortex-cuda/cudf-test/build.rs delete mode 100644 vortex-cuda/cudf-test/cpp/CMakeLists.txt delete mode 100644 vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.cpp delete mode 100644 vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h diff --git a/Cargo.lock b/Cargo.lock index a8991c396d3..8e4626e8fad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10467,14 +10467,10 @@ dependencies = [ name = "vortex-cudf-test" version = "0.1.0" dependencies = [ - "arrow-array 57.2.0", "arrow-schema 57.2.0", - "bindgen", "futures", - "vortex-array", - "vortex-buffer", + "vortex", "vortex-cuda", - "vortex-session", ] [[package]] diff --git a/vortex-cuda/cudf-test/Cargo.toml b/vortex-cuda/cudf-test/Cargo.toml index 84d92f7cec4..6bac8c4e4cd 100644 --- a/vortex-cuda/cudf-test/Cargo.toml +++ b/vortex-cuda/cudf-test/Cargo.toml @@ -11,22 +11,20 @@ categories = { workspace = true } include = { workspace = true } keywords = { workspace = true } license = { workspace = true } +publish = false readme = { workspace = true } repository = { workspace = true } rust-version = { workspace = true } version = { workspace = true } +[lib] +crate-type = ["cdylib"] + [lints] workspace = true [dependencies] -arrow-array = { workspace = true, features = ["ffi"] } arrow-schema = { workspace = true, features = ["ffi"] } futures = { workspace = true, features = ["executor"] } -vortex-array = { workspace = true } -vortex-buffer = { workspace = true } -vortex-cuda = { path = "..", features = ["_test-harness"] } -vortex-session = { workspace = true } - -[build-dependencies] -bindgen = { workspace = true } +vortex = { workspace = true } +vortex-cuda = { workspace = true, features = ["_test-harness"] } diff --git a/vortex-cuda/cudf-test/build.rs b/vortex-cuda/cudf-test/build.rs deleted file mode 100644 index 8b4dea3136c..00000000000 --- a/vortex-cuda/cudf-test/build.rs +++ /dev/null @@ -1,80 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -// Build scripts use expect/panic to fail the build with clear error messages -#![allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)] - -use std::env; -use std::path::PathBuf; -use std::process::Command; - -fn main() { - let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); - let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); - let cpp_dir = manifest_dir.join("cpp"); - - // Create build directory - let build_dir = out_dir.join("cmake_build"); - std::fs::create_dir_all(&build_dir).expect("Failed to create build directory"); - - // Get conda prefix for finding cudf - let conda_prefix = env::var("CONDA_PREFIX").ok(); - - // Configure CMake - let mut cmake_cmd = Command::new("cmake"); - cmake_cmd - .current_dir(&build_dir) - .arg(&cpp_dir) - .arg("-DCMAKE_BUILD_TYPE=Release"); - - // Add conda prefix to CMAKE_PREFIX_PATH if available - if let Some(prefix) = &conda_prefix { - cmake_cmd.arg(format!("-DCMAKE_PREFIX_PATH={}", prefix)); - } - - let status = cmake_cmd.status().expect("Failed to run cmake configure"); - - assert!(status.success(), "CMake configure failed"); - - // Build - let status = Command::new("cmake") - .current_dir(&build_dir) - .args(["--build", ".", "--config", "Release", "-j"]) - .status() - .expect("Failed to run cmake build"); - - assert!(status.success(), "CMake build failed"); - - // Tell cargo where to find the library - println!("cargo:rustc-link-search=native={}", build_dir.display()); - println!("cargo:rustc-link-lib=dylib=cudf_arrow_ffi"); - - // Also link to cudf and its dependencies - if let Some(prefix) = &conda_prefix { - println!("cargo:rustc-link-search=native={}/lib", prefix); - } - - // Rebuild if C++ sources change - println!("cargo:rerun-if-changed=cpp/cudf_arrow_ffi.cpp"); - println!("cargo:rerun-if-changed=cpp/cudf_arrow_ffi.h"); - println!("cargo:rerun-if-changed=cpp/CMakeLists.txt"); - - // Generate bindings using bindgen - let bindings = bindgen::Builder::default() - .header(cpp_dir.join("cudf_arrow_ffi.h").to_string_lossy()) - .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) - .allowlist_function("cudf_.*") - .allowlist_type("CudfResult") - .allowlist_type("CudfErrorCode") - .allowlist_type("ArrowSchema") - .allowlist_type("ArrowArray") - .allowlist_type("ArrowDeviceArray") - .allowlist_type("ArrowDeviceType") - .allowlist_var("ARROW_DEVICE_.*") - .generate() - .expect("Unable to generate bindings"); - - bindings - .write_to_file(out_dir.join("bindings.rs")) - .expect("Couldn't write bindings!"); -} diff --git a/vortex-cuda/cudf-test/cpp/CMakeLists.txt b/vortex-cuda/cudf-test/cpp/CMakeLists.txt deleted file mode 100644 index 5d310446beb..00000000000 --- a/vortex-cuda/cudf-test/cpp/CMakeLists.txt +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors - -cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) - -project(cudf_arrow_ffi LANGUAGES CXX CUDA) - -set(CMAKE_CXX_STANDARD 20) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) - -# Find RAPIDS dependencies -find_package(cudf REQUIRED) -find_package(rmm REQUIRED) - -# Create the shared library -add_library(cudf_arrow_ffi SHARED - cudf_arrow_ffi.cpp -) - -target_include_directories(cudf_arrow_ffi - PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR} -) - -target_link_libraries(cudf_arrow_ffi - PUBLIC - cudf::cudf - rmm::rmm -) - -# Set output directory to parent directory for easier linking from Rust -set_target_properties(cudf_arrow_ffi PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" -) - -# Install the library -install(TARGETS cudf_arrow_ffi - LIBRARY DESTINATION lib - RUNTIME DESTINATION bin -) - -install(FILES cudf_arrow_ffi.h - DESTINATION include -) diff --git a/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.cpp b/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.cpp deleted file mode 100644 index 1e6efee56f4..00000000000 --- a/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.cpp +++ /dev/null @@ -1,275 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -#include "cudf_arrow_ffi.h" - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -// Internal struct definitions for opaque types - -struct cudf_context { - std::unique_ptr cuda_mr; -}; - -struct cudf_tableview { - cudf::unique_table_view_t view; - - explicit cudf_tableview(cudf::unique_table_view_t v) : view(std::move(v)) {} -}; - -struct cudf_columnview { - cudf::unique_column_view_t view; - - explicit cudf_columnview(cudf::unique_column_view_t v) : view(std::move(v)) {} -}; - -// Helper to create an error string -static cudf_err_t make_error(const char* msg) { - return strdup(msg); -} - -static cudf_err_t make_error(const std::string& msg) { - return strdup(msg.c_str()); -} - -extern "C" { - -cudf_err_t cudf_context_create(cudf_context_t** ctx) { - if (!ctx) { - return make_error("ctx pointer is null"); - } - - try { - auto context = std::make_unique(); - context->cuda_mr = std::make_unique(); - rmm::mr::set_current_device_resource(context->cuda_mr.get()); - *ctx = context.release(); - return nullptr; - } catch (const std::exception& e) { - return make_error(e.what()); - } -} - -void cudf_context_free(cudf_context_t* ctx) { - delete ctx; -} - -cudf_err_t cudf_tableview_from_device( - cudf_context_t* ctx, - const ArrowSchema* schema, - const ArrowDeviceArray* device_array, - cudf_tableview_t** out -) { - if (!ctx) { - return make_error("context is null"); - } - if (!schema || !device_array) { - return make_error("schema or device_array is null"); - } - if (!out) { - return make_error("out pointer is null"); - } - - try { - auto view = cudf::from_arrow_device(schema, device_array); - *out = new cudf_tableview(std::move(view)); - return nullptr; - } catch (const std::exception& e) { - return make_error(e.what()); - } -} - -cudf_err_t cudf_columnview_from_device( - cudf_context_t* ctx, - const ArrowSchema* schema, - const ArrowDeviceArray* device_array, - cudf_columnview_t** out -) { - if (!ctx) { - return make_error("context is null"); - } - if (!schema || !device_array) { - return make_error("schema or device_array is null"); - } - if (!out) { - return make_error("out pointer is null"); - } - - try { - auto view = cudf::from_arrow_device_column(schema, device_array); - *out = new cudf_columnview(std::move(view)); - return nullptr; - } catch (const std::exception& e) { - return make_error(e.what()); - } -} - -cudf_err_t cudf_tableview_num_rows(const cudf_tableview_t* tv, int64_t* count) { - if (!tv) { - return make_error("table view is null"); - } - if (!count) { - return make_error("count pointer is null"); - } - - try { - *count = static_cast(tv->view->num_rows()); - return nullptr; - } catch (const std::exception& e) { - return make_error(e.what()); - } -} - -cudf_err_t cudf_tableview_num_columns(const cudf_tableview_t* tv, int32_t* count) { - if (!tv) { - return make_error("table view is null"); - } - if (!count) { - return make_error("count pointer is null"); - } - - try { - *count = static_cast(tv->view->num_columns()); - return nullptr; - } catch (const std::exception& e) { - return make_error(e.what()); - } -} - -cudf_err_t cudf_columnview_size(const cudf_columnview_t* cv, int64_t* count) { - if (!cv) { - return make_error("column view is null"); - } - if (!count) { - return make_error("count pointer is null"); - } - - try { - *count = static_cast(cv->view->size()); - return nullptr; - } catch (const std::exception& e) { - return make_error(e.what()); - } -} - -cudf_err_t cudf_tableview_count_valid(const cudf_tableview_t* tv, int32_t column_index, int64_t* valid_count) { - if (!tv) { - return make_error("table view is null"); - } - if (!valid_count) { - return make_error("valid_count pointer is null"); - } - - try { - if (column_index < 0 || column_index >= tv->view->num_columns()) { - return make_error("column index out of bounds"); - } - - auto col_view = tv->view->column(column_index); - auto agg = cudf::make_count_aggregation(); - auto result = cudf::reduce(col_view, *agg, cudf::data_type{cudf::type_id::INT64}); - - auto* int_scalar = static_cast*>(result.get()); - *valid_count = int_scalar->value(); - - return nullptr; - } catch (const std::exception& e) { - return make_error(e.what()); - } -} - -cudf_err_t cudf_columnview_count_valid(const cudf_columnview_t* cv, int64_t* valid_count) { - if (!cv) { - return make_error("column view is null"); - } - if (!valid_count) { - return make_error("valid_count pointer is null"); - } - - try { - auto agg = cudf::make_count_aggregation(); - auto result = cudf::reduce(*cv->view, *agg, cudf::data_type{cudf::type_id::INT64}); - - auto* int_scalar = static_cast*>(result.get()); - *valid_count = int_scalar->value(); - - return nullptr; - } catch (const std::exception& e) { - return make_error(e.what()); - } -} - -cudf_err_t cudf_tableview_sum_int64(const cudf_tableview_t* tv, int32_t column_index, int64_t* sum) { - if (!tv) { - return make_error("table view is null"); - } - if (!sum) { - return make_error("sum pointer is null"); - } - - try { - if (column_index < 0 || column_index >= tv->view->num_columns()) { - return make_error("column index out of bounds"); - } - - auto col_view = tv->view->column(column_index); - auto agg = cudf::make_sum_aggregation(); - auto result = cudf::reduce(col_view, *agg, cudf::data_type{cudf::type_id::INT64}); - - auto* int_scalar = static_cast*>(result.get()); - *sum = int_scalar->value(); - - return nullptr; - } catch (const std::exception& e) { - return make_error(e.what()); - } -} - -cudf_err_t cudf_columnview_sum_int64(const cudf_columnview_t* cv, int64_t* sum) { - if (!cv) { - return make_error("column view is null"); - } - if (!sum) { - return make_error("sum pointer is null"); - } - - try { - auto agg = cudf::make_sum_aggregation(); - auto result = cudf::reduce(*cv->view, *agg, cudf::data_type{cudf::type_id::INT64}); - - auto* int_scalar = static_cast*>(result.get()); - *sum = int_scalar->value(); - - return nullptr; - } catch (const std::exception& e) { - return make_error(e.what()); - } -} - -void cudf_tableview_free(cudf_tableview_t* tv) { - delete tv; -} - -void cudf_columnview_free(cudf_columnview_t* cv) { - delete cv; -} - -void cudf_err_free(cudf_err_t err) { - if (err) { - free(const_cast(err)); - } -} - -} // extern "C" diff --git a/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h b/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h deleted file mode 100644 index 89a02019ba4..00000000000 --- a/vortex-cuda/cudf-test/cpp/cudf_arrow_ffi.h +++ /dev/null @@ -1,142 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -#ifndef CUDF_ARROW_FFI_H -#define CUDF_ARROW_FFI_H - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -// Arrow C Device Data Interface structures -// These match the Arrow specification for device data exchange - -struct ArrowSchema { - const char* format; - const char* name; - const char* metadata; - int64_t flags; - int64_t n_children; - struct ArrowSchema** children; - struct ArrowSchema* dictionary; - void (*release)(struct ArrowSchema*); - void* private_data; -}; - -struct ArrowArray { - int64_t length; - int64_t null_count; - int64_t offset; - int64_t n_buffers; - int64_t n_children; - const void** buffers; - struct ArrowArray** children; - struct ArrowArray* dictionary; - void (*release)(struct ArrowArray*); - void* private_data; -}; - -// Arrow Device type constants -typedef int32_t ArrowDeviceType; -#define ARROW_DEVICE_CPU 1 -#define ARROW_DEVICE_CUDA 2 -#define ARROW_DEVICE_CUDA_HOST 3 -#define ARROW_DEVICE_OPENCL 4 -#define ARROW_DEVICE_VULKAN 7 -#define ARROW_DEVICE_METAL 8 -#define ARROW_DEVICE_VPI 9 -#define ARROW_DEVICE_ROCM 10 -#define ARROW_DEVICE_ROCM_HOST 11 -#define ARROW_DEVICE_EXT_DEV 12 -#define ARROW_DEVICE_CUDA_MANAGED 13 -#define ARROW_DEVICE_ONEAPI 14 -#define ARROW_DEVICE_WEBGPU 15 -#define ARROW_DEVICE_HEXAGON 16 - -struct ArrowDeviceArray { - struct ArrowArray array; - int64_t device_id; - ArrowDeviceType device_type; - void* sync_event; - int64_t reserved[3]; -}; - -// Error type: NULL on success, pointer to error string on failure. -// Caller must free with cudf_err_free() when non-NULL. -typedef const char* cudf_err_t; - -// Opaque context type that holds CUDA memory resources and global state. -typedef struct cudf_context cudf_context_t; - -// Opaque table view type wrapping cudf::unique_table_view_t -typedef struct cudf_tableview cudf_tableview_t; - -// Opaque column view type wrapping cudf::unique_column_view_t -typedef struct cudf_columnview cudf_columnview_t; - -// Create a new cudf context and initialize RMM. -// On success, *ctx is set to the new context and NULL is returned. -// On failure, *ctx is unchanged and an error string is returned. -cudf_err_t cudf_context_create(cudf_context_t** ctx); - -// Free a cudf context and all associated resources. -void cudf_context_free(cudf_context_t* ctx); - -// Import an Arrow table from device memory into a cudf table view. -// On success, *out is set to the new table view and NULL is returned. -// On failure, *out is unchanged and an error string is returned. -cudf_err_t cudf_tableview_from_device( - cudf_context_t* ctx, - const struct ArrowSchema* schema, - const struct ArrowDeviceArray* device_array, - cudf_tableview_t** out -); - -// Import an Arrow column from device memory into a cudf column view. -// On success, *out is set to the new column view and NULL is returned. -// On failure, *out is unchanged and an error string is returned. -cudf_err_t cudf_columnview_from_device( - cudf_context_t* ctx, - const struct ArrowSchema* schema, - const struct ArrowDeviceArray* device_array, - cudf_columnview_t** out -); - -// Get the number of rows in a table view. -cudf_err_t cudf_tableview_num_rows(const cudf_tableview_t* tv, int64_t* count); - -// Get the number of columns in a table view. -cudf_err_t cudf_tableview_num_columns(const cudf_tableview_t* tv, int32_t* count); - -// Get the number of rows in a column view. -cudf_err_t cudf_columnview_size(const cudf_columnview_t* cv, int64_t* count); - -// Count valid (non-null) values in a table column. -cudf_err_t cudf_tableview_count_valid(const cudf_tableview_t* tv, int32_t column_index, int64_t* valid_count); - -// Count valid (non-null) values in a column view. -cudf_err_t cudf_columnview_count_valid(const cudf_columnview_t* cv, int64_t* valid_count); - -// Sum values in an int64 table column. -cudf_err_t cudf_tableview_sum_int64(const cudf_tableview_t* tv, int32_t column_index, int64_t* sum); - -// Sum values in an int64 column view. -cudf_err_t cudf_columnview_sum_int64(const cudf_columnview_t* cv, int64_t* sum); - -// Free a table view. -void cudf_tableview_free(cudf_tableview_t* tv); - -// Free a column view. -void cudf_columnview_free(cudf_columnview_t* cv); - -// Free an error string. -void cudf_err_free(cudf_err_t err); - -#ifdef __cplusplus -} -#endif - -#endif // CUDF_ARROW_FFI_H diff --git a/vortex-cuda/cudf-test/src/lib.rs b/vortex-cuda/cudf-test/src/lib.rs index b4591dfd70e..76aab24f758 100644 --- a/vortex-cuda/cudf-test/src/lib.rs +++ b/vortex-cuda/cudf-test/src/lib.rs @@ -1,285 +1,49 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Rust bindings for cudf Arrow Device FFI operations. -//! -//! This crate provides a safe Rust interface to cudf's Arrow Device data -//! import functionality, allowing GPU data to be passed directly to cudf -//! for processing. - -#![allow(non_upper_case_globals)] -#![allow(non_camel_case_types)] -#![allow(non_snake_case)] - -use std::ffi::CStr; -use std::fmt; -use std::ptr; - -// Include the generated bindings -include!(concat!(env!("OUT_DIR"), "/bindings.rs")); - -/// Error type for cudf operations. -#[derive(Debug)] -pub struct CudfError { - pub message: String, -} - -impl fmt::Display for CudfError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "CudfError: {}", self.message) - } -} - -impl std::error::Error for CudfError {} - -/// Result type for cudf operations. -pub type Result = std::result::Result; - -/// Check a cudf_err_t and convert to Result. -fn check_err(err: cudf_err_t) -> Result<()> { - if err.is_null() { - Ok(()) - } else { - let message = unsafe { CStr::from_ptr(err) } - .to_string_lossy() - .into_owned(); - unsafe { cudf_err_free(err) }; - Err(CudfError { message }) - } -} - -/// RAII wrapper for cudf_context_t. -pub struct CudfContext { - ctx: *mut cudf_context_t, -} - -impl CudfContext { - /// Create a new cudf context and initialize RMM. - pub fn new() -> Result { - let mut ctx: *mut cudf_context_t = ptr::null_mut(); - let err = unsafe { cudf_context_create(&raw mut ctx) }; - check_err(err)?; - Ok(Self { ctx }) - } - - /// Import an Arrow table from device memory into a cudf table view. - /// - /// # Safety - /// - /// The schema and device_array must be valid Arrow C Device Data Interface structures - /// with device memory pointers. - pub unsafe fn tableview_from_device( - &self, - schema: *const ArrowSchema, - device_array: *const ArrowDeviceArray, - ) -> Result { - let mut tv: *mut cudf_tableview_t = ptr::null_mut(); - let err = - unsafe { cudf_tableview_from_device(self.ctx, schema, device_array, &raw mut tv) }; - check_err(err)?; - Ok(CudfTableView { tv }) - } - - /// Import an Arrow column from device memory into a cudf column view. - /// - /// # Safety - /// - /// The schema and device_array must be valid Arrow C Data Interface structures - /// with device memory pointers. - pub unsafe fn columnview_from_device( - &self, - schema: *const ArrowSchema, - device_array: *const ArrowDeviceArray, - ) -> Result { - let mut cv: *mut cudf_columnview_t = ptr::null_mut(); - let err = - unsafe { cudf_columnview_from_device(self.ctx, schema, device_array, &raw mut cv) }; - check_err(err)?; - Ok(CudfColumnView { cv }) - } -} - -impl Drop for CudfContext { - fn drop(&mut self) { - if !self.ctx.is_null() { - unsafe { cudf_context_free(self.ctx) }; - } - } -} - -/// RAII wrapper for cudf_tableview_t. -pub struct CudfTableView { - tv: *mut cudf_tableview_t, -} - -impl CudfTableView { - /// Get the number of rows in the table. - pub fn num_rows(&self) -> Result { - let mut count: i64 = 0; - let err = unsafe { cudf_tableview_num_rows(self.tv, &raw mut count) }; - check_err(err)?; - Ok(count) - } - - /// Get the number of columns in the table. - pub fn num_columns(&self) -> Result { - let mut count: i32 = 0; - let err = unsafe { cudf_tableview_num_columns(self.tv, &raw mut count) }; - check_err(err)?; - Ok(count) - } - - /// Count valid (non-null) values in a column. - pub fn count_valid(&self, column_index: i32) -> Result { - let mut count: i64 = 0; - let err = unsafe { cudf_tableview_count_valid(self.tv, column_index, &raw mut count) }; - check_err(err)?; - Ok(count) - } - - /// Sum values in an int64 column. - pub fn sum_int64(&self, column_index: i32) -> Result { - let mut sum: i64 = 0; - let err = unsafe { cudf_tableview_sum_int64(self.tv, column_index, &raw mut sum) }; - check_err(err)?; - Ok(sum) - } -} - -impl Drop for CudfTableView { - fn drop(&mut self) { - if !self.tv.is_null() { - unsafe { cudf_tableview_free(self.tv) }; - } - } -} - -/// RAII wrapper for cudf_columnview_t. -pub struct CudfColumnView { - cv: *mut cudf_columnview_t, -} - -impl CudfColumnView { - /// Get the number of rows in the column. - pub fn size(&self) -> Result { - let mut count: i64 = 0; - let err = unsafe { cudf_columnview_size(self.cv, &raw mut count) }; - check_err(err)?; - Ok(count) - } - - /// Count valid (non-null) values in the column. - pub fn count_valid(&self) -> Result { - let mut count: i64 = 0; - let err = unsafe { cudf_columnview_count_valid(self.cv, &raw mut count) }; - check_err(err)?; - Ok(count) - } - - /// Sum values in the column (int64). - pub fn sum_int64(&self) -> Result { - let mut sum: i64 = 0; - let err = unsafe { cudf_columnview_sum_int64(self.cv, &raw mut sum) }; - check_err(err)?; - Ok(sum) - } -} - -impl Drop for CudfColumnView { - fn drop(&mut self) { - if !self.cv.is_null() { - unsafe { cudf_columnview_free(self.cv) }; +//! This file is a simple C-compatible API that is called from the cudf-test-harness at CI time. + +use std::sync::LazyLock; + +use futures::executor::block_on; +use vortex::array::IntoArray; +use vortex::array::arrays::PrimitiveArray; +use vortex::array::session::ArraySession; +use vortex::error::VortexResult; +use vortex::expr::session::ExprSession; +use vortex::io::session::RuntimeSession; +use vortex::layout::session::LayoutSession; +use vortex::metrics::VortexMetrics; +use vortex::session::VortexSession; +use vortex_cuda::CudaSession; +use vortex_cuda::arrow::ArrowDeviceArray; +use vortex_cuda::arrow::DeviceArrayExt; + +static SESSION: LazyLock = LazyLock::new(|| { + VortexSession::empty() + .with::() + .with::() + .with::() + .with::() + .with::() + .with::() +}); + +/// External array +#[unsafe(no_mangle)] +pub extern "C" fn export_array(array_ptr: &mut ArrowDeviceArray) -> i32 { + let mut ctx = CudaSession::create_execution_ctx(&SESSION).unwrap(); + + let primitive = PrimitiveArray::from_iter(0u32..1024); + + match block_on(primitive.into_array().export_device_array(&mut ctx)) { + Ok(exported) => { + *array_ptr = exported; + 0 } - } -} - -#[cfg(test)] -mod tests { - use arrow_array::ffi::FFI_ArrowSchema; - use arrow_schema::DataType; - use futures::executor::block_on; - use vortex_array::Array; - use vortex_array::IntoArray; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::validity::Validity; - use vortex_buffer::Buffer; - use vortex_cuda::CudaSession; - use vortex_cuda::arrow::CudaDeviceArrayExecute; - use vortex_session::VortexSession; - - use super::*; - - #[test] - fn test_context_create() -> Result<()> { - match CudfContext::new() { - Ok(_ctx) => { - println!("cudf context created successfully"); - Ok(()) - } - Err(e) => { - println!("cudf context creation failed (expected without GPU): {}", e); - Ok(()) - } + Err(err) => { + eprintln!("error in export_device_array: {err}"); + 1 } } - - #[test] - fn test_primitive_array_to_cudf_columnview() -> Result<()> { - // Create a PrimitiveArray with 100 i64 values - let data: Vec = (0..100).collect(); - let expected_len = data.len(); - let primitive_array = - PrimitiveArray::new(Buffer::from(data), Validity::NonNullable).into_array(); - - // Create CUDA execution context - let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()).unwrap(); - - // Get canonical form and export as ArrowDeviceArray - let canonical = primitive_array.to_canonical().map_err(|e| CudfError { - message: e.to_string(), - })?; - let device_array = block_on(canonical.execute(primitive_array.clone(), &mut cuda_ctx)) - .map_err(|e| CudfError { - message: e.to_string(), - })?; - - // Synchronize the CUDA stream to ensure the data is ready - cuda_ctx.synchronize_stream().map_err(|e| CudfError { - message: e.to_string(), - })?; - - // Create FFI_ArrowSchema from the data type - let mut ffi_schema = - FFI_ArrowSchema::try_from(&DataType::Int64).map_err(|e| CudfError { - message: format!("Failed to create FFI schema: {}", e), - })?; - - // Create cudf context - let cudf_ctx = CudfContext::new()?; - - // Import into cudf columnview - let columnview = unsafe { - cudf_ctx.columnview_from_device( - (&raw mut ffi_schema).cast::(), - (&raw const device_array).cast::(), - )? - }; - - // Verify row count - let size = columnview.size()?; - assert_eq!(size, expected_len as i64, "Size mismatch"); - println!( - "Successfully imported PrimitiveArray into cudf columnview with {} rows", - size - ); - - // Verify valid count (should be same as length since NonNullable) - let valid_count = columnview.count_valid()?; - assert_eq!(valid_count, expected_len as i64, "Valid count mismatch"); - println!("Columnview has {} valid values", valid_count); - - // Columnview and cudf_ctx will be deallocated automatically via Drop - - Ok(()) - } } diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs index c0a902454b3..3f1c4d09602 100644 --- a/vortex-cuda/src/arrow/canonical.rs +++ b/vortex-cuda/src/arrow/canonical.rs @@ -18,14 +18,20 @@ use crate::CudaBufferExt; use crate::CudaExecutionCtx; use crate::arrow::ArrowArray; use crate::arrow::ArrowDeviceArray; -use crate::arrow::CudaDeviceArrayExecute; use crate::arrow::CudaPrivateData; use crate::arrow::DeviceType; +use crate::arrow::ExportDeviceArray; use crate::executor::CudaArrayExt; +/// An implementation of `ExportDeviceArray` that exports Vortex arrays to `ArrowDeviceArray` by +/// first decoding the array on the GPU and then converting the canonical type to the nearest +/// Arrow equivalent. +#[derive(Debug)] +pub(crate) struct CanonicalDeviceArrayExport; + #[async_trait] -impl CudaDeviceArrayExecute for Canonical { - async fn execute( +impl ExportDeviceArray for CanonicalDeviceArrayExport { + async fn export_device_array( &self, array: ArrayRef, ctx: &mut CudaExecutionCtx, @@ -34,6 +40,8 @@ impl CudaDeviceArrayExecute for Canonical { let arrow_array = match cuda_array { Canonical::Primitive(primitive) => export_primitive(primitive, ctx).await?, + // Canonical::Decimal(decimal) => todo!("export decimal"), + // Canonical::VarBinView(varbinview) => todo!("export varbinview"), c => todo!("implement support for exporting {}", c.dtype()), }; diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs index 89a476ee54e..dc19cc4d134 100644 --- a/vortex-cuda/src/arrow/mod.rs +++ b/vortex-cuda/src/arrow/mod.rs @@ -11,13 +11,16 @@ mod canonical; use std::ffi::c_void; +use std::fmt::Debug; use std::ptr::NonNull; use std::sync::Arc; use async_trait::async_trait; +pub(crate) use canonical::CanonicalDeviceArrayExport; use cudarc::driver::CudaStream; use cudarc::driver::sys; use cudarc::runtime::sys::cudaEvent_t; +use vortex_array::Array; use vortex_array::ArrayRef; use vortex_array::buffer::BufferHandle; use vortex_error::VortexResult; @@ -115,10 +118,36 @@ pub(crate) struct CudaPrivateData { pub(crate) buffer_ptrs: Box<[sys::CUdeviceptr]>, } +#[async_trait] +pub trait DeviceArrayExt: Array { + async fn export_device_array( + self, + ctx: &mut CudaExecutionCtx, + ) -> VortexResult; +} + +#[async_trait] +impl DeviceArrayExt for ArrayRef { + async fn export_device_array( + self, + ctx: &mut CudaExecutionCtx, + ) -> VortexResult { + let exporter = Arc::clone(ctx.exporter()); + exporter.export_device_array(self, ctx).await + } +} + /// Trait implemented for types that can be exported to [`ArrowDeviceArray`]. #[async_trait] -pub trait CudaDeviceArrayExecute { - async fn execute( +pub trait ExportDeviceArray: Debug + Send + Sync + 'static { + /// Export a Vortex array as an [`ArrowDeviceArray`]. + /// + /// The Arrow Device Array is part of the Arrow C Data Device Interface extension to the Arrow + /// specification. It enables passing Vortex arrays to other processes that consume Arrow + /// arrays, such as cudf. + /// + /// See . + async fn export_device_array( &self, array: ArrayRef, ctx: &mut CudaExecutionCtx, diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs index 426d428d163..7c2ed2a1436 100644 --- a/vortex-cuda/src/executor.rs +++ b/vortex-cuda/src/executor.rs @@ -27,6 +27,7 @@ use vortex_error::VortexResult; use vortex_error::vortex_err; use crate::CudaSession; +use crate::ExportDeviceArray; use crate::session::CudaSessionExt; use crate::stream::VortexCudaStream; @@ -160,6 +161,11 @@ impl CudaExecutionCtx { pub fn stream(&self) -> &Arc { &self.stream.0 } + + /// Get a handle to the exporter that can convert arrays into `ArrowDeviceArray`. + pub fn exporter(&self) -> &Arc { + self.cuda_session.export_device_array() + } } /// Support trait for CUDA-accelerated decompression of arrays. diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs index 94c1ea50c30..a1b938e6f95 100644 --- a/vortex-cuda/src/lib.rs +++ b/vortex-cuda/src/lib.rs @@ -15,6 +15,8 @@ mod session; mod stream; mod stream_pool; +pub(crate) use arrow::CanonicalDeviceArrayExport; +pub use arrow::ExportDeviceArray; pub use canonical::CanonicalCudaExt; pub use device_buffer::CudaBufferExt; pub use device_buffer::CudaDeviceBuffer; diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs index b21901aab18..d28774383d9 100644 --- a/vortex-cuda/src/session.rs +++ b/vortex-cuda/src/session.rs @@ -5,6 +5,7 @@ use std::fmt::Debug; use std::sync::Arc; use cudarc::driver::CudaContext; +use vortex_array::ArrayRef; use vortex_array::VortexSessionExecute; use vortex_array::vtable::ArrayId; use vortex_error::VortexResult; @@ -12,6 +13,9 @@ use vortex_session::Ref; use vortex_session::SessionExt; use vortex_utils::aliases::dash_map::DashMap; +use crate::ExportDeviceArray; +use crate::arrow::ArrowDeviceArray; +use crate::arrow::CanonicalDeviceArrayExport; use crate::executor::CudaExecute; pub use crate::executor::CudaExecutionCtx; use crate::kernel::KernelLoader; @@ -29,6 +33,7 @@ const DEFAULT_STREAM_POOL_CAPACITY: usize = 4; pub struct CudaSession { context: Arc, kernels: Arc>, + export_device_array: Arc, kernel_loader: Arc, stream_pool: Arc, } @@ -52,6 +57,7 @@ impl CudaSession { context, kernels: Arc::new(DashMap::default()), kernel_loader: Arc::new(KernelLoader::new()), + export_device_array: Arc::new(CanonicalDeviceArrayExport), stream_pool, } } @@ -116,6 +122,11 @@ impl CudaSession { self.kernel_loader .load_function(module_name, type_suffixes, &self.context) } + + /// Get a handle to the exporter that converts Vortex arrays to [`A`rrowDeviceArray`]. + pub fn export_device_array(&self) -> &Arc { + &self.export_device_array + } } impl Default for CudaSession { From fc53a448d8a52360a0c3a723da45c4d2ad1ac0f9 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 3 Feb 2026 12:18:27 -0500 Subject: [PATCH 07/22] cleanup Signed-off-by: Andrew Duffy --- vortex-cuda/cudf-test/src/lib.rs | 13 ++++++++++--- vortex-cuda/src/lib.rs | 1 - vortex-cuda/src/session.rs | 2 -- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/vortex-cuda/cudf-test/src/lib.rs b/vortex-cuda/cudf-test/src/lib.rs index 76aab24f758..4ab11a701b4 100644 --- a/vortex-cuda/cudf-test/src/lib.rs +++ b/vortex-cuda/cudf-test/src/lib.rs @@ -3,13 +3,15 @@ //! This file is a simple C-compatible API that is called from the cudf-test-harness at CI time. -use std::sync::LazyLock; +#![allow(clippy::unwrap_used)] +use arrow_schema::DataType; +use arrow_schema::ffi::FFI_ArrowSchema; use futures::executor::block_on; +use std::sync::LazyLock; use vortex::array::IntoArray; use vortex::array::arrays::PrimitiveArray; use vortex::array::session::ArraySession; -use vortex::error::VortexResult; use vortex::expr::session::ExprSession; use vortex::io::session::RuntimeSession; use vortex::layout::session::LayoutSession; @@ -31,11 +33,16 @@ static SESSION: LazyLock = LazyLock::new(|| { /// External array #[unsafe(no_mangle)] -pub extern "C" fn export_array(array_ptr: &mut ArrowDeviceArray) -> i32 { +pub extern "C" fn export_array( + schema_ptr: &mut FFI_ArrowSchema, + array_ptr: &mut ArrowDeviceArray, +) -> i32 { let mut ctx = CudaSession::create_execution_ctx(&SESSION).unwrap(); let primitive = PrimitiveArray::from_iter(0u32..1024); + *schema_ptr = FFI_ArrowSchema::try_from(DataType::UInt32).unwrap(); + match block_on(primitive.into_array().export_device_array(&mut ctx)) { Ok(exported) => { *array_ptr = exported; diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs index a1b938e6f95..9221b2a1288 100644 --- a/vortex-cuda/src/lib.rs +++ b/vortex-cuda/src/lib.rs @@ -15,7 +15,6 @@ mod session; mod stream; mod stream_pool; -pub(crate) use arrow::CanonicalDeviceArrayExport; pub use arrow::ExportDeviceArray; pub use canonical::CanonicalCudaExt; pub use device_buffer::CudaBufferExt; diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs index d28774383d9..f28380c7bb3 100644 --- a/vortex-cuda/src/session.rs +++ b/vortex-cuda/src/session.rs @@ -5,7 +5,6 @@ use std::fmt::Debug; use std::sync::Arc; use cudarc::driver::CudaContext; -use vortex_array::ArrayRef; use vortex_array::VortexSessionExecute; use vortex_array::vtable::ArrayId; use vortex_error::VortexResult; @@ -14,7 +13,6 @@ use vortex_session::SessionExt; use vortex_utils::aliases::dash_map::DashMap; use crate::ExportDeviceArray; -use crate::arrow::ArrowDeviceArray; use crate::arrow::CanonicalDeviceArrayExport; use crate::executor::CudaExecute; pub use crate::executor::CudaExecutionCtx; From 2c94c6931d4cad285d4a23f0ebab6afe79aab05d Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 3 Feb 2026 12:22:39 -0500 Subject: [PATCH 08/22] drop impl Signed-off-by: Andrew Duffy --- vortex-cuda/src/arrow/mod.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs index dc19cc4d134..98fba922285 100644 --- a/vortex-cuda/src/arrow/mod.rs +++ b/vortex-cuda/src/arrow/mod.rs @@ -103,6 +103,15 @@ impl ArrowArray { } } +impl Drop for ArrowArray { + fn drop(&mut self) { + // SAFETY: this is only safe if we're dropping an ArrowArray that was created from Rust + // code. This is necessary to ensure that the fields inside of the CudaPrivateData + // get dropped to free native/GPU memory. + drop(unsafe { Box::from_raw(self.private_data.cast::()) }) + } +} + #[expect( unused, reason = "cuda_stream and cuda_buffers need to have deferred drop" From 0990251e258e715dbed4924bc0551612da01924c Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 3 Feb 2026 12:33:33 -0500 Subject: [PATCH 09/22] free the right way Signed-off-by: Andrew Duffy --- vortex-cuda/cudf-test/src/lib.rs | 3 ++- vortex-cuda/src/arrow/canonical.rs | 14 +++++++++++--- vortex-cuda/src/arrow/mod.rs | 9 --------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/vortex-cuda/cudf-test/src/lib.rs b/vortex-cuda/cudf-test/src/lib.rs index 4ab11a701b4..5c1767c302f 100644 --- a/vortex-cuda/cudf-test/src/lib.rs +++ b/vortex-cuda/cudf-test/src/lib.rs @@ -5,10 +5,11 @@ #![allow(clippy::unwrap_used)] +use std::sync::LazyLock; + use arrow_schema::DataType; use arrow_schema::ffi::FFI_ArrowSchema; use futures::executor::block_on; -use std::sync::LazyLock; use vortex::array::IntoArray; use vortex::array::arrays::PrimitiveArray; use vortex::array::session::ArraySession; diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs index 3f1c4d09602..7a685db9028 100644 --- a/vortex-cuda/src/arrow/canonical.rs +++ b/vortex-cuda/src/arrow/canonical.rs @@ -60,10 +60,18 @@ async fn export_primitive( ctx: &mut CudaExecutionCtx, ) -> VortexResult { unsafe extern "C" fn release(array: *mut ArrowArray) { - // SAFETY: this is only safe if the caller provides a valid pointer to an `ArrowArray`. - drop(unsafe { Box::from_raw(array) }); - } + // SAFETY: this is only safe if we're dropping an ArrowArray that was created from Rust + // code. This is necessary to ensure that the fields inside the CudaPrivateData + // get dropped to free native/GPU memory. + unsafe { + let private_data_ptr = + std::ptr::replace(&raw mut (*array).private_data, std::ptr::null_mut()); + drop(Box::from_raw(private_data_ptr.cast::())); + // update the release function to NULL to avoid any possibility of double-frees. + (*array).release = None; + } + } let len = array.len(); let PrimitiveArrayParts { buffer, validity, .. diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs index 98fba922285..dc19cc4d134 100644 --- a/vortex-cuda/src/arrow/mod.rs +++ b/vortex-cuda/src/arrow/mod.rs @@ -103,15 +103,6 @@ impl ArrowArray { } } -impl Drop for ArrowArray { - fn drop(&mut self) { - // SAFETY: this is only safe if we're dropping an ArrowArray that was created from Rust - // code. This is necessary to ensure that the fields inside of the CudaPrivateData - // get dropped to free native/GPU memory. - drop(unsafe { Box::from_raw(self.private_data.cast::()) }) - } -} - #[expect( unused, reason = "cuda_stream and cuda_buffers need to have deferred drop" From 1495dbcdc7f7efe3fa4c3fdd42c05496b72dc282 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 3 Feb 2026 12:52:54 -0500 Subject: [PATCH 10/22] add CI step to validate cudf compatibility Signed-off-by: Andrew Duffy --- .github/workflows/ci.yml | 7 +++++++ vortex-cuda/src/session.rs | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3b0b7cd5635..1f4bdf0c259 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -537,6 +537,13 @@ jobs: --no-fail-fast \ --target x86_64-unknown-linux-gnu \ --verbose + - name: Build cudf-test library + run: cargo +nightly build --locked -p vortex-cudf-test --target x86_64-unknown-linux-gnu + - name: Download and run cudf-test-harness + run: | + curl -fsSL https://github.com/vortex-data/cudf-test-harness/releases/latest/download/cudf-test-harness-x86_64.tar.gz | tar -xz + cd cudf-test-harness-x86_64 + ./cudf-test-harness check $GITHUB_WORKSPACE/target/x86_64-unknown-linux-gnu/debug/libvortex_cudf_test.so rust-test-other: name: "Rust tests (${{ matrix.os }})" diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs index f28380c7bb3..1f95430e6ae 100644 --- a/vortex-cuda/src/session.rs +++ b/vortex-cuda/src/session.rs @@ -121,7 +121,7 @@ impl CudaSession { .load_function(module_name, type_suffixes, &self.context) } - /// Get a handle to the exporter that converts Vortex arrays to [`A`rrowDeviceArray`]. + /// Get a handle to the exporter that converts Vortex arrays to [`ArrowDeviceArray`]. pub fn export_device_array(&self) -> &Arc { &self.export_device_array } From b48971222e0cf978e48c330e4072494488b9edde Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 3 Feb 2026 13:19:21 -0500 Subject: [PATCH 11/22] lol Signed-off-by: Andrew Duffy --- vortex-cuda/src/session.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs index 1f95430e6ae..33233582116 100644 --- a/vortex-cuda/src/session.rs +++ b/vortex-cuda/src/session.rs @@ -121,7 +121,7 @@ impl CudaSession { .load_function(module_name, type_suffixes, &self.context) } - /// Get a handle to the exporter that converts Vortex arrays to [`ArrowDeviceArray`]. + /// Get a handle to the exporter that converts Vortex arrays to `ArrowDeviceArray`. pub fn export_device_array(&self) -> &Arc { &self.export_device_array } From e67b57a8c326e46f0f6bdcf5196ec0292feda4f4 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 3 Feb 2026 13:30:12 -0500 Subject: [PATCH 12/22] comment Signed-off-by: Andrew Duffy --- vortex-cuda/src/arrow/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs index dc19cc4d134..933383002af 100644 --- a/vortex-cuda/src/arrow/mod.rs +++ b/vortex-cuda/src/arrow/mod.rs @@ -137,7 +137,7 @@ impl DeviceArrayExt for ArrayRef { } } -/// Trait implemented for types that can be exported to [`ArrowDeviceArray`]. +/// A type that can convert a Vortex array into an [`ArrowDeviceArray`]. #[async_trait] pub trait ExportDeviceArray: Debug + Send + Sync + 'static { /// Export a Vortex array as an [`ArrowDeviceArray`]. From b672525f6d785917b47f25e38d9a7bf9bf699f46 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 3 Feb 2026 13:47:53 -0500 Subject: [PATCH 13/22] fix Signed-off-by: Andrew Duffy --- vortex-cuda/src/arrow/canonical.rs | 6 +++++- vortex-cuda/src/arrow/mod.rs | 6 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs index 7a685db9028..00ef00434d1 100644 --- a/vortex-cuda/src/arrow/canonical.rs +++ b/vortex-cuda/src/arrow/canonical.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use async_trait::async_trait; +use cudarc::driver::result; use cudarc::driver::sys; use vortex_array::ArrayRef; use vortex_array::Canonical; @@ -45,9 +46,12 @@ impl ExportDeviceArray for CanonicalDeviceArrayExport { c => todo!("implement support for exporting {}", c.dtype()), }; + ctx.stream() + .record_event(); + Ok(ArrowDeviceArray { array: arrow_array, - device_id: 0, + device_id: ctx.stream().context().ordinal() as i64, device_type: DeviceType::Cuda, sync_event: None, _reserved: Default::default(), diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs index 933383002af..f41960fe5bb 100644 --- a/vortex-cuda/src/arrow/mod.rs +++ b/vortex-cuda/src/arrow/mod.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! This module implements the Arrow C Data Device Interface extension for sharing GPU-resident +//! This module implements the Arrow C Device data interface extension for sharing GPU-resident //! data. //! //! This is an extension to the Arrow C Data Interface. @@ -48,7 +48,7 @@ pub enum DeviceType { pub type SyncEvent = Option>; -/// The C Data Device Interface representation of an Arrow array. +/// The C Device data interface representation of an Arrow array. /// /// This array contains on-device pointers to Arrow array data, along with a synchronization /// event that the client must wait on. @@ -142,7 +142,7 @@ impl DeviceArrayExt for ArrayRef { pub trait ExportDeviceArray: Debug + Send + Sync + 'static { /// Export a Vortex array as an [`ArrowDeviceArray`]. /// - /// The Arrow Device Array is part of the Arrow C Data Device Interface extension to the Arrow + /// The Arrow Device Array is part of the Arrow C Device data interface extension to the Arrow /// specification. It enables passing Vortex arrays to other processes that consume Arrow /// arrays, such as cudf. /// From 75cb1ee1d5404be77b7164f1b59094ea01ad082f Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 3 Feb 2026 16:06:16 -0500 Subject: [PATCH 14/22] sync event propagation Signed-off-by: Andrew Duffy --- vortex-cuda/src/arrow/canonical.rs | 72 +++++++++++++++++++----------- vortex-cuda/src/arrow/mod.rs | 6 ++- 2 files changed, 50 insertions(+), 28 deletions(-) diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs index 00ef00434d1..b5789e4e7c6 100644 --- a/vortex-cuda/src/arrow/canonical.rs +++ b/vortex-cuda/src/arrow/canonical.rs @@ -1,10 +1,10 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::ptr::NonNull; use std::sync::Arc; use async_trait::async_trait; -use cudarc::driver::result; use cudarc::driver::sys; use vortex_array::ArrayRef; use vortex_array::Canonical; @@ -14,14 +14,16 @@ use vortex_array::buffer::BufferHandle; use vortex_array::validity::Validity; use vortex_error::VortexResult; use vortex_error::vortex_bail; +use vortex_error::vortex_err; use crate::CudaBufferExt; use crate::CudaExecutionCtx; use crate::arrow::ArrowArray; use crate::arrow::ArrowDeviceArray; -use crate::arrow::CudaPrivateData; use crate::arrow::DeviceType; use crate::arrow::ExportDeviceArray; +use crate::arrow::PrivateData; +use crate::arrow::SyncEvent; use crate::executor::CudaArrayExt; /// An implementation of `ExportDeviceArray` that exports Vortex arrays to `ArrowDeviceArray` by @@ -39,21 +41,16 @@ impl ExportDeviceArray for CanonicalDeviceArrayExport { ) -> VortexResult { let cuda_array = array.execute_cuda(ctx).await?; - let arrow_array = match cuda_array { + let (arrow_array, sync_event) = match cuda_array { Canonical::Primitive(primitive) => export_primitive(primitive, ctx).await?, - // Canonical::Decimal(decimal) => todo!("export decimal"), - // Canonical::VarBinView(varbinview) => todo!("export varbinview"), c => todo!("implement support for exporting {}", c.dtype()), }; - ctx.stream() - .record_event(); - Ok(ArrowDeviceArray { array: arrow_array, + sync_event, device_id: ctx.stream().context().ordinal() as i64, device_type: DeviceType::Cuda, - sync_event: None, _reserved: Default::default(), }) } @@ -62,20 +59,7 @@ impl ExportDeviceArray for CanonicalDeviceArrayExport { async fn export_primitive( array: PrimitiveArray, ctx: &mut CudaExecutionCtx, -) -> VortexResult { - unsafe extern "C" fn release(array: *mut ArrowArray) { - // SAFETY: this is only safe if we're dropping an ArrowArray that was created from Rust - // code. This is necessary to ensure that the fields inside the CudaPrivateData - // get dropped to free native/GPU memory. - unsafe { - let private_data_ptr = - std::ptr::replace(&raw mut (*array).private_data, std::ptr::null_mut()); - drop(Box::from_raw(private_data_ptr.cast::())); - - // update the release function to NULL to avoid any possibility of double-frees. - (*array).release = None; - } - } +) -> VortexResult<(ArrowArray, SyncEvent)> { let len = array.len(); let PrimitiveArrayParts { buffer, validity, .. @@ -114,13 +98,28 @@ async fn export_primitive( .collect::>>()? .into_boxed_slice(); - let mut private_data = Box::new(CudaPrivateData { + // Create an event object that can be synchronized on to wait for any writes in this stream + // to complete. + // This is stored in the PrivateData so that it will be dropped when the native code calls + // the arrow_array->release callback. + let cuda_event = ctx + .stream() + .record_event(None) + .map_err(|_| vortex_err!("failed to create cudaEvent_t"))?; + + let mut private_data = Box::new(PrivateData { cuda_stream: Arc::clone(ctx.stream()), buffers, buffer_ptrs, + cuda_event_ptr: cuda_event.cu_event().cast(), + cuda_event, }); - Ok(ArrowArray { + // The sync_event should point to the cudaEvent_t saved in the private data + let sync_event: SyncEvent = NonNull::new(&raw mut private_data.cuda_event_ptr); + + // Return a copy of the CudaEvent + let arrow_array = ArrowArray { length: len as i64, null_count: null_count as i64, offset: 0, @@ -129,8 +128,27 @@ async fn export_primitive( buffers: private_data.buffer_ptrs.as_mut_ptr(), n_children: 0, children: std::ptr::null_mut(), - release: Some(release), + release: Some(release_array), dictionary: std::ptr::null_mut(), private_data: Box::into_raw(private_data).cast(), - }) + }; + + Ok((arrow_array, sync_event)) +} + +unsafe extern "C" fn release_array(array: *mut ArrowArray) { + // SAFETY: this is only safe if we're dropping an ArrowArray that was created from Rust + // code. This is necessary to ensure that the fields inside the CudaPrivateData + // get dropped to free native/GPU memory. + unsafe { + let private_data_ptr = + std::ptr::replace(&raw mut (*array).private_data, std::ptr::null_mut()); + + if !private_data_ptr.is_null() { + drop(Box::from_raw(private_data_ptr.cast::())); + } + + // update the release function to NULL to avoid any possibility of double-frees. + (*array).release = None; + } } diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs index f41960fe5bb..d9b816d79c4 100644 --- a/vortex-cuda/src/arrow/mod.rs +++ b/vortex-cuda/src/arrow/mod.rs @@ -17,6 +17,7 @@ use std::sync::Arc; use async_trait::async_trait; pub(crate) use canonical::CanonicalDeviceArrayExport; +use cudarc::driver::CudaEvent; use cudarc::driver::CudaStream; use cudarc::driver::sys; use cudarc::runtime::sys::cudaEvent_t; @@ -46,6 +47,7 @@ pub enum DeviceType { // Hexagon = 16, } +/// A (potentially null) pointer to a `cudaEvent_t`. pub type SyncEvent = Option>; /// The C Device data interface representation of an Arrow array. @@ -107,7 +109,7 @@ impl ArrowArray { unused, reason = "cuda_stream and cuda_buffers need to have deferred drop" )] -pub(crate) struct CudaPrivateData { +pub(crate) struct PrivateData { /// Hold a reference to the CudaStream so that it stays alive even after CudaExecutionCtx /// has been dropped. pub(crate) cuda_stream: Arc, @@ -116,6 +118,8 @@ pub(crate) struct CudaPrivateData { /// Boxed slice of buffer pointers. We return a pointer to the start of this allocation over /// the interface, so we hold it here so the Box contents are not freed. pub(crate) buffer_ptrs: Box<[sys::CUdeviceptr]>, + pub(crate) cuda_event: CudaEvent, + pub(crate) cuda_event_ptr: cudaEvent_t, } #[async_trait] From a06184dca54bfc1807abf0e30611cf0ded3c67c9 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 3 Feb 2026 18:31:31 -0500 Subject: [PATCH 15/22] add struct array for C export (#6294) Signed-off-by: Andrew Duffy --- vortex-cuda/cudf-test/src/lib.rs | 24 ++++++-- vortex-cuda/src/arrow/canonical.rs | 95 ++++++++++++++++++++++++++++-- vortex-cuda/src/arrow/mod.rs | 10 +++- 3 files changed, 119 insertions(+), 10 deletions(-) diff --git a/vortex-cuda/cudf-test/src/lib.rs b/vortex-cuda/cudf-test/src/lib.rs index 5c1767c302f..11c1b673979 100644 --- a/vortex-cuda/cudf-test/src/lib.rs +++ b/vortex-cuda/cudf-test/src/lib.rs @@ -3,16 +3,19 @@ //! This file is a simple C-compatible API that is called from the cudf-test-harness at CI time. -#![allow(clippy::unwrap_used)] +#![allow(clippy::unwrap_used, clippy::expect_used)] use std::sync::LazyLock; -use arrow_schema::DataType; use arrow_schema::ffi::FFI_ArrowSchema; use futures::executor::block_on; +use vortex::array::Array; use vortex::array::IntoArray; use vortex::array::arrays::PrimitiveArray; +use vortex::array::arrays::StructArray; use vortex::array::session::ArraySession; +use vortex::array::validity::Validity; +use vortex::dtype::FieldNames; use vortex::expr::session::ExprSession; use vortex::io::session::RuntimeSession; use vortex::layout::session::LayoutSession; @@ -42,9 +45,22 @@ pub extern "C" fn export_array( let primitive = PrimitiveArray::from_iter(0u32..1024); - *schema_ptr = FFI_ArrowSchema::try_from(DataType::UInt32).unwrap(); + let array = StructArray::new( + FieldNames::from_iter(["a"]), + vec![primitive.into_array()], + 1024, + Validity::NonNullable, + ) + .into_array(); - match block_on(primitive.into_array().export_device_array(&mut ctx)) { + let data_type = array + .dtype() + .to_arrow_dtype() + .expect("converting schema to Arrow DataType"); + + *schema_ptr = FFI_ArrowSchema::try_from(data_type).expect("data_type to FFI_ArrowSchema"); + + match block_on(array.export_device_array(&mut ctx)) { Ok(exported) => { *array_ptr = exported; 0 diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs index b5789e4e7c6..3347084a464 100644 --- a/vortex-cuda/src/arrow/canonical.rs +++ b/vortex-cuda/src/arrow/canonical.rs @@ -6,10 +6,13 @@ use std::sync::Arc; use async_trait::async_trait; use cudarc::driver::sys; +use futures::future::BoxFuture; use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::PrimitiveArrayParts; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::StructArrayParts; use vortex_array::buffer::BufferHandle; use vortex_array::validity::Validity; use vortex_error::VortexResult; @@ -41,10 +44,7 @@ impl ExportDeviceArray for CanonicalDeviceArrayExport { ) -> VortexResult { let cuda_array = array.execute_cuda(ctx).await?; - let (arrow_array, sync_event) = match cuda_array { - Canonical::Primitive(primitive) => export_primitive(primitive, ctx).await?, - c => todo!("implement support for exporting {}", c.dtype()), - }; + let (arrow_array, sync_event) = export_canonical(cuda_array, ctx).await?; Ok(ArrowDeviceArray { array: arrow_array, @@ -56,6 +56,85 @@ impl ExportDeviceArray for CanonicalDeviceArrayExport { } } +fn export_canonical( + cuda_array: Canonical, + ctx: &mut CudaExecutionCtx, +) -> BoxFuture<'_, VortexResult<(ArrowArray, SyncEvent)>> { + Box::pin(async { + match cuda_array { + Canonical::Struct(struct_array) => export_struct(struct_array, ctx).await, + Canonical::Primitive(primitive) => export_primitive(primitive, ctx).await, + c => todo!("support for exporting {} arrays", c.dtype()), + } + }) +} + +async fn export_struct( + array: StructArray, + ctx: &mut CudaExecutionCtx, +) -> VortexResult<(ArrowArray, SyncEvent)> { + let len = array.len(); + let StructArrayParts { + validity, fields, .. + } = array.into_parts(); + + let null_count = match validity { + Validity::NonNullable | Validity::AllValid => 0, + Validity::AllInvalid => len, + Validity::Array(_) => { + vortex_bail!("Exporting PrimitiveArray with non-trivial validity not supported yet") + } + }; + + // We need the children to be held across await points. + let mut children = Vec::with_capacity(fields.len()); + + for field in fields.iter() { + let cuda_field = field.clone().execute_cuda(ctx).await?; + let (arrow_field, _) = export_canonical(cuda_field, ctx).await?; + children.push(arrow_field); + } + + let cuda_event = ctx + .stream() + .record_event(None) + .map_err(|_| vortex_err!("failed to create cudaEvent_t"))?; + + let children = children + .into_iter() + .map(|array| Box::into_raw(Box::new(array))) + .collect::>(); + + let buffer_ptrs = vec![sys::CUdeviceptr::default()].into_boxed_slice(); + + let mut private_data = Box::new(PrivateData { + cuda_stream: Arc::clone(ctx.stream()), + buffers: Box::new([None]), + buffer_ptrs, + cuda_event_ptr: cuda_event.cu_event().cast(), + cuda_event, + children, + }); + + let sync_event: SyncEvent = NonNull::new(&raw mut private_data.cuda_event_ptr); + + // Populate the ArrowArray with the child arrays. + let mut arrow_struct = ArrowArray::empty(); + arrow_struct.length = len as i64; + arrow_struct.null_count = null_count as i64; + arrow_struct.n_children = fields.len() as i64; + arrow_struct.children = private_data.children.as_mut_ptr(); + + // StructArray _can_ contain a validity buffer. In this case, we just write a null pointer + // for it. + arrow_struct.n_buffers = 1; + arrow_struct.buffers = private_data.buffer_ptrs.as_mut_ptr(); + arrow_struct.release = Some(release_array); + arrow_struct.private_data = Box::into_raw(private_data).cast(); + + Ok((arrow_struct, sync_event)) +} + async fn export_primitive( array: PrimitiveArray, ctx: &mut CudaExecutionCtx, @@ -109,6 +188,7 @@ async fn export_primitive( let mut private_data = Box::new(PrivateData { cuda_stream: Arc::clone(ctx.stream()), + children: Box::new([]), buffers, buffer_ptrs, cuda_event_ptr: cuda_event.cu_event().cast(), @@ -145,7 +225,12 @@ unsafe extern "C" fn release_array(array: *mut ArrowArray) { std::ptr::replace(&raw mut (*array).private_data, std::ptr::null_mut()); if !private_data_ptr.is_null() { - drop(Box::from_raw(private_data_ptr.cast::())); + let mut private_data = Box::from_raw(private_data_ptr.cast::()); + let children = std::mem::take(&mut private_data.children); + for child in children { + release_array(child); + } + drop(private_data); } // update the release function to NULL to avoid any possibility of double-frees. diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs index d9b816d79c4..3ed46adafce 100644 --- a/vortex-cuda/src/arrow/mod.rs +++ b/vortex-cuda/src/arrow/mod.rs @@ -66,6 +66,9 @@ pub struct ArrowDeviceArray { _reserved: [i64; 3], } +unsafe impl Send for ArrowDeviceArray {} +unsafe impl Sync for ArrowDeviceArray {} + /// An FFI-compatible version of the ArrowArray that holds pointers to device buffers. #[repr(C)] #[derive(Debug)] @@ -77,7 +80,8 @@ pub(crate) struct ArrowArray { n_children: i64, buffers: *mut sys::CUdeviceptr, children: *mut *mut ArrowArray, - dictionary: *mut ArrowArray, + // NOTE: we don't support exporting dictionary arrays, so we leave this as an opaque pointer. + dictionary: *mut (), release: Option, // When exported, this MUST contain everything that is owned by this array. // for example, any buffer pointed to in `buffers` must be here, as well @@ -105,6 +109,9 @@ impl ArrowArray { } } +unsafe impl Send for ArrowArray {} +unsafe impl Sync for ArrowArray {} + #[expect( unused, reason = "cuda_stream and cuda_buffers need to have deferred drop" @@ -120,6 +127,7 @@ pub(crate) struct PrivateData { pub(crate) buffer_ptrs: Box<[sys::CUdeviceptr]>, pub(crate) cuda_event: CudaEvent, pub(crate) cuda_event_ptr: cudaEvent_t, + pub(crate) children: Box<[*mut ArrowArray]>, } #[async_trait] From 8a3bf78916076800bd504b889c6cb664f1d5f7ea Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 4 Feb 2026 11:59:56 -0500 Subject: [PATCH 16/22] decimal Signed-off-by: Andrew Duffy --- vortex-cuda/src/arrow/canonical.rs | 100 +++++++++++++++++++++++++++-- vortex-cuda/src/arrow/mod.rs | 2 +- 2 files changed, 97 insertions(+), 5 deletions(-) diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs index 3347084a464..82aa272afee 100644 --- a/vortex-cuda/src/arrow/canonical.rs +++ b/vortex-cuda/src/arrow/canonical.rs @@ -9,14 +9,18 @@ use cudarc::driver::sys; use futures::future::BoxFuture; use vortex_array::ArrayRef; use vortex_array::Canonical; +use vortex_array::arrays::DecimalArray; +use vortex_array::arrays::DecimalArrayParts; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::PrimitiveArrayParts; use vortex_array::arrays::StructArray; use vortex_array::arrays::StructArrayParts; use vortex_array::buffer::BufferHandle; use vortex_array::validity::Validity; +use vortex_dtype::DecimalType; use vortex_error::VortexResult; use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; use vortex_error::vortex_err; use crate::CudaBufferExt; @@ -64,6 +68,7 @@ fn export_canonical( match cuda_array { Canonical::Struct(struct_array) => export_struct(struct_array, ctx).await, Canonical::Primitive(primitive) => export_primitive(primitive, ctx).await, + Canonical::Decimal(decimal) => export_decimal(decimal, ctx).await, c => todo!("support for exporting {} arrays", c.dtype()), } }) @@ -80,8 +85,7 @@ async fn export_struct( let null_count = match validity { Validity::NonNullable | Validity::AllValid => 0, - Validity::AllInvalid => len, - Validity::Array(_) => { + _ => { vortex_bail!("Exporting PrimitiveArray with non-trivial validity not supported yet") } }; @@ -153,8 +157,96 @@ async fn export_primitive( let null_count = match validity { Validity::NonNullable | Validity::AllValid => 0, - Validity::AllInvalid => len, - Validity::Array(_) => { + _ => { + vortex_bail!("Exporting PrimitiveArray with non-trivial validity not supported yet") + } + }; + + // TODO(aduffy): currently the null buffer is always empty, in the future we will need + // to pass it. + let buffers: Box<[Option]> = vec![None, Some(buffer)].into_boxed_slice(); + + let buffer_ptrs: Box<[sys::CUdeviceptr]> = buffers + .iter() + .map(|buf| { + match buf { + None => { + // null pointer + Ok(sys::CUdeviceptr::default()) + } + Some(handle) => handle.cuda_device_ptr(), + } + }) + .collect::>>()? + .into_boxed_slice(); + + // Create an event object that can be synchronized on to wait for any writes in this stream + // to complete. + // This is stored in the PrivateData so that it will be dropped when the native code calls + // the arrow_array->release callback. + let cuda_event = ctx + .stream() + .record_event(None) + .map_err(|_| vortex_err!("failed to create cudaEvent_t"))?; + + let mut private_data = Box::new(PrivateData { + cuda_stream: Arc::clone(ctx.stream()), + children: Box::new([]), + buffers, + buffer_ptrs, + cuda_event_ptr: cuda_event.cu_event().cast(), + cuda_event, + }); + + // The sync_event should point to the cudaEvent_t saved in the private data + let sync_event: SyncEvent = NonNull::new(&raw mut private_data.cuda_event_ptr); + + // Return a copy of the CudaEvent + let arrow_array = ArrowArray { + length: len as i64, + null_count: null_count as i64, + offset: 0, + // 1 (optional) buffer for nulls, one buffer for data + n_buffers: 2, + buffers: private_data.buffer_ptrs.as_mut_ptr(), + n_children: 0, + children: std::ptr::null_mut(), + release: Some(release_array), + dictionary: std::ptr::null_mut(), + private_data: Box::into_raw(private_data).cast(), + }; + + Ok((arrow_array, sync_event)) +} + +async fn export_decimal( + array: DecimalArray, + ctx: &mut CudaExecutionCtx, +) -> VortexResult<(ArrowArray, SyncEvent)> { + let len = array.len(); + let DecimalArrayParts { + values, + values_type, + validity, + .. + } = array.into_parts(); + + // TODO(aduffy): GPU kernel for upcasting. + vortex_ensure!( + values_type >= DecimalType::I32, + "cannot export DecimalArray with values type {values_type}. must be i32 or wider." + ); + + let buffer = if values.is_on_device() { + values + } else { + // TODO(aduffy): I don't think this type parameter does anything + ctx.move_to_device::(values)?.await? + }; + + let null_count = match validity { + Validity::NonNullable | Validity::AllValid => 0, + _ => { vortex_bail!("Exporting PrimitiveArray with non-trivial validity not supported yet") } }; diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs index 3ed46adafce..671e38e6372 100644 --- a/vortex-cuda/src/arrow/mod.rs +++ b/vortex-cuda/src/arrow/mod.rs @@ -86,7 +86,7 @@ pub(crate) struct ArrowArray { // When exported, this MUST contain everything that is owned by this array. // for example, any buffer pointed to in `buffers` must be here, as well // as the `buffers` pointer itself. - // In other words, everything in [FFI_ArrowArray] must be owned by + // In other words, everything in ArrowArray must be owned by // `private_data` and can assume that they do not outlive `private_data`. private_data: *mut c_void, } From 3b4357233dea96aa26a57a7131ad7f5a6558ed59 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 4 Feb 2026 14:01:11 -0500 Subject: [PATCH 17/22] rest of canonical types Signed-off-by: Andrew Duffy --- vortex-cuda/src/arrow/canonical.rs | 331 +++++++++--------- vortex-cuda/src/arrow/mod.rs | 49 +++ vortex-cuda/src/executor.rs | 4 +- vortex-cuda/src/kernel/arrays/dict.rs | 8 +- vortex-cuda/src/kernel/encodings/alp.rs | 2 +- vortex-cuda/src/kernel/encodings/bitpacked.rs | 2 +- vortex-cuda/src/kernel/encodings/for_.rs | 2 +- vortex-cuda/src/kernel/encodings/runend.rs | 4 +- vortex-cuda/src/kernel/encodings/zigzag.rs | 2 +- vortex-cuda/src/kernel/filter/mod.rs | 2 +- vortex-cuda/src/kernel/filter/varbinview.rs | 2 +- vortex-cuda/src/kernel/patches/mod.rs | 4 +- vortex-cuda/src/stream.rs | 6 +- 13 files changed, 229 insertions(+), 189 deletions(-) diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs index 82aa272afee..0e71a95a610 100644 --- a/vortex-cuda/src/arrow/canonical.rs +++ b/vortex-cuda/src/arrow/canonical.rs @@ -1,29 +1,26 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::ptr::NonNull; -use std::sync::Arc; - use async_trait::async_trait; -use cudarc::driver::sys; use futures::future::BoxFuture; use vortex_array::ArrayRef; use vortex_array::Canonical; -use vortex_array::arrays::DecimalArray; +use vortex_array::ToCanonical; +use vortex_array::arrays::BoolArrayParts; use vortex_array::arrays::DecimalArrayParts; -use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::PrimitiveArrayParts; use vortex_array::arrays::StructArray; use vortex_array::arrays::StructArrayParts; +use vortex_array::arrays::VarBinViewArrayParts; use vortex_array::buffer::BufferHandle; use vortex_array::validity::Validity; +use vortex_buffer::BufferMut; use vortex_dtype::DecimalType; +use vortex_dtype::datetime::AnyTemporal; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; -use vortex_error::vortex_err; -use crate::CudaBufferExt; use crate::CudaExecutionCtx; use crate::arrow::ArrowArray; use crate::arrow::ArrowDeviceArray; @@ -67,8 +64,101 @@ fn export_canonical( Box::pin(async { match cuda_array { Canonical::Struct(struct_array) => export_struct(struct_array, ctx).await, - Canonical::Primitive(primitive) => export_primitive(primitive, ctx).await, - Canonical::Decimal(decimal) => export_decimal(decimal, ctx).await, + Canonical::Primitive(primitive) => { + let len = primitive.len(); + let PrimitiveArrayParts { + buffer, validity, .. + } = primitive.into_parts(); + + check_validity_empty(validity)?; + + let buffer = ensure_device_resident(buffer, ctx).await?; + + export_fixed_size(buffer, len, 0, ctx) + } + Canonical::Null(null_array) => { + let len = null_array.len(); + + // The null array has no buffers, no children, just metadata. + let mut array = ArrowArray::empty(); + array.length = len as i64; + array.null_count = len as i64; + array.release = Some(release_array); + + // we don't need a sync event for Null since no data is copied. + Ok((array, None)) + } + Canonical::Decimal(decimal) => { + let len = decimal.len(); + let DecimalArrayParts { + values, + values_type, + validity, + .. + } = decimal.into_parts(); + + // verify that there is no null buffer + check_validity_empty(validity)?; + + // TODO(aduffy): GPU kernel for upcasting. + vortex_ensure!( + values_type >= DecimalType::I32, + "cannot export DecimalArray with values type {values_type}. must be i32 or wider." + ); + + let buffer = if values.is_on_device() { + values + } else { + ctx.move_to_device(values)?.await? + }; + + export_fixed_size(buffer, len, 0, ctx) + } + Canonical::Extension(extension) => { + if !extension.ext_dtype().is::() { + vortex_bail!("only support temporal extension types currently"); + } + + let values = extension.storage().to_primitive(); + let len = extension.len(); + + let PrimitiveArrayParts { + buffer, validity, .. + } = values.into_parts(); + + check_validity_empty(validity)?; + + let buffer = ensure_device_resident(buffer, ctx).await?; + export_fixed_size(buffer, len, 0, ctx) + } + + Canonical::Bool(bool_array) => { + let BoolArrayParts { + bits, + offset, + len, + validity, + .. + } = bool_array.into_parts(); + + check_validity_empty(validity)?; + + export_fixed_size(bits, len, offset, ctx) + } + Canonical::VarBinView(view) => { + let len = view.len(); + + let VarBinViewArrayParts { + views, + buffers, + validity, + .. + } = view.into_parts(); + + check_validity_empty(validity)?; + + export_variadic(Some(views), buffers.to_vec(), len, ctx).await + } c => todo!("support for exporting {} arrays", c.dtype()), } }) @@ -83,12 +173,7 @@ async fn export_struct( validity, fields, .. } = array.into_parts(); - let null_count = match validity { - Validity::NonNullable | Validity::AllValid => 0, - _ => { - vortex_bail!("Exporting PrimitiveArray with non-trivial validity not supported yet") - } - }; + check_validity_empty(validity)?; // We need the children to be held across await points. let mut children = Vec::with_capacity(fields.len()); @@ -99,33 +184,12 @@ async fn export_struct( children.push(arrow_field); } - let cuda_event = ctx - .stream() - .record_event(None) - .map_err(|_| vortex_err!("failed to create cudaEvent_t"))?; - - let children = children - .into_iter() - .map(|array| Box::into_raw(Box::new(array))) - .collect::>(); - - let buffer_ptrs = vec![sys::CUdeviceptr::default()].into_boxed_slice(); - - let mut private_data = Box::new(PrivateData { - cuda_stream: Arc::clone(ctx.stream()), - buffers: Box::new([None]), - buffer_ptrs, - cuda_event_ptr: cuda_event.cu_event().cast(), - cuda_event, - children, - }); - - let sync_event: SyncEvent = NonNull::new(&raw mut private_data.cuda_event_ptr); + let mut private_data = PrivateData::new(vec![None], children, ctx)?; + let sync_event: SyncEvent = private_data.sync_event(); // Populate the ArrowArray with the child arrays. let mut arrow_struct = ArrowArray::empty(); arrow_struct.length = len as i64; - arrow_struct.null_count = null_count as i64; arrow_struct.n_children = fields.len() as i64; arrow_struct.children = private_data.children.as_mut_ptr(); @@ -139,74 +203,29 @@ async fn export_struct( Ok((arrow_struct, sync_event)) } -async fn export_primitive( - array: PrimitiveArray, +/// Export fixed-size array data that owns a single buffer of values. +fn export_fixed_size( + buffer: BufferHandle, + len: usize, + offset: usize, ctx: &mut CudaExecutionCtx, ) -> VortexResult<(ArrowArray, SyncEvent)> { - let len = array.len(); - let PrimitiveArrayParts { - buffer, validity, .. - } = array.into_parts(); - - let buffer = if buffer.is_on_device() { - buffer - } else { - // TODO(aduffy): I don't think this type parameter does anything - ctx.move_to_device::(buffer)?.await? - }; - - let null_count = match validity { - Validity::NonNullable | Validity::AllValid => 0, - _ => { - vortex_bail!("Exporting PrimitiveArray with non-trivial validity not supported yet") - } - }; + vortex_ensure!( + buffer.is_on_device(), + "buffer must already be copied to device before calling" + ); - // TODO(aduffy): currently the null buffer is always empty, in the future we will need + // TODO(aduffy): currently the null buffer is always None, in the future we will need // to pass it. - let buffers: Box<[Option]> = vec![None, Some(buffer)].into_boxed_slice(); - - let buffer_ptrs: Box<[sys::CUdeviceptr]> = buffers - .iter() - .map(|buf| { - match buf { - None => { - // null pointer - Ok(sys::CUdeviceptr::default()) - } - Some(handle) => handle.cuda_device_ptr(), - } - }) - .collect::>>()? - .into_boxed_slice(); - - // Create an event object that can be synchronized on to wait for any writes in this stream - // to complete. - // This is stored in the PrivateData so that it will be dropped when the native code calls - // the arrow_array->release callback. - let cuda_event = ctx - .stream() - .record_event(None) - .map_err(|_| vortex_err!("failed to create cudaEvent_t"))?; - - let mut private_data = Box::new(PrivateData { - cuda_stream: Arc::clone(ctx.stream()), - children: Box::new([]), - buffers, - buffer_ptrs, - cuda_event_ptr: cuda_event.cu_event().cast(), - cuda_event, - }); - - // The sync_event should point to the cudaEvent_t saved in the private data - let sync_event: SyncEvent = NonNull::new(&raw mut private_data.cuda_event_ptr); + let mut private_data = PrivateData::new(vec![None, Some(buffer)], vec![], ctx)?; + let sync_event: SyncEvent = private_data.sync_event(); // Return a copy of the CudaEvent let arrow_array = ArrowArray { length: len as i64, - null_count: null_count as i64, - offset: 0, - // 1 (optional) buffer for nulls, one buffer for data + null_count: 0, + offset: offset as i64, + // 1 (optional) buffer for nulls, one buffer for the data n_buffers: 2, buffers: private_data.buffer_ptrs.as_mut_ptr(), n_children: 0, @@ -219,95 +238,69 @@ async fn export_primitive( Ok((arrow_array, sync_event)) } -async fn export_decimal( - array: DecimalArray, +async fn export_variadic( + buffer: Option, + variadic_buffers: Vec, + len: usize, ctx: &mut CudaExecutionCtx, ) -> VortexResult<(ArrowArray, SyncEvent)> { - let len = array.len(); - let DecimalArrayParts { - values, - values_type, - validity, - .. - } = array.into_parts(); + let mut buffers = vec![]; - // TODO(aduffy): GPU kernel for upcasting. - vortex_ensure!( - values_type >= DecimalType::I32, - "cannot export DecimalArray with values type {values_type}. must be i32 or wider." - ); + // push an empty buffer for the nulls. + buffers.push(None); - let buffer = if values.is_on_device() { - values - } else { - // TODO(aduffy): I don't think this type parameter does anything - ctx.move_to_device::(values)?.await? - }; + if let Some(buf) = buffer { + buffers.push(Some(buf)); + } - let null_count = match validity { - Validity::NonNullable | Validity::AllValid => 0, - _ => { - vortex_bail!("Exporting PrimitiveArray with non-trivial validity not supported yet") - } - }; + // We create a new buffer that contains the lengths of the variadic buffers as i64. + let mut variadic_buffer_lens = BufferMut::with_capacity(variadic_buffers.len()); + for buffer in variadic_buffers { + variadic_buffer_lens.push(buffer.len() as i64); + buffers.push(Some(buffer)); + } - // TODO(aduffy): currently the null buffer is always empty, in the future we will need - // to pass it. - let buffers: Box<[Option]> = vec![None, Some(buffer)].into_boxed_slice(); - - let buffer_ptrs: Box<[sys::CUdeviceptr]> = buffers - .iter() - .map(|buf| { - match buf { - None => { - // null pointer - Ok(sys::CUdeviceptr::default()) - } - Some(handle) => handle.cuda_device_ptr(), - } - }) - .collect::>>()? - .into_boxed_slice(); - - // Create an event object that can be synchronized on to wait for any writes in this stream - // to complete. - // This is stored in the PrivateData so that it will be dropped when the native code calls - // the arrow_array->release callback. - let cuda_event = ctx - .stream() - .record_event(None) - .map_err(|_| vortex_err!("failed to create cudaEvent_t"))?; - - let mut private_data = Box::new(PrivateData { - cuda_stream: Arc::clone(ctx.stream()), - children: Box::new([]), - buffers, - buffer_ptrs, - cuda_event_ptr: cuda_event.cu_event().cast(), - cuda_event, - }); - - // The sync_event should point to the cudaEvent_t saved in the private data - let sync_event: SyncEvent = NonNull::new(&raw mut private_data.cuda_event_ptr); + let mut private_data = PrivateData::new(buffers, vec![], ctx)?; + let sync_event = private_data.sync_event(); - // Return a copy of the CudaEvent let arrow_array = ArrowArray { length: len as i64, - null_count: null_count as i64, - offset: 0, - // 1 (optional) buffer for nulls, one buffer for data - n_buffers: 2, + n_buffers: private_data.buffers.len() as i64, buffers: private_data.buffer_ptrs.as_mut_ptr(), n_children: 0, children: std::ptr::null_mut(), - release: Some(release_array), + offset: 0, + null_count: 0, dictionary: std::ptr::null_mut(), private_data: Box::into_raw(private_data).cast(), + release: Some(release_array), }; Ok((arrow_array, sync_event)) } +/// Check that the validity buffer is empty and does not need to be copied over the device boundary. +fn check_validity_empty(validity: Validity) -> VortexResult<()> { + if let Validity::AllInvalid | Validity::Array(_) = validity { + vortex_bail!("Exporting array with non-trivial validity not supported yet") + } + + Ok(()) +} + +async fn ensure_device_resident( + buffer_handle: BufferHandle, + ctx: &mut CudaExecutionCtx, +) -> VortexResult { + if buffer_handle.is_on_device() { + Ok(buffer_handle) + } else { + ctx.move_to_device(buffer_handle)?.await + } +} + +// export some nested data + unsafe extern "C" fn release_array(array: *mut ArrowArray) { // SAFETY: this is only safe if we're dropping an ArrowArray that was created from Rust // code. This is necessary to ensure that the fields inside the CudaPrivateData diff --git a/vortex-cuda/src/arrow/mod.rs b/vortex-cuda/src/arrow/mod.rs index 671e38e6372..6d8b53221a5 100644 --- a/vortex-cuda/src/arrow/mod.rs +++ b/vortex-cuda/src/arrow/mod.rs @@ -25,7 +25,9 @@ use vortex_array::Array; use vortex_array::ArrayRef; use vortex_array::buffer::BufferHandle; use vortex_error::VortexResult; +use vortex_error::vortex_err; +use crate::CudaBufferExt; use crate::CudaExecutionCtx; #[derive(Debug, Copy, Clone)] @@ -130,6 +132,53 @@ pub(crate) struct PrivateData { pub(crate) children: Box<[*mut ArrowArray]>, } +impl PrivateData { + pub(crate) fn new( + buffers: Vec>, + children: Vec, + ctx: &mut CudaExecutionCtx, + ) -> VortexResult> { + let buffers = buffers.into_boxed_slice(); + let buffer_ptrs: Box<[sys::CUdeviceptr]> = buffers + .iter() + .map(|buf| { + match buf { + None => { + // null pointer + Ok(sys::CUdeviceptr::default()) + } + Some(handle) => handle.cuda_device_ptr(), + } + }) + .collect::>>()? + .into_boxed_slice(); + + let children = children + .into_iter() + .map(|array| Box::into_raw(Box::new(array))) + .collect::>(); + + // generate the synchronization event + let cuda_event = ctx + .stream() + .record_event(None) + .map_err(|_| vortex_err!("failed to create cudaEvent_t"))?; + + Ok(Box::new(Self { + buffers, + buffer_ptrs, + cuda_stream: Arc::clone(ctx.stream()), + children, + cuda_event_ptr: cuda_event.cu_event().cast(), + cuda_event, + })) + } + + pub(crate) fn sync_event(&mut self) -> SyncEvent { + NonNull::new(&raw mut self.cuda_event_ptr) + } +} + #[async_trait] pub trait DeviceArrayExt: Array { async fn export_device_array( diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs index 7c2ed2a1436..f5717a94dcd 100644 --- a/vortex-cuda/src/executor.rs +++ b/vortex-cuda/src/executor.rs @@ -150,11 +150,11 @@ impl CudaExecutionCtx { } /// See `VortexCudaStream::move_to_device`. - pub fn move_to_device( + pub fn move_to_device( &self, handle: BufferHandle, ) -> VortexResult>> { - self.stream.move_to_device::(handle) + self.stream.move_to_device(handle) } /// Returns a reference to the underlying CUDA stream. diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs index e8f10a56d61..a65eef98d04 100644 --- a/vortex-cuda/src/kernel/arrays/dict.rs +++ b/vortex-cuda/src/kernel/arrays/dict.rs @@ -105,13 +105,13 @@ async fn execute_dict_prim_typed(values_buffer)?.await? + ctx.move_to_device(values_buffer)?.await? }; let codes_device = if codes_buffer.is_on_device() { codes_buffer } else { - ctx.move_to_device::(codes_buffer)?.await? + ctx.move_to_device(codes_buffer)?.await? }; // Allocate output buffer on device @@ -204,13 +204,13 @@ async fn execute_dict_decimal_typed< let values_device = if values_buffer.is_on_device() { values_buffer } else { - ctx.move_to_device::(values_buffer)?.await? + ctx.move_to_device(values_buffer)?.await? }; let codes_device = if codes_buffer.is_on_device() { codes_buffer } else { - ctx.move_to_device::(codes_buffer)?.await? + ctx.move_to_device(codes_buffer)?.await? }; // Allocate output buffer on device (codes_len * value_byte_width bytes) diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs index da1e8669420..de4deb61b4f 100644 --- a/vortex-cuda/src/kernel/encodings/alp.rs +++ b/vortex-cuda/src/kernel/encodings/alp.rs @@ -74,7 +74,7 @@ where let device_input: BufferHandle = if buffer.is_on_device() { buffer } else { - ctx.move_to_device::(buffer)?.await? + ctx.move_to_device(buffer)?.await? }; // Get CUDA view of input diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs index c1e905aff77..c9385b6a862 100644 --- a/vortex-cuda/src/kernel/encodings/bitpacked.rs +++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs @@ -83,7 +83,7 @@ where let device_input: BufferHandle = if packed.is_on_device() { packed } else { - ctx.move_to_device::(packed)?.await? + ctx.move_to_device(packed)?.await? }; // Get CUDA view of input diff --git a/vortex-cuda/src/kernel/encodings/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs index 1a68e2a3d7c..a93cea4d0d5 100644 --- a/vortex-cuda/src/kernel/encodings/for_.rs +++ b/vortex-cuda/src/kernel/encodings/for_.rs @@ -73,7 +73,7 @@ where let device_buffer: BufferHandle = if buffer.is_on_device() { buffer } else { - ctx.move_to_device::

(buffer)?.await? + ctx.move_to_device(buffer)?.await? }; // Get CUDA view of the buffer diff --git a/vortex-cuda/src/kernel/encodings/runend.rs b/vortex-cuda/src/kernel/encodings/runend.rs index f66774f36a0..9ba2601f3a3 100644 --- a/vortex-cuda/src/kernel/encodings/runend.rs +++ b/vortex-cuda/src/kernel/encodings/runend.rs @@ -116,13 +116,13 @@ async fn decode_runend_typed(ends_buffer)?.await? + ctx.move_to_device(ends_buffer)?.await? }; let values_device = if values_buffer.is_on_device() { values_buffer } else { - ctx.move_to_device::(values_buffer)?.await? + ctx.move_to_device(values_buffer)?.await? }; let output_slice = ctx.device_alloc::(output_len)?; diff --git a/vortex-cuda/src/kernel/encodings/zigzag.rs b/vortex-cuda/src/kernel/encodings/zigzag.rs index c6c545e394f..1e4c97263b9 100644 --- a/vortex-cuda/src/kernel/encodings/zigzag.rs +++ b/vortex-cuda/src/kernel/encodings/zigzag.rs @@ -78,7 +78,7 @@ where let device_buffer: BufferHandle = if buffer.is_on_device() { buffer } else { - ctx.move_to_device::(buffer)?.await? + ctx.move_to_device(buffer)?.await? }; // Get CUDA view of the buffer diff --git a/vortex-cuda/src/kernel/filter/mod.rs b/vortex-cuda/src/kernel/filter/mod.rs index 102b9ef34a5..23e12daa167 100644 --- a/vortex-cuda/src/kernel/filter/mod.rs +++ b/vortex-cuda/src/kernel/filter/mod.rs @@ -95,7 +95,7 @@ async fn filter_sized(input)?.await? + ctx.move_to_device(input)?.await? }; // Construct the inputs for the cub::DeviceSelect::Flagged call. diff --git a/vortex-cuda/src/kernel/filter/varbinview.rs b/vortex-cuda/src/kernel/filter/varbinview.rs index f6bc39a7729..5ba1bddd510 100644 --- a/vortex-cuda/src/kernel/filter/varbinview.rs +++ b/vortex-cuda/src/kernel/filter/varbinview.rs @@ -28,7 +28,7 @@ pub(super) async fn filter_varbinview( let d_views = if views.is_on_device() { views } else { - ctx.move_to_device::(views)?.await? + ctx.move_to_device(views)?.await? }; let filtered_views = filter_sized::(d_views, mask, ctx).await?; diff --git a/vortex-cuda/src/kernel/patches/mod.rs b/vortex-cuda/src/kernel/patches/mod.rs index 9922c7bf56d..ed40da8d6db 100644 --- a/vortex-cuda/src/kernel/patches/mod.rs +++ b/vortex-cuda/src/kernel/patches/mod.rs @@ -73,13 +73,13 @@ pub(crate) async fn execute_patches< let d_patch_indices = if indices_buffer.is_on_device() { indices_buffer } else { - ctx.move_to_device::(indices_buffer)?.await? + ctx.move_to_device(indices_buffer)?.await? }; let d_patch_values = if values_buffer.is_on_device() { values_buffer } else { - ctx.move_to_device::(values_buffer)?.await? + ctx.move_to_device(values_buffer)?.await? }; let d_target_view = target.as_view::(); diff --git a/vortex-cuda/src/stream.rs b/vortex-cuda/src/stream.rs index 98f1af8d994..fad54b36ab4 100644 --- a/vortex-cuda/src/stream.rs +++ b/vortex-cuda/src/stream.rs @@ -15,7 +15,6 @@ use cudarc::driver::result::stream; use futures::future::BoxFuture; use kanal::Sender; use vortex_array::buffer::BufferHandle; -use vortex_buffer::Buffer; use vortex_error::VortexResult; use vortex_error::vortex_err; @@ -98,7 +97,7 @@ impl VortexCudaStream { /// # Returns /// /// A future that resolves to the device buffer handle when the copy completes. - pub fn move_to_device( + pub fn move_to_device( &self, handle: BufferHandle, ) -> VortexResult>> { @@ -106,8 +105,7 @@ impl VortexCudaStream { .as_host_opt() .ok_or_else(|| vortex_err!("Buffer is not on host"))?; - let buffer: Buffer = Buffer::from_byte_buffer(host_buffer.clone()); - self.copy_to_device(buffer) + self.copy_to_device(host_buffer.clone()) } } From 66a3e3c3183af87e2bd81394898b94f70bdf9c4c Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 4 Feb 2026 14:13:25 -0500 Subject: [PATCH 18/22] fix up test lib Signed-off-by: Andrew Duffy --- vortex-cuda/cudf-test/src/lib.rs | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/vortex-cuda/cudf-test/src/lib.rs b/vortex-cuda/cudf-test/src/lib.rs index 11c1b673979..db13c1a8cc4 100644 --- a/vortex-cuda/cudf-test/src/lib.rs +++ b/vortex-cuda/cudf-test/src/lib.rs @@ -2,6 +2,14 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors //! This file is a simple C-compatible API that is called from the cudf-test-harness at CI time. +//! +//! The flow is +//! +//! * test harness calls `dlopen` in this library +//! * invokes the `export_array` function to get back the device array +//! * pass the arrays to `cudf`'s `from_arrow_device_column` +//! * run some operations on the loaded column view +//! * call `array->release()` to drop the data allocated from the Rust side #![allow(clippy::unwrap_used, clippy::expect_used)] @@ -11,10 +19,13 @@ use arrow_schema::ffi::FFI_ArrowSchema; use futures::executor::block_on; use vortex::array::Array; use vortex::array::IntoArray; +use vortex::array::arrays::DecimalArray; use vortex::array::arrays::PrimitiveArray; use vortex::array::arrays::StructArray; +use vortex::array::arrays::VarBinViewArray; use vortex::array::session::ArraySession; use vortex::array::validity::Validity; +use vortex::dtype::DecimalDType; use vortex::dtype::FieldNames; use vortex::expr::session::ExprSession; use vortex::io::session::RuntimeSession; @@ -35,7 +46,6 @@ static SESSION: LazyLock = LazyLock::new(|| { .with::() }); -/// External array #[unsafe(no_mangle)] pub extern "C" fn export_array( schema_ptr: &mut FFI_ArrowSchema, @@ -44,10 +54,17 @@ pub extern "C" fn export_array( let mut ctx = CudaSession::create_execution_ctx(&SESSION).unwrap(); let primitive = PrimitiveArray::from_iter(0u32..1024); + let string = + VarBinViewArray::from_iter_str((0..1024).map(|idx| format!("this is string {idx}"))); + let decimal = DecimalArray::from_iter(0i64..1024, DecimalDType::new(19, 2)); let array = StructArray::new( - FieldNames::from_iter(["a"]), - vec![primitive.into_array()], + FieldNames::from_iter(["prims", "strings", "decimals"]), + vec![ + primitive.into_array(), + string.into_array(), + decimal.into_array(), + ], 1024, Validity::NonNullable, ) From bf87ef3310f6c297bb7013d4b83e778f4ea6450b Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 4 Feb 2026 14:16:52 -0500 Subject: [PATCH 19/22] copy all buffers to device Signed-off-by: Andrew Duffy --- vortex-cuda/src/arrow/canonical.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs index 0e71a95a610..7e8fe44ad6e 100644 --- a/vortex-cuda/src/arrow/canonical.rs +++ b/vortex-cuda/src/arrow/canonical.rs @@ -250,16 +250,24 @@ async fn export_variadic( buffers.push(None); if let Some(buf) = buffer { - buffers.push(Some(buf)); + buffers.push(Some(ensure_device_resident(buf, ctx).await?)); } // We create a new buffer that contains the lengths of the variadic buffers as i64. let mut variadic_buffer_lens = BufferMut::with_capacity(variadic_buffers.len()); for buffer in variadic_buffers { variadic_buffer_lens.push(buffer.len() as i64); - buffers.push(Some(buffer)); + buffers.push(Some(ensure_device_resident(buffer, ctx).await?)); } + let variadic_buffer_lens = ensure_device_resident( + BufferHandle::new_host(variadic_buffer_lens.freeze().into_byte_buffer()), + ctx, + ) + .await?; + + buffers.push(Some(variadic_buffer_lens)); + let mut private_data = PrivateData::new(buffers, vec![], ctx)?; let sync_event = private_data.sync_event(); From 0b523c1f4954215c060d5e5c374db2b9d2de8663 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 4 Feb 2026 14:48:47 -0500 Subject: [PATCH 20/22] update Signed-off-by: Andrew Duffy --- .github/workflows/ci.yml | 6 +-- Cargo.lock | 18 ++----- Cargo.toml | 1 - vortex-cuda/cudf-test/Cargo.toml | 30 ----------- vortex-cuda/cudf-test/src/lib.rs | 90 -------------------------------- vortex-test/e2e-cuda/Cargo.toml | 11 ++-- vortex-test/e2e-cuda/src/lib.rs | 88 ++++++++++++++++++++++++++++++- 7 files changed, 99 insertions(+), 145 deletions(-) delete mode 100644 vortex-cuda/cudf-test/Cargo.toml delete mode 100644 vortex-cuda/cudf-test/src/lib.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1f4bdf0c259..4d608622f2d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -537,13 +537,13 @@ jobs: --no-fail-fast \ --target x86_64-unknown-linux-gnu \ --verbose - - name: Build cudf-test library - run: cargo +nightly build --locked -p vortex-cudf-test --target x86_64-unknown-linux-gnu + - name: Build cudf test library + run: cargo +nightly build --locked -p vortex-test-e2e-cuda --target x86_64-unknown-linux-gnu - name: Download and run cudf-test-harness run: | curl -fsSL https://github.com/vortex-data/cudf-test-harness/releases/latest/download/cudf-test-harness-x86_64.tar.gz | tar -xz cd cudf-test-harness-x86_64 - ./cudf-test-harness check $GITHUB_WORKSPACE/target/x86_64-unknown-linux-gnu/debug/libvortex_cudf_test.so + ./cudf-test-harness check $GITHUB_WORKSPACE/target/x86_64-unknown-linux-gnu/debug/libvortex_test_e2e_cuda.so rust-test-other: name: "Rust tests (${{ matrix.os }})" diff --git a/Cargo.lock b/Cargo.lock index 8e4626e8fad..944b8fa0832 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10463,16 +10463,6 @@ dependencies = [ "syn 2.0.114", ] -[[package]] -name = "vortex-cudf-test" -version = "0.1.0" -dependencies = [ - "arrow-schema 57.2.0", - "futures", - "vortex", - "vortex-cuda", -] - [[package]] name = "vortex-cxx" version = "0.1.0" @@ -11112,12 +11102,10 @@ dependencies = [ name = "vortex-test-e2e-cuda" version = "0.1.0" dependencies = [ - "cudarc", - "rstest", - "tokio", - "vortex-array", + "arrow-schema 57.2.0", + "futures", + "vortex", "vortex-cuda", - "vortex-error", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index a87b67a052e..01914a4d9b1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,6 @@ members = [ "vortex-duckdb", "vortex-cuda", "vortex-cuda/cub", - "vortex-cuda/cudf-test", "vortex-cuda/macros", "vortex-cuda/nvcomp", "vortex-cxx", diff --git a/vortex-cuda/cudf-test/Cargo.toml b/vortex-cuda/cudf-test/Cargo.toml deleted file mode 100644 index 6bac8c4e4cd..00000000000 --- a/vortex-cuda/cudf-test/Cargo.toml +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors - -[package] -name = "vortex-cudf-test" -authors.workspace = true -description = "Test for cuDF integration" -edition = { workspace = true } -homepage = { workspace = true } -categories = { workspace = true } -include = { workspace = true } -keywords = { workspace = true } -license = { workspace = true } -publish = false -readme = { workspace = true } -repository = { workspace = true } -rust-version = { workspace = true } -version = { workspace = true } - -[lib] -crate-type = ["cdylib"] - -[lints] -workspace = true - -[dependencies] -arrow-schema = { workspace = true, features = ["ffi"] } -futures = { workspace = true, features = ["executor"] } -vortex = { workspace = true } -vortex-cuda = { workspace = true, features = ["_test-harness"] } diff --git a/vortex-cuda/cudf-test/src/lib.rs b/vortex-cuda/cudf-test/src/lib.rs deleted file mode 100644 index db13c1a8cc4..00000000000 --- a/vortex-cuda/cudf-test/src/lib.rs +++ /dev/null @@ -1,90 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! This file is a simple C-compatible API that is called from the cudf-test-harness at CI time. -//! -//! The flow is -//! -//! * test harness calls `dlopen` in this library -//! * invokes the `export_array` function to get back the device array -//! * pass the arrays to `cudf`'s `from_arrow_device_column` -//! * run some operations on the loaded column view -//! * call `array->release()` to drop the data allocated from the Rust side - -#![allow(clippy::unwrap_used, clippy::expect_used)] - -use std::sync::LazyLock; - -use arrow_schema::ffi::FFI_ArrowSchema; -use futures::executor::block_on; -use vortex::array::Array; -use vortex::array::IntoArray; -use vortex::array::arrays::DecimalArray; -use vortex::array::arrays::PrimitiveArray; -use vortex::array::arrays::StructArray; -use vortex::array::arrays::VarBinViewArray; -use vortex::array::session::ArraySession; -use vortex::array::validity::Validity; -use vortex::dtype::DecimalDType; -use vortex::dtype::FieldNames; -use vortex::expr::session::ExprSession; -use vortex::io::session::RuntimeSession; -use vortex::layout::session::LayoutSession; -use vortex::metrics::VortexMetrics; -use vortex::session::VortexSession; -use vortex_cuda::CudaSession; -use vortex_cuda::arrow::ArrowDeviceArray; -use vortex_cuda::arrow::DeviceArrayExt; - -static SESSION: LazyLock = LazyLock::new(|| { - VortexSession::empty() - .with::() - .with::() - .with::() - .with::() - .with::() - .with::() -}); - -#[unsafe(no_mangle)] -pub extern "C" fn export_array( - schema_ptr: &mut FFI_ArrowSchema, - array_ptr: &mut ArrowDeviceArray, -) -> i32 { - let mut ctx = CudaSession::create_execution_ctx(&SESSION).unwrap(); - - let primitive = PrimitiveArray::from_iter(0u32..1024); - let string = - VarBinViewArray::from_iter_str((0..1024).map(|idx| format!("this is string {idx}"))); - let decimal = DecimalArray::from_iter(0i64..1024, DecimalDType::new(19, 2)); - - let array = StructArray::new( - FieldNames::from_iter(["prims", "strings", "decimals"]), - vec![ - primitive.into_array(), - string.into_array(), - decimal.into_array(), - ], - 1024, - Validity::NonNullable, - ) - .into_array(); - - let data_type = array - .dtype() - .to_arrow_dtype() - .expect("converting schema to Arrow DataType"); - - *schema_ptr = FFI_ArrowSchema::try_from(data_type).expect("data_type to FFI_ArrowSchema"); - - match block_on(array.export_device_array(&mut ctx)) { - Ok(exported) => { - *array_ptr = exported; - 0 - } - Err(err) => { - eprintln!("error in export_device_array: {err}"); - 1 - } - } -} diff --git a/vortex-test/e2e-cuda/Cargo.toml b/vortex-test/e2e-cuda/Cargo.toml index c1318c2f898..52eedb4e71e 100644 --- a/vortex-test/e2e-cuda/Cargo.toml +++ b/vortex-test/e2e-cuda/Cargo.toml @@ -12,13 +12,14 @@ repository = { workspace = true } rust-version = { workspace = true } version = { workspace = true } +[lib] +crate-type = ["cdylib"] + [lints] workspace = true [dependencies] -cudarc = { workspace = true } -rstest = { workspace = true } -tokio = { workspace = true, features = ["rt", "macros"] } -vortex-array = { workspace = true, features = ["_test-harness"] } +arrow-schema = { workspace = true, features = ["ffi"] } +futures = { workspace = true, features = ["executor"] } +vortex = { workspace = true } vortex-cuda = { workspace = true, features = ["_test-harness"] } -vortex-error = { workspace = true } diff --git a/vortex-test/e2e-cuda/src/lib.rs b/vortex-test/e2e-cuda/src/lib.rs index 6b479935adb..db13c1a8cc4 100644 --- a/vortex-test/e2e-cuda/src/lib.rs +++ b/vortex-test/e2e-cuda/src/lib.rs @@ -1,4 +1,90 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! End-to-end CUDA tests for Vortex. +//! This file is a simple C-compatible API that is called from the cudf-test-harness at CI time. +//! +//! The flow is +//! +//! * test harness calls `dlopen` in this library +//! * invokes the `export_array` function to get back the device array +//! * pass the arrays to `cudf`'s `from_arrow_device_column` +//! * run some operations on the loaded column view +//! * call `array->release()` to drop the data allocated from the Rust side + +#![allow(clippy::unwrap_used, clippy::expect_used)] + +use std::sync::LazyLock; + +use arrow_schema::ffi::FFI_ArrowSchema; +use futures::executor::block_on; +use vortex::array::Array; +use vortex::array::IntoArray; +use vortex::array::arrays::DecimalArray; +use vortex::array::arrays::PrimitiveArray; +use vortex::array::arrays::StructArray; +use vortex::array::arrays::VarBinViewArray; +use vortex::array::session::ArraySession; +use vortex::array::validity::Validity; +use vortex::dtype::DecimalDType; +use vortex::dtype::FieldNames; +use vortex::expr::session::ExprSession; +use vortex::io::session::RuntimeSession; +use vortex::layout::session::LayoutSession; +use vortex::metrics::VortexMetrics; +use vortex::session::VortexSession; +use vortex_cuda::CudaSession; +use vortex_cuda::arrow::ArrowDeviceArray; +use vortex_cuda::arrow::DeviceArrayExt; + +static SESSION: LazyLock = LazyLock::new(|| { + VortexSession::empty() + .with::() + .with::() + .with::() + .with::() + .with::() + .with::() +}); + +#[unsafe(no_mangle)] +pub extern "C" fn export_array( + schema_ptr: &mut FFI_ArrowSchema, + array_ptr: &mut ArrowDeviceArray, +) -> i32 { + let mut ctx = CudaSession::create_execution_ctx(&SESSION).unwrap(); + + let primitive = PrimitiveArray::from_iter(0u32..1024); + let string = + VarBinViewArray::from_iter_str((0..1024).map(|idx| format!("this is string {idx}"))); + let decimal = DecimalArray::from_iter(0i64..1024, DecimalDType::new(19, 2)); + + let array = StructArray::new( + FieldNames::from_iter(["prims", "strings", "decimals"]), + vec![ + primitive.into_array(), + string.into_array(), + decimal.into_array(), + ], + 1024, + Validity::NonNullable, + ) + .into_array(); + + let data_type = array + .dtype() + .to_arrow_dtype() + .expect("converting schema to Arrow DataType"); + + *schema_ptr = FFI_ArrowSchema::try_from(data_type).expect("data_type to FFI_ArrowSchema"); + + match block_on(array.export_device_array(&mut ctx)) { + Ok(exported) => { + *array_ptr = exported; + 0 + } + Err(err) => { + eprintln!("error in export_device_array: {err}"); + 1 + } + } +} From a504e89a32933671827b22ffb3e2c58c4c374a76 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 4 Feb 2026 15:11:26 -0500 Subject: [PATCH 21/22] one more Signed-off-by: Andrew Duffy --- vortex-cuda/src/kernel/patches/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-cuda/src/kernel/patches/mod.rs b/vortex-cuda/src/kernel/patches/mod.rs index ed40da8d6db..870abff3564 100644 --- a/vortex-cuda/src/kernel/patches/mod.rs +++ b/vortex-cuda/src/kernel/patches/mod.rs @@ -176,7 +176,7 @@ mod tests { } = values.into_parts(); let handle = ctx - .move_to_device::(cuda_buffer) + .move_to_device(cuda_buffer) .unwrap() .await .unwrap(); From b90267d06c47d19ce5d54ddb1cba023b81d0a596 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 4 Feb 2026 15:52:38 -0500 Subject: [PATCH 22/22] fix lint Signed-off-by: Andrew Duffy --- vortex-cuda/src/kernel/patches/mod.rs | 6 +----- vortex-test/e2e-cuda/src/lib.rs | 12 ++++++------ 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/vortex-cuda/src/kernel/patches/mod.rs b/vortex-cuda/src/kernel/patches/mod.rs index 870abff3564..20dd03a15bb 100644 --- a/vortex-cuda/src/kernel/patches/mod.rs +++ b/vortex-cuda/src/kernel/patches/mod.rs @@ -175,11 +175,7 @@ mod tests { .. } = values.into_parts(); - let handle = ctx - .move_to_device(cuda_buffer) - .unwrap() - .await - .unwrap(); + let handle = ctx.move_to_device(cuda_buffer).unwrap().await.unwrap(); let device_buf = handle .as_device() .as_any() diff --git a/vortex-test/e2e-cuda/src/lib.rs b/vortex-test/e2e-cuda/src/lib.rs index db13c1a8cc4..06e30909d43 100644 --- a/vortex-test/e2e-cuda/src/lib.rs +++ b/vortex-test/e2e-cuda/src/lib.rs @@ -3,13 +3,13 @@ //! This file is a simple C-compatible API that is called from the cudf-test-harness at CI time. //! -//! The flow is +//! The flow is: //! -//! * test harness calls `dlopen` in this library -//! * invokes the `export_array` function to get back the device array -//! * pass the arrays to `cudf`'s `from_arrow_device_column` -//! * run some operations on the loaded column view -//! * call `array->release()` to drop the data allocated from the Rust side +//! * test harness calls `dlopen` in this library +//! * invokes the `export_array` function to get back the device array +//! * pass the arrays to `cudf`'s `from_arrow_device_column` +//! * run some operations on the loaded column view +//! * call `array->release()` to drop the data allocated from the Rust side #![allow(clippy::unwrap_used, clippy::expect_used)]