Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions vortex-cuda/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ vortex-array = { workspace = true }
vortex-buffer = { workspace = true }
vortex-cub = { path = "cub" }
vortex-cuda-macros = { workspace = true }
vortex-datetime-parts = { workspace = true }
vortex-decimal-byte-parts = { workspace = true }
vortex-dtype = { workspace = true, features = ["cudarc"] }
vortex-error = { workspace = true }
Expand Down Expand Up @@ -81,3 +82,7 @@ harness = false
[[bench]]
name = "runend_cuda"
harness = false

[[bench]]
name = "date_time_parts_cuda"
harness = false
169 changes: 169 additions & 0 deletions vortex-cuda/benches/date_time_parts_cuda.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! CUDA benchmarks for DateTimeParts decoding.

#![allow(clippy::unwrap_used)]
#![allow(clippy::cast_possible_truncation)]

use std::mem::size_of;
use std::time::Duration;

use criterion::BenchmarkId;
use criterion::Criterion;
use criterion::Throughput;
use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
use futures::executor::block_on;
use vortex_array::IntoArray;
use vortex_array::ToCanonical;
use vortex_array::arrays::ConstantArray;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::validity::Validity;
use vortex_buffer::Buffer;
use vortex_cuda::CudaBufferExt;
use vortex_cuda::CudaExecutionCtx;
use vortex_cuda::CudaSession;
use vortex_cuda_macros::cuda_available;
use vortex_cuda_macros::cuda_not_available;
use vortex_datetime_parts::DateTimePartsArray;
use vortex_dtype::DType;
use vortex_dtype::Nullability;
use vortex_dtype::PType;
use vortex_dtype::datetime::TimeUnit;
use vortex_dtype::datetime::Timestamp;
use vortex_error::VortexExpect;
use vortex_session::VortexSession;

fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArray {
let days: Vec<i16> = (0..len).map(|i| (i / 1000) as i16).collect();
let days_arr = PrimitiveArray::new(Buffer::from(days), Validity::NonNullable).into_array();
let seconds_arr = ConstantArray::new(0i8, len).into_array();
let subseconds_arr = ConstantArray::new(0i8, len).into_array();

let dtype = DType::Extension(Timestamp::new(time_unit, Nullability::NonNullable).erased());

DateTimePartsArray::try_new(dtype, days_arr, seconds_arr, subseconds_arr)
.vortex_expect("Failed to create DateTimePartsArray")
}

/// Launches DateTimeParts decode kernel and returns elapsed GPU time.
fn launch_datetimeparts_kernel_timed(
dtp_array: &DateTimePartsArray,
time_unit: TimeUnit,
cuda_ctx: &mut CudaExecutionCtx,
) -> vortex_error::VortexResult<Duration> {
let days_prim = dtp_array.days().to_primitive();

// TODO(0ax1): figure out how to represent constant array in CUDA kernels
let seconds_prim = dtp_array.seconds().to_primitive();
let subseconds_prim = dtp_array.subseconds().to_primitive();

let output_len = dtp_array.len();

let divisor: i64 = match time_unit {
TimeUnit::Nanoseconds => 1_000_000_000,
TimeUnit::Microseconds => 1_000_000,
TimeUnit::Milliseconds => 1_000,
TimeUnit::Seconds => 1,
TimeUnit::Days => unreachable!("Days not supported for DateTimeParts"),
};

let days_device = block_on(
cuda_ctx
.copy_to_device(days_prim.as_slice::<i16>().to_vec())
.unwrap(),
)
.vortex_expect("failed to copy days to device");

let seconds_device = block_on(
cuda_ctx
.copy_to_device(seconds_prim.as_slice::<i8>().to_vec())
.unwrap(),
)
.vortex_expect("failed to copy seconds to device");

let subseconds_device = block_on(
cuda_ctx
.copy_to_device(subseconds_prim.as_slice::<i8>().to_vec())
.unwrap(),
)
.vortex_expect("failed to copy subseconds to device");

// Allocate output buffer
let output_device = block_on(cuda_ctx.copy_to_device(vec![0i64; output_len]).unwrap())
.vortex_expect("failed to allocate output buffer");

let days_view = days_device
.cuda_view::<i32>()
.vortex_expect("failed to get days view");
let seconds_view = seconds_device
.cuda_view::<i32>()
.vortex_expect("failed to get seconds view");
let subseconds_view = subseconds_device
.cuda_view::<i64>()
.vortex_expect("failed to get subseconds view");
let output_view = output_device
.cuda_view::<i64>()
.vortex_expect("failed to get output view");

let array_len_u64 = output_len as u64;

let events = vortex_cuda::launch_cuda_kernel!(
execution_ctx: cuda_ctx,
module: "date_time_parts",
ptypes: &[PType::I32, PType::I32, PType::I64],
launch_args: [days_view, seconds_view, subseconds_view, divisor, output_view, array_len_u64],
event_recording: CU_EVENT_BLOCKING_SYNC,
array_len: output_len
);

events.duration()
}

fn benchmark_datetimeparts(c: &mut Criterion) {
let mut group = c.benchmark_group("datetimeparts_cuda");
group.sample_size(10);

for (len, len_str) in [
(1_000_000usize, "1M"),
(10_000_000usize, "10M"),
(100_000_000usize, "100M"),
] {
group.throughput(Throughput::Bytes((len * size_of::<i64>()) as u64));

let (time_unit, unit_str) = (TimeUnit::Milliseconds, "ms");
let dtp_array = make_datetimeparts_array(len, time_unit);

group.bench_with_input(
BenchmarkId::new("datetimeparts", format!("{len_str}_{unit_str}")),
&dtp_array,
|b, dtp_array| {
b.iter_custom(|iters| {
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
.vortex_expect("failed to create execution context");

let mut total_time = Duration::ZERO;

for _ in 0..iters {
let kernel_time =
launch_datetimeparts_kernel_timed(dtp_array, time_unit, &mut cuda_ctx)
.vortex_expect("kernel launch failed");
total_time += kernel_time;
}

total_time
});
},
);
}

group.finish();
}

criterion::criterion_group!(benches, benchmark_datetimeparts);

#[cuda_available]
criterion::criterion_main!(benches);

#[cuda_not_available]
fn main() {}
63 changes: 63 additions & 0 deletions vortex-cuda/kernels/src/date_time_parts.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

#include "config.cuh"
#include "types.cuh"

constexpr int64_t SECONDS_PER_DAY = 86400;

// Combines date/time parts (days, seconds, subseconds) into timestamp values.
template<typename DaysT, typename SecondsT, typename SubsecondsT>
__device__ void date_time_parts(
const DaysT *__restrict days,
const SecondsT *__restrict seconds,
const SubsecondsT *__restrict subseconds,
int64_t divisor,
int64_t *__restrict output,
uint64_t array_len
) {
const int64_t ticks_per_day = SECONDS_PER_DAY * divisor;
const uint32_t elements_per_block = blockDim.x * ELEMENTS_PER_THREAD;

const uint64_t block_start = static_cast<uint64_t>(blockIdx.x) * elements_per_block;
const uint64_t block_end = min(block_start + elements_per_block, array_len);

for (uint64_t idx = block_start + threadIdx.x; idx < block_end; idx += blockDim.x) {
output[idx] = static_cast<int64_t>(days[idx]) * ticks_per_day
+ static_cast<int64_t>(seconds[idx]) * divisor
+ static_cast<int64_t>(subseconds[idx]);
}
}

#define GENERATE_DATE_TIME_PARTS_KERNEL(days_suffix, DaysT, seconds_suffix, SecondsT, subseconds_suffix, SubsecondsT) \
extern "C" __global__ void date_time_parts_##days_suffix##_##seconds_suffix##_##subseconds_suffix( \
const DaysT *__restrict days, \
const SecondsT *__restrict seconds, \
const SubsecondsT *__restrict subseconds, \
int64_t divisor, \
int64_t *__restrict output, \
uint64_t array_len \
) { \
date_time_parts(days, seconds, subseconds, divisor, output, array_len); \
}

#define EXPAND_DAYS(X) \
X(i8, int8_t) \
X(i16, int16_t) \
X(i32, int32_t) \
X(i64, int64_t)

#define EXPAND_SUBSECONDS(d, DT, s, ST) \
GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i8, int8_t) \
GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i16, int16_t) \
GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i32, int32_t) \
GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i64, int64_t)

#define EXPAND_SECONDS(d, DT) \
EXPAND_SUBSECONDS(d, DT, i8, int8_t) \
EXPAND_SUBSECONDS(d, DT, i16, int16_t) \
EXPAND_SUBSECONDS(d, DT, i32, int32_t) \
EXPAND_SUBSECONDS(d, DT, i64, int64_t)

// Generate all 64 kernels (4³)
EXPAND_DAYS(EXPAND_SECONDS)
1 change: 0 additions & 1 deletion vortex-cuda/kernels/src/dict.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,3 @@ extern "C" __global__ void dict_##value_suffix##_##index_suffix( \

// Generate for all native ptypes & decimal values
FOR_EACH_NUMERIC(GENERATE_DICT_FOR_ALL_INDICES)

14 changes: 14 additions & 0 deletions vortex-cuda/src/canonical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use vortex_array::arrays::BoolArray;
use vortex_array::arrays::BoolArrayParts;
use vortex_array::arrays::DecimalArray;
use vortex_array::arrays::DecimalArrayParts;
use vortex_array::arrays::ExtensionArray;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::arrays::PrimitiveArrayParts;
use vortex_array::arrays::StructArray;
Expand Down Expand Up @@ -129,6 +130,19 @@ impl CanonicalCudaExt for Canonical {
VarBinViewArray::new_unchecked(host_views, host_buffers, dtype, validity)
}))
}
Canonical::Extension(ext) => {
// Copy the storage array to host and rewrap in ExtensionArray.
let host_storage = ext
.storage()
.to_canonical()?
.into_host()
.await?
.into_array();
Ok(Canonical::Extension(ExtensionArray::new(
ext.ext_dtype().clone(),
host_storage,
)))
}
c => todo!("{} not implemented", c.dtype()),
}
}
Expand Down
Loading
Loading