Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

// TODO: This shouldn't be invoking anything from spatial/knn
#include "../ann_utils.cuh"
#include "../smem_utils.cuh"

#include <raft/util/cuda_rt_essentials.hpp>
#include <raft/util/cudart_utils.hpp> // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
Expand Down Expand Up @@ -589,8 +590,7 @@ void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dat
THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256);
}

RAFT_CUDA_TRY(
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
cuvs::neighbors::detail::optionally_set_larger_max_smem_size(smem_size, kernel);
// Initialize hash table
const uint32_t traversed_hash_size = hashmap::get_size(traversed_hash_bitlen);
set_value_batch(traversed_hashmap_ptr,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

// TODO: This shouldn't be invoking anything from spatial/knn
#include "../ann_utils.cuh"
#include "../smem_utils.cuh"

#include <raft/util/cuda_rt_essentials.hpp>
#include <raft/util/integer_utils.hpp>
Expand Down Expand Up @@ -1980,8 +1981,7 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
auto* dd_dev_ptr = dd_host.dev_ptr(stream);

// set kernel attributes same as in normal kernel
RAFT_CUDA_TRY(
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
cuvs::neighbors::detail::optionally_set_larger_max_smem_size(smem_size, kernel);

// set kernel launch parameters
dim3 gs = calc_coop_grid_size(block_size, smem_size, persistent_device_usage);
Expand Down Expand Up @@ -2312,8 +2312,7 @@ control is returned in this thread (in persistent_runner_t constructor), so we'r
using descriptor_base_type = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
auto kernel = search_kernel_config<false, descriptor_base_type, SourceIndexT, SampleFilterT>::
choose_itopk_and_mx_candidates(ps.itopk_size, num_itopk_candidates, block_size);
RAFT_CUDA_TRY(
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
cuvs::neighbors::detail::optionally_set_larger_max_smem_size(smem_size, kernel);
dim3 thread_dims(block_size, 1, 1);
dim3 block_dims(1, num_queries, 1);
RAFT_LOG_DEBUG(
Expand Down
37 changes: 37 additions & 0 deletions cpp/src/neighbors/detail/smem_utils.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once

#include <raft/util/cuda_rt_essentials.hpp>

#include <cstdint>
#include <mutex>

namespace cuvs::neighbors::detail {

/**
* @brief Optionally set the larger max dynamic shared memory size for the kernel.
* This is required because `cudaFuncSetAttribute` is not thread-safe.
* In the event of concurrent calls, we'd like to accommodate the largest requested size.
* @tparam KernelT The type of the kernel.
* @param smem_size The size of the dynamic shared memory to be set.
* @param kernel The kernel to be set.
*/
template <typename KernelT>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should work fine if the kernel type is different for each CUDA kernel instantiation. I am not familiar with this.

void optionally_set_larger_max_smem_size(uint32_t smem_size, KernelT& kernel)
{
static auto mutex = std::mutex{};
static auto running_max_smem_size = uint32_t{0};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would maybe make running_max_smem_size be a std::atomic for more safety.

if (smem_size > running_max_smem_size) {
auto guard = std::lock_guard<std::mutex>{mutex};
if (smem_size > running_max_smem_size) {
running_max_smem_size = smem_size;
RAFT_CUDA_TRY(cudaFuncSetAttribute(
kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, running_max_smem_size));
}
}
}

} // namespace cuvs::neighbors::detail
Loading