[REVIEW] Set `cudaFuncAttributeMaxDynamicSharedMemorySize` with thread-safety #1771

viclafargue · 2026-02-05T09:39:40Z

Should work fine if the kernel type is different for each CUDA kernel instantiation. I am not familiar with this.

viclafargue · 2026-02-05T09:36:34Z

I would maybe make running_max_smem_size be a std::atomic for more safety.

-Original file line number
+Diff line change
@@ Expand Up / @@ -26,6 +26,7 @@ @@
     // TODO: This shouldn't be invoking anything from spatial/knn
     #include "../ann_utils.cuh"
+    #include "../smem_utils.cuh"
     #include <raft/util/cuda_rt_essentials.hpp>
     #include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
@@ Expand Down Expand Up @@
         THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256);
       }
-      RAFT_CUDA_TRY(
-        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+      cuvs::neighbors::detail::optionally_set_larger_max_smem_size(smem_size, kernel);
       // Initialize hash table
       const uint32_t traversed_hash_size = hashmap::get_size(traversed_hash_bitlen);
       set_value_batch(traversed_hashmap_ptr,
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -26,6 +26,7 @@ @@
     // TODO: This shouldn't be invoking anything from spatial/knn
     #include "../ann_utils.cuh"
+    #include "../smem_utils.cuh"
     #include <raft/util/cuda_rt_essentials.hpp>
     #include <raft/util/integer_utils.hpp>
@@ Expand Down Expand Up @@
         auto* dd_dev_ptr = dd_host.dev_ptr(stream);
         // set kernel attributes same as in normal kernel
-        RAFT_CUDA_TRY(
-          cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+        cuvs::neighbors::detail::optionally_set_larger_max_smem_size(smem_size, kernel);
         // set kernel launch parameters
         dim3 gs = calc_coop_grid_size(block_size, smem_size, persistent_device_usage);
@@ Expand Down Expand Up @@
         using descriptor_base_type = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
         auto kernel = search_kernel_config<false, descriptor_base_type, SourceIndexT, SampleFilterT>::
           choose_itopk_and_mx_candidates(ps.itopk_size, num_itopk_candidates, block_size);
-        RAFT_CUDA_TRY(
-          cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+        cuvs::neighbors::detail::optionally_set_larger_max_smem_size(smem_size, kernel);
         dim3 thread_dims(block_size, 1, 1);
         dim3 block_dims(1, num_queries, 1);
         RAFT_LOG_DEBUG(
@@ Expand Down @@

-Original file line number
+Diff line change
@@ -0,0 +1,37 @@
+    /*
+     * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+     * SPDX-License-Identifier: Apache-2.0
+     */
+    #pragma once
+    #include <raft/util/cuda_rt_essentials.hpp>
+    #include <cstdint>
+    #include <mutex>
+    namespace cuvs::neighbors::detail {
+    /**
+     * @brief Optionally set the larger max dynamic shared memory size for the kernel.
+     * This is required because `cudaFuncSetAttribute` is not thread-safe.
+     * In the event of concurrent calls, we'd like to accommodate the largest requested size.
+     * @tparam KernelT The type of the kernel.
+     * @param smem_size The size of the dynamic shared memory to be set.
+     * @param kernel The kernel to be set.
+     */
+    template <typename KernelT>
+    void optionally_set_larger_max_smem_size(uint32_t smem_size, KernelT& kernel)
+    {
+      static auto mutex                 = std::mutex{};
+      static auto running_max_smem_size = uint32_t{0};
+      if (smem_size > running_max_smem_size) {
+        auto guard = std::lock_guard<std::mutex>{mutex};
+        if (smem_size > running_max_smem_size) {
+          running_max_smem_size = smem_size;
+          RAFT_CUDA_TRY(cudaFuncSetAttribute(
+            kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, running_max_smem_size));
+        }
+      }
+    }
+    }  // namespace cuvs::neighbors::detail

Provide feedback