rapidsai · mythrocks · Feb 3, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026
@@ -26,6 +26,7 @@
 
 // TODO: This shouldn't be invoking anything from spatial/knn
 #include "../ann_utils.cuh"
+#include "../smem_utils.cuh"
 
 #include <raft/util/cuda_rt_essentials.hpp>
 #include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
@@ -589,8 +590,7 @@ void select_and_run(const dataset_descriptor_host<DataT, IndexT, DistanceT>& dat
     THROW("Result buffer size %u larger than max buffer size %u", result_buffer_size, 256);
   }
 
-  RAFT_CUDA_TRY(
-    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+  cuvs::neighbors::detail::optionally_set_larger_max_smem_size(smem_size, kernel);
   // Initialize hash table
   const uint32_t traversed_hash_size = hashmap::get_size(traversed_hash_bitlen);
   set_value_batch(traversed_hashmap_ptr,

@@ -26,6 +26,7 @@
 
 // TODO: This shouldn't be invoking anything from spatial/knn
 #include "../ann_utils.cuh"
+#include "../smem_utils.cuh"
 
 #include <raft/util/cuda_rt_essentials.hpp>
 #include <raft/util/integer_utils.hpp>
@@ -1980,8 +1981,7 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
     auto* dd_dev_ptr = dd_host.dev_ptr(stream);
 
     // set kernel attributes same as in normal kernel
-    RAFT_CUDA_TRY(
-      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    cuvs::neighbors::detail::optionally_set_larger_max_smem_size(smem_size, kernel);
 
     // set kernel launch parameters
     dim3 gs = calc_coop_grid_size(block_size, smem_size, persistent_device_usage);
@@ -2312,8 +2312,7 @@ control is returned in this thread (in persistent_runner_t constructor), so we'r
     using descriptor_base_type = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
     auto kernel = search_kernel_config<false, descriptor_base_type, SourceIndexT, SampleFilterT>::
       choose_itopk_and_mx_candidates(ps.itopk_size, num_itopk_candidates, block_size);
-    RAFT_CUDA_TRY(
-      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    cuvs::neighbors::detail::optionally_set_larger_max_smem_size(smem_size, kernel);
     dim3 thread_dims(block_size, 1, 1);
     dim3 block_dims(1, num_queries, 1);
     RAFT_LOG_DEBUG(

@@ -0,0 +1,37 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <raft/util/cuda_rt_essentials.hpp>
+
+#include <cstdint>
+#include <mutex>
+
+namespace cuvs::neighbors::detail {
+
+/**
+ * @brief Optionally set the larger max dynamic shared memory size for the kernel.
+ * This is required because `cudaFuncSetAttribute` is not thread-safe.
+ * In the event of concurrent calls, we'd like to accommodate the largest requested size.
+ * @tparam KernelT The type of the kernel.
+ * @param smem_size The size of the dynamic shared memory to be set.
+ * @param kernel The kernel to be set.
+ */
+template <typename KernelT>
+void optionally_set_larger_max_smem_size(uint32_t smem_size, KernelT& kernel)
+{
+  static auto mutex                 = std::mutex{};
+  static auto running_max_smem_size = uint32_t{0};
+  if (smem_size > running_max_smem_size) {
+    auto guard = std::lock_guard<std::mutex>{mutex};
+    if (smem_size > running_max_smem_size) {
+      running_max_smem_size = smem_size;
+      RAFT_CUDA_TRY(cudaFuncSetAttribute(
+        kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, running_max_smem_size));
+    }
+  }
+}
+
+}  // namespace cuvs::neighbors::detail