From d0019ae89597928cdb82ff560920ff554148c083 Mon Sep 17 00:00:00 2001
From: jnke2016 <jnke2016@gmail.com>
Date: Wed, 1 Oct 2025 20:55:17 -0700
Subject: [PATCH 1/3] add changes for undirected MIS

---
 cpp/src/components/mis_impl.cuh  | 548 ++++++++++++++-----------------
 cpp/tests/components/mis_test.cu | 124 +++++--
 2 files changed, 340 insertions(+), 332 deletions(-)
diff --git a/cpp/src/components/mis_impl.cuh b/cpp/src/components/mis_impl.cuh
index 085b4837873..8390dd30678 100644
--- a/cpp/src/components/mis_impl.cuh
+++ b/cpp/src/components/mis_impl.cuh
@@ -1,6 +1,5 @@
-
 /*
- * Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -11,325 +10,276 @@
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
+ * See the License for the specific language governin_from_mtxg permissions and
  * limitations under the License.
  */
-#pragma once
 
-#include "prims/fill_edge_src_dst_property.cuh"
 #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
-#include "prims/update_edge_src_dst_property.cuh"
+#include "prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh"
+#include "prims/reduce_op.cuh"
+#include "utilities/base_fixture.hpp"
+#include "utilities/test_graphs.hpp"
 
 #include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
 #include <cugraph/edge_property.hpp>
 #include <cugraph/edge_src_dst_property.hpp>
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
-#include <cugraph/utilities/host_scalar_comm.hpp>
-
-#include <cuda/functional>
-#include <cuda/std/iterator>
-#include <cuda/std/tuple>
-#include <thrust/count.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/merge.h>
-#include <thrust/remove.h>
-#include <thrust/set_operations.h>
-#include <thrust/transform.h>
-#include <thrust/transform_reduce.h>
-
-#include <cmath>
-#include <optional>
-
-namespace cugraph {
-
-namespace detail {
-
-template <typename vertex_t, typename edge_t, bool multi_gpu>
-rmm::device_uvector<vertex_t> maximal_independent_set(
-  raft::handle_t const& handle,
-  cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
-  raft::random::RngState& rng_state)
-{
-  using GraphViewType = cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>;
-
-  vertex_t local_vtx_partitoin_size = graph_view.local_vertex_partition_range_size();
-
-  rmm::device_uvector<vertex_t> remaining_vertices(local_vtx_partitoin_size, handle.get_stream());
-
-  auto vertex_begin =
-    thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first());
-  auto vertex_end = thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last());
-
-  auto out_degrees = graph_view.compute_out_degrees(handle);
-  auto in_degrees  = graph_view.compute_in_degrees(handle);
-
-  // Vertices with degree zero are always part of MIS
-  remaining_vertices.resize(
-    cuda::std::distance(
-      remaining_vertices.begin(),
-      thrust::copy_if(handle.get_thrust_policy(),
-                      vertex_begin,
-                      vertex_end,
-                      thrust::make_zip_iterator(out_degrees.begin(), in_degrees.begin()),
-                      remaining_vertices.begin(),
-                      [] __device__(auto out_deg_and_in_deg) {
-                        return !((cuda::std::get<0>(out_deg_and_in_deg) == 0) &&
-                                 (cuda::std::get<1>(out_deg_and_in_deg) == 0));
-                      })),
-    handle.get_stream());
-
-  // Set ID of each vertex as its rank
-  rmm::device_uvector<vertex_t> ranks(local_vtx_partitoin_size, handle.get_stream());
-  thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, ranks.begin());
-
-  // Set ranks of zero degree vetices to std::numeric_limits<vertex_t>::max()
-  thrust::transform_if(handle.get_thrust_policy(),
-                       thrust::make_zip_iterator(out_degrees.begin(), in_degrees.begin()),
-                       thrust::make_zip_iterator(out_degrees.end(), in_degrees.end()),
-                       ranks.begin(),
-                       cuda::proclaim_return_type<vertex_t>(
-                         [] __device__(auto) { return std::numeric_limits<vertex_t>::max(); }),
-                       [] __device__(auto in_out_degree) {
-                         return (cuda::std::get<0>(in_out_degree) == 0) &&
-                                (cuda::std::get<1>(in_out_degree) == 0);
-                       });
-
-  out_degrees.resize(0, handle.get_stream());
-  out_degrees.shrink_to_fit(handle.get_stream());
-
-  in_degrees.resize(0, handle.get_stream());
-  in_degrees.shrink_to_fit(handle.get_stream());
-
-  size_t loop_counter = 0;
-  while (true) {
-    loop_counter++;
-
-    // Copy ranks into temporary vector to begin with
-
-    rmm::device_uvector<vertex_t> temporary_ranks(local_vtx_partitoin_size, handle.get_stream());
-    thrust::copy(handle.get_thrust_policy(), ranks.begin(), ranks.end(), temporary_ranks.begin());
-
-    // Select a random set of candidate vertices
-
-    vertex_t nr_remaining_vertices_to_check = remaining_vertices.size();
-    if (multi_gpu) {
-      nr_remaining_vertices_to_check = host_scalar_allreduce(handle.get_comms(),
-                                                             nr_remaining_vertices_to_check,
-                                                             raft::comms::op_t::SUM,
-                                                             handle.get_stream());
+#include <cugraph/large_buffer_manager.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/random/rng_state.hpp>
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <iostream>
+#include <random>
+
+struct MaximalIndependentSet_Usecase {
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_SGMaximalIndependentSet
+  : public ::testing::TestWithParam<std::tuple<MaximalIndependentSet_Usecase, input_usecase_t>> {
+ public:
+  Tests_SGMaximalIndependentSet() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(std::tuple<MaximalIndependentSet_Usecase, input_usecase_t> const& param)
+  {
+  
+    auto [mis_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      hr_timer.start("Construct graph");
     }
 
-    vertex_t nr_candidates = (nr_remaining_vertices_to_check < 1024)
-                               ? nr_remaining_vertices_to_check
-                               : std::min(static_cast<vertex_t>((0.50 + 0.25 * loop_counter) *
-                                                                nr_remaining_vertices_to_check),
-                                          nr_remaining_vertices_to_check);
-
-    // FIXME: Can we improve performance here?
-    // FIXME: if(nr_remaining_vertices_to_check < 1024), may avoid calling select_random_vertices
-    auto d_sampled_vertices =
-      cugraph::select_random_vertices(handle,
-                                      graph_view,
-                                      std::make_optional(raft::device_span<vertex_t const>{
-                                        remaining_vertices.data(), remaining_vertices.size()}),
-                                      rng_state,
-                                      nr_candidates,
-                                      false,
-                                      true);
-
-    rmm::device_uvector<vertex_t> non_candidate_vertices(
-      remaining_vertices.size() - d_sampled_vertices.size(), handle.get_stream());
-
-    thrust::set_difference(handle.get_thrust_policy(),
-                           remaining_vertices.begin(),
-                           remaining_vertices.end(),
-                           d_sampled_vertices.begin(),
-                           d_sampled_vertices.end(),
-                           non_candidate_vertices.begin());
-
-    // Set temporary ranks of non-candidate vertices to std::numeric_limits<vertex_t>::lowest()
-    thrust::for_each(
-      handle.get_thrust_policy(),
-      non_candidate_vertices.begin(),
-      non_candidate_vertices.end(),
-      [temporary_ranks =
-         raft::device_span<vertex_t>(temporary_ranks.data(), temporary_ranks.size()),
-       v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) {
-        //
-        // if rank of a non-candidate vertex is not std::numeric_limits<vertex_t>::max() (i.e. the
-        // vertex is not already in MIS), set it to std::numeric_limits<vertex_t>::lowest()
-        //
-        auto v_offset = v - v_first;
-        if (temporary_ranks[v_offset] < std::numeric_limits<vertex_t>::max()) {
-          temporary_ranks[v_offset] = std::numeric_limits<vertex_t>::lowest();
-        }
-      });
+    constexpr bool multi_gpu = false;
+
+    bool test_weighted = false;
+    bool renumber = true;
+    bool drop_self_loops  = true;
+    bool drop_multi_edges = true;
+
+    auto [sg_graph, sg_edge_weights, sg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, multi_gpu>(
+        handle, input_usecase, test_weighted, renumber, drop_self_loops, drop_multi_edges);
 
-    // Caches for ranks
-    edge_src_property_t<vertex_t, vertex_t> src_rank_cache(handle);
-    edge_dst_property_t<vertex_t, vertex_t> dst_rank_cache(handle);
-
-    // Update rank caches with temporary ranks
-    if constexpr (multi_gpu) {
-      src_rank_cache = edge_src_property_t<vertex_t, vertex_t>(handle, graph_view);
-      dst_rank_cache = edge_dst_property_t<vertex_t, vertex_t>(handle, graph_view);
-      update_edge_src_property(
-        handle, graph_view, temporary_ranks.begin(), src_rank_cache.mutable_view());
-      update_edge_dst_property(
-        handle, graph_view, temporary_ranks.begin(), dst_rank_cache.mutable_view());
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
     }
+    
+    auto sg_graph_view = sg_graph.view();
+    auto sg_edge_weight_view =
+      sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
+
+    auto edge_partition = sg_graph_view.local_edge_partition_view(0);
+    
+    auto number_of_local_edges = edge_partition.number_of_edges();
+
+    raft::random::RngState rng_state(0);
+    auto d_mis = cugraph::maximal_independent_set<vertex_t, edge_t, multi_gpu>(
+      handle, sg_graph_view, rng_state);
+
+    // Test MIS
+    if (mis_usecase.check_correctness) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      std::vector<vertex_t> h_mis(d_mis.size());
+      raft::update_host(h_mis.data(), d_mis.data(), d_mis.size(), handle.get_stream());
 
-    //
-    // Find maximum rank outgoing neighbor for each vertex
-    //
-
-    rmm::device_uvector<vertex_t> max_outgoing_ranks(local_vtx_partitoin_size, handle.get_stream());
-
-    per_v_transform_reduce_outgoing_e(
-      handle,
-      graph_view,
-      multi_gpu ? src_rank_cache.view()
-                : make_edge_src_property_view<vertex_t, vertex_t>(
-                    graph_view, temporary_ranks.begin(), temporary_ranks.size()),
-      multi_gpu ? dst_rank_cache.view()
-                : make_edge_dst_property_view<vertex_t, vertex_t>(
-                    graph_view, temporary_ranks.begin(), temporary_ranks.size()),
-      edge_dummy_property_t{}.view(),
-      [] __device__(auto src, auto dst, auto src_rank, auto dst_rank, auto wt) { return dst_rank; },
-      std::numeric_limits<vertex_t>::lowest(),
-      cugraph::reduce_op::maximum<vertex_t>{},
-      max_outgoing_ranks.begin());
-
-    //
-    // Find maximum rank incoming neighbor for each vertex
-    //
-
-    rmm::device_uvector<vertex_t> max_incoming_ranks(local_vtx_partitoin_size, handle.get_stream());
-
-    per_v_transform_reduce_incoming_e(
-      handle,
-      graph_view,
-      multi_gpu ? src_rank_cache.view()
-                : make_edge_src_property_view<vertex_t, vertex_t>(
-                    graph_view, temporary_ranks.begin(), temporary_ranks.size()),
-      multi_gpu ? dst_rank_cache.view()
-                : make_edge_dst_property_view<vertex_t, vertex_t>(
-                    graph_view, temporary_ranks.begin(), temporary_ranks.size()),
-      edge_dummy_property_t{}.view(),
-      [] __device__(auto src, auto dst, auto src_rank, auto dst_rank, auto wt) { return src_rank; },
-      std::numeric_limits<vertex_t>::lowest(),
-      cugraph::reduce_op::maximum<vertex_t>{},
-      max_incoming_ranks.begin());
-
-    temporary_ranks.resize(0, handle.get_stream());
-    temporary_ranks.shrink_to_fit(handle.get_stream());
-
-    //
-    // Compute max of outgoing and incoming neighbors
-    //
-    thrust::transform(handle.get_thrust_policy(),
-                      max_incoming_ranks.begin(),
-                      max_incoming_ranks.end(),
-                      max_outgoing_ranks.begin(),
-                      max_outgoing_ranks.begin(),
-                      thrust::maximum<vertex_t>());
-
-    max_incoming_ranks.resize(0, handle.get_stream());
-    max_incoming_ranks.shrink_to_fit(handle.get_stream());
-
-    //
-    // If the max neighbor of a vertex is already in MIS (i.e. has rank
-    // std::numeric_limits<vertex_t>::max()), discard it, otherwise,
-    // include the vertex if it has larger rank than its maximum rank neighbor
-    //
-    auto last = thrust::remove_if(
-      handle.get_thrust_policy(),
-      d_sampled_vertices.begin(),
-      d_sampled_vertices.end(),
-      [max_rank_neighbor_first = max_outgoing_ranks.begin(),
-       ranks                   = raft::device_span<vertex_t>(ranks.data(), ranks.size()),
-       v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) {
-        auto v_offset          = v - v_first;
-        auto max_neighbor_rank = *(max_rank_neighbor_first + v_offset);
-        auto rank_of_v         = ranks[v_offset];
-
-        if (max_neighbor_rank >= std::numeric_limits<vertex_t>::max()) {
-          // Maximum rank neighbor is alreay in MIS
-          // Discard current vertex by setting its rank to
-          // std::numeric_limits<vertex_t>::lowest()
-          ranks[v_offset] = std::numeric_limits<vertex_t>::lowest();
-          return true;
-        }
-
-        if (rank_of_v >= max_neighbor_rank) {
-          // Include v and set its rank to std::numeric_limits<vertex_t>::max()
-          ranks[v_offset] = std::numeric_limits<vertex_t>::max();
-          return true;
-        }
-        return false;
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      auto vertex_first = sg_graph_view.local_vertex_partition_range_first();
+      auto vertex_last  = sg_graph_view.local_vertex_partition_range_last();
+
+      std::for_each(h_mis.begin(), h_mis.end(), [vertex_first, vertex_last](vertex_t v) {
+        ASSERT_TRUE((v >= vertex_first) && (v < vertex_last));
       });
 
-    max_outgoing_ranks.resize(0, handle.get_stream());
-    max_outgoing_ranks.shrink_to_fit(handle.get_stream());
-
-    d_sampled_vertices.resize(cuda::std::distance(d_sampled_vertices.begin(), last),
-                              handle.get_stream());
-    d_sampled_vertices.shrink_to_fit(handle.get_stream());
-
-    remaining_vertices.resize(non_candidate_vertices.size() + d_sampled_vertices.size(),
-                              handle.get_stream());
-    remaining_vertices.shrink_to_fit(handle.get_stream());
-
-    // merge non-candidate and remaining candidate vertices
-    thrust::merge(handle.get_thrust_policy(),
-                  non_candidate_vertices.begin(),
-                  non_candidate_vertices.end(),
-                  d_sampled_vertices.begin(),
-                  d_sampled_vertices.end(),
-                  remaining_vertices.begin());
-
-    nr_remaining_vertices_to_check = remaining_vertices.size();
-    if (multi_gpu) {
-      nr_remaining_vertices_to_check = host_scalar_allreduce(handle.get_comms(),
-                                                             nr_remaining_vertices_to_check,
-                                                             raft::comms::op_t::SUM,
-                                                             handle.get_stream());
+      // If a vertex is included in MIS, then none of its neighbor should be
+
+      vertex_t local_vtx_partitoin_size = sg_graph_view.local_vertex_partition_range_size();
+      rmm::device_uvector<vertex_t> d_any_outgoing_nbrs_included_mis(local_vtx_partitoin_size,
+                                                                       handle.get_stream());
+
+      rmm::device_uvector<vertex_t> inclusiong_flags(local_vtx_partitoin_size, handle.get_stream());
+
+      thrust::uninitialized_fill(
+        handle.get_thrust_policy(), inclusiong_flags.begin(), inclusiong_flags.end(), vertex_t{0});
+
+      thrust::for_each(
+        handle.get_thrust_policy(),
+        d_mis.begin(),
+        d_mis.end(),
+        [inclusiong_flags =
+           raft::device_span<vertex_t>(inclusiong_flags.data(), inclusiong_flags.size()),
+         v_first = sg_graph_view.local_vertex_partition_range_first()] __device__(auto v) {
+          auto v_offset              = v - v_first;
+          inclusiong_flags[v_offset] = vertex_t{1};
+        });
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      per_v_transform_reduce_if_outgoing_e(
+        handle,
+        sg_graph_view,
+        cugraph::make_edge_src_property_view<vertex_t, vertex_t>(
+          sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()),
+        cugraph::make_edge_dst_property_view<vertex_t, vertex_t>(
+          sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()),
+        cugraph::edge_dummy_property_t{}.view(),
+        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) { return vertex_t{1}; },
+        vertex_t{0},
+        cugraph::reduce_op::any<vertex_t>(),
+        // just use auto, auto remove src_rank and wt # FIXME: address this.
+        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) {
+          // Adjacent vertices are in the MIS
+          return (src_included == dst_included) && (src_included == 1);
+          },
+        d_any_outgoing_nbrs_included_mis.begin(),
+        false);
+      
+      auto num_invalid_vertices_in_mis = thrust::reduce(
+        handle.get_thrust_policy(),
+        d_any_outgoing_nbrs_included_mis.begin(),
+        d_any_outgoing_nbrs_included_mis.end());
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      ASSERT_TRUE(num_invalid_vertices_in_mis == 0);
+
+      auto vertex_begin =
+        thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first());
+
+      auto vertex_end = 
+        thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last());
+
+      rmm::device_uvector<vertex_t> vertices(local_vtx_partitoin_size,
+                                              handle.get_stream());
+      
+      thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, vertices.begin());
+
+      rmm::device_uvector<vertex_t> non_candidate_vertices(
+        vertices.size() - d_mis.size(), handle.get_stream());
+      
+      thrust::set_difference(handle.get_thrust_policy(),
+                             vertices.begin(),
+                             vertices.end(),
+                             d_mis.begin(),
+                             d_mis.end(),
+                             non_candidate_vertices.begin());
+      
+      cugraph::vertex_frontier_t<vertex_t, void, /*GraphViewType::is_multi_gpu*/false, true> vertex_frontier(
+        handle,
+        1);
+      
+      vertex_frontier.bucket(0).insert(non_candidate_vertices.begin(), non_candidate_vertices.end());
+      
+      d_any_outgoing_nbrs_included_mis.resize(non_candidate_vertices.size(), handle.get_stream());
+
+      per_v_transform_reduce_if_outgoing_e(
+        handle,
+        sg_graph_view,
+        vertex_frontier.bucket(0),
+        cugraph::make_edge_src_property_view<vertex_t, vertex_t>(
+          sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()),
+        cugraph::make_edge_dst_property_view<vertex_t, vertex_t>(
+          sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()),
+        cugraph::edge_dummy_property_t{}.view(),
+        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) { return vertex_t{1}; },
+        vertex_t{0},
+        cugraph::reduce_op::any<vertex_t>(),
+        // just use auto, auto remove src_rank and wt # FIXME: address this.
+        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) {
+          // Adjacent vertices are in the MIS
+          return dst_included == 1;
+          },
+        d_any_outgoing_nbrs_included_mis.begin(),
+        false);
+
+      auto num_invalid_non_candidate_vertices_out_mis = thrust::reduce(
+        handle.get_thrust_policy(),
+        d_any_outgoing_nbrs_included_mis.begin(),
+        d_any_outgoing_nbrs_included_mis.end());
+      
+      // FIXME: Add an error message
+      ASSERT_TRUE(
+        num_invalid_non_candidate_vertices_out_mis == d_any_outgoing_nbrs_included_mis.size());
     }
-
-    if (nr_remaining_vertices_to_check == 0) { break; }
+    
   }
+};
+
+using Tests_SGMaximalIndependentSet_File =
+  Tests_SGMaximalIndependentSet<cugraph::test::File_Usecase>;
+using Tests_SGMaximalIndependentSet_Rmat =
+  Tests_SGMaximalIndependentSet<cugraph::test::Rmat_Usecase>;
 
-  // Count number of vertices included in MIS
-
-  vertex_t nr_vertices_included_in_mis = thrust::count_if(
-    handle.get_thrust_policy(), ranks.begin(), ranks.end(), [] __device__(auto v_rank) {
-      return v_rank >= std::numeric_limits<vertex_t>::max();
-    });
-
-  // Build MIS and return
-  rmm::device_uvector<vertex_t> mis(nr_vertices_included_in_mis, handle.get_stream());
-  thrust::copy_if(
-    handle.get_thrust_policy(),
-    vertex_begin,
-    vertex_end,
-    ranks.begin(),
-    mis.begin(),
-    [] __device__(auto v_rank) { return v_rank >= std::numeric_limits<vertex_t>::max(); });
-
-  ranks.resize(0, handle.get_stream());
-  ranks.shrink_to_fit(handle.get_stream());
-  return mis;
+TEST_P(Tests_SGMaximalIndependentSet_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGMaximalIndependentSet_File, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SGMaximalIndependentSet_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
 }
-}  // namespace detail
 
-template <typename vertex_t, typename edge_t, bool multi_gpu>
-rmm::device_uvector<vertex_t> maximal_independent_set(
-  raft::handle_t const& handle,
-  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
-  raft::random::RngState& rng_state)
+TEST_P(Tests_SGMaximalIndependentSet_Rmat, CheckInt64Int64FloatFloat)
 {
-  return detail::maximal_independent_set(handle, graph_view, rng_state);
+  run_current_test<int64_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
 }
 
-}  // namespace cugraph
+bool constexpr check_correctness = false;
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_SGMaximalIndependentSet_File,
+  ::testing::Combine(::testing::Values(MaximalIndependentSet_Usecase{check_correctness},
+                                       MaximalIndependentSet_Usecase{check_correctness}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_SGMaximalIndependentSet_Rmat,
+  ::testing::Combine(
+    ::testing::Values(MaximalIndependentSet_Usecase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(3, 4, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_SGMaximalIndependentSet_Rmat,
+  ::testing::Combine(
+    ::testing::Values(MaximalIndependentSet_Usecase{check_correctness},
+                      MaximalIndependentSet_Usecase{check_correctness}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/components/mis_test.cu b/cpp/tests/components/mis_test.cu
index 0edbccfc356..8390dd30678 100644
--- a/cpp/tests/components/mis_test.cu
+++ b/cpp/tests/components/mis_test.cu
@@ -15,6 +15,7 @@
  */
 
 #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
+#include "prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh"
 #include "prims/reduce_op.cuh"
 #include "utilities/base_fixture.hpp"
 #include "utilities/test_graphs.hpp"
@@ -25,6 +26,7 @@
 #include <cugraph/edge_src_dst_property.hpp>
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
+#include <cugraph/large_buffer_manager.hpp>
 #include <cugraph/utilities/high_res_timer.hpp>
 
 #include <raft/random/rng_state.hpp>
@@ -51,9 +53,10 @@ class Tests_SGMaximalIndependentSet
   virtual void SetUp() {}
   virtual void TearDown() {}
 
-  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  template <typename vertex_t, typename edge_t, typename weight_t>
   void run_current_test(std::tuple<MaximalIndependentSet_Usecase, input_usecase_t> const& param)
   {
+  
     auto [mis_usecase, input_usecase] = param;
 
     raft::handle_t handle{};
@@ -66,9 +69,14 @@ class Tests_SGMaximalIndependentSet
 
     constexpr bool multi_gpu = false;
 
+    bool test_weighted = false;
+    bool renumber = true;
+    bool drop_self_loops  = true;
+    bool drop_multi_edges = true;
+
     auto [sg_graph, sg_edge_weights, sg_renumber_map] =
       cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, multi_gpu>(
-        handle, input_usecase, false, true);
+        handle, input_usecase, test_weighted, renumber, drop_self_loops, drop_multi_edges);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());
@@ -76,11 +84,15 @@ class Tests_SGMaximalIndependentSet
       hr_timer.stop();
       hr_timer.display_and_clear(std::cout);
     }
-
+    
     auto sg_graph_view = sg_graph.view();
     auto sg_edge_weight_view =
       sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
 
+    auto edge_partition = sg_graph_view.local_edge_partition_view(0);
+    
+    auto number_of_local_edges = edge_partition.number_of_edges();
+
     raft::random::RngState rng_state(0);
     auto d_mis = cugraph::maximal_independent_set<vertex_t, edge_t, multi_gpu>(
       handle, sg_graph_view, rng_state);
@@ -103,7 +115,7 @@ class Tests_SGMaximalIndependentSet
       // If a vertex is included in MIS, then none of its neighbor should be
 
       vertex_t local_vtx_partitoin_size = sg_graph_view.local_vertex_partition_range_size();
-      rmm::device_uvector<vertex_t> d_total_outgoing_nbrs_included_mis(local_vtx_partitoin_size,
+      rmm::device_uvector<vertex_t> d_any_outgoing_nbrs_included_mis(local_vtx_partitoin_size,
                                                                        handle.get_stream());
 
       rmm::device_uvector<vertex_t> inclusiong_flags(local_vtx_partitoin_size, handle.get_stream());
@@ -124,47 +136,93 @@ class Tests_SGMaximalIndependentSet
 
       RAFT_CUDA_TRY(cudaDeviceSynchronize());
 
-      per_v_transform_reduce_outgoing_e(
+      per_v_transform_reduce_if_outgoing_e(
         handle,
         sg_graph_view,
         cugraph::make_edge_src_property_view<vertex_t, vertex_t>(
-          sg_graph_view, inclusiong_flags.data(), 1),
+          sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()),
         cugraph::make_edge_dst_property_view<vertex_t, vertex_t>(
-          sg_graph_view, inclusiong_flags.data(), 1),
+          sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()),
         cugraph::edge_dummy_property_t{}.view(),
-        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) {
-          return (src == dst) ? 0 : dst_included;
-        },
+        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) { return vertex_t{1}; },
         vertex_t{0},
-        cugraph::reduce_op::plus<vertex_t>{},
-        d_total_outgoing_nbrs_included_mis.begin());
-
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-
-      std::vector<vertex_t> h_total_outgoing_nbrs_included_mis(
-        d_total_outgoing_nbrs_included_mis.size());
-      raft::update_host(h_total_outgoing_nbrs_included_mis.data(),
-                        d_total_outgoing_nbrs_included_mis.data(),
-                        d_total_outgoing_nbrs_included_mis.size(),
-                        handle.get_stream());
+        cugraph::reduce_op::any<vertex_t>(),
+        // just use auto, auto remove src_rank and wt # FIXME: address this.
+        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) {
+          // Adjacent vertices are in the MIS
+          return (src_included == dst_included) && (src_included == 1);
+          },
+        d_any_outgoing_nbrs_included_mis.begin(),
+        false);
+      
+      auto num_invalid_vertices_in_mis = thrust::reduce(
+        handle.get_thrust_policy(),
+        d_any_outgoing_nbrs_included_mis.begin(),
+        d_any_outgoing_nbrs_included_mis.end());
 
       RAFT_CUDA_TRY(cudaDeviceSynchronize());
 
-      {
-        auto vertex_first = sg_graph_view.local_vertex_partition_range_first();
-        auto vertex_last  = sg_graph_view.local_vertex_partition_range_last();
+      ASSERT_TRUE(num_invalid_vertices_in_mis == 0);
+
+      auto vertex_begin =
+        thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first());
+
+      auto vertex_end = 
+        thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last());
+
+      rmm::device_uvector<vertex_t> vertices(local_vtx_partitoin_size,
+                                              handle.get_stream());
+      
+      thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, vertices.begin());
+
+      rmm::device_uvector<vertex_t> non_candidate_vertices(
+        vertices.size() - d_mis.size(), handle.get_stream());
+      
+      thrust::set_difference(handle.get_thrust_policy(),
+                             vertices.begin(),
+                             vertices.end(),
+                             d_mis.begin(),
+                             d_mis.end(),
+                             non_candidate_vertices.begin());
+      
+      cugraph::vertex_frontier_t<vertex_t, void, /*GraphViewType::is_multi_gpu*/false, true> vertex_frontier(
+        handle,
+        1);
+      
+      vertex_frontier.bucket(0).insert(non_candidate_vertices.begin(), non_candidate_vertices.end());
+      
+      d_any_outgoing_nbrs_included_mis.resize(non_candidate_vertices.size(), handle.get_stream());
 
-        std::for_each(h_mis.begin(),
-                      h_mis.end(),
-                      [vertex_first, vertex_last, &h_total_outgoing_nbrs_included_mis](vertex_t v) {
-                        ASSERT_TRUE((v >= vertex_first) && (v < vertex_last))
-                          << v << " is not within vertex parition range" << std::endl;
+      per_v_transform_reduce_if_outgoing_e(
+        handle,
+        sg_graph_view,
+        vertex_frontier.bucket(0),
+        cugraph::make_edge_src_property_view<vertex_t, vertex_t>(
+          sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()),
+        cugraph::make_edge_dst_property_view<vertex_t, vertex_t>(
+          sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()),
+        cugraph::edge_dummy_property_t{}.view(),
+        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) { return vertex_t{1}; },
+        vertex_t{0},
+        cugraph::reduce_op::any<vertex_t>(),
+        // just use auto, auto remove src_rank and wt # FIXME: address this.
+        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) {
+          // Adjacent vertices are in the MIS
+          return dst_included == 1;
+          },
+        d_any_outgoing_nbrs_included_mis.begin(),
+        false);
 
-                        ASSERT_TRUE(h_total_outgoing_nbrs_included_mis[v - vertex_first] == 0)
-                          << v << "'s neighbor is included in MIS" << std::endl;
-                      });
-      }
+      auto num_invalid_non_candidate_vertices_out_mis = thrust::reduce(
+        handle.get_thrust_policy(),
+        d_any_outgoing_nbrs_included_mis.begin(),
+        d_any_outgoing_nbrs_included_mis.end());
+      
+      // FIXME: Add an error message
+      ASSERT_TRUE(
+        num_invalid_non_candidate_vertices_out_mis == d_any_outgoing_nbrs_included_mis.size());
     }
+    
   }
 };
 

From 75879f47f79b40493eb0b200a1c7607d7502e9fc Mon Sep 17 00:00:00 2001
From: jnke2016 <jnke2016@gmail.com>
Date: Wed, 1 Oct 2025 21:01:44 -0700
Subject: [PATCH 2/3] fix wrong copy

---
 cpp/src/components/mis_impl.cuh | 509 +++++++++++++++++---------------
 1 file changed, 277 insertions(+), 232 deletions(-)

diff --git a/cpp/src/components/mis_impl.cuh b/cpp/src/components/mis_impl.cuh
index 8390dd30678..c567e0320d0 100644
--- a/cpp/src/components/mis_impl.cuh
+++ b/cpp/src/components/mis_impl.cuh
@@ -1,5 +1,6 @@
+
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -10,276 +11,320 @@
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governin_from_mtxg permissions and
+ * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
-#include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh"
+#include "prims/fill_edge_src_dst_property.cuh"
+#include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh" // FIXME: remove if unused
 #include "prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh"
-#include "prims/reduce_op.cuh"
-#include "utilities/base_fixture.hpp"
-#include "utilities/test_graphs.hpp"
+#include "prims/update_edge_src_dst_property.cuh"
 
 #include <cugraph/algorithms.hpp>
-#include <cugraph/edge_partition_view.hpp>
 #include <cugraph/edge_property.hpp>
 #include <cugraph/edge_src_dst_property.hpp>
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
-#include <cugraph/large_buffer_manager.hpp>
-#include <cugraph/utilities/high_res_timer.hpp>
-
-#include <raft/random/rng_state.hpp>
-
-#include <gtest/gtest.h>
-
-#include <chrono>
-#include <iostream>
-#include <random>
-
-struct MaximalIndependentSet_Usecase {
-  bool check_correctness{true};
-};
-
-template <typename input_usecase_t>
-class Tests_SGMaximalIndependentSet
-  : public ::testing::TestWithParam<std::tuple<MaximalIndependentSet_Usecase, input_usecase_t>> {
- public:
-  Tests_SGMaximalIndependentSet() {}
-
-  static void SetUpTestCase() {}
-  static void TearDownTestCase() {}
+#include <cugraph/utilities/host_scalar_comm.hpp>
+
+#include <cuda/functional>
+#include <cuda/std/iterator>
+#include <cuda/std/tuple>
+#include <thrust/count.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/merge.h>
+#include <thrust/remove.h>
+#include <thrust/set_operations.h>
+#include <thrust/transform.h>
+#include <thrust/transform_reduce.h>
+
+#include <cmath>
+#include <optional>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+rmm::device_uvector<vertex_t> maximal_independent_set(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  raft::random::RngState& rng_state)
+{
 
-  virtual void SetUp() {}
-  virtual void TearDown() {}
+  using GraphViewType = cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>;
 
-  template <typename vertex_t, typename edge_t, typename weight_t>
-  void run_current_test(std::tuple<MaximalIndependentSet_Usecase, input_usecase_t> const& param)
-  {
+  vertex_t local_vtx_partition_size = graph_view.local_vertex_partition_range_size();
   
-    auto [mis_usecase, input_usecase] = param;
+  auto vertex_begin =
+    thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first());
 
-    raft::handle_t handle{};
-    HighResTimer hr_timer{};
+  auto vertex_end = thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last());
 
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      hr_timer.start("Construct graph");
-    }
-
-    constexpr bool multi_gpu = false;
+  // Set ID of each vertex as its rank
+  rmm::device_uvector<vertex_t> ranks(local_vtx_partition_size, handle.get_stream());
 
-    bool test_weighted = false;
-    bool renumber = true;
-    bool drop_self_loops  = true;
-    bool drop_multi_edges = true;
+  auto segment_offsets = graph_view.local_vertex_partition_segment_offsets();
 
-    auto [sg_graph, sg_edge_weights, sg_renumber_map] =
-      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, multi_gpu>(
-        handle, input_usecase, test_weighted, renumber, drop_self_loops, drop_multi_edges);
+  vertex_t isolated_v_start = multi_gpu ? segment_offsets->data()[4] : segment_offsets->data()[3];
+  // Only the non zero degree vertices are part of the initial 'remaining_vertices' list
+  rmm::device_uvector<vertex_t> remaining_vertices(isolated_v_start, handle.get_stream());
 
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  thrust::for_each(
+    handle.get_thrust_policy(),
+    thrust::make_counting_iterator<size_t>(0),
+    thrust::make_counting_iterator<size_t>(graph_view.local_vertex_partition_range_size()),
+    [
+      isolated_v_start = multi_gpu ? segment_offsets->data()[4] : segment_offsets->data()[3],
+      ranks = raft::device_span<vertex_t>(ranks.data(), ranks.size()),
+      remaining_vertices = raft::device_span<vertex_t>(remaining_vertices.data(), remaining_vertices.size()),
+      v_first = graph_view.local_vertex_partition_range_first()] __device__(auto idx) {
+      
+      
+      if (idx < isolated_v_start) {
+        // initializing the ranks array
+        ranks[idx] = v_first + idx;
+        // initializing the remaining vertices array
+        remaining_vertices[idx] = v_first + idx;
+      } else {
+        // zero-degree vertices are always part of the MIS
+        ranks[idx] = std::numeric_limits<vertex_t>::max();
+      }
+
+  });
+
+  auto num_buckets = 1;
+
+  vertex_frontier_t<vertex_t, void, GraphViewType::is_multi_gpu, true> vertex_frontier(handle,
+                                                                                       num_buckets);
+  
+  size_t loop_counter = 0;
+  vertex_t nr_remaining_vertices_to_check = remaining_vertices.size();
+  vertex_t nr_remaining_local_vertices_to_check = remaining_vertices.size();
+  edge_dst_property_t<vertex_t, vertex_t> dst_rank_cache(handle);
+
+
+  while (true) {
+    loop_counter++;
+
+    auto num_processed_vertices = remaining_vertices.size() - nr_remaining_local_vertices_to_check;
+
+    if constexpr (multi_gpu) {
+      if (loop_counter == 1) {
+        // Update the property of all edge endpoints during the
+        // first iteration
+        dst_rank_cache = edge_dst_property_t<vertex_t, vertex_t>(handle, graph_view);
+
+        update_edge_dst_property(
+          handle, graph_view, ranks.begin(), dst_rank_cache.mutable_view());
+
+      } else {
+        // Only update the property of endpoints that had their ranks modified       
+        rmm::device_uvector<vertex_t> processed_ranks(
+          num_processed_vertices, handle.get_stream());
+
+        auto pair_idx_processed_vertex_first = thrust::make_zip_iterator(
+          thrust::make_counting_iterator<size_t>(0),
+          remaining_vertices.begin() + nr_remaining_local_vertices_to_check
+        );
+        
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          pair_idx_processed_vertex_first,
+          pair_idx_processed_vertex_first + num_processed_vertices,
+          [processed_ranks =
+            raft::device_span<vertex_t>(processed_ranks.data(), processed_ranks.size()),
+          ranks =
+            raft::device_span<vertex_t>(ranks.data(), ranks.size()),
+          v_first = graph_view.local_vertex_partition_range_first()] __device__(auto pair_idx_v) {
+            
+            auto idx = thrust::get<0>(pair_idx_v);
+            auto v = thrust::get<1>(pair_idx_v);
+            auto v_offset          = v - v_first;
+
+            processed_ranks[idx] = ranks[v_offset];
+        });
 
-      hr_timer.stop();
-      hr_timer.display_and_clear(std::cout);
+        // Only update a subset of the graph edge dst property values
+
+        // FIXME: Since we know that the property being updated are either
+        // std::numeric_limits<vertex_t>::max() or std::numeric_limits<vertex_t>::min(),
+        // explore 'fill_edge_dst_property' which is faster
+        update_edge_dst_property(
+          handle,
+          graph_view,
+          remaining_vertices.begin() + nr_remaining_local_vertices_to_check,
+          remaining_vertices.end(),
+          processed_ranks.begin(),
+          dst_rank_cache.mutable_view()
+        );
+
+      }
     }
-    
-    auto sg_graph_view = sg_graph.view();
-    auto sg_edge_weight_view =
-      sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
 
-    auto edge_partition = sg_graph_view.local_edge_partition_view(0);
-    
-    auto number_of_local_edges = edge_partition.number_of_edges();
+    rmm::device_uvector<vertex_t> max_outgoing_ranks(
+      nr_remaining_local_vertices_to_check, handle.get_stream());
 
-    raft::random::RngState rng_state(0);
-    auto d_mis = cugraph::maximal_independent_set<vertex_t, edge_t, multi_gpu>(
-      handle, sg_graph_view, rng_state);
+    remaining_vertices.resize(nr_remaining_local_vertices_to_check,
+                              handle.get_stream());
+    remaining_vertices.shrink_to_fit(handle.get_stream());
 
-    // Test MIS
-    if (mis_usecase.check_correctness) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      std::vector<vertex_t> h_mis(d_mis.size());
-      raft::update_host(h_mis.data(), d_mis.data(), d_mis.size(), handle.get_stream());
+    vertex_frontier.bucket(0).clear();
 
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-
-      auto vertex_first = sg_graph_view.local_vertex_partition_range_first();
-      auto vertex_last  = sg_graph_view.local_vertex_partition_range_last();
-
-      std::for_each(h_mis.begin(), h_mis.end(), [vertex_first, vertex_last](vertex_t v) {
-        ASSERT_TRUE((v >= vertex_first) && (v < vertex_last));
-      });
-
-      // If a vertex is included in MIS, then none of its neighbor should be
-
-      vertex_t local_vtx_partitoin_size = sg_graph_view.local_vertex_partition_range_size();
-      rmm::device_uvector<vertex_t> d_any_outgoing_nbrs_included_mis(local_vtx_partitoin_size,
-                                                                       handle.get_stream());
-
-      rmm::device_uvector<vertex_t> inclusiong_flags(local_vtx_partitoin_size, handle.get_stream());
-
-      thrust::uninitialized_fill(
-        handle.get_thrust_policy(), inclusiong_flags.begin(), inclusiong_flags.end(), vertex_t{0});
-
-      thrust::for_each(
-        handle.get_thrust_policy(),
-        d_mis.begin(),
-        d_mis.end(),
-        [inclusiong_flags =
-           raft::device_span<vertex_t>(inclusiong_flags.data(), inclusiong_flags.size()),
-         v_first = sg_graph_view.local_vertex_partition_range_first()] __device__(auto v) {
-          auto v_offset              = v - v_first;
-          inclusiong_flags[v_offset] = vertex_t{1};
-        });
-
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+    vertex_frontier.bucket(0).insert(remaining_vertices.begin(), remaining_vertices.end());
 
+    if (loop_counter == 1) {
+      // FIXME: The optimization below is not appropriate for the current
+      // implementation since the neighbor with the highest priority
+      // needs to be retrieved to update its rank if possible in the 'rank'
+      // array. When using this primitive, it will stop once the first higher
+      // priority neighbor is found which may not be the highest. This
+      // will lead to more iterations until the highest priority neighbor
+      // is found if any (this will increase the overall runtime).
       per_v_transform_reduce_if_outgoing_e(
         handle,
-        sg_graph_view,
-        cugraph::make_edge_src_property_view<vertex_t, vertex_t>(
-          sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()),
-        cugraph::make_edge_dst_property_view<vertex_t, vertex_t>(
-          sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()),
-        cugraph::edge_dummy_property_t{}.view(),
-        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) { return vertex_t{1}; },
-        vertex_t{0},
-        cugraph::reduce_op::any<vertex_t>(),
+        graph_view,
+        vertex_frontier.bucket(0),
+        edge_src_dummy_property_t{}.view(),
+        multi_gpu ? dst_rank_cache.view()
+                  : make_edge_dst_property_view<vertex_t, vertex_t>(
+                      graph_view, ranks.begin(), ranks.size()),
+        edge_dummy_property_t{}.view(),
+        [] __device__(auto src, auto dst, auto src_rank, auto dst_rank, auto wt) { return dst_rank; },
+        std::numeric_limits<vertex_t>::lowest(),
+        reduce_op::any<vertex_t>(),
         // just use auto, auto remove src_rank and wt # FIXME: address this.
-        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) {
-          // Adjacent vertices are in the MIS
-          return (src_included == dst_included) && (src_included == 1);
-          },
-        d_any_outgoing_nbrs_included_mis.begin(),
-        false);
-      
-      auto num_invalid_vertices_in_mis = thrust::reduce(
-        handle.get_thrust_policy(),
-        d_any_outgoing_nbrs_included_mis.begin(),
-        d_any_outgoing_nbrs_included_mis.end());
-
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-
-      ASSERT_TRUE(num_invalid_vertices_in_mis == 0);
-
-      auto vertex_begin =
-        thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first());
-
-      auto vertex_end = 
-        thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last());
-
-      rmm::device_uvector<vertex_t> vertices(local_vtx_partitoin_size,
-                                              handle.get_stream());
-      
-      thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, vertices.begin());
-
-      rmm::device_uvector<vertex_t> non_candidate_vertices(
-        vertices.size() - d_mis.size(), handle.get_stream());
-      
-      thrust::set_difference(handle.get_thrust_policy(),
-                             vertices.begin(),
-                             vertices.end(),
-                             d_mis.begin(),
-                             d_mis.end(),
-                             non_candidate_vertices.begin());
-      
-      cugraph::vertex_frontier_t<vertex_t, void, /*GraphViewType::is_multi_gpu*/false, true> vertex_frontier(
+        [] __device__(auto src, auto dst, auto src_rank, auto dst_rank, auto wt) { return src < dst_rank; },
+        max_outgoing_ranks.begin(),
+        false); // FIXME: Set expensive check to False
+    } else {
+      per_v_transform_reduce_outgoing_e(
         handle,
-        1);
-      
-      vertex_frontier.bucket(0).insert(non_candidate_vertices.begin(), non_candidate_vertices.end());
-      
-      d_any_outgoing_nbrs_included_mis.resize(non_candidate_vertices.size(), handle.get_stream());
-
-      per_v_transform_reduce_if_outgoing_e(
-        handle,
-        sg_graph_view,
+        graph_view,
         vertex_frontier.bucket(0),
-        cugraph::make_edge_src_property_view<vertex_t, vertex_t>(
-          sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()),
-        cugraph::make_edge_dst_property_view<vertex_t, vertex_t>(
-          sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()),
-        cugraph::edge_dummy_property_t{}.view(),
-        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) { return vertex_t{1}; },
-        vertex_t{0},
-        cugraph::reduce_op::any<vertex_t>(),
-        // just use auto, auto remove src_rank and wt # FIXME: address this.
-        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) {
-          // Adjacent vertices are in the MIS
-          return dst_included == 1;
-          },
-        d_any_outgoing_nbrs_included_mis.begin(),
-        false);
-
-      auto num_invalid_non_candidate_vertices_out_mis = thrust::reduce(
-        handle.get_thrust_policy(),
-        d_any_outgoing_nbrs_included_mis.begin(),
-        d_any_outgoing_nbrs_included_mis.end());
+        edge_src_dummy_property_t{}.view(),
+        
+        multi_gpu ? dst_rank_cache.view()
+                  : make_edge_dst_property_view<vertex_t, vertex_t>(
+                      graph_view, ranks.begin(), ranks.size()),
       
-      // FIXME: Add an error message
-      ASSERT_TRUE(
-        num_invalid_non_candidate_vertices_out_mis == d_any_outgoing_nbrs_included_mis.size());
+        edge_dummy_property_t{}.view(),
+        [] __device__(auto src, auto dst, auto src_rank, auto dst_rank, auto wt) { return dst_rank; },
+        std::numeric_limits<vertex_t>::lowest(),
+        cugraph::reduce_op::maximum<vertex_t>{},
+        max_outgoing_ranks.begin(),
+        false);
     }
+  
+    auto pair_idx_vertex_first = thrust::make_zip_iterator( // FIXME: rename this.
+      thrust::make_counting_iterator<size_t>(0),
+      remaining_vertices.begin()
+      );
     
-  }
-};
 
-using Tests_SGMaximalIndependentSet_File =
-  Tests_SGMaximalIndependentSet<cugraph::test::File_Usecase>;
-using Tests_SGMaximalIndependentSet_Rmat =
-  Tests_SGMaximalIndependentSet<cugraph::test::Rmat_Usecase>;
+    //
+    // If the max neighbor of a vertex is already in MIS (i.e. has rank
+    // std::numeric_limits<vertex_t>::max()), discard it, otherwise,
+    // include the vertex if it has larger rank than its maximum rank neighbor
+    //
+
+    // Use thrust::stable_partition to keep track of vertices that only needs to have
+    // their property updated
+    auto last = thrust::stable_partition(
+      handle.get_thrust_policy(),
+      pair_idx_vertex_first, // FIXME: Same here, no need to pass an iterator, just follow the pseudo code
+      pair_idx_vertex_first + remaining_vertices.size(),
+      [max_rank_neighbor_first = max_outgoing_ranks.begin(),
+      ranks                   = raft::device_span<vertex_t>(ranks.data(), ranks.size()),
+      v_first = graph_view.local_vertex_partition_range_first()] __device__(auto pair_vidx_v_priority) {
+
+        auto vidx = thrust::get<0>(pair_vidx_v_priority);
+        auto v = thrust::get<1>(pair_vidx_v_priority);
+        auto v_offset          = v - v_first;
+        auto max_neighbor_rank = *(max_rank_neighbor_first + vidx);
+        auto rank_of_v         = ranks[v_offset];
+
+        if (max_neighbor_rank >= std::numeric_limits<vertex_t>::max()) {
+          ranks[v_offset] = std::numeric_limits<vertex_t>::lowest();
+          return false;
+        }
+
+        if (rank_of_v >= max_neighbor_rank) {
+          ranks[v_offset] = std::numeric_limits<vertex_t>::max();
+          return false;
+        }
+        return true;
+      });
 
-TEST_P(Tests_SGMaximalIndependentSet_File, CheckInt32Int32FloatFloat)
-{
-  run_current_test<int32_t, int32_t, float, int>(
-    override_File_Usecase_with_cmd_line_arguments(GetParam()));
-}
+    nr_remaining_local_vertices_to_check = cuda::std::distance(pair_idx_vertex_first, last),
+                          handle.get_stream();
+
+    max_outgoing_ranks.resize(0, handle.get_stream());
+    max_outgoing_ranks.shrink_to_fit(handle.get_stream());
+
+    if (multi_gpu) {
+      // FIXME: rename to 'nr_remaining_vertices_to_check' to
+      // 'nr_remaining_global_vertices_to_check'
+      nr_remaining_vertices_to_check = host_scalar_allreduce(handle.get_comms(),
+                                                             nr_remaining_local_vertices_to_check,
+                                                             raft::comms::op_t::SUM,
+                                                             handle.get_stream());
+    } else {
+      nr_remaining_vertices_to_check = nr_remaining_local_vertices_to_check;
+    }
 
-TEST_P(Tests_SGMaximalIndependentSet_File, CheckInt64Int64FloatFloat)
-{
-  run_current_test<int64_t, int64_t, float, int>(
-    override_File_Usecase_with_cmd_line_arguments(GetParam()));
-}
+    if (nr_remaining_vertices_to_check == 0) { break; }
 
-TEST_P(Tests_SGMaximalIndependentSet_Rmat, CheckInt32Int32FloatFloat)
-{
-  run_current_test<int32_t, int32_t, float, int>(
-    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+  }
+  
+  vertex_t nr_vertices_included_in_mis = thrust::count_if(
+    handle.get_thrust_policy(), ranks.begin(), ranks.end(), [] __device__(auto v_rank) {
+      return v_rank >= std::numeric_limits<vertex_t>::max();
+    });
+
+  // Build MIS and return
+
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto start_build_mis = high_resolution_clock::now();
+
+  rmm::device_uvector<vertex_t> mis(nr_vertices_included_in_mis, handle.get_stream());
+  thrust::copy_if(
+    handle.get_thrust_policy(),
+    vertex_begin,
+    vertex_end,
+    ranks.begin(),
+    mis.begin(),
+    [] __device__(auto v_rank) { return v_rank >= std::numeric_limits<vertex_t>::max(); });
+
+  ranks.resize(0, handle.get_stream());
+  ranks.shrink_to_fit(handle.get_stream());
+
+  if (multi_gpu) {
+      // FIXME: rename to 'nr_remaining_vertices_to_check' to
+      // 'nr_remaining_global_vertices_to_check'
+      mis_size = host_scalar_allreduce(handle.get_comms(),
+                                       mis_size,
+                                       raft::comms::op_t::SUM,
+                                       handle.get_stream());
+      
+      mis_size_perc = host_scalar_allreduce(handle.get_comms(),
+                                            mis_size_perc,
+                                            raft::comms::op_t::SUM,
+                                            handle.get_stream());
+  }
+
+  return mis;
 }
+}  // namespace detail
 
-TEST_P(Tests_SGMaximalIndependentSet_Rmat, CheckInt64Int64FloatFloat)
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+rmm::device_uvector<vertex_t> maximal_independent_set(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  raft::random::RngState& rng_state)
 {
-  run_current_test<int64_t, int64_t, float, int>(
-    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+  return detail::maximal_independent_set(handle, graph_view, rng_state);
 }
 
-bool constexpr check_correctness = false;
-INSTANTIATE_TEST_SUITE_P(
-  file_test,
-  Tests_SGMaximalIndependentSet_File,
-  ::testing::Combine(::testing::Values(MaximalIndependentSet_Usecase{check_correctness},
-                                       MaximalIndependentSet_Usecase{check_correctness}),
-                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
-
-INSTANTIATE_TEST_SUITE_P(
-  rmat_small_test,
-  Tests_SGMaximalIndependentSet_Rmat,
-  ::testing::Combine(
-    ::testing::Values(MaximalIndependentSet_Usecase{check_correctness}),
-    ::testing::Values(cugraph::test::Rmat_Usecase(3, 4, 0.57, 0.19, 0.19, 0, true, false))));
-
-INSTANTIATE_TEST_SUITE_P(
-  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
-                          --gtest_filter to select only the rmat_benchmark_test with a specific
-                          vertex & edge type combination) by command line arguments and do not
-                          include more than one Rmat_Usecase that differ only in scale or edge
-                          factor (to avoid running same benchmarks more than once) */
-  Tests_SGMaximalIndependentSet_Rmat,
-  ::testing::Combine(
-    ::testing::Values(MaximalIndependentSet_Usecase{check_correctness},
-                      MaximalIndependentSet_Usecase{check_correctness}),
-    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
-
-CUGRAPH_TEST_PROGRAM_MAIN()
+}  // namespace cugraph

From e2277b5f5d391f1e175f74bd9fff2ef3aacfdb58 Mon Sep 17 00:00:00 2001
From: jnke2016 <jnke2016@gmail.com>
Date: Wed, 1 Oct 2025 21:04:11 -0700
Subject: [PATCH 3/3] remove unused statement

---
 cpp/src/components/mis_impl.cuh | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/cpp/src/components/mis_impl.cuh b/cpp/src/components/mis_impl.cuh
index c567e0320d0..25fc1961642 100644
--- a/cpp/src/components/mis_impl.cuh
+++ b/cpp/src/components/mis_impl.cuh
@@ -300,20 +300,6 @@ rmm::device_uvector<vertex_t> maximal_independent_set(
   ranks.resize(0, handle.get_stream());
   ranks.shrink_to_fit(handle.get_stream());
 
-  if (multi_gpu) {
-      // FIXME: rename to 'nr_remaining_vertices_to_check' to
-      // 'nr_remaining_global_vertices_to_check'
-      mis_size = host_scalar_allreduce(handle.get_comms(),
-                                       mis_size,
-                                       raft::comms::op_t::SUM,
-                                       handle.get_stream());
-      
-      mis_size_perc = host_scalar_allreduce(handle.get_comms(),
-                                            mis_size_perc,
-                                            raft::comms::op_t::SUM,
-                                            handle.get_stream());
-  }
-
   return mis;
 }
 }  // namespace detail