diff --git a/cpp/src/components/mis_impl.cuh b/cpp/src/components/mis_impl.cuh index 085b4837873..25fc1961642 100644 --- a/cpp/src/components/mis_impl.cuh +++ b/cpp/src/components/mis_impl.cuh @@ -17,7 +17,8 @@ #pragma once #include "prims/fill_edge_src_dst_property.cuh" -#include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh" +#include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh" // FIXME: remove if unused +#include "prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh" #include "prims/update_edge_src_dst_property.cuh" #include @@ -51,263 +52,242 @@ rmm::device_uvector maximal_independent_set( cugraph::graph_view_t const& graph_view, raft::random::RngState& rng_state) { - using GraphViewType = cugraph::graph_view_t; - - vertex_t local_vtx_partitoin_size = graph_view.local_vertex_partition_range_size(); - rmm::device_uvector remaining_vertices(local_vtx_partitoin_size, handle.get_stream()); + using GraphViewType = cugraph::graph_view_t; + vertex_t local_vtx_partition_size = graph_view.local_vertex_partition_range_size(); + auto vertex_begin = thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()); - auto vertex_end = thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last()); - auto out_degrees = graph_view.compute_out_degrees(handle); - auto in_degrees = graph_view.compute_in_degrees(handle); - - // Vertices with degree zero are always part of MIS - remaining_vertices.resize( - cuda::std::distance( - remaining_vertices.begin(), - thrust::copy_if(handle.get_thrust_policy(), - vertex_begin, - vertex_end, - thrust::make_zip_iterator(out_degrees.begin(), in_degrees.begin()), - remaining_vertices.begin(), - [] __device__(auto out_deg_and_in_deg) { - return !((cuda::std::get<0>(out_deg_and_in_deg) == 0) && - (cuda::std::get<1>(out_deg_and_in_deg) == 0)); - })), - handle.get_stream()); + auto vertex_end = thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last()); // Set ID of each vertex as its rank - rmm::device_uvector ranks(local_vtx_partitoin_size, handle.get_stream()); - thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, ranks.begin()); - - // Set ranks of zero degree vetices to std::numeric_limits::max() - thrust::transform_if(handle.get_thrust_policy(), - thrust::make_zip_iterator(out_degrees.begin(), in_degrees.begin()), - thrust::make_zip_iterator(out_degrees.end(), in_degrees.end()), - ranks.begin(), - cuda::proclaim_return_type( - [] __device__(auto) { return std::numeric_limits::max(); }), - [] __device__(auto in_out_degree) { - return (cuda::std::get<0>(in_out_degree) == 0) && - (cuda::std::get<1>(in_out_degree) == 0); - }); - - out_degrees.resize(0, handle.get_stream()); - out_degrees.shrink_to_fit(handle.get_stream()); - - in_degrees.resize(0, handle.get_stream()); - in_degrees.shrink_to_fit(handle.get_stream()); - - size_t loop_counter = 0; - while (true) { - loop_counter++; + rmm::device_uvector ranks(local_vtx_partition_size, handle.get_stream()); - // Copy ranks into temporary vector to begin with + auto segment_offsets = graph_view.local_vertex_partition_segment_offsets(); - rmm::device_uvector temporary_ranks(local_vtx_partitoin_size, handle.get_stream()); - thrust::copy(handle.get_thrust_policy(), ranks.begin(), ranks.end(), temporary_ranks.begin()); + vertex_t isolated_v_start = multi_gpu ? segment_offsets->data()[4] : segment_offsets->data()[3]; + // Only the non zero degree vertices are part of the initial 'remaining_vertices' list + rmm::device_uvector remaining_vertices(isolated_v_start, handle.get_stream()); - // Select a random set of candidate vertices + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_size()), + [ + isolated_v_start = multi_gpu ? segment_offsets->data()[4] : segment_offsets->data()[3], + ranks = raft::device_span(ranks.data(), ranks.size()), + remaining_vertices = raft::device_span(remaining_vertices.data(), remaining_vertices.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto idx) { + + + if (idx < isolated_v_start) { + // initializing the ranks array + ranks[idx] = v_first + idx; + // initializing the remaining vertices array + remaining_vertices[idx] = v_first + idx; + } else { + // zero-degree vertices are always part of the MIS + ranks[idx] = std::numeric_limits::max(); + } + + }); + + auto num_buckets = 1; + + vertex_frontier_t vertex_frontier(handle, + num_buckets); + + size_t loop_counter = 0; + vertex_t nr_remaining_vertices_to_check = remaining_vertices.size(); + vertex_t nr_remaining_local_vertices_to_check = remaining_vertices.size(); + edge_dst_property_t dst_rank_cache(handle); - vertex_t nr_remaining_vertices_to_check = remaining_vertices.size(); - if (multi_gpu) { - nr_remaining_vertices_to_check = host_scalar_allreduce(handle.get_comms(), - nr_remaining_vertices_to_check, - raft::comms::op_t::SUM, - handle.get_stream()); - } - vertex_t nr_candidates = (nr_remaining_vertices_to_check < 1024) - ? nr_remaining_vertices_to_check - : std::min(static_cast((0.50 + 0.25 * loop_counter) * - nr_remaining_vertices_to_check), - nr_remaining_vertices_to_check); - - // FIXME: Can we improve performance here? - // FIXME: if(nr_remaining_vertices_to_check < 1024), may avoid calling select_random_vertices - auto d_sampled_vertices = - cugraph::select_random_vertices(handle, - graph_view, - std::make_optional(raft::device_span{ - remaining_vertices.data(), remaining_vertices.size()}), - rng_state, - nr_candidates, - false, - true); - - rmm::device_uvector non_candidate_vertices( - remaining_vertices.size() - d_sampled_vertices.size(), handle.get_stream()); - - thrust::set_difference(handle.get_thrust_policy(), - remaining_vertices.begin(), - remaining_vertices.end(), - d_sampled_vertices.begin(), - d_sampled_vertices.end(), - non_candidate_vertices.begin()); - - // Set temporary ranks of non-candidate vertices to std::numeric_limits::lowest() - thrust::for_each( - handle.get_thrust_policy(), - non_candidate_vertices.begin(), - non_candidate_vertices.end(), - [temporary_ranks = - raft::device_span(temporary_ranks.data(), temporary_ranks.size()), - v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { - // - // if rank of a non-candidate vertex is not std::numeric_limits::max() (i.e. the - // vertex is not already in MIS), set it to std::numeric_limits::lowest() - // - auto v_offset = v - v_first; - if (temporary_ranks[v_offset] < std::numeric_limits::max()) { - temporary_ranks[v_offset] = std::numeric_limits::lowest(); - } - }); + while (true) { + loop_counter++; - // Caches for ranks - edge_src_property_t src_rank_cache(handle); - edge_dst_property_t dst_rank_cache(handle); + auto num_processed_vertices = remaining_vertices.size() - nr_remaining_local_vertices_to_check; - // Update rank caches with temporary ranks if constexpr (multi_gpu) { - src_rank_cache = edge_src_property_t(handle, graph_view); - dst_rank_cache = edge_dst_property_t(handle, graph_view); - update_edge_src_property( - handle, graph_view, temporary_ranks.begin(), src_rank_cache.mutable_view()); - update_edge_dst_property( - handle, graph_view, temporary_ranks.begin(), dst_rank_cache.mutable_view()); + if (loop_counter == 1) { + // Update the property of all edge endpoints during the + // first iteration + dst_rank_cache = edge_dst_property_t(handle, graph_view); + + update_edge_dst_property( + handle, graph_view, ranks.begin(), dst_rank_cache.mutable_view()); + + } else { + // Only update the property of endpoints that had their ranks modified + rmm::device_uvector processed_ranks( + num_processed_vertices, handle.get_stream()); + + auto pair_idx_processed_vertex_first = thrust::make_zip_iterator( + thrust::make_counting_iterator(0), + remaining_vertices.begin() + nr_remaining_local_vertices_to_check + ); + + thrust::for_each( + handle.get_thrust_policy(), + pair_idx_processed_vertex_first, + pair_idx_processed_vertex_first + num_processed_vertices, + [processed_ranks = + raft::device_span(processed_ranks.data(), processed_ranks.size()), + ranks = + raft::device_span(ranks.data(), ranks.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto pair_idx_v) { + + auto idx = thrust::get<0>(pair_idx_v); + auto v = thrust::get<1>(pair_idx_v); + auto v_offset = v - v_first; + + processed_ranks[idx] = ranks[v_offset]; + }); + + // Only update a subset of the graph edge dst property values + + // FIXME: Since we know that the property being updated are either + // std::numeric_limits::max() or std::numeric_limits::min(), + // explore 'fill_edge_dst_property' which is faster + update_edge_dst_property( + handle, + graph_view, + remaining_vertices.begin() + nr_remaining_local_vertices_to_check, + remaining_vertices.end(), + processed_ranks.begin(), + dst_rank_cache.mutable_view() + ); + + } } - // - // Find maximum rank outgoing neighbor for each vertex - // - - rmm::device_uvector max_outgoing_ranks(local_vtx_partitoin_size, handle.get_stream()); - - per_v_transform_reduce_outgoing_e( - handle, - graph_view, - multi_gpu ? src_rank_cache.view() - : make_edge_src_property_view( - graph_view, temporary_ranks.begin(), temporary_ranks.size()), - multi_gpu ? dst_rank_cache.view() - : make_edge_dst_property_view( - graph_view, temporary_ranks.begin(), temporary_ranks.size()), - edge_dummy_property_t{}.view(), - [] __device__(auto src, auto dst, auto src_rank, auto dst_rank, auto wt) { return dst_rank; }, - std::numeric_limits::lowest(), - cugraph::reduce_op::maximum{}, - max_outgoing_ranks.begin()); + rmm::device_uvector max_outgoing_ranks( + nr_remaining_local_vertices_to_check, handle.get_stream()); - // - // Find maximum rank incoming neighbor for each vertex - // - - rmm::device_uvector max_incoming_ranks(local_vtx_partitoin_size, handle.get_stream()); - - per_v_transform_reduce_incoming_e( - handle, - graph_view, - multi_gpu ? src_rank_cache.view() - : make_edge_src_property_view( - graph_view, temporary_ranks.begin(), temporary_ranks.size()), - multi_gpu ? dst_rank_cache.view() - : make_edge_dst_property_view( - graph_view, temporary_ranks.begin(), temporary_ranks.size()), - edge_dummy_property_t{}.view(), - [] __device__(auto src, auto dst, auto src_rank, auto dst_rank, auto wt) { return src_rank; }, - std::numeric_limits::lowest(), - cugraph::reduce_op::maximum{}, - max_incoming_ranks.begin()); - - temporary_ranks.resize(0, handle.get_stream()); - temporary_ranks.shrink_to_fit(handle.get_stream()); - - // - // Compute max of outgoing and incoming neighbors - // - thrust::transform(handle.get_thrust_policy(), - max_incoming_ranks.begin(), - max_incoming_ranks.end(), - max_outgoing_ranks.begin(), - max_outgoing_ranks.begin(), - thrust::maximum()); + remaining_vertices.resize(nr_remaining_local_vertices_to_check, + handle.get_stream()); + remaining_vertices.shrink_to_fit(handle.get_stream()); - max_incoming_ranks.resize(0, handle.get_stream()); - max_incoming_ranks.shrink_to_fit(handle.get_stream()); + vertex_frontier.bucket(0).clear(); + + vertex_frontier.bucket(0).insert(remaining_vertices.begin(), remaining_vertices.end()); + + if (loop_counter == 1) { + // FIXME: The optimization below is not appropriate for the current + // implementation since the neighbor with the highest priority + // needs to be retrieved to update its rank if possible in the 'rank' + // array. When using this primitive, it will stop once the first higher + // priority neighbor is found which may not be the highest. This + // will lead to more iterations until the highest priority neighbor + // is found if any (this will increase the overall runtime). + per_v_transform_reduce_if_outgoing_e( + handle, + graph_view, + vertex_frontier.bucket(0), + edge_src_dummy_property_t{}.view(), + multi_gpu ? dst_rank_cache.view() + : make_edge_dst_property_view( + graph_view, ranks.begin(), ranks.size()), + edge_dummy_property_t{}.view(), + [] __device__(auto src, auto dst, auto src_rank, auto dst_rank, auto wt) { return dst_rank; }, + std::numeric_limits::lowest(), + reduce_op::any(), + // just use auto, auto remove src_rank and wt # FIXME: address this. + [] __device__(auto src, auto dst, auto src_rank, auto dst_rank, auto wt) { return src < dst_rank; }, + max_outgoing_ranks.begin(), + false); // FIXME: Set expensive check to False + } else { + per_v_transform_reduce_outgoing_e( + handle, + graph_view, + vertex_frontier.bucket(0), + edge_src_dummy_property_t{}.view(), + + multi_gpu ? dst_rank_cache.view() + : make_edge_dst_property_view( + graph_view, ranks.begin(), ranks.size()), + + edge_dummy_property_t{}.view(), + [] __device__(auto src, auto dst, auto src_rank, auto dst_rank, auto wt) { return dst_rank; }, + std::numeric_limits::lowest(), + cugraph::reduce_op::maximum{}, + max_outgoing_ranks.begin(), + false); + } + + auto pair_idx_vertex_first = thrust::make_zip_iterator( // FIXME: rename this. + thrust::make_counting_iterator(0), + remaining_vertices.begin() + ); + // // If the max neighbor of a vertex is already in MIS (i.e. has rank // std::numeric_limits::max()), discard it, otherwise, // include the vertex if it has larger rank than its maximum rank neighbor // - auto last = thrust::remove_if( + + // Use thrust::stable_partition to keep track of vertices that only needs to have + // their property updated + auto last = thrust::stable_partition( handle.get_thrust_policy(), - d_sampled_vertices.begin(), - d_sampled_vertices.end(), + pair_idx_vertex_first, // FIXME: Same here, no need to pass an iterator, just follow the pseudo code + pair_idx_vertex_first + remaining_vertices.size(), [max_rank_neighbor_first = max_outgoing_ranks.begin(), - ranks = raft::device_span(ranks.data(), ranks.size()), - v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + ranks = raft::device_span(ranks.data(), ranks.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto pair_vidx_v_priority) { + + auto vidx = thrust::get<0>(pair_vidx_v_priority); + auto v = thrust::get<1>(pair_vidx_v_priority); auto v_offset = v - v_first; - auto max_neighbor_rank = *(max_rank_neighbor_first + v_offset); + auto max_neighbor_rank = *(max_rank_neighbor_first + vidx); auto rank_of_v = ranks[v_offset]; if (max_neighbor_rank >= std::numeric_limits::max()) { - // Maximum rank neighbor is alreay in MIS - // Discard current vertex by setting its rank to - // std::numeric_limits::lowest() ranks[v_offset] = std::numeric_limits::lowest(); - return true; + return false; } if (rank_of_v >= max_neighbor_rank) { - // Include v and set its rank to std::numeric_limits::max() ranks[v_offset] = std::numeric_limits::max(); - return true; + return false; } - return false; + return true; }); + nr_remaining_local_vertices_to_check = cuda::std::distance(pair_idx_vertex_first, last), + handle.get_stream(); + max_outgoing_ranks.resize(0, handle.get_stream()); max_outgoing_ranks.shrink_to_fit(handle.get_stream()); - d_sampled_vertices.resize(cuda::std::distance(d_sampled_vertices.begin(), last), - handle.get_stream()); - d_sampled_vertices.shrink_to_fit(handle.get_stream()); - - remaining_vertices.resize(non_candidate_vertices.size() + d_sampled_vertices.size(), - handle.get_stream()); - remaining_vertices.shrink_to_fit(handle.get_stream()); - - // merge non-candidate and remaining candidate vertices - thrust::merge(handle.get_thrust_policy(), - non_candidate_vertices.begin(), - non_candidate_vertices.end(), - d_sampled_vertices.begin(), - d_sampled_vertices.end(), - remaining_vertices.begin()); - - nr_remaining_vertices_to_check = remaining_vertices.size(); if (multi_gpu) { + // FIXME: rename to 'nr_remaining_vertices_to_check' to + // 'nr_remaining_global_vertices_to_check' nr_remaining_vertices_to_check = host_scalar_allreduce(handle.get_comms(), - nr_remaining_vertices_to_check, + nr_remaining_local_vertices_to_check, raft::comms::op_t::SUM, handle.get_stream()); + } else { + nr_remaining_vertices_to_check = nr_remaining_local_vertices_to_check; } if (nr_remaining_vertices_to_check == 0) { break; } - } - - // Count number of vertices included in MIS + } + vertex_t nr_vertices_included_in_mis = thrust::count_if( handle.get_thrust_policy(), ranks.begin(), ranks.end(), [] __device__(auto v_rank) { return v_rank >= std::numeric_limits::max(); }); // Build MIS and return + + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto start_build_mis = high_resolution_clock::now(); + rmm::device_uvector mis(nr_vertices_included_in_mis, handle.get_stream()); thrust::copy_if( handle.get_thrust_policy(), @@ -319,6 +299,7 @@ rmm::device_uvector maximal_independent_set( ranks.resize(0, handle.get_stream()); ranks.shrink_to_fit(handle.get_stream()); + return mis; } } // namespace detail diff --git a/cpp/tests/components/mis_test.cu b/cpp/tests/components/mis_test.cu index 0edbccfc356..8390dd30678 100644 --- a/cpp/tests/components/mis_test.cu +++ b/cpp/tests/components/mis_test.cu @@ -15,6 +15,7 @@ */ #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh" +#include "prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh" #include "prims/reduce_op.cuh" #include "utilities/base_fixture.hpp" #include "utilities/test_graphs.hpp" @@ -25,6 +26,7 @@ #include #include #include +#include #include #include @@ -51,9 +53,10 @@ class Tests_SGMaximalIndependentSet virtual void SetUp() {} virtual void TearDown() {} - template + template void run_current_test(std::tuple const& param) { + auto [mis_usecase, input_usecase] = param; raft::handle_t handle{}; @@ -66,9 +69,14 @@ class Tests_SGMaximalIndependentSet constexpr bool multi_gpu = false; + bool test_weighted = false; + bool renumber = true; + bool drop_self_loops = true; + bool drop_multi_edges = true; + auto [sg_graph, sg_edge_weights, sg_renumber_map] = cugraph::test::construct_graph( - handle, input_usecase, false, true); + handle, input_usecase, test_weighted, renumber, drop_self_loops, drop_multi_edges); if (cugraph::test::g_perf) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -76,11 +84,15 @@ class Tests_SGMaximalIndependentSet hr_timer.stop(); hr_timer.display_and_clear(std::cout); } - + auto sg_graph_view = sg_graph.view(); auto sg_edge_weight_view = sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt; + auto edge_partition = sg_graph_view.local_edge_partition_view(0); + + auto number_of_local_edges = edge_partition.number_of_edges(); + raft::random::RngState rng_state(0); auto d_mis = cugraph::maximal_independent_set( handle, sg_graph_view, rng_state); @@ -103,7 +115,7 @@ class Tests_SGMaximalIndependentSet // If a vertex is included in MIS, then none of its neighbor should be vertex_t local_vtx_partitoin_size = sg_graph_view.local_vertex_partition_range_size(); - rmm::device_uvector d_total_outgoing_nbrs_included_mis(local_vtx_partitoin_size, + rmm::device_uvector d_any_outgoing_nbrs_included_mis(local_vtx_partitoin_size, handle.get_stream()); rmm::device_uvector inclusiong_flags(local_vtx_partitoin_size, handle.get_stream()); @@ -124,47 +136,93 @@ class Tests_SGMaximalIndependentSet RAFT_CUDA_TRY(cudaDeviceSynchronize()); - per_v_transform_reduce_outgoing_e( + per_v_transform_reduce_if_outgoing_e( handle, sg_graph_view, cugraph::make_edge_src_property_view( - sg_graph_view, inclusiong_flags.data(), 1), + sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()), cugraph::make_edge_dst_property_view( - sg_graph_view, inclusiong_flags.data(), 1), + sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()), cugraph::edge_dummy_property_t{}.view(), - [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) { - return (src == dst) ? 0 : dst_included; - }, + [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) { return vertex_t{1}; }, vertex_t{0}, - cugraph::reduce_op::plus{}, - d_total_outgoing_nbrs_included_mis.begin()); - - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - - std::vector h_total_outgoing_nbrs_included_mis( - d_total_outgoing_nbrs_included_mis.size()); - raft::update_host(h_total_outgoing_nbrs_included_mis.data(), - d_total_outgoing_nbrs_included_mis.data(), - d_total_outgoing_nbrs_included_mis.size(), - handle.get_stream()); + cugraph::reduce_op::any(), + // just use auto, auto remove src_rank and wt # FIXME: address this. + [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) { + // Adjacent vertices are in the MIS + return (src_included == dst_included) && (src_included == 1); + }, + d_any_outgoing_nbrs_included_mis.begin(), + false); + + auto num_invalid_vertices_in_mis = thrust::reduce( + handle.get_thrust_policy(), + d_any_outgoing_nbrs_included_mis.begin(), + d_any_outgoing_nbrs_included_mis.end()); RAFT_CUDA_TRY(cudaDeviceSynchronize()); - { - auto vertex_first = sg_graph_view.local_vertex_partition_range_first(); - auto vertex_last = sg_graph_view.local_vertex_partition_range_last(); + ASSERT_TRUE(num_invalid_vertices_in_mis == 0); + + auto vertex_begin = + thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()); + + auto vertex_end = + thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()); + + rmm::device_uvector vertices(local_vtx_partitoin_size, + handle.get_stream()); + + thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, vertices.begin()); + + rmm::device_uvector non_candidate_vertices( + vertices.size() - d_mis.size(), handle.get_stream()); + + thrust::set_difference(handle.get_thrust_policy(), + vertices.begin(), + vertices.end(), + d_mis.begin(), + d_mis.end(), + non_candidate_vertices.begin()); + + cugraph::vertex_frontier_t vertex_frontier( + handle, + 1); + + vertex_frontier.bucket(0).insert(non_candidate_vertices.begin(), non_candidate_vertices.end()); + + d_any_outgoing_nbrs_included_mis.resize(non_candidate_vertices.size(), handle.get_stream()); - std::for_each(h_mis.begin(), - h_mis.end(), - [vertex_first, vertex_last, &h_total_outgoing_nbrs_included_mis](vertex_t v) { - ASSERT_TRUE((v >= vertex_first) && (v < vertex_last)) - << v << " is not within vertex parition range" << std::endl; + per_v_transform_reduce_if_outgoing_e( + handle, + sg_graph_view, + vertex_frontier.bucket(0), + cugraph::make_edge_src_property_view( + sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()), + cugraph::make_edge_dst_property_view( + sg_graph_view, inclusiong_flags.data(), inclusiong_flags.size()), + cugraph::edge_dummy_property_t{}.view(), + [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) { return vertex_t{1}; }, + vertex_t{0}, + cugraph::reduce_op::any(), + // just use auto, auto remove src_rank and wt # FIXME: address this. + [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) { + // Adjacent vertices are in the MIS + return dst_included == 1; + }, + d_any_outgoing_nbrs_included_mis.begin(), + false); - ASSERT_TRUE(h_total_outgoing_nbrs_included_mis[v - vertex_first] == 0) - << v << "'s neighbor is included in MIS" << std::endl; - }); - } + auto num_invalid_non_candidate_vertices_out_mis = thrust::reduce( + handle.get_thrust_policy(), + d_any_outgoing_nbrs_included_mis.begin(), + d_any_outgoing_nbrs_included_mis.end()); + + // FIXME: Add an error message + ASSERT_TRUE( + num_invalid_non_candidate_vertices_out_mis == d_any_outgoing_nbrs_included_mis.size()); } + } };