@@ -528,6 +528,110 @@ void MPMesh::assembleField(int vtxPerElm, int nCells, int nVerticesSolve, int nV
528528 kkDblViewHostU arrayHost (array_full, nVertices);
529529 Kokkos::deep_copy (arrayHost, array_full_d);
530530 pumipic::RecordTime (" polyMPOgetAssemblyField" , timer2.seconds ());
531+
532+ MPMesh::startCommunication (nVerticesSolve);
533+ }
534+
535+ // Start Communication routine
536+ void MPMesh::startCommunication (int nVerticesSolve){
537+
538+ MPI_Comm comm = p_MPs->getMPIComm ();
539+ int comm_rank, nProcs;
540+ MPI_Comm_rank (comm, &comm_rank);
541+ MPI_Comm_size (comm, &nProcs);
542+
543+ int nCells = p_mesh->getNumElements ();
544+ int numVertices = p_mesh->getNumVertices ();
545+
546+ // Owning processes in GPU and copy to CPU
547+ auto elmOwners = p_mesh->getElm2Process ();
548+ auto elmOwners_host = Kokkos::create_mirror_view_and_copy (Kokkos::DefaultHostExecutionSpace::memory_space (),
549+ elmOwners);
550+
551+ auto vtxGlobal= p_mesh->getVtxGlobal ();
552+ auto vtxGlobal_host = Kokkos::create_mirror_view_and_copy (Kokkos::DefaultHostExecutionSpace::memory_space (),
553+ vtxGlobal);
554+
555+ // Elment to Vertex Connection in GPU and communicate to CPU
556+ auto elm2VtxConn = p_mesh->getElm2VtxConn ();
557+ auto elm2VtxConn_host = Kokkos::create_mirror_view_and_copy (Kokkos::DefaultHostExecutionSpace::memory_space (),
558+ elm2VtxConn);
559+ // Debugging
560+ for (auto k=0 ; k<elmOwners_host.size (); k++)
561+ if (k==0 || k==elmOwners_host.size ()-1 )
562+ printf (" Rank %d Owning Proc %d \n " , comm_rank, elmOwners_host (k));
563+
564+ // Find adjacent processes and no of adjacent cells for each process
565+ std::map<int , int > adjProcsForCells;
566+ std::vector<int > send_each_proc (nProcs);
567+ for (auto iCell=0 ; iCell<nCells; iCell++){
568+ if (elmOwners_host (iCell) != comm_rank){
569+ auto ownerProc = elmOwners_host[iCell];
570+ adjProcsForCells[ownerProc] = adjProcsForCells[ownerProc]+1 ;
571+ send_each_proc[ownerProc] +=1 ;
572+ }
573+ }
574+ // Debugging
575+ printf (" Size adjProcs %d \n " , adjProcsForCells.size ());
576+ for (const auto & [key, value] : adjProcsForCells){
577+ printf (" Process %d sends to %d size %d\n " , comm_rank, key, value);
578+ }
579+
580+ // Find received particles in each process
581+ std::vector<int > total_recv_each_proc (nProcs);
582+ MPI_Alltoall (send_each_proc.data (), 1 , MPI_INT, total_recv_each_proc.data (), 1 , MPI_INT, comm);
583+ // Debiugging for receiving
584+ for (int i=0 ; i<nProcs; i++)
585+ printf (" Rank %d receiving from %d size %d \n " , comm_rank, i, total_recv_each_proc[i]);
586+
587+ // Create maps of data to Send
588+ std::map<int , std::vector<int >> cellDataToSend;
589+ std::map<int , int > counter;
590+ for (const auto & [key, value] : adjProcsForCells){
591+ cellDataToSend[key].resize (4 * value * maxVtxsPerElm);
592+ counter[key] = 0 ;
593+ }
594+
595+ for (auto iCell=0 ; iCell<nCells; iCell++){
596+ if (elmOwners_host (iCell) != comm_rank){
597+ int ownerProc = elmOwners_host[iCell];
598+ auto idx_start = counter[ownerProc]*4 *maxVtxsPerElm;
599+ int nVtxE = elm2VtxConn_host (iCell,0 );
600+ for (int v=0 ; v<nVtxE; v++){
601+ int vID = elm2VtxConn_host (iCell, v+1 )-1 ;
602+ int idx = idx_start + v*4 ;
603+ if (vID < nVerticesSolve)
604+ cellDataToSend[ownerProc][idx+0 ] = 0 ; // TO DO better way
605+ else
606+ cellDataToSend[ownerProc][idx+0 ] = 1 ;
607+ cellDataToSend[ownerProc][idx+1 ] = comm_rank; // sending Proc TODO not needed
608+ cellDataToSend[ownerProc][idx+2 ] = vID; // localID
609+ cellDataToSend[ownerProc][idx+3 ] = vtxGlobal_host (vID); // globalID
610+ }
611+ counter[ownerProc] = counter[ownerProc] + 1 ;
612+ // assert(counter[ownerProc] == adjProcsForCells.find(ownerProc));
613+ }
614+ }
615+
616+ std::vector<MPI_Request> s_requests;
617+ s_requests.resize (cellDataToSend.size ());
618+ int count_s_request=0 ;
619+ for (auto & [proc, vec] : cellDataToSend){
620+ MPI_Isend (vec.data (), vec.size (), MPI_INT, proc, MPI_ANY_TAG, comm, &s_requests[count_s_request]);
621+ count_s_request=count_s_request+1 ;
622+ }
623+
624+ std::vector<std::vector<int >> cellDataToReceive;
625+ cellDataToReceive.resize (nProcs); //
626+ for (int iProc=0 ; iProc< nProcs; iProc++)
627+ cellDataToReceive[iProc].resize (total_recv_each_proc[iProc]);
628+
629+ std::vector<MPI_Request> r_requests;
630+ r_requests.resize (nProcs);
631+ for (int iProc=0 ; iProc< nProcs; iProc++)
632+ MPI_Irecv (cellDataToReceive[iProc].data (), total_recv_each_proc[iProc], MPI_INT, iProc,
633+ MPI_ANY_TAG, comm, &r_requests[iProc]);
634+
531635}
532636
533637template <MeshFieldIndex meshFieldIndex>
0 commit comments