Skip to content

Commit 8881e16

Browse files
Merge pull request #140 from DrTimothyAldenDavis/master
Master
2 parents 4bb44bb + 3f7827f commit 8881e16

31 files changed

+2892
-169
lines changed

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@ endif ( )
2626
set ( CMAKE_MACOSX_RPATH TRUE )
2727

2828
# version of SuiteSparse:GraphBLAS
29-
set ( GraphBLAS_DATE "Apr 8, 2022" )
29+
set ( GraphBLAS_DATE "Apr 25, 2022" )
3030
set ( GraphBLAS_VERSION_MAJOR 7 )
3131
set ( GraphBLAS_VERSION_MINOR 0 )
32-
set ( GraphBLAS_VERSION_SUB 3 )
32+
set ( GraphBLAS_VERSION_SUB 4 )
3333

3434
message ( STATUS "Building SuiteSparse:GraphBLAS version: v" ${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB} " date: " ${GraphBLAS_DATE} )
3535

CUDA/GB_AxB_dot3_cuda.cpp

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,8 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
206206
phase2endlaunchFactory p2elf;
207207

208208

209-
// # of threads in phase1 and phase2 kernel launches must be the same
209+
// # of threads in phase1 and phase2 kernel launches are related
210+
// # by the size of the warp. ph2_task = ph1_task/32 for example
210211
int nthrd = p2lf.get_threads_per_block();
211212
int ntasks = p2elf.get_number_of_blocks(M);
212213

@@ -267,21 +268,22 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
267268

268269
GBURBLE ("(GPU phase1 done) ") ;
269270

270-
print_array<int64_t>(Nanobuckets, nanobuckets_size, "Nanobuckets");
271-
print_array<int64_t>(Blockbucket, blockbuckets_size , "Blockbucket");
271+
//print_array<int64_t>(Nanobuckets, nanobuckets_size, "Nanobuckets");
272+
printf(" using %ld blockbuckets \n", blockbuckets_size);
273+
//print_array<int64_t>(Blockbucket, blockbuckets_size , "Blockbucket");
272274

273275
//----------------------------------------------------------------------
274276
// phase2: cumsum across the blockbuckets, propagate to thread level
275277
//----------------------------------------------------------------------
276278

277279
GBURBLE ("(GPU phase1 start) ") ;
278280

279-
p2lf.jitGridBlockLaunch(Blockbucket, offset, M);
281+
p2lf.jitGridBlockLaunch(Blockbucket, offset, M );
280282

281-
int64_t s= 0;
282-
for ( int bucket = 0 ; bucket < NBUCKETS+1; ++bucket)
283+
int64_t s= offset[0];
284+
for ( int bucket = 1 ; bucket < NBUCKETS+1; ++bucket)
283285
{
284-
Bucketp[bucket] = s;
286+
Bucketp[bucket] = s;
285287
s+= offset[bucket];
286288
printf("bucketp[%d] = %ld, offset=%ld\n", bucket, Bucketp[bucket], offset[bucket]);
287289
}
@@ -295,38 +297,35 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
295297

296298
GBURBLE ("(GPU phase2end done) ") ;
297299

298-
print_array<int64_t>(Bucket, mnz , "Bucket");
299-
print_array<int64_t>(M->i, mnz , "M->i");
300-
print_array<int64_t>(C->i, mnz , "C->i");
300+
//print_array<int64_t>(Bucket, mnz , "Bucket");
301+
//print_array<int64_t>(M->i, mnz , "M->i");
302+
//print_array<int64_t>(C->i, mnz , "C->i");
301303

302304
//----------------------------------------------------------------------
303305
// phase3: do the numerical work
304306
//----------------------------------------------------------------------
305307

306308
print_array<int64_t>(Bucketp, NBUCKETS + 1 , "Bucketp");
307-
C->nzombies = Bucketp[1]; //set pre-zombie counts
308-
printf("pre-kernel C->nzombies=%ld\n", C->nzombies);
309+
printf("pre-phase3 kernel C->nzombies=%ld\n", C->nzombies);
309310

310311
for ( int bucket = 1 ; bucket < NBUCKETS; ++bucket)
311312
{
312313
int64_t start = Bucketp[bucket];
313-
int64_t end = Bucketp[bucket+1];
314-
314+
int64_t end = Bucketp[bucket + 1 ];
315315

316316
if(end - start > 0) {
317317
printf("Executing bucket: %d with %ld edges\n", bucket, end-start);
318318
// TODO: We might want to consider submitting these in different cuda streams (maybe use cuda stream pool?)
319319
phase3launchFactory p3lf(mysemiring, (GB_bucket_code)bucket);
320-
p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket, C, M, A, B);
320+
p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket, C, M, A, B);
321321
} else {
322322
printf("Skipping bucket %d, no work to do\n", bucket);
323323
}
324324

325325
GBURBLE ("(GPU phase3 done ) ") ;
326326
}
327-
C->nzombies += Bucketp[1];
328-
printf("C->p[0]=%ld\n", C->p[0]);
329-
printf("C->p[1]=%ld\n", C->p[1]);
327+
//printf("C->p[0]=%ld\n", C->p[0]);
328+
//printf("C->p[1]=%ld\n", C->p[1]);
330329
printf("C->nzombies=%ld\n", C->nzombies);
331330

332331
GB_FREE_WORKSPACE ;

CUDA/GB_cuda_semiring_factory.hpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,15 @@ extern "C"
2222
//std::istream* (*file_callback)(std::string, std::iostream&);
2323

2424
// Define a factory class for building any semiring text definitions
25+
26+
// FIXME: Rename to GrB_MxM_problem_spec and delegate problem generation to data factory
2527
class GB_cuda_semiring_factory: public jit::File_Desc {
2628

2729
public:
28-
30+
uint32_t mask_ecode;
2931
uint64_t sr_code;
3032
bool mask_struct;
33+
bool mask_comp;
3134

3235
// file ptr
3336
FILE *fp;
@@ -55,6 +58,8 @@ class GB_cuda_semiring_factory: public jit::File_Desc {
5558
// input:
5659
GrB_Semiring semiring, // the semiring to enumify
5760
bool flipxy, // multiplier is: mult(a,b) or mult(b,a)
61+
62+
// FIXME: Just use GrB_Matrix here
5863
GrB_Type ctype, // the type of C
5964
GrB_Type mtype, // the type of M, or NULL if no mask
6065
GrB_Type atype, // the type of A
@@ -109,6 +114,7 @@ class GB_cuda_semiring_factory: public jit::File_Desc {
109114
// operators, datatypes, sparsity formats and produces a character buffer.
110115
//------------------------------------------------------------------------------
111116

117+
// FIXME: Also need mask code macrofication
112118
void macrofy ( ) override
113119
{
114120
std::cout<<" calling macrofy semiring. sr_code="<< this->sr_code << std::endl;

CUDA/GB_reduce_to_scalar_cuda.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,6 @@ GrB_Info GB_reduce_to_scalar_cuda
3737

3838
GB_cuda_reduce( A, s, reduce);
3939

40-
printf("num_triangles = %d\n", s[0] );
41-
4240
return GrB_SUCCESS ;
4341
}
4442

CUDA/jitFactory.hpp

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ class phase1launchFactory
142142
std::cout << "B TYpe: " << B->type << std::endl;
143143
// // (1) create the semiring code and name
144144

145-
// // (2) ensure the jitifier has "GB_semiring_[mysemiring.sr_code].h"
145+
// // (2) ensure the jitifier has "GB_semiring_[mysemiring.sr_code].h"
146146
jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
147147
filecache.getFile (semiring_factory_) ;
148148

@@ -162,6 +162,11 @@ class phase1launchFactory
162162
dim3 grid(get_number_of_blocks(M));
163163
dim3 block(get_threads_per_block());
164164

165+
// for (auto s:compiler_flags)
166+
// {
167+
// std::cout << "Compiler Flags: " << s << std::endl ;
168+
// }
169+
165170
jit::launcher( hashable_name + "_" + M->type->name + "_" + sr_code,
166171
string_to_be_jitted.str(),
167172
header_names,
@@ -199,6 +204,13 @@ class phase2launchFactory
199204
return (ntasks + threads_per_block - 1) / threads_per_block ;
200205
}
201206

207+
int get_number_of_phase1_blocks( GrB_Matrix M){
208+
const int64_t mnz = GB_nnz (M) ;
209+
int number_of_sms = GB_Global_gpu_sm_get (0);
210+
int nblks = ( GB_nnz (M) + chunk_size - 1)/chunk_size;
211+
return GB_IMIN( nblks, 128 * number_of_sms);
212+
}
213+
202214
bool jitGridBlockLaunch(// parameters to AxB_phase2:
203215
int64_t *blockBucket, int64_t *offset, GrB_Matrix M) {
204216

@@ -224,7 +236,7 @@ class phase2launchFactory
224236
.set_kernel_inst( kernel_name, {})
225237
.configure(grid, block)
226238
// parameters to AxB_phase2:
227-
.launch( blockBucket, offset, get_number_of_blocks(M));
239+
.launch( blockBucket, offset, get_number_of_phase1_blocks(M));
228240

229241
checkCudaErrors( cudaDeviceSynchronize() );
230242
result= true;
@@ -319,9 +331,9 @@ class phase3launchFactory
319331
//----------------------------------------------------------------------
320332
// phase3: do the numerical work
321333
//----------------------------------------------------------------------
334+
322335
C->jumbled = true;
323-
C->nzombies = bucketp[1]; //set pre-zombie counts
324-
const int64_t Cnz = GB_nnz (C) ;
336+
const int64_t nz = end - start; // number of dots in this bucket
325337
const int64_t mnvec = M->nvec ;
326338

327339
int gridsz, blocksz, sz = 4;
@@ -332,10 +344,13 @@ class phase3launchFactory
332344
/**
333345
* Configure geometry and kernel function name based on sparsity of C and number of vectors in M
334346
*/
335-
configure(Cnz, mnvec, final_kernel_name_ss, blocksz, gridsz, sz);
347+
configure( nz, mnvec, final_kernel_name_ss, blocksz, gridsz, sz);
348+
349+
auto sr_code = std::to_string(semiring_factory_.sr_code);
336350

337351
std::string hashable_name = base_name + "_" + final_kernel_name_ss.str();
338352
std::stringstream string_to_be_jitted ;
353+
std::vector<std::string> template_types = {C->type->name, A->type->name, B->type->name};
339354

340355
jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
341356
filecache.getFile (semiring_factory_) ;
@@ -347,17 +362,16 @@ class phase3launchFactory
347362
dim3 grid(gridsz);
348363
dim3 block(blocksz);
349364

350-
C->nzombies = 0;
351365
GBURBLE ("(GPU phase3 launch st,end=%ld,%ld nblocks,blocksize= %d,%d )\n",start,end,gridsz,blocksz) ;
352-
jit::launcher( hashable_name,
366+
jit::launcher( hashable_name + "_" + M->type->name + "_" + sr_code,
353367
string_to_be_jitted.str(),
354368
header_names,
355369
compiler_flags,
356370
file_callback)
357-
.set_kernel_inst(final_kernel_name_ss.str(),
358-
{ C->type->name,
359-
A->type->name,
360-
B->type->name })
371+
.set_kernel_inst(final_kernel_name_ss.str(), template_types )
372+
// { C->type->name,
373+
// A->type->name,
374+
// B->type->name })
361375
.configure(grid, block) //if commented, use implicit 1D configure in launch
362376
.launch(
363377
start, // input/output:
@@ -386,6 +400,7 @@ class phase3launchFactory
386400
int number_of_sms = GB_Global_gpu_sm_get (0) ;
387401

388402
std::string Opname;
403+
// TODO: make sure this works with different geometry
389404

390405
printf("LAUNCHING BUCKET CODE: %d\n", (int)bucket_code_);
391406
switch (bucket_code_)
@@ -706,4 +721,4 @@ inline bool GB_cuda_reduce(GrB_Matrix A, void *output, GrB_Monoid op) {
706721
//
707722
//
708723
#endif // C++11
709-
#endif
724+
#endif

CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cuh

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ T block_ReduceSum(thread_block g, T val, T Ident)
7272

7373
//tile.sync(); // Wait for all partial reductions
7474

75-
if (wid > 0 || gridDim.x == 1 ) return val;
75+
if (wid > 0 ) return val;
7676

7777
//read from shared memory only if that warp existed
7878
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : Ident ;
@@ -88,7 +88,7 @@ __global__ void AxB_dot3_phase3_dndn
8888
(
8989
int64_t start,
9090
int64_t end,
91-
int64_t *Bucket,
91+
int64_t *Bucket, // do the work in Bucket [start:end-1]
9292
GrB_Matrix C,
9393
GrB_Matrix M,
9494
GrB_Matrix A,
@@ -108,6 +108,7 @@ __global__ void AxB_dot3_phase3_dndn
108108

109109
// zombie count
110110
int zc = 0;
111+
// dot pair and index in bucket
111112
int64_t pair_id;
112113

113114
// total items to be inspected
@@ -116,26 +117,27 @@ __global__ void AxB_dot3_phase3_dndn
116117
int s = blockDim.x;
117118

118119
// Main loop over pairs
119-
for (pair_id = start + blockIdx.x; //warp per pair
120-
pair_id < end;
121-
pair_id += gridDim.x ){
120+
for ( int64_t kk = start + blockIdx.x; //warp per pair
121+
kk < end;
122+
kk += gridDim.x ){
122123

124+
pair_id = Bucket [ kk ];
123125
int64_t i = Mi[pair_id];
124126
int64_t j = Ci[pair_id] >> 4;
125127

126-
int64_t pA = Ap[i];
127-
int64_t xend = Ap[i+1];
128+
int64_t pA = Ap[i];
129+
int64_t xend = Ap[i+1];
128130
nnzA = xend - pA;
129131

130-
int64_t pB = Bp[j];
131-
int64_t yend = Bp[j+1];
132+
int64_t pB = Bp[j];
133+
int64_t yend = Bp[j+1];
132134
nnzB = yend - pB;
133135

134-
if (threadIdx.x == 0 ){
135-
printf("tid=%d, i,j = %d,%d nnzA= %d, nnzB=%d\n",
136-
threadIdx.x, (int)i,(int)j, (int)nnzA, (int)nnzB);
137-
}
138-
__syncthreads();
136+
if (threadIdx.x == 0 ){
137+
printf("tid=%d, i,j = %d,%d nnzA= %d, nnzB=%d\n",
138+
threadIdx.x, (int)i,(int)j, (int)nnzA, (int)nnzB);
139+
}
140+
__syncthreads();
139141

140142

141143
// convert global data pointer to the local pointer of this block
@@ -170,6 +172,7 @@ __global__ void AxB_dot3_phase3_dndn
170172
GB_PUTC( Ci[pair_id]=i ) ;
171173
}
172174
//__syncthreads ( ) ;
175+
// FIXME: add atomics to sum up block zombies to C->nzombies
173176
}
174177

175178
}

CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cuh

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ __global__ void AxB_dot3_phase3_mp
7777
(
7878
int64_t start,
7979
int64_t end,
80-
int64_t *Bucket,
80+
int64_t *Bucket, // do the work in Bucket [start:end-1]
8181
GrB_Matrix C,
8282
GrB_Matrix M,
8383
GrB_Matrix A,
@@ -120,11 +120,13 @@ __global__ void AxB_dot3_phase3_mp
120120
// int has_zombies = 0 ;
121121

122122
// Main loop over pairs
123-
for (pair_id = start+ blockIdx.x; //warp per pair
124-
pair_id < end;
125-
pair_id += gridDim.x )
123+
int64_t kk ;
124+
for (kk = start+ blockIdx.x; //warp per pair
125+
kk < end;
126+
kk += gridDim.x )
126127
{
127128

129+
pair_id = Bucket [kk] ;
128130
int64_t i = Mi[pair_id];
129131
int64_t j = Ci[pair_id] >> 4;
130132

CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cuh

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ __global__ void AxB_dot3_phase3_spdn
5050
(
5151
int64_t start,
5252
int64_t end,
53-
int64_t *Bucket,
53+
int64_t *Bucket, // do the work in Bucket [start:end-1]
5454
GrB_Matrix C,
5555
GrB_Matrix M,
5656
GrB_Matrix A,
@@ -87,27 +87,27 @@ __global__ void AxB_dot3_phase3_spdn
8787
for ( int tid= threadIdx.x +blockDim.x*blockIdx.x;
8888
tid < dots;
8989
tid += blockDim.x * gridDim.x) {
90-
int pair_id, im;
90+
int64_t kk, pair_id, im;
9191
// if (threadIdx.x ==0)
9292
// printf("thd%u pi=%lld\n",tid, start+threadIdx.x);
9393
// __syncthreads();
9494

95-
for (pair_id = start+tid, im = 0;
96-
im < m && pair_id < end;
97-
++im, pair_id += dots ){
95+
for (int64_t kk = start+tid, im = 0;
96+
kk < end && im < m ;
97+
kk += dots, ++im ){
9898

99+
pair_id = Bucket[ kk ] ;
99100
int64_t i = Mi[pair_id]; // cols from mask
100101

101-
// TODO: column of Ci / 16?
102-
int64_t j = Ci[pair_id] >> 4; // row number of C
102+
int64_t j = Ci[pair_id] >> 4; // row number of C previously encoded in phase1
103103

104104
//printf("tid=%d, i=%lu, j=%lu\n", threadIdx.x, i, j);
105105

106106
// if (threadIdx.x ==0)
107107
// printf("thd%u i,j=%lld,%lld\n",tid, i,j);
108108
// __syncthreads();
109109

110-
// Prime row offsets for both A and B
110+
// Prep row offsets for both A and B
111111
int64_t pA = Ap[i]; // row of C
112112
int64_t pA_end = Ap[i+1];
113113
int64_t nnzA = pA_end - pA;

0 commit comments

Comments
 (0)