Line data Source code
1 : //------------------------------------------------------------------------------
2 : // LG_CC_FastSV5: connected components
3 : //------------------------------------------------------------------------------
4 :
5 : // LAGraph, (c) 2019-2022 by The LAGraph Contributors, All Rights Reserved.
6 : // SPDX-License-Identifier: BSD-2-Clause
7 : //
8 : // For additional details (including references to third party source code and
9 : // other files) see the LICENSE file or contact permission@sei.cmu.edu. See
10 : // Contributors.txt for a full list of contributors. Created, in part, with
11 : // funding and support from the U.S. Government (see Acknowledgments.txt file).
12 : // DM22-0790
13 :
14 : // Contributed by Yongzhe Zhang; modified by Tim Davis, Texas A&M University
15 :
16 : //------------------------------------------------------------------------------
17 :
18 : // Code is based on the algorithm described in the following paper
19 : // Zhang, Azad, Hu. FastSV: FastSV: A Distributed-Memory Connected Component
20 : // Algorithm with Fast Convergence (SIAM PP20)
21 :
22 : // A subsequent update to the algorithm is here (which might not be reflected
23 : // in this code):
24 : //
25 : // Yongzhe Zhang, Ariful Azad, Aydin Buluc: Parallel algorithms for finding
26 : // connected components using linear algebra. J. Parallel Distributed Comput.
27 : // 144: 14-27 (2020).
28 :
29 : // Modified by Tim Davis, Texas A&M University
30 :
31 : // The input matrix A must be symmetric. Self-edges (diagonal entries) are
32 : // OK, and are ignored. The values and type of A are ignored; just its
33 : // structure is accessed.
34 :
35 : // The matrix A must have dimension 2^32 or less.
36 : // todo: Need a 64-bit version of this method.
37 :
38 : // todo: this function is not thread-safe, since it exports G->A and then
39 : // reimports it back. G->A is unchanged when the function returns, but during
40 : // execution G->A is invalid.
41 :
42 : // A note about "todo" and "fixme" in this file: these do not need to be fixed
43 : // or changed for this method, since the revised version appears as
44 : // src/algorithm/LG_CC_FastSV6.c. They have thus been changed here to lower
45 : // case.
46 :
47 : // todo: free all workspace in LG_FREE_ALL
48 : #define LG_FREE_ALL ;
49 :
50 : #include "LG_internal.h"
51 :
52 : #if LAGRAPH_SUITESPARSE
53 :
54 : //------------------------------------------------------------------------------
55 : // hash functions: todo describe me
56 : //------------------------------------------------------------------------------
57 :
58 : // hash table size must be a power of 2
59 : #define HASH_SIZE 1024
60 :
61 : // number of samples to insert into the hash table
62 : // todo: this seems to be a lot of entries for a HASH_SIZE of 1024.
63 : // There could be lots of collisions.
64 : #define HASH_SAMPLES 864
65 :
66 : #define HASH(x) (((x << 4) + x) & (HASH_SIZE-1))
67 : #define NEXT(x) ((x + 23) & (HASH_SIZE-1))
68 :
69 : //------------------------------------------------------------------------------
70 : // ht_init: todo describe me
71 : //------------------------------------------------------------------------------
72 :
73 : // Clear the hash table counts (ht_val [0:HASH_SIZE-1] = 0), and set all hash
74 : // table entries as empty (ht_key [0:HASH_SIZE-1] =-1).
75 :
76 : // todo: the memset of ht_key is confusing
77 :
78 : // todo: the name "ht_val" is confusing. It is not a value, but a count of
79 : // the number of times the value x = ht_key [h] has been inserted into the
80 : // hth position in the hash table. It should be renamed ht_cnt.
81 :
82 48 : static inline void ht_init
83 : (
84 : int32_t *ht_key,
85 : int32_t *ht_val
86 : )
87 : {
88 48 : memset (ht_key, -1, sizeof (int32_t) * HASH_SIZE) ;
89 48 : memset (ht_val, 0, sizeof (int32_t) * HASH_SIZE) ;
90 48 : }
91 :
92 : //------------------------------------------------------------------------------
93 : // ht_sample: todo describe me
94 : //------------------------------------------------------------------------------
95 :
96 : //
97 :
98 48 : static inline void ht_sample
99 : (
100 : uint32_t *V32, // array of size n (todo: this is a bad variable name)
101 : int32_t n,
102 : int32_t samples, // number of samples to take from V32
103 : int32_t *ht_key,
104 : int32_t *ht_val,
105 : uint64_t *seed
106 : )
107 : {
108 41520 : for (int32_t k = 0 ; k < samples ; k++)
109 : {
110 : // select an entry from V32 at random
111 41472 : int32_t x = V32 [LG_Random60 (seed) % n] ;
112 :
113 : // find x in the hash table
114 : // todo: make this loop a static inline function (see also below)
115 41472 : int32_t h = HASH (x) ;
116 46722 : while (ht_key [h] != -1 && ht_key [h] != x)
117 : {
118 5250 : h = NEXT (h) ;
119 : }
120 :
121 41472 : ht_key [h] = x ;
122 41472 : ht_val [h]++ ;
123 : }
124 48 : }
125 :
126 : //------------------------------------------------------------------------------
127 : // ht_most_frequent: todo describe me
128 : //------------------------------------------------------------------------------
129 :
130 : // todo what if key is returned as -1? Code breaks. todo: handle this case
131 :
132 4 : static inline int32_t ht_most_frequent
133 : (
134 : int32_t *ht_key,
135 : int32_t *ht_val
136 : )
137 : {
138 4 : int32_t key = -1 ;
139 4 : int32_t val = 0 ; // max (ht_val [0:HASH_SIZE-1])
140 4100 : for (int32_t h = 0 ; h < HASH_SIZE ; h++)
141 : {
142 4096 : if (ht_val [h] > val)
143 : {
144 10 : key = ht_key [h] ;
145 10 : val = ht_val [h] ;
146 : }
147 : }
148 4 : return (key) ; // return most frequent key
149 : }
150 :
151 : //------------------------------------------------------------------------------
152 : // Reduce_assign32: w (index) += s, using MIN as the "+=" accum operator
153 : //------------------------------------------------------------------------------
154 :
155 : // mask = NULL, accumulator = GrB_MIN_UINT32, descriptor = NULL.
156 : // Duplicates are summed with the accumulator, which differs from how
157 : // GrB_assign works. GrB_assign states that the presence of duplicates results
158 : // in undefined behavior. GrB_assign in SuiteSparse:GraphBLAS follows the
159 : // MATLAB rule, which discards all but the first of the duplicates.
160 :
161 : // todo: add this to GraphBLAS as a variant of GrB_assign, either as
162 : // GxB_assign_accum (or another name), or as a GxB_* descriptor setting.
163 :
164 90 : static inline int Reduce_assign32
165 : (
166 : GrB_Vector *w_handle, // vector of size n, all entries present
167 : GrB_Vector *s_handle, // vector of size n, all entries present
168 : uint32_t *index, // array of size n, can have duplicates
169 : GrB_Index n,
170 : int nthreads,
171 : int32_t *ht_key, // hash table
172 : int32_t *ht_val, // hash table (count of # of entries)
173 : uint64_t *seed, // random
174 : char *msg
175 : )
176 : {
177 :
178 : GrB_Type w_type, s_type ;
179 : GrB_Index w_n, s_n, w_nvals, s_nvals, *w_i, *s_i, w_size, s_size ;
180 : uint32_t *w_x, *s_x ;
181 90 : bool s_iso = false ;
182 :
183 : //--------------------------------------------------------------------------
184 : // export w and s
185 : //--------------------------------------------------------------------------
186 :
187 : // export the GrB_Vectors w and s as full arrays, to get direct access to
188 : // their contents. Note that this would fail if w or s are not full, with
189 : // all entries present.
190 90 : GRB_TRY (GxB_Vector_export_Full (w_handle, &w_type, &w_n, (void **) &w_x,
191 : &w_size, NULL, NULL)) ;
192 90 : GRB_TRY (GxB_Vector_export_Full (s_handle, &s_type, &s_n, (void **) &s_x,
193 : &s_size, &s_iso, NULL)) ;
194 :
195 : #if defined ( COVERAGE )
196 90 : if (n >= 200) // for test coverage only; do not use in production!!
197 : #else
198 : if (nthreads >= 4)
199 : #endif
200 : {
201 :
202 : // allocate a buf array for each thread, of size HASH_SIZE
203 : uint32_t *mem ;
204 44 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &mem, nthreads*HASH_SIZE,
205 : sizeof (uint32_t), msg)) ;
206 : // todo: check out-of-memory condition here
207 :
208 : // todo why is hashing needed here? hashing is slow for what needs
209 : // to be computed here. GraphBLAS has fast MIN atomic monoids that
210 : // do not require hashing.
211 44 : ht_init (ht_key, ht_val) ;
212 44 : ht_sample (index, n, HASH_SAMPLES, ht_key, ht_val, seed) ;
213 :
214 : int tid;
215 : #pragma omp parallel for num_threads(nthreads) schedule(static)
216 88 : for (tid = 0 ; tid < nthreads ; tid++)
217 : {
218 : // get the thread-specific buf array of size HASH_SIZE
219 : // todo: buf is a bad variable name; it's not a "buffer",
220 : // but a local workspace to compute the local version of w_x.
221 44 : uint32_t *buf = mem + tid * HASH_SIZE ;
222 :
223 : // copy the values from the global hash table into buf
224 45100 : for (int32_t h = 0 ; h < HASH_SIZE ; h++)
225 : {
226 45056 : if (ht_key [h] != -1)
227 : {
228 15386 : buf [h] = w_x [ht_key [h]] ;
229 : }
230 : }
231 :
232 : // this thread works on index [kstart:kend]
233 44 : int32_t kstart = (n * tid + nthreads - 1) / nthreads ;
234 44 : int32_t kend = (n * tid + n + nthreads - 1) / nthreads ;
235 89968 : for (int32_t k = kstart ; k < kend ; k++)
236 : {
237 89924 : uint32_t i = index [k] ;
238 :
239 : // todo: make this loop a static inline function
240 89924 : int32_t h = HASH (i) ;
241 126336 : while (ht_key [h] != -1 && ht_key [h] != i)
242 : {
243 36412 : h = NEXT (h) ;
244 : }
245 :
246 89924 : if (ht_key [h] == -1)
247 : {
248 : // todo is this a race condition?
249 27790 : w_x [i] = LAGRAPH_MIN (w_x [i], s_x [s_iso?0:k]) ;
250 : }
251 : else
252 : {
253 62134 : buf [h] = LAGRAPH_MIN (buf [h], s_x [s_iso?0:k]) ;
254 : }
255 : }
256 : }
257 :
258 : // combine intermediate results from each thread
259 45100 : for (int32_t h = 0 ; h < HASH_SIZE ; h++)
260 : {
261 45056 : int32_t i = ht_key [h] ;
262 45056 : if (i != -1)
263 : {
264 30772 : for (tid = 0 ; tid < nthreads ; tid++)
265 : {
266 15386 : w_x [i] = LAGRAPH_MIN (w_x [i], mem [tid * HASH_SIZE + h]) ;
267 : }
268 : }
269 : }
270 :
271 44 : LAGraph_Free ((void **) &mem, NULL) ;
272 : }
273 : else
274 : {
275 : // sequential version
276 728 : for (GrB_Index k = 0 ; k < n ; k++)
277 : {
278 682 : uint32_t i = index [k] ;
279 682 : w_x [i] = LAGRAPH_MIN (w_x [i], s_x [s_iso?0:k]) ;
280 : }
281 : }
282 :
283 : //--------------------------------------------------------------------------
284 : // reimport w and s back into GrB_Vectors, and return result
285 : //--------------------------------------------------------------------------
286 :
287 : // s is unchanged. It was exported only to compute w (index) += s
288 :
289 90 : GRB_TRY (GxB_Vector_import_Full (w_handle, w_type, w_n, (void **) &w_x,
290 : w_size, false, NULL)) ;
291 90 : GRB_TRY (GxB_Vector_import_Full (s_handle, s_type, s_n, (void **) &s_x,
292 : s_size, s_iso, NULL)) ;
293 :
294 90 : return (GrB_SUCCESS) ;
295 : }
296 :
297 : //------------------------------------------------------------------------------
298 : // LG_CC_FastSV5
299 : //------------------------------------------------------------------------------
300 :
301 : // The output of LG_CC_FastSV5 is a vector component, where
302 : // component(i)=s if node i is in the connected compononent whose
303 : // representative node is node s. If s is a representative, then
304 : // component(s)=s. The number of connected components in the graph G is the
305 : // number of representatives.
306 :
307 : #undef LG_FREE_ALL
308 : #define LG_FREE_ALL \
309 : { \
310 : LAGraph_Free ((void **) &I, NULL) ; \
311 : LAGraph_Free ((void **) &V32, NULL) ; \
312 : LAGraph_Free ((void **) &ht_key, NULL) ; \
313 : LAGraph_Free ((void **) &ht_val, NULL) ; \
314 : /* todo why is T not freed?? */ \
315 : GrB_free (&f) ; \
316 : GrB_free (&gp) ; \
317 : GrB_free (&mngp) ; \
318 : GrB_free (&gp_new) ; \
319 : GrB_free (&mod) ; \
320 : }
321 :
322 : #endif
323 :
324 24 : int LG_CC_FastSV5 // SuiteSparse:GraphBLAS method, with GxB extensions
325 : (
326 : // output
327 : GrB_Vector *component, // component(i)=s if node is in the component s
328 : // inputs
329 : LAGraph_Graph G, // input graph, G->A can change
330 : char *msg
331 : )
332 : {
333 :
334 : //--------------------------------------------------------------------------
335 : // check inputs
336 : //--------------------------------------------------------------------------
337 :
338 24 : LG_CLEAR_MSG ;
339 :
340 : #if !LAGRAPH_SUITESPARSE
341 : LG_ASSERT_MSG (false, GrB_NOT_IMPLEMENTED, "SuiteSparse required") ;
342 : #else
343 :
344 24 : uint32_t *V32 = NULL ;
345 24 : int32_t *ht_key = NULL, *ht_val = NULL ;
346 24 : GrB_Index n, nnz, *I = NULL ;
347 24 : GrB_Vector f = NULL, gp_new = NULL, mngp = NULL, mod = NULL, gp = NULL ;
348 24 : GrB_Matrix T = NULL ;
349 :
350 24 : LG_TRY (LAGraph_CheckGraph (G, msg)) ;
351 24 : LG_ASSERT (component != NULL, GrB_NULL_POINTER) ;
352 :
353 24 : LG_ASSERT_MSG ((G->kind == LAGraph_ADJACENCY_UNDIRECTED ||
354 : (G->kind == LAGraph_ADJACENCY_DIRECTED &&
355 : G->is_symmetric_structure == LAGraph_TRUE)),
356 : -1001, "G->A must be known to be symmetric") ;
357 :
358 24 : GrB_Matrix S = G->A ;
359 24 : GRB_TRY (GrB_Matrix_nrows (&n, S)) ;
360 24 : GRB_TRY (GrB_Matrix_nvals (&nnz, S)) ;
361 :
362 24 : LG_ASSERT_MSG (n <= UINT32_MAX, -1, "problem too large (fixme)") ;
363 :
364 : #define FASTSV_SAMPLES 4
365 :
366 24 : bool sampling = (n * FASTSV_SAMPLES * 2 < nnz) ;
367 :
368 : // random number seed
369 24 : uint64_t seed = n ;
370 :
371 : //--------------------------------------------------------------------------
372 : // initializations
373 : //--------------------------------------------------------------------------
374 :
375 : // determine # of threads to use for Reduce_assign
376 : int nthreads, nthreads_outer, nthreads_inner ;
377 24 : LG_TRY (LAGraph_GetNumThreads (&nthreads_outer, &nthreads_inner, msg)) ;
378 24 : nthreads = nthreads_outer * nthreads_inner ;
379 :
380 24 : nthreads = LAGRAPH_MIN (nthreads, n / 16) ;
381 24 : nthreads = LAGRAPH_MAX (nthreads, 1) ;
382 :
383 : // # of threads to use for typecast
384 24 : int nthreads2 = n / (64*1024) ;
385 24 : nthreads2 = LAGRAPH_MIN (nthreads2, nthreads) ;
386 24 : nthreads2 = LAGRAPH_MAX (nthreads2, 1) ;
387 :
388 : // vectors
389 24 : GRB_TRY (GrB_Vector_new (&f, GrB_UINT32, n)) ;
390 24 : GRB_TRY (GrB_Vector_new (&gp_new, GrB_UINT32, n)) ;
391 24 : GRB_TRY (GrB_Vector_new (&mod, GrB_BOOL, n)) ;
392 :
393 : // temporary arrays
394 24 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &I , n, sizeof (GrB_Index), msg)) ;
395 24 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &V32, n, sizeof (uint32_t), msg)) ;
396 :
397 : // prepare vectors
398 : int64_t i;
399 : #pragma omp parallel for num_threads(nthreads2) schedule(static)
400 16284 : for (i = 0 ; i < n ; i++)
401 : {
402 16260 : I [i] = i ;
403 16260 : V32 [i] = (uint32_t) i ;
404 : }
405 :
406 24 : GRB_TRY (GrB_Vector_build (f, I, V32, n, GrB_PLUS_UINT32)) ;
407 24 : GRB_TRY (GrB_Vector_dup (&gp, f)) ;
408 24 : GRB_TRY (GrB_Vector_dup (&mngp, f)) ;
409 :
410 : // allocate the hash table
411 24 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &ht_key, HASH_SIZE, sizeof (int32_t),
412 : msg)) ;
413 24 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &ht_val, HASH_SIZE, sizeof (int32_t),
414 : msg)) ;
415 :
416 : //--------------------------------------------------------------------------
417 : // sample phase
418 : //--------------------------------------------------------------------------
419 :
420 24 : if (sampling)
421 : {
422 :
423 : //----------------------------------------------------------------------
424 : // export S = G->A in CSR format
425 : //----------------------------------------------------------------------
426 :
427 : // S is not modified. It is only exported so that its contents can be
428 : // read by the parallel loops below.
429 :
430 : GrB_Type type ;
431 : GrB_Index nrows, ncols, nvals ;
432 : size_t typesize ;
433 : int64_t nonempty ;
434 : GrB_Index *Sp, *Sj ;
435 : void *Sx ;
436 4 : bool S_jumbled = false ;
437 : GrB_Index Sp_size, Sj_size, Sx_size ;
438 4 : bool S_iso = false ;
439 :
440 4 : GRB_TRY (GrB_Matrix_nvals (&nvals, S)) ;
441 4 : GRB_TRY (GxB_Matrix_export_CSR (&S, &type, &nrows, &ncols, &Sp, &Sj,
442 : &Sx, &Sp_size, &Sj_size, &Sx_size,
443 : &S_iso, &S_jumbled, NULL)) ;
444 4 : GRB_TRY (GxB_Type_size (&typesize, type)) ;
445 4 : G->A = NULL ;
446 :
447 : //----------------------------------------------------------------------
448 : // allocate space to construct T
449 : //----------------------------------------------------------------------
450 :
451 4 : GrB_Index Tp_len = nrows+1, Tp_size = Tp_len*sizeof(GrB_Index);
452 4 : GrB_Index Tj_len = nvals, Tj_size = Tj_len*sizeof(GrB_Index);
453 4 : GrB_Index Tx_len = nvals ;
454 :
455 4 : GrB_Index *Tp = NULL, *Tj = NULL ;
456 4 : GrB_Index Tx_size = typesize ;
457 4 : void *Tx = NULL ;
458 4 : int32_t *range = NULL ;
459 4 : GrB_Index *count = NULL ;
460 :
461 4 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &Tp, Tp_len,
462 : sizeof (GrB_Index), msg)) ;
463 4 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &Tj, Tj_len,
464 : sizeof (GrB_Index), msg)) ;
465 4 : LAGRAPH_TRY (LAGraph_Calloc (&Tx, 1, typesize, msg)) ; // T is iso
466 :
467 : //----------------------------------------------------------------------
468 : // allocate workspace
469 : //----------------------------------------------------------------------
470 :
471 4 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &range, nthreads + 1,
472 : sizeof (int32_t), msg)) ;
473 4 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &count, nthreads + 1,
474 : sizeof (GrB_Index), msg)) ;
475 :
476 4 : memset (count, 0, sizeof (GrB_Index) * (nthreads + 1)) ;
477 :
478 : //----------------------------------------------------------------------
479 : // define parallel tasks to construct T
480 : //----------------------------------------------------------------------
481 :
482 : // thread tid works on rows range[tid]:range[tid+1]-1 of S and T
483 12 : for (int tid = 0 ; tid <= nthreads ; tid++)
484 : {
485 8 : range [tid] = (n * tid + nthreads - 1) / nthreads ;
486 : }
487 :
488 : //----------------------------------------------------------------------
489 : // determine the number entries to be constructed in T for each thread
490 : //----------------------------------------------------------------------
491 :
492 : int tid;
493 : #pragma omp parallel for num_threads(nthreads) schedule(static)
494 8 : for (tid = 0 ; tid < nthreads ; tid++)
495 : {
496 9756 : for (int32_t i = range [tid] ; i < range [tid+1] ; i++)
497 : {
498 9752 : int32_t deg = Sp [i + 1] - Sp [i] ;
499 9752 : count [tid + 1] += LAGRAPH_MIN (FASTSV_SAMPLES, deg) ;
500 : }
501 : }
502 :
503 : //----------------------------------------------------------------------
504 : // count = cumsum (count)
505 : //----------------------------------------------------------------------
506 :
507 8 : for (tid = 0 ; tid < nthreads ; tid++)
508 : {
509 4 : count [tid + 1] += count [tid] ;
510 : }
511 :
512 : //----------------------------------------------------------------------
513 : // construct T
514 : //----------------------------------------------------------------------
515 :
516 : // T (i,:) consists of the first FASTSV_SAMPLES of S (i,:).
517 :
518 : // todo: this could be done by GxB_Select, using a new operator. Need
519 : // to define a set of GxB_SelectOp operators that would allow for this.
520 :
521 : // Note that Tx is not modified. Only Tp and Tj are constructed.
522 :
523 : #pragma omp parallel for num_threads(nthreads) schedule(static)
524 8 : for (tid = 0 ; tid < nthreads ; tid++)
525 : {
526 4 : GrB_Index p = count [tid] ;
527 4 : Tp [range [tid]] = p ;
528 9756 : for (int32_t i = range [tid] ; i < range [tid+1] ; i++)
529 : {
530 : // construct T (i,:) from the first entries in S (i,:)
531 9752 : for (int32_t j = 0 ;
532 40390 : j < FASTSV_SAMPLES && Sp [i] + j < Sp [i + 1] ; j++)
533 : {
534 30638 : Tj [p++] = Sj [Sp [i] + j] ;
535 : }
536 9752 : Tp [i + 1] = p ;
537 : }
538 : }
539 :
540 : //----------------------------------------------------------------------
541 : // import the result into the GrB_Matrix T
542 : //----------------------------------------------------------------------
543 :
544 : // Note that Tx is unmodified.
545 :
546 : // in SuiteSparse:GraphBLAS v5, sizes are in bytes, not entries
547 4 : GrB_Index Tp_siz = Tp_size ;
548 4 : GrB_Index Tj_siz = Tj_size ;
549 4 : GrB_Index Tx_siz = Tx_size ;
550 :
551 4 : GrB_Index t_nvals = Tp [nrows] ;
552 4 : GRB_TRY (GxB_Matrix_import_CSR (&T, type, nrows, ncols,
553 : &Tp, &Tj, &Tx, Tp_siz, Tj_siz, Tx_siz,
554 : true, // T is iso
555 : S_jumbled, NULL)) ;
556 :
557 : //----------------------------------------------------------------------
558 : // find the connected components of T
559 : //----------------------------------------------------------------------
560 :
561 : // todo: this is nearly identical to the final phase below.
562 : // Make this a function
563 :
564 4 : bool change = true, is_first = true ;
565 26 : while (change)
566 : {
567 : // hooking & shortcutting
568 22 : GRB_TRY (GrB_mxv (mngp, NULL, GrB_MIN_UINT32,
569 : GrB_MIN_SECOND_SEMIRING_UINT32, T, gp, NULL)) ;
570 22 : if (!is_first)
571 : {
572 18 : LG_TRY (Reduce_assign32 (&f, &mngp, V32, n, nthreads,
573 : ht_key, ht_val, &seed, msg)) ;
574 : }
575 22 : GRB_TRY (GrB_eWiseAdd (f, NULL, GrB_MIN_UINT32, GrB_MIN_UINT32,
576 : mngp, gp, NULL)) ;
577 :
578 : // calculate grandparent
579 : // fixme: NULL parameter is SS:GrB extension
580 22 : GRB_TRY (GrB_Vector_extractTuples (NULL, V32, &n, f)) ; // fixme
581 : int32_t i;
582 : #pragma omp parallel for num_threads(nthreads2) schedule(static)
583 54528 : for (i = 0 ; i < n ; i++)
584 : {
585 54506 : I [i] = (GrB_Index) V32 [i] ;
586 : }
587 22 : GRB_TRY (GrB_extract (gp_new, NULL, NULL, f, I, n, NULL)) ;
588 :
589 : // todo: GrB_Vector_extract should have a variant where the index
590 : // list is not given by an array I, but as a GrB_Vector of type
591 : // GrB_UINT64 (or which can be typecast to GrB_UINT64). This is a
592 : // common issue that arises in other algorithms as well.
593 : // Likewise GrB_Matrix_extract, and all forms of GrB_assign.
594 :
595 : // check termination
596 22 : GRB_TRY (GrB_eWiseMult (mod, NULL, NULL, GrB_NE_UINT32, gp_new,
597 : gp, NULL)) ;
598 22 : GRB_TRY (GrB_reduce (&change, NULL, GrB_LOR_MONOID_BOOL, mod,
599 : NULL)) ;
600 :
601 : // swap gp and gp_new
602 22 : GrB_Vector t = gp ; gp = gp_new ; gp_new = t ;
603 22 : is_first = false ;
604 : }
605 :
606 : //----------------------------------------------------------------------
607 : // todo: describe me
608 : //----------------------------------------------------------------------
609 :
610 4 : ht_init (ht_key, ht_val) ;
611 4 : ht_sample (V32, n, HASH_SAMPLES, ht_key, ht_val, &seed) ;
612 4 : int32_t key = ht_most_frequent (ht_key, ht_val) ;
613 : // todo: what if key is returned as -1? Then T below is invalid.
614 :
615 4 : int64_t t_nonempty = -1 ;
616 4 : bool T_jumbled = false, T_iso = true ;
617 :
618 : // export T
619 4 : GRB_TRY (GxB_Matrix_export_CSR (&T, &type, &nrows, &ncols, &Tp, &Tj,
620 : &Tx, &Tp_siz, &Tj_siz, &Tx_siz,
621 : &T_iso, &T_jumbled, NULL)) ;
622 :
623 : // todo what is this phase doing? It is constructing a matrix T that
624 : // depends only on S, key, and V32. T contains a subset of the entries
625 : // in S, except that T (i,:) is empty if
626 :
627 : // The prior content of T is ignored; it is exported from the earlier
628 : // phase, only to reuse the allocated space for T. However, T_jumbled
629 : // is preserved from the prior matrix T, which doesn't make sense.
630 :
631 : // This parallel loop is badly load balanced. Each thread operates on
632 : // the same number of rows of S, regardless of how many entries appear
633 : // in each set of rows. It uses one thread per task, statically
634 : // scheduled.
635 :
636 : #pragma omp parallel for num_threads(nthreads) schedule(static)
637 8 : for (tid = 0 ; tid < nthreads ; tid++)
638 : {
639 4 : GrB_Index ptr = Sp [range [tid]] ;
640 : // thread tid scans S (range [tid]:range [tid+1]-1,:),
641 : // and constructs T(i,:) for all rows in this range.
642 9756 : for (int32_t i = range [tid] ; i < range [tid+1] ; i++)
643 : {
644 9752 : int32_t pv = V32 [i] ; // what is pv?
645 9752 : Tp [i] = ptr ; // start the construction of T(i,:)
646 : // T(i,:) is empty if pv == key
647 9752 : if (pv != key)
648 : {
649 : // scan S(i,:)
650 46550 : for (GrB_Index p = Sp [i] ; p < Sp [i+1] ; p++)
651 : {
652 : // get S(i,j)
653 41274 : int32_t j = Sj [p] ;
654 41274 : if (V32 [j] != key)
655 : {
656 : // add the entry T(i,j) to T, but skip it if
657 : // V32 [j] is equal to key
658 41264 : Tj [ptr++] = j ;
659 : }
660 : }
661 : // add the entry T(i,key) if there is room for it in T(i,:)
662 5276 : if (ptr - Tp [i] < Sp [i+1] - Sp [i])
663 : {
664 2 : Tj [ptr++] = key ;
665 : }
666 : }
667 : }
668 : // count the number of entries inserted into T by this thread?
669 4 : count [tid] = ptr - Tp [range [tid]] ;
670 : }
671 :
672 : // Compact empty space out of Tj not filled in from the above phase.
673 : // This is a lot of work and should be done in parallel.
674 4 : GrB_Index offset = 0 ;
675 8 : for (tid = 0 ; tid < nthreads ; tid++)
676 : {
677 :
678 : // this memcpy is not safe (src/dest can overlap)
679 : // memcpy (Tj + offset, Tj + Tp [range [tid]],
680 : // sizeof (GrB_Index) * count [tid]) ;
681 :
682 : // // using a for loop instead:
683 : // GrB_Index *Tj_dest = Tj + offset ;
684 : // GrB_Index *Tj_src = Tj + Tp [range [tid]] ;
685 : // for (int64_t k = 0 ; k < count [tid] ; k++)
686 : // {
687 : // Tj_dest [k] = Tj_src [k] ;
688 : // }
689 :
690 : // this is safe (memmove_s not necessary):
691 4 : memmove (Tj + offset, Tj + Tp [range [tid]],
692 4 : sizeof (GrB_Index) * count [tid]) ;
693 :
694 4 : offset += count [tid] ;
695 4 : count [tid] = offset - count [tid] ;
696 : }
697 :
698 : // Compact empty space out of Tp
699 : #pragma omp parallel for num_threads(nthreads) schedule(static)
700 8 : for (tid = 0 ; tid < nthreads ; tid++)
701 : {
702 4 : GrB_Index ptr = Tp [range [tid]] ;
703 9756 : for (int32_t i = range [tid] ; i < range [tid+1] ; i++)
704 : {
705 9752 : Tp [i] -= ptr - count [tid] ;
706 : }
707 : }
708 :
709 : // finalize T
710 4 : Tp [n] = offset ;
711 :
712 : // free workspace
713 4 : LAGraph_Free ((void **) &count, NULL) ;
714 4 : LAGraph_Free ((void **) &range, NULL) ;
715 :
716 : // import S (unchanged since last export)
717 4 : GRB_TRY (GxB_Matrix_import_CSR (&S, type, nrows, ncols,
718 : &Sp, &Sj, &Sx, Sp_size, Sj_size, Sx_size,
719 : S_iso, S_jumbled, NULL)) ;
720 :
721 : // import T for the final phase
722 4 : GRB_TRY (GxB_Matrix_import_CSR (&T, type, nrows, ncols,
723 : &Tp, &Tj, &Tx, Tp_siz, Tj_siz, Tx_siz,
724 : T_iso, /* T is jumbled: */ true, NULL)) ;
725 :
726 : // restore G->A
727 4 : G->A = S ;
728 :
729 : }
730 : else
731 : {
732 :
733 : // no sampling; the final phase operates on the whole graph
734 20 : T = S ;
735 :
736 : }
737 :
738 : //--------------------------------------------------------------------------
739 : // final phase
740 : //--------------------------------------------------------------------------
741 :
742 24 : GRB_TRY (GrB_Matrix_nvals (&nnz, T)) ;
743 :
744 24 : bool change = true ;
745 96 : while (change && nnz > 0)
746 : {
747 : // hooking & shortcutting
748 72 : GRB_TRY (GrB_mxv (mngp, NULL, GrB_MIN_UINT32,
749 : GrB_MIN_SECOND_SEMIRING_UINT32, T, gp, NULL)) ;
750 72 : GRB_TRY (Reduce_assign32 (&f, &mngp, V32, n, nthreads, ht_key,
751 : ht_val, &seed, msg)) ;
752 72 : GRB_TRY (GrB_eWiseAdd (f, NULL, GrB_MIN_UINT32, GrB_MIN_UINT32,
753 : mngp, gp, NULL)) ;
754 :
755 : // calculate grandparent
756 : // fixme: NULL parameter is SS:GrB extension
757 72 : GRB_TRY (GrB_Vector_extractTuples (NULL, V32, &n, f)) ; // fixme
758 : int32_t k;
759 : #pragma omp parallel for num_threads(nthreads2) schedule(static)
760 45924 : for (k = 0 ; k < n ; k++)
761 : {
762 45852 : I [k] = (GrB_Index) V32 [k] ;
763 : }
764 72 : GRB_TRY (GrB_extract (gp_new, NULL, NULL, f, I, n, NULL)) ;
765 :
766 : // check termination
767 72 : GRB_TRY (GrB_eWiseMult (mod, NULL, NULL, GrB_NE_UINT32, gp_new, gp,
768 : NULL)) ;
769 72 : GRB_TRY (GrB_reduce (&change, NULL, GrB_LOR_MONOID_BOOL, mod, NULL)) ;
770 :
771 : // swap gp and gp_new
772 72 : GrB_Vector t = gp ; gp = gp_new ; gp_new = t ;
773 : }
774 :
775 : //--------------------------------------------------------------------------
776 : // free workspace and return result
777 : //--------------------------------------------------------------------------
778 :
779 24 : (*component) = f ;
780 24 : f = NULL ;
781 24 : if (sampling)
782 : {
783 4 : GrB_free (&T) ;
784 : }
785 24 : LG_FREE_ALL ;
786 24 : return (GrB_SUCCESS) ;
787 : #endif
788 : }
|