Line data Source code
1 : //------------------------------------------------------------------------------
2 : // LG_CC_FastSV5: connected components
3 : //------------------------------------------------------------------------------
4 :
5 : // LAGraph, (c) 2019-2023 by The LAGraph Contributors, All Rights Reserved.
6 : // SPDX-License-Identifier: BSD-2-Clause
7 : //
8 : // For additional details (including references to third party source code and
9 : // other files) see the LICENSE file or contact permission@sei.cmu.edu. See
10 : // Contributors.txt for a full list of contributors. Created, in part, with
11 : // funding and support from the U.S. Government (see Acknowledgments.txt file).
12 : // DM22-0790
13 :
14 : // Contributed by Yongzhe Zhang; modified by Tim Davis, Texas A&M University
15 :
16 : //------------------------------------------------------------------------------
17 :
18 : // Code is based on the algorithm described in the following paper
19 : // Zhang, Azad, Hu. FastSV: FastSV: A Distributed-Memory Connected Component
20 : // Algorithm with Fast Convergence (SIAM PP20)
21 :
22 : // A subsequent update to the algorithm is here (which might not be reflected
23 : // in this code):
24 : //
25 : // Yongzhe Zhang, Ariful Azad, Aydin Buluc: Parallel algorithms for finding
26 : // connected components using linear algebra. J. Parallel Distributed Comput.
27 : // 144: 14-27 (2020).
28 :
29 : // Modified by Tim Davis, Texas A&M University
30 :
31 : // The input matrix A must be symmetric. Self-edges (diagonal entries) are
32 : // OK, and are ignored. The values and type of A are ignored; just its
33 : // structure is accessed.
34 :
35 : // The matrix A must have dimension 2^32 or less.
36 : // todo: Need a 64-bit version of this method.
37 :
38 : // todo: this function is not thread-safe, since it exports G->A and then
39 : // reimports it back. G->A is unchanged when the function returns, but during
40 : // execution G->A is invalid.
41 :
42 : // A note about "todo" and "fixme" in this file: these do not need to be fixed
43 : // or changed for this method, since the revised version appears as
44 : // src/algorithm/LG_CC_FastSV6.c. They have thus been changed here to lower
45 : // case.
46 :
47 : // todo: free all workspace in LG_FREE_ALL
48 : #define LG_FREE_ALL ;
49 :
50 : #include "LG_internal.h"
51 :
52 : // TODO: not ready for src; need a vanilla method with no GxB
53 :
54 : #if LAGRAPH_SUITESPARSE
55 :
56 : //------------------------------------------------------------------------------
57 : // hash functions: todo describe me
58 : //------------------------------------------------------------------------------
59 :
60 : // hash table size must be a power of 2
61 : #define HASH_SIZE 1024
62 :
63 : // number of samples to insert into the hash table
64 : // todo: this seems to be a lot of entries for a HASH_SIZE of 1024.
65 : // There could be lots of collisions.
66 : #define HASH_SAMPLES 864
67 :
68 : #define HASH(x) (((x << 4) + x) & (HASH_SIZE-1))
69 : #define NEXT(x) ((x + 23) & (HASH_SIZE-1))
70 :
71 : //------------------------------------------------------------------------------
72 : // ht_init: todo describe me
73 : //------------------------------------------------------------------------------
74 :
75 : // Clear the hash table counts (ht_val [0:HASH_SIZE-1] = 0), and set all hash
76 : // table entries as empty (ht_key [0:HASH_SIZE-1] =-1).
77 :
78 : // todo: the memset of ht_key is confusing
79 :
80 : // todo: the name "ht_val" is confusing. It is not a value, but a count of
81 : // the number of times the value x = ht_key [h] has been inserted into the
82 : // hth position in the hash table. It should be renamed ht_cnt.
83 :
84 48 : static inline void ht_init
85 : (
86 : int32_t *ht_key,
87 : int32_t *ht_val
88 : )
89 : {
90 48 : memset (ht_key, -1, sizeof (int32_t) * HASH_SIZE) ;
91 48 : memset (ht_val, 0, sizeof (int32_t) * HASH_SIZE) ;
92 48 : }
93 :
94 : //------------------------------------------------------------------------------
95 : // ht_sample: todo describe me
96 : //------------------------------------------------------------------------------
97 :
98 : //
99 :
100 48 : static inline void ht_sample
101 : (
102 : uint32_t *V32, // array of size n (todo: this is a bad variable name)
103 : int32_t n,
104 : int32_t samples, // number of samples to take from V32
105 : int32_t *ht_key,
106 : int32_t *ht_val,
107 : uint64_t *seed
108 : )
109 : {
110 41520 : for (int32_t k = 0 ; k < samples ; k++)
111 : {
112 : // select an entry from V32 at random
113 41472 : int32_t x = V32 [LG_Random64 (seed) % n] ;
114 :
115 : // find x in the hash table
116 : // todo: make this loop a static inline function (see also below)
117 41472 : int32_t h = HASH (x) ;
118 46012 : while (ht_key [h] != -1 && ht_key [h] != x)
119 : {
120 4540 : h = NEXT (h) ;
121 : }
122 :
123 41472 : ht_key [h] = x ;
124 41472 : ht_val [h]++ ;
125 : }
126 48 : }
127 :
128 : //------------------------------------------------------------------------------
129 : // ht_most_frequent: todo describe me
130 : //------------------------------------------------------------------------------
131 :
132 : // todo what if key is returned as -1? Code breaks. todo: handle this case
133 :
134 4 : static inline int32_t ht_most_frequent
135 : (
136 : int32_t *ht_key,
137 : int32_t *ht_val
138 : )
139 : {
140 4 : int32_t key = -1 ;
141 4 : int32_t val = 0 ; // max (ht_val [0:HASH_SIZE-1])
142 4100 : for (int32_t h = 0 ; h < HASH_SIZE ; h++)
143 : {
144 4096 : if (ht_val [h] > val)
145 : {
146 8 : key = ht_key [h] ;
147 8 : val = ht_val [h] ;
148 : }
149 : }
150 4 : return (key) ; // return most frequent key
151 : }
152 :
153 : //------------------------------------------------------------------------------
154 : // Reduce_assign32: w (index) += s, using MIN as the "+=" accum operator
155 : //------------------------------------------------------------------------------
156 :
157 : // mask = NULL, accumulator = GrB_MIN_UINT32, descriptor = NULL.
158 : // Duplicates are summed with the accumulator, which differs from how
159 : // GrB_assign works. GrB_assign states that the presence of duplicates results
160 : // in undefined behavior. GrB_assign in SuiteSparse:GraphBLAS follows the
161 : // MATLAB rule, which discards all but the first of the duplicates.
162 :
163 : // TODO: Reduce_assign32 is slow. See src/algorithm/LG_CC_FastSV6.
164 :
165 90 : static inline int Reduce_assign32
166 : (
167 : GrB_Vector *w_handle, // vector of size n, all entries present
168 : GrB_Vector *s_handle, // vector of size n, all entries present
169 : uint32_t *index, // array of size n, can have duplicates
170 : GrB_Index n,
171 : int nthreads,
172 : int32_t *ht_key, // hash table
173 : int32_t *ht_val, // hash table (count of # of entries)
174 : uint64_t *seed, // random
175 : char *msg
176 : )
177 : {
178 :
179 : GrB_Type w_type, s_type ;
180 : GrB_Index w_n, s_n, w_nvals, s_nvals, *w_i, *s_i, w_size, s_size ;
181 : uint32_t *w_x, *s_x ;
182 90 : bool s_iso = false ;
183 :
184 : //--------------------------------------------------------------------------
185 : // export w and s
186 : //--------------------------------------------------------------------------
187 :
188 : // export the GrB_Vectors w and s as full arrays, to get direct access to
189 : // their contents. Note that this would fail if w or s are not full, with
190 : // all entries present.
191 90 : GRB_TRY (GxB_Vector_export_Full (w_handle, &w_type, &w_n, (void **) &w_x,
192 : &w_size, NULL, NULL)) ;
193 90 : GRB_TRY (GxB_Vector_export_Full (s_handle, &s_type, &s_n, (void **) &s_x,
194 : &s_size, &s_iso, NULL)) ;
195 :
196 : #if defined ( COVERAGE )
197 90 : if (n >= 200) // for test coverage only; do not use in production!!
198 : #else
199 : if (nthreads >= 4)
200 : #endif
201 : {
202 :
203 : // allocate a buf array for each thread, of size HASH_SIZE
204 : uint32_t *mem ;
205 44 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &mem, nthreads*HASH_SIZE,
206 : sizeof (uint32_t), msg)) ;
207 : // todo: check out-of-memory condition here
208 :
209 : // todo why is hashing needed here? hashing is slow for what needs
210 : // to be computed here. GraphBLAS has fast MIN atomic monoids that
211 : // do not require hashing.
212 44 : ht_init (ht_key, ht_val) ;
213 44 : ht_sample (index, n, HASH_SAMPLES, ht_key, ht_val, seed) ;
214 :
215 : int tid ;
216 : #pragma omp parallel for num_threads(nthreads) schedule(static)
217 88 : for (tid = 0 ; tid < nthreads ; tid++)
218 : {
219 : // get the thread-specific buf array of size HASH_SIZE
220 : // todo: buf is a bad variable name; it's not a "buffer",
221 : // but a local workspace to compute the local version of w_x.
222 44 : uint32_t *buf = mem + tid * HASH_SIZE ;
223 :
224 : // copy the values from the global hash table into buf
225 45100 : for (int32_t h = 0 ; h < HASH_SIZE ; h++)
226 : {
227 45056 : if (ht_key [h] != -1)
228 : {
229 15352 : buf [h] = w_x [ht_key [h]] ;
230 : }
231 : }
232 :
233 : // this thread works on index [kstart:kend]
234 44 : int32_t kstart = (n * tid + nthreads - 1) / nthreads ;
235 44 : int32_t kend = (n * tid + n + nthreads - 1) / nthreads ;
236 89968 : for (int32_t k = kstart ; k < kend ; k++)
237 : {
238 89924 : uint32_t i = index [k] ;
239 :
240 : // todo: make this loop a static inline function
241 89924 : int32_t h = HASH (i) ;
242 122754 : while (ht_key [h] != -1 && ht_key [h] != i)
243 : {
244 32830 : h = NEXT (h) ;
245 : }
246 :
247 89924 : if (ht_key [h] == -1)
248 : {
249 : // todo is this a race condition?
250 27746 : w_x [i] = LAGRAPH_MIN (w_x [i], s_x [s_iso?0:k]) ;
251 : }
252 : else
253 : {
254 62178 : buf [h] = LAGRAPH_MIN (buf [h], s_x [s_iso?0:k]) ;
255 : }
256 : }
257 : }
258 :
259 : // combine intermediate results from each thread
260 45100 : for (int32_t h = 0 ; h < HASH_SIZE ; h++)
261 : {
262 45056 : int32_t i = ht_key [h] ;
263 45056 : if (i != -1)
264 : {
265 30704 : for (tid = 0 ; tid < nthreads ; tid++)
266 : {
267 15352 : w_x [i] = LAGRAPH_MIN (w_x [i], mem [tid * HASH_SIZE + h]) ;
268 : }
269 : }
270 : }
271 :
272 44 : LAGraph_Free ((void **) &mem, NULL) ;
273 : }
274 : else
275 : {
276 : // sequential version
277 728 : for (int64_t k = 0 ; k < n ; k++)
278 : {
279 682 : uint32_t i = index [k] ;
280 682 : w_x [i] = LAGRAPH_MIN (w_x [i], s_x [s_iso?0:k]) ;
281 : }
282 : }
283 :
284 : //--------------------------------------------------------------------------
285 : // reimport w and s back into GrB_Vectors, and return result
286 : //--------------------------------------------------------------------------
287 :
288 : // s is unchanged. It was exported only to compute w (index) += s
289 :
290 90 : GRB_TRY (GxB_Vector_import_Full (w_handle, w_type, w_n, (void **) &w_x,
291 : w_size, false, NULL)) ;
292 90 : GRB_TRY (GxB_Vector_import_Full (s_handle, s_type, s_n, (void **) &s_x,
293 : s_size, s_iso, NULL)) ;
294 :
295 90 : return (GrB_SUCCESS) ;
296 : }
297 :
298 : #endif
299 :
300 : //------------------------------------------------------------------------------
301 : // LG_CC_FastSV5
302 : //------------------------------------------------------------------------------
303 :
304 : // The output of LG_CC_FastSV5 is a vector component, where
305 : // component(i)=s if node i is in the connected compononent whose
306 : // representative node is node s. If s is a representative, then
307 : // component(s)=s. The number of connected components in the graph G is the
308 : // number of representatives.
309 :
310 : #undef LG_FREE_ALL
311 : #define LG_FREE_ALL \
312 : { \
313 : LAGraph_Free ((void **) &I, NULL) ; \
314 : LAGraph_Free ((void **) &V32, NULL) ; \
315 : LAGraph_Free ((void **) &ht_key, NULL) ; \
316 : LAGraph_Free ((void **) &ht_val, NULL) ; \
317 : /* todo why is T not freed?? */ \
318 : GrB_free (&f) ; \
319 : GrB_free (&gp) ; \
320 : GrB_free (&mngp) ; \
321 : GrB_free (&gp_new) ; \
322 : GrB_free (&mod) ; \
323 : }
324 :
325 24 : int LG_CC_FastSV5 // SuiteSparse:GraphBLAS method
326 : (
327 : // output
328 : GrB_Vector *component, // component(i)=s if node is in the component s
329 : // inputs
330 : LAGraph_Graph G, // input graph, G->A can change
331 : char *msg
332 : )
333 : {
334 : #if LAGRAPH_SUITESPARSE
335 :
336 : //--------------------------------------------------------------------------
337 : // check inputs
338 : //--------------------------------------------------------------------------
339 :
340 24 : LG_CLEAR_MSG ;
341 :
342 24 : uint32_t *V32 = NULL ;
343 24 : int32_t *ht_key = NULL, *ht_val = NULL ;
344 24 : GrB_Index n, nnz, *I = NULL ;
345 24 : GrB_Vector f = NULL, gp_new = NULL, mngp = NULL, mod = NULL, gp = NULL ;
346 24 : GrB_Matrix T = NULL ;
347 :
348 24 : LG_TRY (LAGraph_CheckGraph (G, msg)) ;
349 24 : LG_ASSERT (component != NULL, GrB_NULL_POINTER) ;
350 :
351 24 : LG_ASSERT_MSG ((G->kind == LAGraph_ADJACENCY_UNDIRECTED ||
352 : (G->kind == LAGraph_ADJACENCY_DIRECTED &&
353 : G->is_symmetric_structure == LAGraph_TRUE)),
354 : -1001, "G->A must be known to be symmetric") ;
355 :
356 24 : GrB_Matrix S = G->A ;
357 24 : GRB_TRY (GrB_Matrix_nrows (&n, S)) ;
358 24 : GRB_TRY (GrB_Matrix_nvals (&nnz, S)) ;
359 :
360 24 : LG_ASSERT_MSG (n <= UINT32_MAX, -1, "problem too large (fixme)") ;
361 :
362 : #define FASTSV_SAMPLES 4
363 :
364 24 : bool sampling = (n * FASTSV_SAMPLES * 2 < nnz) ;
365 :
366 : // random number seed
367 24 : uint64_t seed = n ;
368 :
369 : //--------------------------------------------------------------------------
370 : // initializations
371 : //--------------------------------------------------------------------------
372 :
373 : // determine # of threads to use for Reduce_assign
374 : int nthreads, nthreads_outer, nthreads_inner ;
375 24 : LG_TRY (LAGraph_GetNumThreads (&nthreads_outer, &nthreads_inner, msg)) ;
376 24 : nthreads = nthreads_outer * nthreads_inner ;
377 :
378 24 : nthreads = LAGRAPH_MIN (nthreads, n / 16) ;
379 24 : nthreads = LAGRAPH_MAX (nthreads, 1) ;
380 :
381 : // # of threads to use for typecast
382 24 : int nthreads2 = n / (64*1024) ;
383 24 : nthreads2 = LAGRAPH_MIN (nthreads2, nthreads) ;
384 24 : nthreads2 = LAGRAPH_MAX (nthreads2, 1) ;
385 :
386 : // vectors
387 24 : GRB_TRY (GrB_Vector_new (&f, GrB_UINT32, n)) ;
388 24 : GRB_TRY (GrB_Vector_new (&gp_new, GrB_UINT32, n)) ;
389 24 : GRB_TRY (GrB_Vector_new (&mod, GrB_BOOL, n)) ;
390 :
391 : // temporary arrays
392 24 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &I , n, sizeof (GrB_Index), msg)) ;
393 24 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &V32, n, sizeof (uint32_t), msg)) ;
394 :
395 : // prepare vectors
396 : int64_t i ;
397 : #pragma omp parallel for num_threads(nthreads2) schedule(static)
398 16284 : for (i = 0 ; i < n ; i++)
399 : {
400 16260 : I [i] = i ;
401 16260 : V32 [i] = (uint32_t) i ;
402 : }
403 :
404 24 : GRB_TRY (GrB_Vector_build (f, I, V32, n, GrB_PLUS_UINT32)) ;
405 24 : GRB_TRY (GrB_Vector_dup (&gp, f)) ;
406 24 : GRB_TRY (GrB_Vector_dup (&mngp, f)) ;
407 :
408 : // allocate the hash table
409 24 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &ht_key, HASH_SIZE, sizeof (int32_t),
410 : msg)) ;
411 24 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &ht_val, HASH_SIZE, sizeof (int32_t),
412 : msg)) ;
413 :
414 : //--------------------------------------------------------------------------
415 : // sample phase
416 : //--------------------------------------------------------------------------
417 :
418 24 : if (sampling)
419 : {
420 :
421 : //----------------------------------------------------------------------
422 : // export S = G->A in CSR format
423 : //----------------------------------------------------------------------
424 :
425 : // S is not modified. It is only exported so that its contents can be
426 : // read by the parallel loops below.
427 :
428 : GrB_Type type ;
429 : GrB_Index nrows, ncols, nvals ;
430 : size_t typesize ;
431 : int64_t nonempty ;
432 : GrB_Index *Sp, *Sj ;
433 : void *Sx ;
434 4 : bool S_jumbled = false ;
435 : GrB_Index Sp_size, Sj_size, Sx_size ;
436 4 : bool S_iso = false ;
437 :
438 4 : GRB_TRY (GrB_Matrix_nvals (&nvals, S)) ;
439 4 : GRB_TRY (GxB_Matrix_export_CSR (&S, &type, &nrows, &ncols, &Sp, &Sj,
440 : &Sx, &Sp_size, &Sj_size, &Sx_size,
441 : &S_iso, &S_jumbled, NULL)) ;
442 4 : LAGRAPH_TRY (LAGraph_SizeOfType (&typesize, type, msg)) ;
443 4 : G->A = NULL ;
444 :
445 : //----------------------------------------------------------------------
446 : // allocate space to construct T
447 : //----------------------------------------------------------------------
448 :
449 4 : GrB_Index Tp_len = nrows+1, Tp_size = Tp_len*sizeof(GrB_Index);
450 4 : GrB_Index Tj_len = nvals, Tj_size = Tj_len*sizeof(GrB_Index);
451 4 : GrB_Index Tx_len = nvals ;
452 :
453 4 : GrB_Index *Tp = NULL, *Tj = NULL ;
454 4 : GrB_Index Tx_size = typesize ;
455 4 : void *Tx = NULL ;
456 4 : int32_t *range = NULL ;
457 4 : GrB_Index *count = NULL ;
458 :
459 4 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &Tp, Tp_len,
460 : sizeof (GrB_Index), msg)) ;
461 4 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &Tj, Tj_len,
462 : sizeof (GrB_Index), msg)) ;
463 4 : LAGRAPH_TRY (LAGraph_Calloc (&Tx, 1, typesize, msg)) ; // T is iso
464 :
465 : //----------------------------------------------------------------------
466 : // allocate workspace
467 : //----------------------------------------------------------------------
468 :
469 4 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &range, nthreads + 1,
470 : sizeof (int32_t), msg)) ;
471 4 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &count, nthreads + 1,
472 : sizeof (GrB_Index), msg)) ;
473 :
474 4 : memset (count, 0, sizeof (GrB_Index) * (nthreads + 1)) ;
475 :
476 : //----------------------------------------------------------------------
477 : // define parallel tasks to construct T
478 : //----------------------------------------------------------------------
479 :
480 : // thread tid works on rows range[tid]:range[tid+1]-1 of S and T
481 : int tid ;
482 12 : for (tid = 0 ; tid <= nthreads ; tid++)
483 : {
484 8 : range [tid] = (n * tid + nthreads - 1) / nthreads ;
485 : }
486 :
487 : //----------------------------------------------------------------------
488 : // determine the number entries to be constructed in T for each thread
489 : //----------------------------------------------------------------------
490 :
491 : #pragma omp parallel for num_threads(nthreads) schedule(static)
492 8 : for (tid = 0 ; tid < nthreads ; tid++)
493 : {
494 9756 : for (int32_t i = range [tid] ; i < range [tid+1] ; i++)
495 : {
496 9752 : int32_t deg = Sp [i + 1] - Sp [i] ;
497 9752 : count [tid + 1] += LAGRAPH_MIN (FASTSV_SAMPLES, deg) ;
498 : }
499 : }
500 :
501 : //----------------------------------------------------------------------
502 : // count = cumsum (count)
503 : //----------------------------------------------------------------------
504 :
505 8 : for (tid = 0 ; tid < nthreads ; tid++)
506 : {
507 4 : count [tid + 1] += count [tid] ;
508 : }
509 :
510 : //----------------------------------------------------------------------
511 : // construct T
512 : //----------------------------------------------------------------------
513 :
514 : // T (i,:) consists of the first FASTSV_SAMPLES of S (i,:).
515 :
516 : // todo: this could be done by GrB_Select, using a new operator.
517 :
518 : // Note that Tx is not modified. Only Tp and Tj are constructed.
519 :
520 : #pragma omp parallel for num_threads(nthreads) schedule(static)
521 8 : for (tid = 0 ; tid < nthreads ; tid++)
522 : {
523 4 : GrB_Index p = count [tid] ;
524 4 : Tp [range [tid]] = p ;
525 9756 : for (int32_t i = range [tid] ; i < range [tid+1] ; i++)
526 : {
527 : // construct T (i,:) from the first entries in S (i,:)
528 9752 : for (int32_t j = 0 ;
529 40390 : j < FASTSV_SAMPLES && Sp [i] + j < Sp [i + 1] ; j++)
530 : {
531 30638 : Tj [p++] = Sj [Sp [i] + j] ;
532 : }
533 9752 : Tp [i + 1] = p ;
534 : }
535 : }
536 :
537 : //----------------------------------------------------------------------
538 : // import the result into the GrB_Matrix T
539 : //----------------------------------------------------------------------
540 :
541 : // Note that Tx is unmodified.
542 :
543 : // in SuiteSparse:GraphBLAS v5, sizes are in bytes, not entries
544 4 : GrB_Index Tp_siz = Tp_size ;
545 4 : GrB_Index Tj_siz = Tj_size ;
546 4 : GrB_Index Tx_siz = Tx_size ;
547 :
548 4 : GrB_Index t_nvals = Tp [nrows] ;
549 4 : GRB_TRY (GxB_Matrix_import_CSR (&T, type, nrows, ncols,
550 : &Tp, &Tj, &Tx, Tp_siz, Tj_siz, Tx_siz,
551 : true, // T is iso
552 : S_jumbled, NULL)) ;
553 :
554 : //----------------------------------------------------------------------
555 : // find the connected components of T
556 : //----------------------------------------------------------------------
557 :
558 : // todo: this is nearly identical to the final phase below.
559 : // Make this a function
560 :
561 4 : bool change = true, is_first = true ;
562 26 : while (change)
563 : {
564 : // hooking & shortcutting
565 22 : GRB_TRY (GrB_mxv (mngp, NULL, GrB_MIN_UINT32,
566 : GrB_MIN_SECOND_SEMIRING_UINT32, T, gp, NULL)) ;
567 22 : if (!is_first)
568 : {
569 18 : LG_TRY (Reduce_assign32 (&f, &mngp, V32, n, nthreads,
570 : ht_key, ht_val, &seed, msg)) ;
571 : }
572 22 : GRB_TRY (GrB_eWiseAdd (f, NULL, GrB_MIN_UINT32, GrB_MIN_UINT32,
573 : mngp, gp, NULL)) ;
574 :
575 : // calculate grandparent
576 : // fixme: NULL parameter is SS:GrB extension
577 22 : GRB_TRY (GrB_Vector_extractTuples (NULL, V32, &n, f)) ; // fixme
578 : int64_t i ;
579 : #pragma omp parallel for num_threads(nthreads2) schedule(static)
580 54528 : for (i = 0 ; i < n ; i++)
581 : {
582 54506 : I [i] = (GrB_Index) V32 [i] ;
583 : }
584 22 : GRB_TRY (GrB_extract (gp_new, NULL, NULL, f, I, n, NULL)) ;
585 :
586 : // todo: GrB_Vector_extract should have a variant where the index
587 : // list is not given by an array I, but as a GrB_Vector of type
588 : // GrB_UINT64 (or which can be typecast to GrB_UINT64). This is a
589 : // common issue that arises in other algorithms as well.
590 : // Likewise GrB_Matrix_extract, and all forms of GrB_assign.
591 :
592 : // check termination
593 22 : GRB_TRY (GrB_eWiseMult (mod, NULL, NULL, GrB_NE_UINT32, gp_new,
594 : gp, NULL)) ;
595 22 : GRB_TRY (GrB_reduce (&change, NULL, GrB_LOR_MONOID_BOOL, mod,
596 : NULL)) ;
597 :
598 : // swap gp and gp_new
599 22 : GrB_Vector t = gp ; gp = gp_new ; gp_new = t ;
600 22 : is_first = false ;
601 : }
602 :
603 : //----------------------------------------------------------------------
604 : // todo: describe me
605 : //----------------------------------------------------------------------
606 :
607 4 : ht_init (ht_key, ht_val) ;
608 4 : ht_sample (V32, n, HASH_SAMPLES, ht_key, ht_val, &seed) ;
609 4 : int32_t key = ht_most_frequent (ht_key, ht_val) ;
610 : // todo: what if key is returned as -1? Then T below is invalid.
611 :
612 4 : int64_t t_nonempty = -1 ;
613 4 : bool T_jumbled = false, T_iso = true ;
614 :
615 : // export T
616 4 : GRB_TRY (GxB_Matrix_export_CSR (&T, &type, &nrows, &ncols, &Tp, &Tj,
617 : &Tx, &Tp_siz, &Tj_siz, &Tx_siz,
618 : &T_iso, &T_jumbled, NULL)) ;
619 :
620 : // todo what is this phase doing? It is constructing a matrix T that
621 : // depends only on S, key, and V32. T contains a subset of the entries
622 : // in S, except that T (i,:) is empty if
623 :
624 : // The prior content of T is ignored; it is exported from the earlier
625 : // phase, only to reuse the allocated space for T. However, T_jumbled
626 : // is preserved from the prior matrix T, which doesn't make sense.
627 :
628 : // This parallel loop is badly load balanced. Each thread operates on
629 : // the same number of rows of S, regardless of how many entries appear
630 : // in each set of rows. It uses one thread per task, statically
631 : // scheduled.
632 :
633 : #pragma omp parallel for num_threads(nthreads) schedule(static)
634 8 : for (tid = 0 ; tid < nthreads ; tid++)
635 : {
636 4 : GrB_Index ptr = Sp [range [tid]] ;
637 : // thread tid scans S (range [tid]:range [tid+1]-1,:),
638 : // and constructs T(i,:) for all rows in this range.
639 9756 : for (int32_t i = range [tid] ; i < range [tid+1] ; i++)
640 : {
641 9752 : int32_t pv = V32 [i] ; // what is pv?
642 9752 : Tp [i] = ptr ; // start the construction of T(i,:)
643 : // T(i,:) is empty if pv == key
644 9752 : if (pv != key)
645 : {
646 : // scan S(i,:)
647 46550 : for (GrB_Index p = Sp [i] ; p < Sp [i+1] ; p++)
648 : {
649 : // get S(i,j)
650 41274 : int32_t j = Sj [p] ;
651 41274 : if (V32 [j] != key)
652 : {
653 : // add the entry T(i,j) to T, but skip it if
654 : // V32 [j] is equal to key
655 41264 : Tj [ptr++] = j ;
656 : }
657 : }
658 : // add the entry T(i,key) if there is room for it in T(i,:)
659 5276 : if (ptr - Tp [i] < Sp [i+1] - Sp [i])
660 : {
661 2 : Tj [ptr++] = key ;
662 : }
663 : }
664 : }
665 : // count the number of entries inserted into T by this thread?
666 4 : count [tid] = ptr - Tp [range [tid]] ;
667 : }
668 :
669 : // Compact empty space out of Tj not filled in from the above phase.
670 : // This is a lot of work and should be done in parallel.
671 4 : GrB_Index offset = 0 ;
672 8 : for (tid = 0 ; tid < nthreads ; tid++)
673 : {
674 : // the source and destination can overlap;
675 : // this is safe (memmove_s not necessary):
676 4 : memmove (Tj + offset, Tj + Tp [range [tid]],
677 4 : sizeof (GrB_Index) * count [tid]) ;
678 4 : offset += count [tid] ;
679 4 : count [tid] = offset - count [tid] ;
680 : }
681 :
682 : // Compact empty space out of Tp
683 : #pragma omp parallel for num_threads(nthreads) schedule(static)
684 8 : for (tid = 0 ; tid < nthreads ; tid++)
685 : {
686 4 : GrB_Index ptr = Tp [range [tid]] ;
687 9756 : for (int64_t i = range [tid] ; i < range [tid+1] ; i++)
688 : {
689 9752 : Tp [i] -= ptr - count [tid] ;
690 : }
691 : }
692 :
693 : // finalize T
694 4 : Tp [n] = offset ;
695 :
696 : // free workspace
697 4 : LAGraph_Free ((void **) &count, NULL) ;
698 4 : LAGraph_Free ((void **) &range, NULL) ;
699 :
700 : // import S (unchanged since last export)
701 4 : GRB_TRY (GxB_Matrix_import_CSR (&S, type, nrows, ncols,
702 : &Sp, &Sj, &Sx, Sp_size, Sj_size, Sx_size,
703 : S_iso, S_jumbled, NULL)) ;
704 :
705 : // import T for the final phase
706 4 : GRB_TRY (GxB_Matrix_import_CSR (&T, type, nrows, ncols,
707 : &Tp, &Tj, &Tx, Tp_siz, Tj_siz, Tx_siz,
708 : T_iso, /* T is jumbled: */ true, NULL)) ;
709 :
710 : // restore G->A
711 4 : G->A = S ;
712 :
713 : }
714 : else
715 : {
716 :
717 : // no sampling; the final phase operates on the whole graph
718 20 : T = S ;
719 :
720 : }
721 :
722 : //--------------------------------------------------------------------------
723 : // final phase
724 : //--------------------------------------------------------------------------
725 :
726 24 : GRB_TRY (GrB_Matrix_nvals (&nnz, T)) ;
727 :
728 24 : bool change = true ;
729 96 : while (change && nnz > 0)
730 : {
731 : // hooking & shortcutting
732 72 : GRB_TRY (GrB_mxv (mngp, NULL, GrB_MIN_UINT32,
733 : GrB_MIN_SECOND_SEMIRING_UINT32, T, gp, NULL)) ;
734 72 : GRB_TRY (Reduce_assign32 (&f, &mngp, V32, n, nthreads, ht_key,
735 : ht_val, &seed, msg)) ;
736 72 : GRB_TRY (GrB_eWiseAdd (f, NULL, GrB_MIN_UINT32, GrB_MIN_UINT32,
737 : mngp, gp, NULL)) ;
738 :
739 : // calculate grandparent
740 : // fixme: NULL parameter is SS:GrB extension
741 72 : GRB_TRY (GrB_Vector_extractTuples (NULL, V32, &n, f)) ; // fixme
742 : int64_t k ;
743 : #pragma omp parallel for num_threads(nthreads2) schedule(static)
744 45924 : for (k = 0 ; k < n ; k++)
745 : {
746 45852 : I [k] = (GrB_Index) V32 [k] ;
747 : }
748 72 : GRB_TRY (GrB_extract (gp_new, NULL, NULL, f, I, n, NULL)) ;
749 :
750 : // check termination
751 72 : GRB_TRY (GrB_eWiseMult (mod, NULL, NULL, GrB_NE_UINT32, gp_new, gp,
752 : NULL)) ;
753 72 : GRB_TRY (GrB_reduce (&change, NULL, GrB_LOR_MONOID_BOOL, mod, NULL)) ;
754 :
755 : // swap gp and gp_new
756 72 : GrB_Vector t = gp ; gp = gp_new ; gp_new = t ;
757 : }
758 :
759 : //--------------------------------------------------------------------------
760 : // free workspace and return result
761 : //--------------------------------------------------------------------------
762 :
763 24 : (*component) = f ;
764 24 : f = NULL ;
765 24 : if (sampling)
766 : {
767 4 : GrB_free (&T) ;
768 : }
769 24 : LG_FREE_ALL ;
770 24 : return (GrB_SUCCESS) ;
771 : #else
772 : return (GrB_NOT_IMPLEMENTED) ;
773 : #endif
774 : }
|