LCOV - code coverage report
Current view: top level - experimental/algorithm - LG_CC_FastSV5.c (source / functions) Hit Total Coverage
Test: LAGraph code coverage report. Commit id: 7b9d2ee. Current time (UTC): 2025-06-03T21:57:17Z Lines: 203 203 100.0 %
Date: 2025-06-03 22:02:40 Functions: 5 5 100.0 %

          Line data    Source code
       1             : //------------------------------------------------------------------------------
       2             : // LG_CC_FastSV5: connected components
       3             : //------------------------------------------------------------------------------
       4             : 
       5             : // LAGraph, (c) 2019-2023 by The LAGraph Contributors, All Rights Reserved.
       6             : // SPDX-License-Identifier: BSD-2-Clause
       7             : //
       8             : // For additional details (including references to third party source code and
       9             : // other files) see the LICENSE file or contact permission@sei.cmu.edu. See
      10             : // Contributors.txt for a full list of contributors. Created, in part, with
      11             : // funding and support from the U.S. Government (see Acknowledgments.txt file).
      12             : // DM22-0790
      13             : 
      14             : // Contributed by Yongzhe Zhang; modified by Tim Davis, Texas A&M University
      15             : 
      16             : //------------------------------------------------------------------------------
      17             : 
      18             : // Code is based on the algorithm described in the following paper
      19             : // Zhang, Azad, Hu. FastSV: FastSV: A Distributed-Memory Connected Component
      20             : // Algorithm with Fast Convergence (SIAM PP20)
      21             : 
      22             : // A subsequent update to the algorithm is here (which might not be reflected
      23             : // in this code):
      24             : //
      25             : // Yongzhe Zhang, Ariful Azad, Aydin Buluc: Parallel algorithms for finding
      26             : // connected components using linear algebra. J. Parallel Distributed Comput.
      27             : // 144: 14-27 (2020).
      28             : 
      29             : // Modified by Tim Davis, Texas A&M University
      30             : 
      31             : // The input matrix A must be symmetric.  Self-edges (diagonal entries) are
      32             : // OK, and are ignored.  The values and type of A are ignored; just its
      33             : // structure is accessed.
      34             : 
      35             : // The matrix A must have dimension 2^32 or less.
      36             : // todo: Need a 64-bit version of this method.
      37             : 
      38             : // todo: this function is not thread-safe, since it exports G->A and then
      39             : // reimports it back.  G->A is unchanged when the function returns, but during
      40             : // execution G->A is invalid.
      41             : 
      42             : // A note about "todo" and "fixme" in this file:  these do not need to be fixed
      43             : // or changed for this method, since the revised version appears as
      44             : // src/algorithm/LG_CC_FastSV6.c.  They have thus been changed here to lower
      45             : // case.
      46             : 
      47             : // todo: free all workspace in LG_FREE_ALL
      48             : #define LG_FREE_ALL ;
      49             : 
      50             : #include "LG_internal.h"
      51             : 
      52             : // TODO: not ready for src; need a vanilla method with no GxB
      53             : 
      54             : #if LAGRAPH_SUITESPARSE
      55             : 
      56             : //------------------------------------------------------------------------------
      57             : // hash functions: todo describe me
      58             : //------------------------------------------------------------------------------
      59             : 
      60             : // hash table size must be a power of 2
      61             : #define HASH_SIZE 1024
      62             : 
      63             : // number of samples to insert into the hash table
      64             : // todo: this seems to be a lot of entries for a HASH_SIZE of 1024.
      65             : // There could be lots of collisions.
      66             : #define HASH_SAMPLES 864
      67             : 
      68             : #define HASH(x) (((x << 4) + x) & (HASH_SIZE-1))
      69             : #define NEXT(x) ((x + 23) & (HASH_SIZE-1))
      70             : 
      71             : //------------------------------------------------------------------------------
      72             : // ht_init: todo describe me
      73             : //------------------------------------------------------------------------------
      74             : 
      75             : // Clear the hash table counts (ht_val [0:HASH_SIZE-1] = 0), and set all hash
      76             : // table entries as empty (ht_key [0:HASH_SIZE-1] =-1).
      77             : 
      78             : // todo: the memset of ht_key is confusing
      79             : 
      80             : // todo: the name "ht_val" is confusing.  It is not a value, but a count of
      81             : // the number of times the value x = ht_key [h] has been inserted into the
      82             : // hth position in the hash table.  It should be renamed ht_cnt.
      83             : 
      84          48 : static inline void ht_init
      85             : (
      86             :     int32_t *ht_key,
      87             :     int32_t *ht_val
      88             : )
      89             : {
      90          48 :     memset (ht_key, -1, sizeof (int32_t) * HASH_SIZE) ;
      91          48 :     memset (ht_val,  0, sizeof (int32_t) * HASH_SIZE) ;
      92          48 : }
      93             : 
      94             : //------------------------------------------------------------------------------
      95             : // ht_sample: todo describe me
      96             : //------------------------------------------------------------------------------
      97             : 
      98             : //
      99             : 
     100          48 : static inline void ht_sample
     101             : (
     102             :     uint32_t *V32,      // array of size n (todo: this is a bad variable name)
     103             :     int32_t n,
     104             :     int32_t samples,    // number of samples to take from V32
     105             :     int32_t *ht_key,
     106             :     int32_t *ht_val,
     107             :     uint64_t *seed
     108             : )
     109             : {
     110       41520 :     for (int32_t k = 0 ; k < samples ; k++)
     111             :     {
     112             :         // select an entry from V32 at random
     113       41472 :         int32_t x = V32 [LG_Random64 (seed) % n] ;
     114             : 
     115             :         // find x in the hash table
     116             :         // todo: make this loop a static inline function (see also below)
     117       41472 :         int32_t h = HASH (x) ;
     118       46012 :         while (ht_key [h] != -1 && ht_key [h] != x)
     119             :         {
     120        4540 :             h = NEXT (h) ;
     121             :         }
     122             : 
     123       41472 :         ht_key [h] = x ;
     124       41472 :         ht_val [h]++ ;
     125             :     }
     126          48 : }
     127             : 
     128             : //------------------------------------------------------------------------------
     129             : // ht_most_frequent: todo describe me
     130             : //------------------------------------------------------------------------------
     131             : 
     132             : // todo what if key is returned as -1?  Code breaks.  todo: handle this case
     133             : 
     134           4 : static inline int32_t ht_most_frequent
     135             : (
     136             :     int32_t *ht_key,
     137             :     int32_t *ht_val
     138             : )
     139             : {
     140           4 :     int32_t key = -1 ;
     141           4 :     int32_t val = 0 ;                       // max (ht_val [0:HASH_SIZE-1])
     142        4100 :     for (int32_t h = 0 ; h < HASH_SIZE ; h++)
     143             :     {
     144        4096 :         if (ht_val [h] > val)
     145             :         {
     146           8 :             key = ht_key [h] ;
     147           8 :             val = ht_val [h] ;
     148             :         }
     149             :     }
     150           4 :     return (key) ;      // return most frequent key
     151             : }
     152             : 
     153             : //------------------------------------------------------------------------------
     154             : // Reduce_assign32:  w (index) += s, using MIN as the "+=" accum operator
     155             : //------------------------------------------------------------------------------
     156             : 
     157             : // mask = NULL, accumulator = GrB_MIN_UINT32, descriptor = NULL.
     158             : // Duplicates are summed with the accumulator, which differs from how
     159             : // GrB_assign works.  GrB_assign states that the presence of duplicates results
     160             : // in undefined behavior.  GrB_assign in SuiteSparse:GraphBLAS follows the
     161             : // MATLAB rule, which discards all but the first of the duplicates.
     162             : 
     163             : // TODO: Reduce_assign32 is slow.  See src/algorithm/LG_CC_FastSV6.
     164             : 
     165          90 : static inline int Reduce_assign32
     166             : (
     167             :     GrB_Vector *w_handle,   // vector of size n, all entries present
     168             :     GrB_Vector *s_handle,   // vector of size n, all entries present
     169             :     uint32_t *index,        // array of size n, can have duplicates
     170             :     GrB_Index n,
     171             :     int nthreads,
     172             :     int32_t *ht_key,        // hash table
     173             :     int32_t *ht_val,        // hash table (count of # of entries)
     174             :     uint64_t *seed,         // random
     175             :     char *msg
     176             : )
     177             : {
     178             : 
     179             :     GrB_Type w_type, s_type ;
     180             :     GrB_Index w_n, s_n, w_nvals, s_nvals, *w_i, *s_i, w_size, s_size ;
     181             :     uint32_t *w_x, *s_x ;
     182          90 :     bool s_iso = false ;
     183             : 
     184             :     //--------------------------------------------------------------------------
     185             :     // export w and s
     186             :     //--------------------------------------------------------------------------
     187             : 
     188             :     // export the GrB_Vectors w and s as full arrays, to get direct access to
     189             :     // their contents.  Note that this would fail if w or s are not full, with
     190             :     // all entries present.
     191          90 :     GRB_TRY (GxB_Vector_export_Full (w_handle, &w_type, &w_n, (void **) &w_x,
     192             :         &w_size, NULL, NULL)) ;
     193          90 :     GRB_TRY (GxB_Vector_export_Full (s_handle, &s_type, &s_n, (void **) &s_x,
     194             :         &s_size, &s_iso, NULL)) ;
     195             : 
     196             :     #if defined ( COVERAGE )
     197          90 :     if (n >= 200)   // for test coverage only; do not use in production!!
     198             :     #else
     199             :     if (nthreads >= 4)
     200             :     #endif
     201             :     {
     202             : 
     203             :         // allocate a buf array for each thread, of size HASH_SIZE
     204             :         uint32_t *mem ;
     205          44 :         LAGRAPH_TRY (LAGraph_Malloc ((void **) &mem, nthreads*HASH_SIZE,
     206             :             sizeof (uint32_t), msg)) ;
     207             :         // todo: check out-of-memory condition here
     208             : 
     209             :         // todo why is hashing needed here?  hashing is slow for what needs
     210             :         // to be computed here.  GraphBLAS has fast MIN atomic monoids that
     211             :         // do not require hashing.
     212          44 :         ht_init (ht_key, ht_val) ;
     213          44 :         ht_sample (index, n, HASH_SAMPLES, ht_key, ht_val, seed) ;
     214             : 
     215             :         int tid ;
     216             :         #pragma omp parallel for num_threads(nthreads) schedule(static)
     217          88 :         for (tid = 0 ; tid < nthreads ; tid++)
     218             :         {
     219             :             // get the thread-specific buf array of size HASH_SIZE
     220             :             // todo: buf is a bad variable name; it's not a "buffer",
     221             :             // but a local workspace to compute the local version of w_x.
     222          44 :             uint32_t *buf = mem + tid * HASH_SIZE ;
     223             : 
     224             :             // copy the values from the global hash table into buf
     225       45100 :             for (int32_t h = 0 ; h < HASH_SIZE ; h++)
     226             :             {
     227       45056 :                 if (ht_key [h] != -1)
     228             :                 {
     229       15352 :                     buf [h] = w_x [ht_key [h]] ;
     230             :                 }
     231             :             }
     232             : 
     233             :             // this thread works on index [kstart:kend]
     234          44 :             int32_t kstart = (n * tid + nthreads - 1) / nthreads ;
     235          44 :             int32_t kend = (n * tid + n + nthreads - 1) / nthreads ;
     236       89968 :             for (int32_t k = kstart ; k < kend ; k++)
     237             :             {
     238       89924 :                 uint32_t i = index [k] ;
     239             : 
     240             :                 // todo: make this loop a static inline function
     241       89924 :                 int32_t h = HASH (i) ;
     242      122754 :                 while (ht_key [h] != -1 && ht_key [h] != i)
     243             :                 {
     244       32830 :                     h = NEXT (h) ;
     245             :                 }
     246             : 
     247       89924 :                 if (ht_key [h] == -1)
     248             :                 {
     249             :                     // todo is this a race condition?
     250       27746 :                     w_x [i] = LAGRAPH_MIN (w_x [i], s_x [s_iso?0:k]) ;
     251             :                 }
     252             :                 else
     253             :                 {
     254       62178 :                     buf [h] = LAGRAPH_MIN (buf [h], s_x [s_iso?0:k]) ;
     255             :                 }
     256             :             }
     257             :         }
     258             : 
     259             :         // combine intermediate results from each thread
     260       45100 :         for (int32_t h = 0 ; h < HASH_SIZE ; h++)
     261             :         {
     262       45056 :             int32_t i = ht_key [h] ;
     263       45056 :             if (i != -1)
     264             :             {
     265       30704 :                 for (tid = 0 ; tid < nthreads ; tid++)
     266             :                 {
     267       15352 :                     w_x [i] = LAGRAPH_MIN (w_x [i], mem [tid * HASH_SIZE + h]) ;
     268             :                 }
     269             :             }
     270             :         }
     271             : 
     272          44 :         LAGraph_Free ((void **) &mem, NULL) ;
     273             :     }
     274             :     else
     275             :     {
     276             :         // sequential version
     277         728 :         for (int64_t k = 0 ; k < n ; k++)
     278             :         {
     279         682 :             uint32_t i = index [k] ;
     280         682 :             w_x [i] = LAGRAPH_MIN (w_x [i], s_x [s_iso?0:k]) ;
     281             :         }
     282             :     }
     283             : 
     284             :     //--------------------------------------------------------------------------
     285             :     // reimport w and s back into GrB_Vectors, and return result
     286             :     //--------------------------------------------------------------------------
     287             : 
     288             :     // s is unchanged.  It was exported only to compute w (index) += s
     289             : 
     290          90 :     GRB_TRY (GxB_Vector_import_Full (w_handle, w_type, w_n, (void **) &w_x,
     291             :         w_size, false, NULL)) ;
     292          90 :     GRB_TRY (GxB_Vector_import_Full (s_handle, s_type, s_n, (void **) &s_x,
     293             :         s_size, s_iso, NULL)) ;
     294             : 
     295          90 :     return (GrB_SUCCESS) ;
     296             : }
     297             : 
     298             : #endif
     299             : 
     300             : //------------------------------------------------------------------------------
     301             : // LG_CC_FastSV5
     302             : //------------------------------------------------------------------------------
     303             : 
     304             : // The output of LG_CC_FastSV5 is a vector component, where
     305             : // component(i)=s if node i is in the connected compononent whose
     306             : // representative node is node s.  If s is a representative, then
     307             : // component(s)=s.  The number of connected components in the graph G is the
     308             : // number of representatives.
     309             : 
     310             : #undef  LG_FREE_ALL
     311             : #define LG_FREE_ALL                             \
     312             : {                                               \
     313             :     LAGraph_Free ((void **) &I, NULL) ;         \
     314             :     LAGraph_Free ((void **) &V32, NULL) ;       \
     315             :     LAGraph_Free ((void **) &ht_key, NULL) ;    \
     316             :     LAGraph_Free ((void **) &ht_val, NULL) ;    \
     317             :     /* todo why is T not freed?? */             \
     318             :     GrB_free (&f) ;                             \
     319             :     GrB_free (&gp) ;                            \
     320             :     GrB_free (&mngp) ;                          \
     321             :     GrB_free (&gp_new) ;                        \
     322             :     GrB_free (&mod) ;                           \
     323             : }
     324             : 
     325          24 : int LG_CC_FastSV5           // SuiteSparse:GraphBLAS method
     326             : (
     327             :     // output
     328             :     GrB_Vector *component,  // component(i)=s if node is in the component s
     329             :     // inputs
     330             :     LAGraph_Graph G,        // input graph, G->A can change
     331             :     char *msg
     332             : )
     333             : {
     334             : #if LAGRAPH_SUITESPARSE
     335             : 
     336             :     //--------------------------------------------------------------------------
     337             :     // check inputs
     338             :     //--------------------------------------------------------------------------
     339             : 
     340          24 :     LG_CLEAR_MSG ;
     341             : 
     342          24 :     uint32_t *V32 = NULL ;
     343          24 :     int32_t *ht_key = NULL, *ht_val = NULL ;
     344          24 :     GrB_Index n, nnz, *I = NULL ;
     345          24 :     GrB_Vector f = NULL, gp_new = NULL, mngp = NULL, mod = NULL, gp = NULL ;
     346          24 :     GrB_Matrix T = NULL ;
     347             : 
     348          24 :     LG_TRY (LAGraph_CheckGraph (G, msg)) ;
     349          24 :     LG_ASSERT (component != NULL, GrB_NULL_POINTER) ;
     350             : 
     351          24 :     LG_ASSERT_MSG ((G->kind == LAGraph_ADJACENCY_UNDIRECTED ||
     352             :        (G->kind == LAGraph_ADJACENCY_DIRECTED &&
     353             :         G->is_symmetric_structure == LAGraph_TRUE)),
     354             :         -1001, "G->A must be known to be symmetric") ;
     355             : 
     356          24 :     GrB_Matrix S = G->A ;
     357          24 :     GRB_TRY (GrB_Matrix_nrows (&n, S)) ;
     358          24 :     GRB_TRY (GrB_Matrix_nvals (&nnz, S)) ;
     359             : 
     360          24 :     LG_ASSERT_MSG (n <= UINT32_MAX, -1, "problem too large (fixme)") ;
     361             : 
     362             :     #define FASTSV_SAMPLES 4
     363             : 
     364          24 :     bool sampling = (n * FASTSV_SAMPLES * 2 < nnz) ;
     365             : 
     366             :     // random number seed
     367          24 :     uint64_t seed = n ;
     368             : 
     369             :     //--------------------------------------------------------------------------
     370             :     // initializations
     371             :     //--------------------------------------------------------------------------
     372             : 
     373             :     // determine # of threads to use for Reduce_assign
     374             :     int nthreads, nthreads_outer, nthreads_inner ;
     375          24 :     LG_TRY (LAGraph_GetNumThreads (&nthreads_outer, &nthreads_inner, msg)) ;
     376          24 :     nthreads = nthreads_outer * nthreads_inner ;
     377             : 
     378          24 :     nthreads = LAGRAPH_MIN (nthreads, n / 16) ;
     379          24 :     nthreads = LAGRAPH_MAX (nthreads, 1) ;
     380             : 
     381             :     // # of threads to use for typecast
     382          24 :     int nthreads2 = n / (64*1024) ;
     383          24 :     nthreads2 = LAGRAPH_MIN (nthreads2, nthreads) ;
     384          24 :     nthreads2 = LAGRAPH_MAX (nthreads2, 1) ;
     385             : 
     386             :     // vectors
     387          24 :     GRB_TRY (GrB_Vector_new (&f,      GrB_UINT32, n)) ;
     388          24 :     GRB_TRY (GrB_Vector_new (&gp_new, GrB_UINT32, n)) ;
     389          24 :     GRB_TRY (GrB_Vector_new (&mod,    GrB_BOOL,   n)) ;
     390             : 
     391             :     // temporary arrays
     392          24 :     LAGRAPH_TRY (LAGraph_Malloc ((void **) &I  , n, sizeof (GrB_Index), msg)) ;
     393          24 :     LAGRAPH_TRY (LAGraph_Malloc ((void **) &V32, n, sizeof (uint32_t), msg)) ;
     394             : 
     395             :     // prepare vectors
     396             :     int64_t i ;
     397             :     #pragma omp parallel for num_threads(nthreads2) schedule(static)
     398       16284 :     for (i = 0 ; i < n ; i++)
     399             :     {
     400       16260 :         I [i] = i ;
     401       16260 :         V32 [i] = (uint32_t) i ;
     402             :     }
     403             : 
     404          24 :     GRB_TRY (GrB_Vector_build (f, I, V32, n, GrB_PLUS_UINT32)) ;
     405          24 :     GRB_TRY (GrB_Vector_dup (&gp,   f)) ;
     406          24 :     GRB_TRY (GrB_Vector_dup (&mngp, f)) ;
     407             : 
     408             :     // allocate the hash table
     409          24 :     LAGRAPH_TRY (LAGraph_Malloc ((void **) &ht_key, HASH_SIZE, sizeof (int32_t),
     410             :         msg)) ;
     411          24 :     LAGRAPH_TRY (LAGraph_Malloc ((void **) &ht_val, HASH_SIZE, sizeof (int32_t),
     412             :         msg)) ;
     413             : 
     414             :     //--------------------------------------------------------------------------
     415             :     // sample phase
     416             :     //--------------------------------------------------------------------------
     417             : 
     418          24 :     if (sampling)
     419             :     {
     420             : 
     421             :         //----------------------------------------------------------------------
     422             :         // export S = G->A in CSR format
     423             :         //----------------------------------------------------------------------
     424             : 
     425             :         // S is not modified.  It is only exported so that its contents can be
     426             :         // read by the parallel loops below.
     427             : 
     428             :         GrB_Type type ;
     429             :         GrB_Index nrows, ncols, nvals ;
     430             :         size_t typesize ;
     431             :         int64_t nonempty ;
     432             :         GrB_Index *Sp, *Sj ;
     433             :         void *Sx ;
     434           4 :         bool S_jumbled = false ;
     435             :         GrB_Index Sp_size, Sj_size, Sx_size ;
     436           4 :         bool S_iso = false ;
     437             : 
     438           4 :         GRB_TRY (GrB_Matrix_nvals (&nvals, S)) ;
     439           4 :         GRB_TRY (GxB_Matrix_export_CSR (&S, &type, &nrows, &ncols, &Sp, &Sj,
     440             :             &Sx, &Sp_size, &Sj_size, &Sx_size,
     441             :             &S_iso, &S_jumbled, NULL)) ;
     442           4 :         LAGRAPH_TRY (LAGraph_SizeOfType (&typesize, type, msg)) ;
     443           4 :         G->A = NULL ;
     444             : 
     445             :         //----------------------------------------------------------------------
     446             :         // allocate space to construct T
     447             :         //----------------------------------------------------------------------
     448             : 
     449           4 :         GrB_Index Tp_len = nrows+1, Tp_size = Tp_len*sizeof(GrB_Index);
     450           4 :         GrB_Index Tj_len = nvals,   Tj_size = Tj_len*sizeof(GrB_Index);
     451           4 :         GrB_Index Tx_len = nvals ;
     452             : 
     453           4 :         GrB_Index *Tp = NULL, *Tj = NULL ;
     454           4 :         GrB_Index Tx_size = typesize ;
     455           4 :         void *Tx = NULL ;
     456           4 :         int32_t *range = NULL ;
     457           4 :         GrB_Index *count = NULL ;
     458             : 
     459           4 :         LAGRAPH_TRY (LAGraph_Malloc ((void **) &Tp, Tp_len,
     460             :             sizeof (GrB_Index), msg)) ;
     461           4 :         LAGRAPH_TRY (LAGraph_Malloc ((void **) &Tj, Tj_len,
     462             :             sizeof (GrB_Index), msg)) ;
     463           4 :         LAGRAPH_TRY (LAGraph_Calloc (&Tx, 1, typesize, msg)) ;   // T is iso
     464             : 
     465             :         //----------------------------------------------------------------------
     466             :         // allocate workspace
     467             :         //----------------------------------------------------------------------
     468             : 
     469           4 :         LAGRAPH_TRY (LAGraph_Malloc ((void **) &range, nthreads + 1,
     470             :             sizeof (int32_t), msg)) ;
     471           4 :         LAGRAPH_TRY (LAGraph_Malloc ((void **) &count, nthreads + 1,
     472             :             sizeof (GrB_Index), msg)) ;
     473             : 
     474           4 :         memset (count, 0, sizeof (GrB_Index) * (nthreads + 1)) ;
     475             : 
     476             :         //----------------------------------------------------------------------
     477             :         // define parallel tasks to construct T
     478             :         //----------------------------------------------------------------------
     479             : 
     480             :         // thread tid works on rows range[tid]:range[tid+1]-1 of S and T
     481             :         int tid ;
     482          12 :         for (tid = 0 ; tid <= nthreads ; tid++)
     483             :         {
     484           8 :             range [tid] = (n * tid + nthreads - 1) / nthreads ;
     485             :         }
     486             : 
     487             :         //----------------------------------------------------------------------
     488             :         // determine the number entries to be constructed in T for each thread
     489             :         //----------------------------------------------------------------------
     490             : 
     491             :         #pragma omp parallel for num_threads(nthreads) schedule(static)
     492           8 :         for (tid = 0 ; tid < nthreads ; tid++)
     493             :         {
     494        9756 :             for (int32_t i = range [tid] ; i < range [tid+1] ; i++)
     495             :             {
     496        9752 :                 int32_t deg = Sp [i + 1] - Sp [i] ;
     497        9752 :                 count [tid + 1] += LAGRAPH_MIN (FASTSV_SAMPLES, deg) ;
     498             :             }
     499             :         }
     500             : 
     501             :         //----------------------------------------------------------------------
     502             :         // count = cumsum (count)
     503             :         //----------------------------------------------------------------------
     504             : 
     505           8 :         for (tid = 0 ; tid < nthreads ; tid++)
     506             :         {
     507           4 :             count [tid + 1] += count [tid] ;
     508             :         }
     509             : 
     510             :         //----------------------------------------------------------------------
     511             :         // construct T
     512             :         //----------------------------------------------------------------------
     513             : 
     514             :         // T (i,:) consists of the first FASTSV_SAMPLES of S (i,:).
     515             : 
     516             :         // todo: this could be done by GrB_Select, using a new operator.
     517             : 
     518             :         // Note that Tx is not modified.  Only Tp and Tj are constructed.
     519             : 
     520             :         #pragma omp parallel for num_threads(nthreads) schedule(static)
     521           8 :         for (tid = 0 ; tid < nthreads ; tid++)
     522             :         {
     523           4 :             GrB_Index p = count [tid] ;
     524           4 :             Tp [range [tid]] = p ;
     525        9756 :             for (int32_t i = range [tid] ; i < range [tid+1] ; i++)
     526             :             {
     527             :                 // construct T (i,:) from the first entries in S (i,:)
     528        9752 :                 for (int32_t j = 0 ;
     529       40390 :                     j < FASTSV_SAMPLES && Sp [i] + j < Sp [i + 1] ; j++)
     530             :                 {
     531       30638 :                     Tj [p++] = Sj [Sp [i] + j] ;
     532             :                 }
     533        9752 :                 Tp [i + 1] = p ;
     534             :             }
     535             :         }
     536             : 
     537             :         //----------------------------------------------------------------------
     538             :         // import the result into the GrB_Matrix T
     539             :         //----------------------------------------------------------------------
     540             : 
     541             :         // Note that Tx is unmodified.
     542             : 
     543             :         // in SuiteSparse:GraphBLAS v5, sizes are in bytes, not entries
     544           4 :         GrB_Index Tp_siz = Tp_size ;
     545           4 :         GrB_Index Tj_siz = Tj_size ;
     546           4 :         GrB_Index Tx_siz = Tx_size ;
     547             : 
     548           4 :         GrB_Index t_nvals = Tp [nrows] ;
     549           4 :         GRB_TRY (GxB_Matrix_import_CSR (&T, type, nrows, ncols,
     550             :                 &Tp, &Tj, &Tx, Tp_siz, Tj_siz, Tx_siz,
     551             :                 true,   // T is iso
     552             :                 S_jumbled, NULL)) ;
     553             : 
     554             :         //----------------------------------------------------------------------
     555             :         // find the connected components of T
     556             :         //----------------------------------------------------------------------
     557             : 
     558             :         // todo: this is nearly identical to the final phase below.
     559             :         // Make this a function
     560             : 
     561           4 :         bool change = true, is_first = true ;
     562          26 :         while (change)
     563             :         {
     564             :             // hooking & shortcutting
     565          22 :             GRB_TRY (GrB_mxv (mngp, NULL, GrB_MIN_UINT32,
     566             :                 GrB_MIN_SECOND_SEMIRING_UINT32, T, gp, NULL)) ;
     567          22 :             if (!is_first)
     568             :             {
     569          18 :                 LG_TRY (Reduce_assign32 (&f, &mngp, V32, n, nthreads,
     570             :                     ht_key, ht_val, &seed, msg)) ;
     571             :             }
     572          22 :             GRB_TRY (GrB_eWiseAdd (f, NULL, GrB_MIN_UINT32, GrB_MIN_UINT32,
     573             :                 mngp, gp, NULL)) ;
     574             : 
     575             :             // calculate grandparent
     576             :             // fixme: NULL parameter is SS:GrB extension
     577          22 :             GRB_TRY (GrB_Vector_extractTuples (NULL, V32, &n, f)) ; // fixme
     578             :             int64_t i ;
     579             :             #pragma omp parallel for num_threads(nthreads2) schedule(static)
     580       54528 :             for (i = 0 ; i < n ; i++)
     581             :             {
     582       54506 :                 I [i] = (GrB_Index) V32 [i] ;
     583             :             }
     584          22 :             GRB_TRY (GrB_extract (gp_new, NULL, NULL, f, I, n, NULL)) ;
     585             : 
     586             :             // todo: GrB_Vector_extract should have a variant where the index
     587             :             // list is not given by an array I, but as a GrB_Vector of type
     588             :             // GrB_UINT64 (or which can be typecast to GrB_UINT64).  This is a
     589             :             // common issue that arises in other algorithms as well.
     590             :             // Likewise GrB_Matrix_extract, and all forms of GrB_assign.
     591             : 
     592             :             // check termination
     593          22 :             GRB_TRY (GrB_eWiseMult (mod, NULL, NULL, GrB_NE_UINT32, gp_new,
     594             :                 gp, NULL)) ;
     595          22 :             GRB_TRY (GrB_reduce (&change, NULL, GrB_LOR_MONOID_BOOL, mod,
     596             :                 NULL)) ;
     597             : 
     598             :             // swap gp and gp_new
     599          22 :             GrB_Vector t = gp ; gp = gp_new ; gp_new = t ;
     600          22 :             is_first = false ;
     601             :         }
     602             : 
     603             :         //----------------------------------------------------------------------
     604             :         // todo: describe me
     605             :         //----------------------------------------------------------------------
     606             : 
     607           4 :         ht_init (ht_key, ht_val) ;
     608           4 :         ht_sample (V32, n, HASH_SAMPLES, ht_key, ht_val, &seed) ;
     609           4 :         int32_t key = ht_most_frequent (ht_key, ht_val) ;
     610             :         // todo: what if key is returned as -1?  Then T below is invalid.
     611             : 
     612           4 :         int64_t t_nonempty = -1 ;
     613           4 :         bool T_jumbled = false, T_iso = true ;
     614             : 
     615             :         // export T
     616           4 :         GRB_TRY (GxB_Matrix_export_CSR (&T, &type, &nrows, &ncols, &Tp, &Tj,
     617             :             &Tx, &Tp_siz, &Tj_siz, &Tx_siz,
     618             :             &T_iso, &T_jumbled, NULL)) ;
     619             : 
     620             :         // todo what is this phase doing?  It is constructing a matrix T that
     621             :         // depends only on S, key, and V32.  T contains a subset of the entries
     622             :         // in S, except that T (i,:) is empty if
     623             : 
     624             :         // The prior content of T is ignored; it is exported from the earlier
     625             :         // phase, only to reuse the allocated space for T.  However, T_jumbled
     626             :         // is preserved from the prior matrix T, which doesn't make sense.
     627             : 
     628             :         // This parallel loop is badly load balanced.  Each thread operates on
     629             :         // the same number of rows of S, regardless of how many entries appear
     630             :         // in each set of rows.  It uses one thread per task, statically
     631             :         // scheduled.
     632             : 
     633             :         #pragma omp parallel for num_threads(nthreads) schedule(static)
     634           8 :         for (tid = 0 ; tid < nthreads ; tid++)
     635             :         {
     636           4 :             GrB_Index ptr = Sp [range [tid]] ;
     637             :             // thread tid scans S (range [tid]:range [tid+1]-1,:),
     638             :             // and constructs T(i,:) for all rows in this range.
     639        9756 :             for (int32_t i = range [tid] ; i < range [tid+1] ; i++)
     640             :             {
     641        9752 :                 int32_t pv = V32 [i] ;  // what is pv?
     642        9752 :                 Tp [i] = ptr ;          // start the construction of T(i,:)
     643             :                 // T(i,:) is empty if pv == key
     644        9752 :                 if (pv != key)
     645             :                 {
     646             :                     // scan S(i,:)
     647       46550 :                     for (GrB_Index p = Sp [i] ; p < Sp [i+1] ; p++)
     648             :                     {
     649             :                         // get S(i,j)
     650       41274 :                         int32_t j = Sj [p] ;
     651       41274 :                         if (V32 [j] != key)
     652             :                         {
     653             :                             // add the entry T(i,j) to T, but skip it if
     654             :                             // V32 [j] is equal to key
     655       41264 :                             Tj [ptr++] = j ;
     656             :                         }
     657             :                     }
     658             :                     // add the entry T(i,key) if there is room for it in T(i,:)
     659        5276 :                     if (ptr - Tp [i] < Sp [i+1] - Sp [i])
     660             :                     {
     661           2 :                         Tj [ptr++] = key ;
     662             :                     }
     663             :                 }
     664             :             }
     665             :             // count the number of entries inserted into T by this thread?
     666           4 :             count [tid] = ptr - Tp [range [tid]] ;
     667             :         }
     668             : 
     669             :         // Compact empty space out of Tj not filled in from the above phase.
     670             :         // This is a lot of work and should be done in parallel.
     671           4 :         GrB_Index offset = 0 ;
     672           8 :         for (tid = 0 ; tid < nthreads ; tid++)
     673             :         {
     674             :             // the source and destination can overlap;
     675             :             // this is safe (memmove_s not necessary):
     676           4 :             memmove (Tj + offset, Tj + Tp [range [tid]],
     677           4 :                 sizeof (GrB_Index) * count [tid]) ;
     678           4 :             offset += count [tid] ;
     679           4 :             count [tid] = offset - count [tid] ;
     680             :         }
     681             : 
     682             :         // Compact empty space out of Tp
     683             :         #pragma omp parallel for num_threads(nthreads) schedule(static)
     684           8 :         for (tid = 0 ; tid < nthreads ; tid++)
     685             :         {
     686           4 :             GrB_Index ptr = Tp [range [tid]] ;
     687        9756 :             for (int64_t i = range [tid] ; i < range [tid+1] ; i++)
     688             :             {
     689        9752 :                 Tp [i] -= ptr - count [tid] ;
     690             :             }
     691             :         }
     692             : 
     693             :         // finalize T
     694           4 :         Tp [n] = offset ;
     695             : 
     696             :         // free workspace
     697           4 :         LAGraph_Free ((void **) &count, NULL) ;
     698           4 :         LAGraph_Free ((void **) &range, NULL) ;
     699             : 
     700             :         // import S (unchanged since last export)
     701           4 :         GRB_TRY (GxB_Matrix_import_CSR (&S, type, nrows, ncols,
     702             :                 &Sp, &Sj, &Sx, Sp_size, Sj_size, Sx_size,
     703             :                 S_iso, S_jumbled, NULL)) ;
     704             : 
     705             :         // import T for the final phase
     706           4 :         GRB_TRY (GxB_Matrix_import_CSR (&T, type, nrows, ncols,
     707             :                 &Tp, &Tj, &Tx, Tp_siz, Tj_siz, Tx_siz,
     708             :                 T_iso, /* T is jumbled: */ true, NULL)) ;
     709             : 
     710             :         // restore G->A
     711           4 :         G->A = S ;
     712             : 
     713             :     }
     714             :     else
     715             :     {
     716             : 
     717             :         // no sampling; the final phase operates on the whole graph
     718          20 :         T = S ;
     719             : 
     720             :     }
     721             : 
     722             :     //--------------------------------------------------------------------------
     723             :     // final phase
     724             :     //--------------------------------------------------------------------------
     725             : 
     726          24 :     GRB_TRY (GrB_Matrix_nvals (&nnz, T)) ;
     727             : 
     728          24 :     bool change = true ;
     729          96 :     while (change && nnz > 0)
     730             :     {
     731             :         // hooking & shortcutting
     732          72 :         GRB_TRY (GrB_mxv (mngp, NULL, GrB_MIN_UINT32,
     733             :                           GrB_MIN_SECOND_SEMIRING_UINT32, T, gp, NULL)) ;
     734          72 :         GRB_TRY (Reduce_assign32 (&f, &mngp, V32, n, nthreads, ht_key,
     735             :                                   ht_val, &seed, msg)) ;
     736          72 :         GRB_TRY (GrB_eWiseAdd (f, NULL, GrB_MIN_UINT32, GrB_MIN_UINT32,
     737             :                                mngp, gp, NULL)) ;
     738             : 
     739             :         // calculate grandparent
     740             :         // fixme: NULL parameter is SS:GrB extension
     741          72 :         GRB_TRY (GrB_Vector_extractTuples (NULL, V32, &n, f)) ; // fixme
     742             :         int64_t k ;
     743             :         #pragma omp parallel for num_threads(nthreads2) schedule(static)
     744       45924 :         for (k = 0 ; k < n ; k++)
     745             :         {
     746       45852 :             I [k] = (GrB_Index) V32 [k] ;
     747             :         }
     748          72 :         GRB_TRY (GrB_extract (gp_new, NULL, NULL, f, I, n, NULL)) ;
     749             : 
     750             :         // check termination
     751          72 :         GRB_TRY (GrB_eWiseMult (mod, NULL, NULL, GrB_NE_UINT32, gp_new, gp,
     752             :             NULL)) ;
     753          72 :         GRB_TRY (GrB_reduce (&change, NULL, GrB_LOR_MONOID_BOOL, mod, NULL)) ;
     754             : 
     755             :         // swap gp and gp_new
     756          72 :         GrB_Vector t = gp ; gp = gp_new ; gp_new = t ;
     757             :     }
     758             : 
     759             :     //--------------------------------------------------------------------------
     760             :     // free workspace and return result
     761             :     //--------------------------------------------------------------------------
     762             : 
     763          24 :     (*component) = f ;
     764          24 :     f = NULL ;
     765          24 :     if (sampling)
     766             :     {
     767           4 :         GrB_free (&T) ;
     768             :     }
     769          24 :     LG_FREE_ALL ;
     770          24 :     return (GrB_SUCCESS) ;
     771             : #else
     772             :     return (GrB_NOT_IMPLEMENTED) ;
     773             : #endif
     774             : }

Generated by: LCOV version 1.14