LCOV - code coverage report
Current view: top level - experimental/algorithm - LG_CC_FastSV5.c (source / functions) Hit Total Coverage
Test: LAGraph code coverage report. Commit id: cc56ed4. Current time (UTC): 2024-08-30T17:14:30Z Lines: 203 203 100.0 %
Date: 2024-08-30 17:16:41 Functions: 5 5 100.0 %

          Line data    Source code
       1             : //------------------------------------------------------------------------------
       2             : // LG_CC_FastSV5: connected components
       3             : //------------------------------------------------------------------------------
       4             : 
       5             : // LAGraph, (c) 2019-2022 by The LAGraph Contributors, All Rights Reserved.
       6             : // SPDX-License-Identifier: BSD-2-Clause
       7             : //
       8             : // For additional details (including references to third party source code and
       9             : // other files) see the LICENSE file or contact permission@sei.cmu.edu. See
      10             : // Contributors.txt for a full list of contributors. Created, in part, with
      11             : // funding and support from the U.S. Government (see Acknowledgments.txt file).
      12             : // DM22-0790
      13             : 
      14             : // Contributed by Yongzhe Zhang; modified by Tim Davis, Texas A&M University
      15             : 
      16             : //------------------------------------------------------------------------------
      17             : 
      18             : // Code is based on the algorithm described in the following paper
      19             : // Zhang, Azad, Hu. FastSV: FastSV: A Distributed-Memory Connected Component
      20             : // Algorithm with Fast Convergence (SIAM PP20)
      21             : 
      22             : // A subsequent update to the algorithm is here (which might not be reflected
      23             : // in this code):
      24             : //
      25             : // Yongzhe Zhang, Ariful Azad, Aydin Buluc: Parallel algorithms for finding
      26             : // connected components using linear algebra. J. Parallel Distributed Comput.
      27             : // 144: 14-27 (2020).
      28             : 
      29             : // Modified by Tim Davis, Texas A&M University
      30             : 
      31             : // The input matrix A must be symmetric.  Self-edges (diagonal entries) are
      32             : // OK, and are ignored.  The values and type of A are ignored; just its
      33             : // structure is accessed.
      34             : 
      35             : // The matrix A must have dimension 2^32 or less.
      36             : // todo: Need a 64-bit version of this method.
      37             : 
      38             : // todo: this function is not thread-safe, since it exports G->A and then
      39             : // reimports it back.  G->A is unchanged when the function returns, but during
      40             : // execution G->A is invalid.
      41             : 
      42             : // A note about "todo" and "fixme" in this file:  these do not need to be fixed
      43             : // or changed for this method, since the revised version appears as
      44             : // src/algorithm/LG_CC_FastSV6.c.  They have thus been changed here to lower
      45             : // case.
      46             : 
      47             : // todo: free all workspace in LG_FREE_ALL
      48             : #define LG_FREE_ALL ;
      49             : 
      50             : #include "LG_internal.h"
      51             : 
      52             : #if LAGRAPH_SUITESPARSE
      53             : 
      54             : //------------------------------------------------------------------------------
      55             : // hash functions: todo describe me
      56             : //------------------------------------------------------------------------------
      57             : 
      58             : // hash table size must be a power of 2
      59             : #define HASH_SIZE 1024
      60             : 
      61             : // number of samples to insert into the hash table
      62             : // todo: this seems to be a lot of entries for a HASH_SIZE of 1024.
      63             : // There could be lots of collisions.
      64             : #define HASH_SAMPLES 864
      65             : 
      66             : #define HASH(x) (((x << 4) + x) & (HASH_SIZE-1))
      67             : #define NEXT(x) ((x + 23) & (HASH_SIZE-1))
      68             : 
      69             : //------------------------------------------------------------------------------
      70             : // ht_init: todo describe me
      71             : //------------------------------------------------------------------------------
      72             : 
      73             : // Clear the hash table counts (ht_val [0:HASH_SIZE-1] = 0), and set all hash
      74             : // table entries as empty (ht_key [0:HASH_SIZE-1] =-1).
      75             : 
      76             : // todo: the memset of ht_key is confusing
      77             : 
      78             : // todo: the name "ht_val" is confusing.  It is not a value, but a count of
      79             : // the number of times the value x = ht_key [h] has been inserted into the
      80             : // hth position in the hash table.  It should be renamed ht_cnt.
      81             : 
      82          48 : static inline void ht_init
      83             : (
      84             :     int32_t *ht_key,
      85             :     int32_t *ht_val
      86             : )
      87             : {
      88          48 :     memset (ht_key, -1, sizeof (int32_t) * HASH_SIZE) ;
      89          48 :     memset (ht_val,  0, sizeof (int32_t) * HASH_SIZE) ;
      90          48 : }
      91             : 
      92             : //------------------------------------------------------------------------------
      93             : // ht_sample: todo describe me
      94             : //------------------------------------------------------------------------------
      95             : 
      96             : //
      97             : 
      98          48 : static inline void ht_sample
      99             : (
     100             :     uint32_t *V32,      // array of size n (todo: this is a bad variable name)
     101             :     int32_t n,
     102             :     int32_t samples,    // number of samples to take from V32
     103             :     int32_t *ht_key,
     104             :     int32_t *ht_val,
     105             :     uint64_t *seed
     106             : )
     107             : {
     108       41520 :     for (int32_t k = 0 ; k < samples ; k++)
     109             :     {
     110             :         // select an entry from V32 at random
     111       41472 :         int32_t x = V32 [LG_Random60 (seed) % n] ;
     112             : 
     113             :         // find x in the hash table
     114             :         // todo: make this loop a static inline function (see also below)
     115       41472 :         int32_t h = HASH (x) ;
     116       46722 :         while (ht_key [h] != -1 && ht_key [h] != x)
     117             :         {
     118        5250 :             h = NEXT (h) ;
     119             :         }
     120             : 
     121       41472 :         ht_key [h] = x ;
     122       41472 :         ht_val [h]++ ;
     123             :     }
     124          48 : }
     125             : 
     126             : //------------------------------------------------------------------------------
     127             : // ht_most_frequent: todo describe me
     128             : //------------------------------------------------------------------------------
     129             : 
     130             : // todo what if key is returned as -1?  Code breaks.  todo: handle this case
     131             : 
     132           4 : static inline int32_t ht_most_frequent
     133             : (
     134             :     int32_t *ht_key,
     135             :     int32_t *ht_val
     136             : )
     137             : {
     138           4 :     int32_t key = -1 ;
     139           4 :     int32_t val = 0 ;                       // max (ht_val [0:HASH_SIZE-1])
     140        4100 :     for (int32_t h = 0 ; h < HASH_SIZE ; h++)
     141             :     {
     142        4096 :         if (ht_val [h] > val)
     143             :         {
     144          10 :             key = ht_key [h] ;
     145          10 :             val = ht_val [h] ;
     146             :         }
     147             :     }
     148           4 :     return (key) ;      // return most frequent key
     149             : }
     150             : 
     151             : //------------------------------------------------------------------------------
     152             : // Reduce_assign32:  w (index) += s, using MIN as the "+=" accum operator
     153             : //------------------------------------------------------------------------------
     154             : 
     155             : // mask = NULL, accumulator = GrB_MIN_UINT32, descriptor = NULL.
     156             : // Duplicates are summed with the accumulator, which differs from how
     157             : // GrB_assign works.  GrB_assign states that the presence of duplicates results
     158             : // in undefined behavior.  GrB_assign in SuiteSparse:GraphBLAS follows the
     159             : // MATLAB rule, which discards all but the first of the duplicates.
     160             : 
     161             : // todo: add this to GraphBLAS as a variant of GrB_assign, either as
     162             : // GxB_assign_accum (or another name), or as a GxB_* descriptor setting.
     163             : 
     164          90 : static inline int Reduce_assign32
     165             : (
     166             :     GrB_Vector *w_handle,   // vector of size n, all entries present
     167             :     GrB_Vector *s_handle,   // vector of size n, all entries present
     168             :     uint32_t *index,        // array of size n, can have duplicates
     169             :     GrB_Index n,
     170             :     int nthreads,
     171             :     int32_t *ht_key,        // hash table
     172             :     int32_t *ht_val,        // hash table (count of # of entries)
     173             :     uint64_t *seed,         // random
     174             :     char *msg
     175             : )
     176             : {
     177             : 
     178             :     GrB_Type w_type, s_type ;
     179             :     GrB_Index w_n, s_n, w_nvals, s_nvals, *w_i, *s_i, w_size, s_size ;
     180             :     uint32_t *w_x, *s_x ;
     181          90 :     bool s_iso = false ;
     182             : 
     183             :     //--------------------------------------------------------------------------
     184             :     // export w and s
     185             :     //--------------------------------------------------------------------------
     186             : 
     187             :     // export the GrB_Vectors w and s as full arrays, to get direct access to
     188             :     // their contents.  Note that this would fail if w or s are not full, with
     189             :     // all entries present.
     190          90 :     GRB_TRY (GxB_Vector_export_Full (w_handle, &w_type, &w_n, (void **) &w_x,
     191             :         &w_size, NULL, NULL)) ;
     192          90 :     GRB_TRY (GxB_Vector_export_Full (s_handle, &s_type, &s_n, (void **) &s_x,
     193             :         &s_size, &s_iso, NULL)) ;
     194             : 
     195             :     #if defined ( COVERAGE )
     196          90 :     if (n >= 200)   // for test coverage only; do not use in production!!
     197             :     #else
     198             :     if (nthreads >= 4)
     199             :     #endif
     200             :     {
     201             : 
     202             :         // allocate a buf array for each thread, of size HASH_SIZE
     203             :         uint32_t *mem ;
     204          44 :         LAGRAPH_TRY (LAGraph_Malloc ((void **) &mem, nthreads*HASH_SIZE,
     205             :             sizeof (uint32_t), msg)) ;
     206             :         // todo: check out-of-memory condition here
     207             : 
     208             :         // todo why is hashing needed here?  hashing is slow for what needs
     209             :         // to be computed here.  GraphBLAS has fast MIN atomic monoids that
     210             :         // do not require hashing.
     211          44 :         ht_init (ht_key, ht_val) ;
     212          44 :         ht_sample (index, n, HASH_SAMPLES, ht_key, ht_val, seed) ;
     213             : 
     214             :         int tid;
     215             :         #pragma omp parallel for num_threads(nthreads) schedule(static)
     216          88 :         for (tid = 0 ; tid < nthreads ; tid++)
     217             :         {
     218             :             // get the thread-specific buf array of size HASH_SIZE
     219             :             // todo: buf is a bad variable name; it's not a "buffer",
     220             :             // but a local workspace to compute the local version of w_x.
     221          44 :             uint32_t *buf = mem + tid * HASH_SIZE ;
     222             : 
     223             :             // copy the values from the global hash table into buf
     224       45100 :             for (int32_t h = 0 ; h < HASH_SIZE ; h++)
     225             :             {
     226       45056 :                 if (ht_key [h] != -1)
     227             :                 {
     228       15386 :                     buf [h] = w_x [ht_key [h]] ;
     229             :                 }
     230             :             }
     231             : 
     232             :             // this thread works on index [kstart:kend]
     233          44 :             int32_t kstart = (n * tid + nthreads - 1) / nthreads ;
     234          44 :             int32_t kend = (n * tid + n + nthreads - 1) / nthreads ;
     235       89968 :             for (int32_t k = kstart ; k < kend ; k++)
     236             :             {
     237       89924 :                 uint32_t i = index [k] ;
     238             : 
     239             :                 // todo: make this loop a static inline function
     240       89924 :                 int32_t h = HASH (i) ;
     241      126336 :                 while (ht_key [h] != -1 && ht_key [h] != i)
     242             :                 {
     243       36412 :                     h = NEXT (h) ;
     244             :                 }
     245             : 
     246       89924 :                 if (ht_key [h] == -1)
     247             :                 {
     248             :                     // todo is this a race condition?
     249       27790 :                     w_x [i] = LAGRAPH_MIN (w_x [i], s_x [s_iso?0:k]) ;
     250             :                 }
     251             :                 else
     252             :                 {
     253       62134 :                     buf [h] = LAGRAPH_MIN (buf [h], s_x [s_iso?0:k]) ;
     254             :                 }
     255             :             }
     256             :         }
     257             : 
     258             :         // combine intermediate results from each thread
     259       45100 :         for (int32_t h = 0 ; h < HASH_SIZE ; h++)
     260             :         {
     261       45056 :             int32_t i = ht_key [h] ;
     262       45056 :             if (i != -1)
     263             :             {
     264       30772 :                 for (tid = 0 ; tid < nthreads ; tid++)
     265             :                 {
     266       15386 :                     w_x [i] = LAGRAPH_MIN (w_x [i], mem [tid * HASH_SIZE + h]) ;
     267             :                 }
     268             :             }
     269             :         }
     270             : 
     271          44 :         LAGraph_Free ((void **) &mem, NULL) ;
     272             :     }
     273             :     else
     274             :     {
     275             :         // sequential version
     276         728 :         for (GrB_Index k = 0 ; k < n ; k++)
     277             :         {
     278         682 :             uint32_t i = index [k] ;
     279         682 :             w_x [i] = LAGRAPH_MIN (w_x [i], s_x [s_iso?0:k]) ;
     280             :         }
     281             :     }
     282             : 
     283             :     //--------------------------------------------------------------------------
     284             :     // reimport w and s back into GrB_Vectors, and return result
     285             :     //--------------------------------------------------------------------------
     286             : 
     287             :     // s is unchanged.  It was exported only to compute w (index) += s
     288             : 
     289          90 :     GRB_TRY (GxB_Vector_import_Full (w_handle, w_type, w_n, (void **) &w_x,
     290             :         w_size, false, NULL)) ;
     291          90 :     GRB_TRY (GxB_Vector_import_Full (s_handle, s_type, s_n, (void **) &s_x,
     292             :         s_size, s_iso, NULL)) ;
     293             : 
     294          90 :     return (GrB_SUCCESS) ;
     295             : }
     296             : 
     297             : //------------------------------------------------------------------------------
     298             : // LG_CC_FastSV5
     299             : //------------------------------------------------------------------------------
     300             : 
     301             : // The output of LG_CC_FastSV5 is a vector component, where
     302             : // component(i)=s if node i is in the connected compononent whose
     303             : // representative node is node s.  If s is a representative, then
     304             : // component(s)=s.  The number of connected components in the graph G is the
     305             : // number of representatives.
     306             : 
     307             : #undef  LG_FREE_ALL
     308             : #define LG_FREE_ALL                             \
     309             : {                                               \
     310             :     LAGraph_Free ((void **) &I, NULL) ;         \
     311             :     LAGraph_Free ((void **) &V32, NULL) ;       \
     312             :     LAGraph_Free ((void **) &ht_key, NULL) ;    \
     313             :     LAGraph_Free ((void **) &ht_val, NULL) ;    \
     314             :     /* todo why is T not freed?? */             \
     315             :     GrB_free (&f) ;                             \
     316             :     GrB_free (&gp) ;                            \
     317             :     GrB_free (&mngp) ;                          \
     318             :     GrB_free (&gp_new) ;                        \
     319             :     GrB_free (&mod) ;                           \
     320             : }
     321             : 
     322             : #endif
     323             : 
     324          24 : int LG_CC_FastSV5           // SuiteSparse:GraphBLAS method, with GxB extensions
     325             : (
     326             :     // output
     327             :     GrB_Vector *component,  // component(i)=s if node is in the component s
     328             :     // inputs
     329             :     LAGraph_Graph G,        // input graph, G->A can change
     330             :     char *msg
     331             : )
     332             : {
     333             : 
     334             :     //--------------------------------------------------------------------------
     335             :     // check inputs
     336             :     //--------------------------------------------------------------------------
     337             : 
     338          24 :     LG_CLEAR_MSG ;
     339             : 
     340             : #if !LAGRAPH_SUITESPARSE
     341             :     LG_ASSERT_MSG (false, GrB_NOT_IMPLEMENTED, "SuiteSparse required") ;
     342             : #else
     343             : 
     344          24 :     uint32_t *V32 = NULL ;
     345          24 :     int32_t *ht_key = NULL, *ht_val = NULL ;
     346          24 :     GrB_Index n, nnz, *I = NULL ;
     347          24 :     GrB_Vector f = NULL, gp_new = NULL, mngp = NULL, mod = NULL, gp = NULL ;
     348          24 :     GrB_Matrix T = NULL ;
     349             : 
     350          24 :     LG_TRY (LAGraph_CheckGraph (G, msg)) ;
     351          24 :     LG_ASSERT (component != NULL, GrB_NULL_POINTER) ;
     352             : 
     353          24 :     LG_ASSERT_MSG ((G->kind == LAGraph_ADJACENCY_UNDIRECTED ||
     354             :        (G->kind == LAGraph_ADJACENCY_DIRECTED &&
     355             :         G->is_symmetric_structure == LAGraph_TRUE)),
     356             :         -1001, "G->A must be known to be symmetric") ;
     357             : 
     358          24 :     GrB_Matrix S = G->A ;
     359          24 :     GRB_TRY (GrB_Matrix_nrows (&n, S)) ;
     360          24 :     GRB_TRY (GrB_Matrix_nvals (&nnz, S)) ;
     361             : 
     362          24 :     LG_ASSERT_MSG (n <= UINT32_MAX, -1, "problem too large (fixme)") ;
     363             : 
     364             :     #define FASTSV_SAMPLES 4
     365             : 
     366          24 :     bool sampling = (n * FASTSV_SAMPLES * 2 < nnz) ;
     367             : 
     368             :     // random number seed
     369          24 :     uint64_t seed = n ;
     370             : 
     371             :     //--------------------------------------------------------------------------
     372             :     // initializations
     373             :     //--------------------------------------------------------------------------
     374             : 
     375             :     // determine # of threads to use for Reduce_assign
     376             :     int nthreads, nthreads_outer, nthreads_inner ;
     377          24 :     LG_TRY (LAGraph_GetNumThreads (&nthreads_outer, &nthreads_inner, msg)) ;
     378          24 :     nthreads = nthreads_outer * nthreads_inner ;
     379             : 
     380          24 :     nthreads = LAGRAPH_MIN (nthreads, n / 16) ;
     381          24 :     nthreads = LAGRAPH_MAX (nthreads, 1) ;
     382             : 
     383             :     // # of threads to use for typecast
     384          24 :     int nthreads2 = n / (64*1024) ;
     385          24 :     nthreads2 = LAGRAPH_MIN (nthreads2, nthreads) ;
     386          24 :     nthreads2 = LAGRAPH_MAX (nthreads2, 1) ;
     387             : 
     388             :     // vectors
     389          24 :     GRB_TRY (GrB_Vector_new (&f,      GrB_UINT32, n)) ;
     390          24 :     GRB_TRY (GrB_Vector_new (&gp_new, GrB_UINT32, n)) ;
     391          24 :     GRB_TRY (GrB_Vector_new (&mod,    GrB_BOOL,   n)) ;
     392             : 
     393             :     // temporary arrays
     394          24 :     LAGRAPH_TRY (LAGraph_Malloc ((void **) &I  , n, sizeof (GrB_Index), msg)) ;
     395          24 :     LAGRAPH_TRY (LAGraph_Malloc ((void **) &V32, n, sizeof (uint32_t), msg)) ;
     396             : 
     397             :     // prepare vectors
     398             :     int64_t i;
     399             :     #pragma omp parallel for num_threads(nthreads2) schedule(static)
     400       16284 :     for (i = 0 ; i < n ; i++)
     401             :     {
     402       16260 :         I [i] = i ;
     403       16260 :         V32 [i] = (uint32_t) i ;
     404             :     }
     405             : 
     406          24 :     GRB_TRY (GrB_Vector_build (f, I, V32, n, GrB_PLUS_UINT32)) ;
     407          24 :     GRB_TRY (GrB_Vector_dup (&gp,   f)) ;
     408          24 :     GRB_TRY (GrB_Vector_dup (&mngp, f)) ;
     409             : 
     410             :     // allocate the hash table
     411          24 :     LAGRAPH_TRY (LAGraph_Malloc ((void **) &ht_key, HASH_SIZE, sizeof (int32_t),
     412             :         msg)) ;
     413          24 :     LAGRAPH_TRY (LAGraph_Malloc ((void **) &ht_val, HASH_SIZE, sizeof (int32_t),
     414             :         msg)) ;
     415             : 
     416             :     //--------------------------------------------------------------------------
     417             :     // sample phase
     418             :     //--------------------------------------------------------------------------
     419             : 
     420          24 :     if (sampling)
     421             :     {
     422             : 
     423             :         //----------------------------------------------------------------------
     424             :         // export S = G->A in CSR format
     425             :         //----------------------------------------------------------------------
     426             : 
     427             :         // S is not modified.  It is only exported so that its contents can be
     428             :         // read by the parallel loops below.
     429             : 
     430             :         GrB_Type type ;
     431             :         GrB_Index nrows, ncols, nvals ;
     432             :         size_t typesize ;
     433             :         int64_t nonempty ;
     434             :         GrB_Index *Sp, *Sj ;
     435             :         void *Sx ;
     436           4 :         bool S_jumbled = false ;
     437             :         GrB_Index Sp_size, Sj_size, Sx_size ;
     438           4 :         bool S_iso = false ;
     439             : 
     440           4 :         GRB_TRY (GrB_Matrix_nvals (&nvals, S)) ;
     441           4 :         GRB_TRY (GxB_Matrix_export_CSR (&S, &type, &nrows, &ncols, &Sp, &Sj,
     442             :             &Sx, &Sp_size, &Sj_size, &Sx_size,
     443             :             &S_iso, &S_jumbled, NULL)) ;
     444           4 :         GRB_TRY (GxB_Type_size (&typesize, type)) ;
     445           4 :         G->A = NULL ;
     446             : 
     447             :         //----------------------------------------------------------------------
     448             :         // allocate space to construct T
     449             :         //----------------------------------------------------------------------
     450             : 
     451           4 :         GrB_Index Tp_len = nrows+1, Tp_size = Tp_len*sizeof(GrB_Index);
     452           4 :         GrB_Index Tj_len = nvals,   Tj_size = Tj_len*sizeof(GrB_Index);
     453           4 :         GrB_Index Tx_len = nvals ;
     454             : 
     455           4 :         GrB_Index *Tp = NULL, *Tj = NULL ;
     456           4 :         GrB_Index Tx_size = typesize ;
     457           4 :         void *Tx = NULL ;
     458           4 :         int32_t *range = NULL ;
     459           4 :         GrB_Index *count = NULL ;
     460             : 
     461           4 :         LAGRAPH_TRY (LAGraph_Malloc ((void **) &Tp, Tp_len,
     462             :             sizeof (GrB_Index), msg)) ;
     463           4 :         LAGRAPH_TRY (LAGraph_Malloc ((void **) &Tj, Tj_len,
     464             :             sizeof (GrB_Index), msg)) ;
     465           4 :         LAGRAPH_TRY (LAGraph_Calloc (&Tx, 1, typesize, msg)) ;   // T is iso
     466             : 
     467             :         //----------------------------------------------------------------------
     468             :         // allocate workspace
     469             :         //----------------------------------------------------------------------
     470             : 
     471           4 :         LAGRAPH_TRY (LAGraph_Malloc ((void **) &range, nthreads + 1,
     472             :             sizeof (int32_t), msg)) ;
     473           4 :         LAGRAPH_TRY (LAGraph_Malloc ((void **) &count, nthreads + 1,
     474             :             sizeof (GrB_Index), msg)) ;
     475             : 
     476           4 :         memset (count, 0, sizeof (GrB_Index) * (nthreads + 1)) ;
     477             : 
     478             :         //----------------------------------------------------------------------
     479             :         // define parallel tasks to construct T
     480             :         //----------------------------------------------------------------------
     481             : 
     482             :         // thread tid works on rows range[tid]:range[tid+1]-1 of S and T
     483          12 :         for (int tid = 0 ; tid <= nthreads ; tid++)
     484             :         {
     485           8 :             range [tid] = (n * tid + nthreads - 1) / nthreads ;
     486             :         }
     487             : 
     488             :         //----------------------------------------------------------------------
     489             :         // determine the number entries to be constructed in T for each thread
     490             :         //----------------------------------------------------------------------
     491             : 
     492             :         int tid;
     493             :         #pragma omp parallel for num_threads(nthreads) schedule(static)
     494           8 :         for (tid = 0 ; tid < nthreads ; tid++)
     495             :         {
     496        9756 :             for (int32_t i = range [tid] ; i < range [tid+1] ; i++)
     497             :             {
     498        9752 :                 int32_t deg = Sp [i + 1] - Sp [i] ;
     499        9752 :                 count [tid + 1] += LAGRAPH_MIN (FASTSV_SAMPLES, deg) ;
     500             :             }
     501             :         }
     502             : 
     503             :         //----------------------------------------------------------------------
     504             :         // count = cumsum (count)
     505             :         //----------------------------------------------------------------------
     506             : 
     507           8 :         for (tid = 0 ; tid < nthreads ; tid++)
     508             :         {
     509           4 :             count [tid + 1] += count [tid] ;
     510             :         }
     511             : 
     512             :         //----------------------------------------------------------------------
     513             :         // construct T
     514             :         //----------------------------------------------------------------------
     515             : 
     516             :         // T (i,:) consists of the first FASTSV_SAMPLES of S (i,:).
     517             : 
     518             :         // todo: this could be done by GxB_Select, using a new operator.  Need
     519             :         // to define a set of GxB_SelectOp operators that would allow for this.
     520             : 
     521             :         // Note that Tx is not modified.  Only Tp and Tj are constructed.
     522             : 
     523             :         #pragma omp parallel for num_threads(nthreads) schedule(static)
     524           8 :         for (tid = 0 ; tid < nthreads ; tid++)
     525             :         {
     526           4 :             GrB_Index p = count [tid] ;
     527           4 :             Tp [range [tid]] = p ;
     528        9756 :             for (int32_t i = range [tid] ; i < range [tid+1] ; i++)
     529             :             {
     530             :                 // construct T (i,:) from the first entries in S (i,:)
     531        9752 :                 for (int32_t j = 0 ;
     532       40390 :                     j < FASTSV_SAMPLES && Sp [i] + j < Sp [i + 1] ; j++)
     533             :                 {
     534       30638 :                     Tj [p++] = Sj [Sp [i] + j] ;
     535             :                 }
     536        9752 :                 Tp [i + 1] = p ;
     537             :             }
     538             :         }
     539             : 
     540             :         //----------------------------------------------------------------------
     541             :         // import the result into the GrB_Matrix T
     542             :         //----------------------------------------------------------------------
     543             : 
     544             :         // Note that Tx is unmodified.
     545             : 
     546             :         // in SuiteSparse:GraphBLAS v5, sizes are in bytes, not entries
     547           4 :         GrB_Index Tp_siz = Tp_size ;
     548           4 :         GrB_Index Tj_siz = Tj_size ;
     549           4 :         GrB_Index Tx_siz = Tx_size ;
     550             : 
     551           4 :         GrB_Index t_nvals = Tp [nrows] ;
     552           4 :         GRB_TRY (GxB_Matrix_import_CSR (&T, type, nrows, ncols,
     553             :                 &Tp, &Tj, &Tx, Tp_siz, Tj_siz, Tx_siz,
     554             :                 true,   // T is iso
     555             :                 S_jumbled, NULL)) ;
     556             : 
     557             :         //----------------------------------------------------------------------
     558             :         // find the connected components of T
     559             :         //----------------------------------------------------------------------
     560             : 
     561             :         // todo: this is nearly identical to the final phase below.
     562             :         // Make this a function
     563             : 
     564           4 :         bool change = true, is_first = true ;
     565          26 :         while (change)
     566             :         {
     567             :             // hooking & shortcutting
     568          22 :             GRB_TRY (GrB_mxv (mngp, NULL, GrB_MIN_UINT32,
     569             :                 GrB_MIN_SECOND_SEMIRING_UINT32, T, gp, NULL)) ;
     570          22 :             if (!is_first)
     571             :             {
     572          18 :                 LG_TRY (Reduce_assign32 (&f, &mngp, V32, n, nthreads,
     573             :                     ht_key, ht_val, &seed, msg)) ;
     574             :             }
     575          22 :             GRB_TRY (GrB_eWiseAdd (f, NULL, GrB_MIN_UINT32, GrB_MIN_UINT32,
     576             :                 mngp, gp, NULL)) ;
     577             : 
     578             :             // calculate grandparent
     579             :             // fixme: NULL parameter is SS:GrB extension
     580          22 :             GRB_TRY (GrB_Vector_extractTuples (NULL, V32, &n, f)) ; // fixme
     581             :             int32_t i;
     582             :             #pragma omp parallel for num_threads(nthreads2) schedule(static)
     583       54528 :             for (i = 0 ; i < n ; i++)
     584             :             {
     585       54506 :                 I [i] = (GrB_Index) V32 [i] ;
     586             :             }
     587          22 :             GRB_TRY (GrB_extract (gp_new, NULL, NULL, f, I, n, NULL)) ;
     588             : 
     589             :             // todo: GrB_Vector_extract should have a variant where the index
     590             :             // list is not given by an array I, but as a GrB_Vector of type
     591             :             // GrB_UINT64 (or which can be typecast to GrB_UINT64).  This is a
     592             :             // common issue that arises in other algorithms as well.
     593             :             // Likewise GrB_Matrix_extract, and all forms of GrB_assign.
     594             : 
     595             :             // check termination
     596          22 :             GRB_TRY (GrB_eWiseMult (mod, NULL, NULL, GrB_NE_UINT32, gp_new,
     597             :                 gp, NULL)) ;
     598          22 :             GRB_TRY (GrB_reduce (&change, NULL, GrB_LOR_MONOID_BOOL, mod,
     599             :                 NULL)) ;
     600             : 
     601             :             // swap gp and gp_new
     602          22 :             GrB_Vector t = gp ; gp = gp_new ; gp_new = t ;
     603          22 :             is_first = false ;
     604             :         }
     605             : 
     606             :         //----------------------------------------------------------------------
     607             :         // todo: describe me
     608             :         //----------------------------------------------------------------------
     609             : 
     610           4 :         ht_init (ht_key, ht_val) ;
     611           4 :         ht_sample (V32, n, HASH_SAMPLES, ht_key, ht_val, &seed) ;
     612           4 :         int32_t key = ht_most_frequent (ht_key, ht_val) ;
     613             :         // todo: what if key is returned as -1?  Then T below is invalid.
     614             : 
     615           4 :         int64_t t_nonempty = -1 ;
     616           4 :         bool T_jumbled = false, T_iso = true ;
     617             : 
     618             :         // export T
     619           4 :         GRB_TRY (GxB_Matrix_export_CSR (&T, &type, &nrows, &ncols, &Tp, &Tj,
     620             :             &Tx, &Tp_siz, &Tj_siz, &Tx_siz,
     621             :             &T_iso, &T_jumbled, NULL)) ;
     622             : 
     623             :         // todo what is this phase doing?  It is constructing a matrix T that
     624             :         // depends only on S, key, and V32.  T contains a subset of the entries
     625             :         // in S, except that T (i,:) is empty if
     626             : 
     627             :         // The prior content of T is ignored; it is exported from the earlier
     628             :         // phase, only to reuse the allocated space for T.  However, T_jumbled
     629             :         // is preserved from the prior matrix T, which doesn't make sense.
     630             : 
     631             :         // This parallel loop is badly load balanced.  Each thread operates on
     632             :         // the same number of rows of S, regardless of how many entries appear
     633             :         // in each set of rows.  It uses one thread per task, statically
     634             :         // scheduled.
     635             : 
     636             :         #pragma omp parallel for num_threads(nthreads) schedule(static)
     637           8 :         for (tid = 0 ; tid < nthreads ; tid++)
     638             :         {
     639           4 :             GrB_Index ptr = Sp [range [tid]] ;
     640             :             // thread tid scans S (range [tid]:range [tid+1]-1,:),
     641             :             // and constructs T(i,:) for all rows in this range.
     642        9756 :             for (int32_t i = range [tid] ; i < range [tid+1] ; i++)
     643             :             {
     644        9752 :                 int32_t pv = V32 [i] ;  // what is pv?
     645        9752 :                 Tp [i] = ptr ;          // start the construction of T(i,:)
     646             :                 // T(i,:) is empty if pv == key
     647        9752 :                 if (pv != key)
     648             :                 {
     649             :                     // scan S(i,:)
     650       46550 :                     for (GrB_Index p = Sp [i] ; p < Sp [i+1] ; p++)
     651             :                     {
     652             :                         // get S(i,j)
     653       41274 :                         int32_t j = Sj [p] ;
     654       41274 :                         if (V32 [j] != key)
     655             :                         {
     656             :                             // add the entry T(i,j) to T, but skip it if
     657             :                             // V32 [j] is equal to key
     658       41264 :                             Tj [ptr++] = j ;
     659             :                         }
     660             :                     }
     661             :                     // add the entry T(i,key) if there is room for it in T(i,:)
     662        5276 :                     if (ptr - Tp [i] < Sp [i+1] - Sp [i])
     663             :                     {
     664           2 :                         Tj [ptr++] = key ;
     665             :                     }
     666             :                 }
     667             :             }
     668             :             // count the number of entries inserted into T by this thread?
     669           4 :             count [tid] = ptr - Tp [range [tid]] ;
     670             :         }
     671             : 
     672             :         // Compact empty space out of Tj not filled in from the above phase.
     673             :         // This is a lot of work and should be done in parallel.
     674           4 :         GrB_Index offset = 0 ;
     675           8 :         for (tid = 0 ; tid < nthreads ; tid++)
     676             :         {
     677             : 
     678             : //          this memcpy is not safe (src/dest can overlap)
     679             : //          memcpy (Tj + offset, Tj + Tp [range [tid]],
     680             : //              sizeof (GrB_Index) * count [tid]) ;
     681             : 
     682             : //          // using a for loop instead:
     683             : //          GrB_Index *Tj_dest = Tj + offset ;
     684             : //          GrB_Index *Tj_src  = Tj + Tp [range [tid]] ;
     685             : //          for (int64_t k = 0 ; k < count [tid] ; k++)
     686             : //          {
     687             : //              Tj_dest [k] = Tj_src [k] ;
     688             : //          }
     689             : 
     690             : //          this is safe (memmove_s not necessary):
     691           4 :             memmove (Tj + offset, Tj + Tp [range [tid]],
     692           4 :                 sizeof (GrB_Index) * count [tid]) ;
     693             : 
     694           4 :             offset += count [tid] ;
     695           4 :             count [tid] = offset - count [tid] ;
     696             :         }
     697             : 
     698             :         // Compact empty space out of Tp
     699             :         #pragma omp parallel for num_threads(nthreads) schedule(static)
     700           8 :         for (tid = 0 ; tid < nthreads ; tid++)
     701             :         {
     702           4 :             GrB_Index ptr = Tp [range [tid]] ;
     703        9756 :             for (int32_t i = range [tid] ; i < range [tid+1] ; i++)
     704             :             {
     705        9752 :                 Tp [i] -= ptr - count [tid] ;
     706             :             }
     707             :         }
     708             : 
     709             :         // finalize T
     710           4 :         Tp [n] = offset ;
     711             : 
     712             :         // free workspace
     713           4 :         LAGraph_Free ((void **) &count, NULL) ;
     714           4 :         LAGraph_Free ((void **) &range, NULL) ;
     715             : 
     716             :         // import S (unchanged since last export)
     717           4 :         GRB_TRY (GxB_Matrix_import_CSR (&S, type, nrows, ncols,
     718             :                 &Sp, &Sj, &Sx, Sp_size, Sj_size, Sx_size,
     719             :                 S_iso, S_jumbled, NULL)) ;
     720             : 
     721             :         // import T for the final phase
     722           4 :         GRB_TRY (GxB_Matrix_import_CSR (&T, type, nrows, ncols,
     723             :                 &Tp, &Tj, &Tx, Tp_siz, Tj_siz, Tx_siz,
     724             :                 T_iso, /* T is jumbled: */ true, NULL)) ;
     725             : 
     726             :         // restore G->A
     727           4 :         G->A = S ;
     728             : 
     729             :     }
     730             :     else
     731             :     {
     732             : 
     733             :         // no sampling; the final phase operates on the whole graph
     734          20 :         T = S ;
     735             : 
     736             :     }
     737             : 
     738             :     //--------------------------------------------------------------------------
     739             :     // final phase
     740             :     //--------------------------------------------------------------------------
     741             : 
     742          24 :     GRB_TRY (GrB_Matrix_nvals (&nnz, T)) ;
     743             : 
     744          24 :     bool change = true ;
     745          96 :     while (change && nnz > 0)
     746             :     {
     747             :         // hooking & shortcutting
     748          72 :         GRB_TRY (GrB_mxv (mngp, NULL, GrB_MIN_UINT32,
     749             :                           GrB_MIN_SECOND_SEMIRING_UINT32, T, gp, NULL)) ;
     750          72 :         GRB_TRY (Reduce_assign32 (&f, &mngp, V32, n, nthreads, ht_key,
     751             :                                   ht_val, &seed, msg)) ;
     752          72 :         GRB_TRY (GrB_eWiseAdd (f, NULL, GrB_MIN_UINT32, GrB_MIN_UINT32,
     753             :                                mngp, gp, NULL)) ;
     754             : 
     755             :         // calculate grandparent
     756             :         // fixme: NULL parameter is SS:GrB extension
     757          72 :         GRB_TRY (GrB_Vector_extractTuples (NULL, V32, &n, f)) ; // fixme
     758             :         int32_t k;
     759             :         #pragma omp parallel for num_threads(nthreads2) schedule(static)
     760       45924 :         for (k = 0 ; k < n ; k++)
     761             :         {
     762       45852 :             I [k] = (GrB_Index) V32 [k] ;
     763             :         }
     764          72 :         GRB_TRY (GrB_extract (gp_new, NULL, NULL, f, I, n, NULL)) ;
     765             : 
     766             :         // check termination
     767          72 :         GRB_TRY (GrB_eWiseMult (mod, NULL, NULL, GrB_NE_UINT32, gp_new, gp,
     768             :             NULL)) ;
     769          72 :         GRB_TRY (GrB_reduce (&change, NULL, GrB_LOR_MONOID_BOOL, mod, NULL)) ;
     770             : 
     771             :         // swap gp and gp_new
     772          72 :         GrB_Vector t = gp ; gp = gp_new ; gp_new = t ;
     773             :     }
     774             : 
     775             :     //--------------------------------------------------------------------------
     776             :     // free workspace and return result
     777             :     //--------------------------------------------------------------------------
     778             : 
     779          24 :     (*component) = f ;
     780          24 :     f = NULL ;
     781          24 :     if (sampling)
     782             :     {
     783           4 :         GrB_free (&T) ;
     784             :     }
     785          24 :     LG_FREE_ALL ;
     786          24 :     return (GrB_SUCCESS) ;
     787             : #endif
     788             : }

Generated by: LCOV version 1.14