LCOV - code coverage report
Current view: top level - experimental/algorithm - LAGraph_cdlp.c (source / functions) Hit Total Coverage
Test: LAGraph code coverage report. Commit id: cc56ed4. Current time (UTC): 2024-08-30T17:14:30Z Lines: 93 93 100.0 %
Date: 2024-08-30 17:16:41 Functions: 1 1 100.0 %

          Line data    Source code
       1             : //------------------------------------------------------------------------------
       2             : // LAGraph_cdlp: community detection using label propagation
       3             : //------------------------------------------------------------------------------
       4             : 
       5             : // LAGraph, (c) 2019-2022 by The LAGraph Contributors, All Rights Reserved.
       6             : // SPDX-License-Identifier: BSD-2-Clause
       7             : //
       8             : // For additional details (including references to third party source code and
       9             : // other files) see the LICENSE file or contact permission@sei.cmu.edu. See
      10             : // Contributors.txt for a full list of contributors. Created, in part, with
      11             : // funding and support from the U.S. Government (see Acknowledgments.txt file).
      12             : // DM22-0790
      13             : 
      14             : // Contributed by Gabor Szarnyas and Balint Hegyi, Budapest University of
      15             : // Technology and Economics (with accented characters: G\'{a}bor Sz\'{a}rnyas
      16             : // and B\'{a}lint Hegyi, using LaTeX syntax).
      17             : // https://inf.mit.bme.hu/en/members/szarnyasg .
      18             : 
      19             : //------------------------------------------------------------------------------
      20             : 
      21             : // ## Background
      22             : //
      23             : // This function was originally written for the LDBC Graphalytics benchmark.
      24             : //
      25             : // The community detection using label propagation (CDLP) algorithm is
      26             : // defined both for directed and undirected graphs.
      27             : //
      28             : // The definition implemented here is described in the following document:
      29             : // https://ldbc.github.io/ldbc_graphalytics_docs/graphalytics_spec.pdf
      30             : //
      31             : // The algorithm is based on the one given in the following paper:
      32             : //
      33             : // Usha Raghavan, Reka Albert, and Soundar Kumara. "Near linear time algorithm
      34             : // to detect community structures in large-scale networks". In: Physical
      35             : // Review E 76.3 (2007), p. 036106, https://arxiv.org/abs/0709.2938
      36             : //
      37             : // The key idea of the algorithm is that each vertex is assigned the label
      38             : // that is most frequent among its neighbors. To allow reproducible
      39             : // experiments, the algorithm is modified to guarantee deterministic behavior:
      40             : // it always picks the smallest label in case of a tie:
      41             : //
      42             : // min ( argmax_{l} (#neighbors with label l) )
      43             : //
      44             : // In other words, we need to compute the *minimum mode value* (minmode) for
      45             : // the labels among the neighbors.
      46             : //
      47             : // For directed graphs, a label on a neighbor that is connected through both
      48             : // an outgoing and on an incoming edge counts twice:
      49             : //
      50             : // min ( argmax_{l} (#incoming neighbors with l + #outgoing neighbors with l) )
      51             : //
      52             : // ## Example (undirected)
      53             : //
      54             : // For an example, let's assume an undirected graph where vertex 1 has four
      55             : // neighbors {2, 3, 4, 5}, and the current labels in the graph are
      56             : // L = [3, 5, 4, 5, 4].
      57             : //
      58             : // In this example, the distribution of labels among the neighbors of vertex 1
      59             : // is {4 => 2, 5 => 2}, therefore, the minimum mode value is 4.
      60             : //
      61             : // Next, we capture this operation using GraphBLAS operations and
      62             : // data structures. Notice that the neighbors of vertex 1 are encoded
      63             : // as a sparse vector in the adjacency matrix:
      64             : //
      65             : // A = | 0 1 1 1 1 |
      66             : //     | 1 . . .   |
      67             : //     | 1 .       |
      68             : //     | 1 .       |
      69             : //     | 1         |
      70             : //
      71             : // To allow propagating the labels along edges, we use a diagonal matrix
      72             : // with the elements of the diagonal set to the values of L:
      73             : //
      74             : // diag(L) = | 3 0 0 0 0 |
      75             : //           | 0 5 0 0 0 |
      76             : //           | 0 0 4 0 0 |
      77             : //           | 0 0 0 5 0 |
      78             : //           | 0 0 0 0 4 |
      79             : //
      80             : // If we multiply adjacency matrix with diag(L), we get a matrix
      81             : // containing the labels of the neighbor nodes. We use the 'sel2nd' operator
      82             : // for multiplication to avoid having to lookup the value on the left.
      83             : // The conventional plus.times semiring would also work: 1 * y = sel2nd(1, y).
      84             : // Note that we multiply with a diagonal matrix so the addition operator
      85             : // is not used. In the implementation, we use "min" so the semiring is
      86             : // "min.sel2nd" on uint64 values.
      87             : //
      88             : // In the example, this gives the following:
      89             : //
      90             : // AL = A min.sel2nd diag(L) = | 0 5 4 5 4 |
      91             : //                             | 3 . . . . |
      92             : //
      93             : // ## Selecting the minimum mode value
      94             : //
      95             : // Next, we need to compute the minimum mode value for each row. As it is
      96             : // difficult to capture this operation as a monoid, we use a sort operation
      97             : // on each row. In the undirected case, we extract tuples <I, _, X> from the
      98             : // matrix, then use <I, X> for sorting. In the directed case, we extract
      99             : // tuples <I1, _, X1> and <I2, _, X2>, then use <I1+I2, X1+X2>,
     100             : // where '+' denotes concatenation. Column indices (J) are not used.
     101             : //
     102             : // The resulting two-tuples are sorted using a parallel merge sort.
     103             : // Finally, we use the sorted arrays compute the minimum mode value for each
     104             : // row.
     105             : //
     106             : // ## Fixed point
     107             : //
     108             : // At the end of each iteration, we check whether L[i-1] == L[i] and
     109             : // terminate if we reached a fixed point.
     110             : //
     111             : // ## Further optimizations
     112             : //
     113             : // A possible optimization is that the first iteration is rather trivial:
     114             : //
     115             : // * in the undirected case, each vertex gets the minimal initial label (=id)
     116             : //   of its neighbors.
     117             : // * in the directed case, each vertex gets the minimal initial label (=id)
     118             : //   of its neighbors which are doubly-linked (on an incoming and on an
     119             : //   outgoing edge). In the absence of such a neighbor, it picks the minimal
     120             : //   label of its neighbors (connected through either an incoming or through
     121             : //   an outgoing edge).
     122             : 
     123             : #define LG_FREE_ALL                                                     \
     124             : {                                                                       \
     125             :     LAGraph_Free ((void *) &I, NULL) ;                                  \
     126             :     LAGraph_Free ((void *) &X, NULL) ;                                  \
     127             :     LAGraph_Free ((void *) &AI, NULL) ;                                 \
     128             :     LAGraph_Free ((void *) &AJ, NULL) ;                                 \
     129             :     LAGraph_Free ((void *) &AX, NULL) ;                                 \
     130             :     LAGraph_Free ((void *) &X, NULL) ;                                  \
     131             :     LAGraph_Free ((void *) &X, NULL) ;                                  \
     132             :     GrB_free (&L) ;                                                     \
     133             :     GrB_free (&L_prev) ;                                                \
     134             :     if (sanitize) GrB_free (&S) ;                                       \
     135             :     GrB_free (&AT) ;                                                    \
     136             : }
     137             : 
     138             : #include <LAGraph.h>
     139             : #include <LAGraphX.h>
     140             : #include "LG_internal.h"
     141             : 
     142             : //****************************************************************************
     143          12 : int LAGraph_cdlp
     144             : (
     145             :     GrB_Vector *CDLP_handle, // output vector
     146             :     const GrB_Matrix A,      // input matrix
     147             :     bool symmetric,          // denote whether the matrix is symmetric
     148             :     bool sanitize,           // if true, ensure A is binary
     149             :     int itermax,             // max number of iterations,
     150             :     double *t,               // t [0] = sanitize time, t [1] = cdlp time,
     151             :                              // in seconds
     152             :     char *msg
     153             : )
     154             : {
     155             :     GrB_Info info;
     156          12 :     LG_CLEAR_MSG ;
     157             : 
     158             :     // Diagonal label matrix
     159          12 :     GrB_Matrix L = NULL;
     160          12 :     GrB_Matrix L_prev = NULL;
     161             :     // Source adjacency matrix
     162          12 :     GrB_Matrix S = NULL;
     163             :     // Transposed matrix for the unsymmetric case
     164          12 :     GrB_Matrix AT = NULL;
     165             :     // Result CDLP vector
     166          12 :     GrB_Vector CDLP = NULL;
     167             : 
     168             :     // Arrays holding extracted tuples if the matrix needs to be copied
     169          12 :     GrB_Index *AI = NULL;
     170          12 :     GrB_Index *AJ = NULL;
     171          12 :     GrB_Index *AX = NULL;
     172             :     // Arrays holding extracted tuples during the algorithm
     173          12 :     GrB_Index *I = NULL;
     174          12 :     GrB_Index *X = NULL;
     175             : 
     176             :     //--------------------------------------------------------------------------
     177             :     // check inputs
     178             :     //--------------------------------------------------------------------------
     179             : 
     180          12 :     if (CDLP_handle == NULL || t == NULL)
     181             :     {
     182           1 :         return GrB_NULL_POINTER;
     183             :     }
     184             : 
     185             :     //--------------------------------------------------------------------------
     186             :     // ensure input is binary and has no self-edges
     187             :     //--------------------------------------------------------------------------
     188             : 
     189          11 :     t [0] = 0;         // sanitize time
     190          11 :     t [1] = 0;         // CDLP time
     191             : 
     192             :     // n = size of A (# of nodes in the graph)
     193             :     // nz = # of non-zero elements in the matrix
     194             :     // nnz = # of non-zero elements used in the computations
     195             :     //   (twice as many for directed graphs)
     196             :     GrB_Index n, nz, nnz;
     197          11 :     GRB_TRY (GrB_Matrix_nrows(&n, A))
     198          11 :     GRB_TRY (GrB_Matrix_nvals(&nz, A))
     199          11 :     if (!symmetric)
     200             :     {
     201           1 :         nnz = 2 * nz;
     202             :     }
     203             :     else
     204             :     {
     205          10 :         nnz = nz;
     206             :     }
     207             : 
     208          11 :     if (sanitize)
     209             :     {
     210           4 :         t [0] = LAGraph_WallClockTime ( ) ;
     211             : 
     212           4 :         LAGRAPH_TRY (LAGraph_Malloc ((void **) &AI, nz, sizeof(GrB_Index),msg));
     213           4 :         LAGRAPH_TRY (LAGraph_Malloc ((void **) &AJ, nz, sizeof(GrB_Index),msg));
     214           4 :         GRB_TRY (GrB_Matrix_extractTuples_UINT64(AI, AJ, GrB_NULL, &nz, A))
     215             : 
     216           4 :         LAGRAPH_TRY (LAGraph_Calloc ((void **) &AX, nz, sizeof(GrB_Index),msg));
     217           4 :         GRB_TRY (GrB_Matrix_new(&S, GrB_UINT64, n, n));
     218           4 :         GRB_TRY (GrB_Matrix_build(S, AI, AJ, AX, nz, GrB_PLUS_UINT64));
     219             : 
     220           4 :         t [0] = LAGraph_WallClockTime ( ) - t [0] ;
     221             :     }
     222             :     else
     223             :     {
     224             :         // Use the input as-is, and assume it is UINT64(!) with no self edges.
     225             :         // Results are undefined if this condition does not hold.
     226           7 :         S = A;
     227             :     }
     228             : 
     229          11 :     t [1] = LAGraph_WallClockTime ( ) ;
     230             : 
     231             : #ifdef LAGRAPH_SUITESPARSE
     232          11 :     GxB_Format_Value A_format = -1, global_format = -1 ;
     233          11 :     GRB_TRY (GxB_get(A, GxB_FORMAT, &A_format))
     234          11 :     GRB_TRY (GxB_get(GxB_FORMAT, &global_format))
     235          11 :     if (A_format != GxB_BY_ROW || global_format != GxB_BY_ROW)
     236             :     {
     237           1 :         LG_FREE_ALL;
     238           1 :         return (GrB_INVALID_VALUE) ;
     239             :     }
     240             : #endif
     241             : 
     242             :     // Initialize L with diagonal elements 1..n
     243          10 :     LAGRAPH_TRY (LAGraph_Malloc ((void **) &I, n, sizeof (GrB_Index), msg)) ;
     244          10 :     LAGRAPH_TRY (LAGraph_Malloc ((void **) &X, n, sizeof (GrB_Index), msg)) ;
     245        3304 :     for (GrB_Index i = 0; i < n; i++) {
     246        3294 :         I[i] = i;
     247        3294 :         X[i] = i;
     248             :     }
     249          10 :     GRB_TRY (GrB_Matrix_new (&L, GrB_UINT64, n, n)) ;
     250          10 :     GRB_TRY (GrB_Matrix_build (L, I, I, X, n, GrB_PLUS_UINT64)) ;
     251          10 :     LAGraph_Free ((void **) &I, NULL) ;
     252          10 :     LAGraph_Free ((void **) &X, NULL) ;
     253             : 
     254             :     // Initialize matrix for storing previous labels
     255          10 :     GRB_TRY (GrB_Matrix_new(&L_prev, GrB_UINT64, n, n))
     256             : 
     257          10 :     if (!symmetric)
     258             :     {
     259             :         // compute AT for the unsymmetric case as it will be used
     260             :         // to compute A' = A' min.2nd L in each iteration
     261           1 :         GRB_TRY (GrB_Matrix_new (&AT, GrB_UINT64, n, n)) ;
     262           1 :         GRB_TRY (GrB_transpose (AT, NULL, NULL, A, NULL)) ;
     263             :     }
     264             : 
     265         259 :     for (int iteration = 0; iteration < itermax; iteration++)
     266             :     {
     267             :         // Initialize data structures for extraction from 'AL_in' and (for directed graphs) 'AL_out'
     268         257 :         LAGRAPH_TRY (LAGraph_Malloc((void **) &I, nnz, sizeof(GrB_Index), msg));
     269         257 :         LAGRAPH_TRY (LAGraph_Malloc((void **) &X, nnz, sizeof(GrB_Index), msg));
     270             : 
     271             :         // A = A min.2nd L
     272             :         // (using the "push" (saxpy) method)
     273         257 :         GRB_TRY (GrB_mxm(S, GrB_NULL, GrB_NULL,
     274             :                            GrB_MIN_SECOND_SEMIRING_UINT64, S, L, NULL));
     275         257 :         GRB_TRY (GrB_Matrix_extractTuples_UINT64(I, GrB_NULL, X, &nz, S));
     276             : 
     277         257 :         if (!symmetric)
     278             :         {
     279             :             // A' = A' min.2nd L
     280             :             // (using the "push" (saxpy) method)
     281           6 :             GRB_TRY (GrB_mxm(AT, GrB_NULL, GrB_NULL,
     282             :                                GrB_MIN_SECOND_SEMIRING_UINT64, AT, L, NULL));
     283           6 :             GRB_TRY (GrB_Matrix_extractTuples_UINT64(&I[nz],
     284             :                                                        GrB_NULL, &X[nz], &nz, AT));
     285             :         }
     286             : 
     287         257 :         LG_msort2((int64_t *) I, (int64_t *) X, nnz, NULL);
     288             : 
     289             :         // save current labels for comparison by swapping L and L_prev
     290         257 :         GrB_Matrix L_swap = L;
     291         257 :         L = L_prev;
     292         257 :         L_prev = L_swap;
     293             : 
     294         257 :         GrB_Index mode_value = -1;
     295         257 :         GrB_Index mode_length = 0;
     296         257 :         GrB_Index run_length = 1;
     297             : 
     298             :         // I[k] is the current row index
     299             :         // X[k] is the current value
     300             :         // we iterate in range 1..nnz and use the last index (nnz) to process the last row of the matrix
     301     1524968 :         for (GrB_Index k = 1; k <= nnz; k++)
     302             :         {
     303             :             // check if we have a reason to recompute the mode value
     304     1524711 :             if (k == nnz           // we surpassed the last element
     305     1524454 :                 || I[k-1] != I[k]  // the row index has changed
     306     1474530 :                 || X[k-1] != X[k]) // the run value has changed
     307             :             {
     308      260160 :                 if (run_length > mode_length)
     309             :                 {
     310       84859 :                     mode_value = X[k-1];
     311       84859 :                     mode_length = run_length;
     312             :                 }
     313      260160 :                 run_length = 0;
     314             :             }
     315     1524711 :             run_length++;
     316             : 
     317             :             // check if we passed a row
     318     1524711 :             if (k == nnz           // we surpassed the last element
     319     1524454 :                 || I[k-1] != I[k]) // or the row index has changed
     320             :             {
     321       50181 :                 GrB_Matrix_setElement(L, mode_value, I[k-1], I[k-1]);
     322       50181 :                 mode_length = 0;
     323             :             }
     324             :         }
     325             : 
     326         257 :         LAGraph_Free ((void **) &I, NULL) ;
     327         257 :         LAGraph_Free ((void **) &X, NULL) ;
     328             : 
     329             :         bool isequal;
     330         257 :         LAGraph_Matrix_IsEqual (&isequal, L_prev, L, NULL);
     331         257 :         if (isequal) {
     332           8 :             break;
     333             :         }
     334             :     }
     335             : 
     336             :     //--------------------------------------------------------------------------
     337             :     // extract final labels to the result vector
     338             :     //--------------------------------------------------------------------------
     339             : 
     340          10 :     GRB_TRY (GrB_Vector_new(&CDLP, GrB_UINT64, n))
     341        3304 :     for (GrB_Index i = 0; i < n; i++)
     342             :     {
     343             :         uint64_t x;
     344        3294 :         GRB_TRY (GrB_Matrix_extractElement(&x, L, i, i))
     345        3294 :         GRB_TRY (GrB_Vector_setElement(CDLP, x, i))
     346             :     }
     347             : 
     348             :     //--------------------------------------------------------------------------
     349             :     // free workspace and return result
     350             :     //--------------------------------------------------------------------------
     351             : 
     352          10 :     (*CDLP_handle) = CDLP;
     353          10 :     CDLP = NULL;            // set to NULL so LG_FREE_ALL doesn't free it
     354          10 :     LG_FREE_ALL;
     355             : 
     356          10 :     t [1] = LAGraph_WallClockTime ( ) - t [1] ;
     357             : 
     358          10 :     return (GrB_SUCCESS);
     359             : }

Generated by: LCOV version 1.14