Line data Source code
1 : //------------------------------------------------------------------------------
2 : // LAGraph_cdlp_withsort: community detection using label propagation
3 : //------------------------------------------------------------------------------
4 :
5 : // LAGraph, (c) 2019-2022 by The LAGraph Contributors, All Rights Reserved.
6 : // SPDX-License-Identifier: BSD-2-Clause
7 : //
8 : // For additional details (including references to third party source code and
9 : // other files) see the LICENSE file or contact permission@sei.cmu.edu. See
10 : // Contributors.txt for a full list of contributors. Created, in part, with
11 : // funding and support from the U.S. Government (see Acknowledgments.txt file).
12 : // DM22-0790
13 :
14 : // Contributed by Gabor Szarnyas and Balint Hegyi, Budapest University of
15 : // Technology and Economics (with accented characters: G\'{a}bor Sz\'{a}rnyas
16 : // and B\'{a}lint Hegyi, using LaTeX syntax).
17 : // https://inf.mit.bme.hu/en/members/szarnyasg .
18 :
19 : //------------------------------------------------------------------------------
20 :
21 : // ## Background
22 : //
23 : // This function was originally written for the LDBC Graphalytics benchmark.
24 : //
25 : // The community detection using label propagation (CDLP) algorithm is
26 : // defined both for directed and undirected graphs.
27 : //
28 : // The definition implemented here is described in the following document:
29 : // https://ldbc.github.io/ldbc_graphalytics_docs/graphalytics_spec.pdf
30 : //
31 : // The algorithm is based on the one given in the following paper:
32 : //
33 : // Usha Raghavan, Reka Albert, and Soundar Kumara. "Near linear time algorithm
34 : // to detect community structures in large-scale networks". In: Physical
35 : // Review E 76.3 (2007), p. 036106, https://arxiv.org/abs/0709.2938
36 : //
37 : // The key idea of the algorithm is that each vertex is assigned the label
38 : // that is most frequent among its neighbors. To allow reproducible
39 : // experiments, the algorithm is modified to guarantee deterministic behavior:
40 : // it always picks the smallest label in case of a tie:
41 : //
42 : // min ( argmax_{l} (#neighbors with label l) )
43 : //
44 : // In other words, we need to compute the *minimum mode value* (minmode) for
45 : // the labels among the neighbors.
46 : //
47 : // For directed graphs, a label on a neighbor that is connected through both
48 : // an outgoing and on an incoming edge counts twice:
49 : //
50 : // min ( argmax_{l} (#incoming neighbors with l + #outgoing neighbors with l) )
51 : //
52 : // ## Example (undirected)
53 : //
54 : // For an example, let's assume an undirected graph where vertex 1 has four
55 : // neighbors {2, 3, 4, 5}, and the current labels in the graph are
56 : // L = [3, 5, 4, 5, 4].
57 : //
58 : // In this example, the distribution of labels among the neighbors of vertex 1
59 : // is {4 => 2, 5 => 2}, therefore, the minimum mode value is 4.
60 : //
61 : // Next, we capture this operation using GraphBLAS operations and
62 : // data structures. Notice that the neighbors of vertex 1 are encoded
63 : // as a sparse vector in the adjacency matrix:
64 : //
65 : // A = | 0 1 1 1 1 |
66 : // | 1 . . . |
67 : // | 1 . |
68 : // | 1 . |
69 : // | 1 |
70 : //
71 : // To allow propagating the labels along edges, we use a diagonal matrix
72 : // with the elements of the diagonal set to the values of L:
73 : //
74 : // diag(L) = | 3 0 0 0 0 |
75 : // | 0 5 0 0 0 |
76 : // | 0 0 4 0 0 |
77 : // | 0 0 0 5 0 |
78 : // | 0 0 0 0 4 |
79 : //
80 : // If we multiply adjacency matrix with diag(L), we get a matrix
81 : // containing the labels of the neighbor nodes. We use the 'sel2nd' operator
82 : // for multiplication to avoid having to lookup the value on the left.
83 : // The conventional plus.times semiring would also work: 1 * y = sel2nd(1, y).
84 : // Note that we multiply with a diagonal matrix so the addition operator
85 : // is not used. In the implementation, we use "min" so the semiring is
86 : // "min.sel2nd" on uint64 values.
87 : //
88 : // In the example, this gives the following:
89 : //
90 : // AL = A min.sel2nd diag(L) = | 0 5 4 5 4 |
91 : // | 3 . . . . |
92 : //
93 : // ## Selecting the minimum mode value
94 : //
95 : // Next, we need to compute the minimum mode value for each row. As it is
96 : // difficult to capture this operation as a monoid, we use a sort operation
97 : // on each row. In the undirected case, we extract tuples <I, _, X> from the
98 : // matrix, then use <I, X> for sorting. In the directed case, we extract
99 : // tuples <I1, _, X1> and <I2, _, X2>, then use <I1+I2, X1+X2>,
100 : // where '+' denotes concatenation. Column indices (J) are not used.
101 : //
102 : // The resulting two-tuples are sorted using a parallel merge sort.
103 : // Finally, we use the sorted arrays compute the minimum mode value for each
104 : // row.
105 : //
106 : // ## Fixed point
107 : //
108 : // At the end of each iteration, we check whether L[i-1] == L[i] and
109 : // terminate if we reached a fixed point.
110 : //
111 : // ## Further optimizations
112 : //
113 : // A possible optimization is that the first iteration is rather trivial:
114 : //
115 : // * in the undirected case, each vertex gets the minimal initial label (=id)
116 : // of its neighbors.
117 : // * in the directed case, each vertex gets the minimal initial label (=id)
118 : // of its neighbors which are doubly-linked (on an incoming and on an
119 : // outgoing edge). In the absence of such a neighbor, it picks the minimal
120 : // label of its neighbors (connected through either an incoming or through
121 : // an outgoing edge).
122 :
123 : #define LG_FREE_ALL \
124 : { \
125 : LAGraph_Free ((void *) &I, NULL) ; \
126 : LAGraph_Free ((void *) &X, NULL) ; \
127 : LAGraph_Free ((void *) &LP, NULL) ; \
128 : LAGraph_Free ((void *) &LI, NULL) ; \
129 : LAGraph_Free ((void *) &LX, NULL) ; \
130 : GrB_free (&L) ; \
131 : GrB_free (&L_prev) ; \
132 : GrB_free (&S) ; \
133 : GrB_free (&AT) ; \
134 : }
135 :
136 : #include <LAGraph.h>
137 : #include <LAGraphX.h>
138 : #include "LG_internal.h"
139 :
140 : //****************************************************************************
141 10 : int LAGraph_cdlp_withsort
142 : (
143 : GrB_Vector *CDLP_handle, // output vector
144 : LAGraph_Graph G, // input graph
145 : int itermax, // max number of iterations,
146 : char *msg
147 : )
148 : {
149 : GrB_Info info;
150 10 : LG_CLEAR_MSG ;
151 :
152 10 : GrB_Matrix A = G->A ;
153 20 : bool symmetric = (G->kind == LAGraph_ADJACENCY_UNDIRECTED) ||
154 10 : ((G->kind == LAGraph_ADJACENCY_DIRECTED) &&
155 10 : (G->is_symmetric_structure == LAGraph_TRUE)) ;
156 :
157 : // Diagonal label matrix
158 10 : GrB_Matrix L = NULL;
159 10 : GrB_Matrix L_prev = NULL;
160 : // Source adjacency matrix
161 10 : GrB_Matrix S = NULL;
162 : // Transposed matrix for the unsymmetric case
163 10 : GrB_Matrix AT = NULL;
164 : // Result CDLP vector
165 10 : GrB_Vector CDLP = NULL;
166 :
167 : // Arrays for constructing initial labels
168 10 : GrB_Index *LP = NULL, *LI = NULL, *LX = NULL ;
169 :
170 : // Arrays holding extracted tuples during the algorithm
171 10 : GrB_Index *I = NULL;
172 10 : GrB_Index *X = NULL;
173 :
174 : //--------------------------------------------------------------------------
175 : // check inputs
176 : //--------------------------------------------------------------------------
177 :
178 10 : LG_ASSERT (CDLP_handle != NULL, GrB_NULL_POINTER) ;
179 :
180 : //--------------------------------------------------------------------------
181 : // ensure input is binary and has no self-edges
182 : //--------------------------------------------------------------------------
183 :
184 : // n = size of A (# of nodes in the graph)
185 : // nz = # of non-zero elements in the matrix
186 : // nnz = # of non-zero elements used in the computations
187 : // (twice as many for directed graphs)
188 : GrB_Index n, nz, nnz;
189 10 : GRB_TRY (GrB_Matrix_nrows(&n, A))
190 10 : GRB_TRY (GrB_Matrix_nvals(&nz, A))
191 10 : if (!symmetric)
192 : {
193 1 : nnz = 2 * nz;
194 : }
195 : else
196 : {
197 9 : nnz = nz;
198 : }
199 :
200 10 : GRB_TRY (GrB_Matrix_new (&S, GrB_UINT64, n, n)) ;
201 10 : GRB_TRY (GrB_apply (S, GrB_NULL, GrB_NULL, GrB_ONEB_UINT64, A, 0, GrB_NULL)) ;
202 :
203 : // Initialize L with diagonal elements 1..n
204 10 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &LP, n+1, sizeof (GrB_Index), msg)) ;
205 3314 : for (GrB_Index i = 0; i <= n; i++) {
206 3304 : LP[i] = i ;
207 : }
208 10 : LAGRAPH_TRY (LAGraph_Malloc((void **) &LI, n, sizeof (GrB_Index), msg)) ;
209 10 : LAGRAPH_TRY (LAGraph_Malloc((void **) &LX, n, sizeof (GrB_Index), msg)) ;
210 3304 : for (GrB_Index i = 0; i < n; i++) {
211 3294 : LI[i] = i ;
212 3294 : LX[i] = i ;
213 : }
214 : #if LAGRAPH_SUITESPARSE
215 10 : GRB_TRY (GrB_Matrix_new (&L, GrB_UINT64, n, n)) ;
216 10 : GRB_TRY (GxB_Matrix_pack_CSC (L, &LP, &LI, (void **) &LX, (n+1)*sizeof(GrB_Index), n*sizeof(GrB_Index), n*sizeof(GrB_Index), false, false, GrB_NULL)) ;
217 : #else
218 : GRB_TRY (GrB_Matrix_import (&L, GrB_UINT64, n, n, LP, LI, LX, n+1, n, n, GrB_CSC_FORMAT)) ;
219 : LAGRAPH_TRY (LAGraph_Free ((void **) &LP, NULL)) ;
220 : LAGRAPH_TRY (LAGraph_Free ((void **) &LI, NULL)) ;
221 : LAGRAPH_TRY (LAGraph_Free ((void **) &LX, NULL)) ;
222 : #endif
223 :
224 : // Initialize matrix for storing previous labels
225 10 : GRB_TRY (GrB_Matrix_new(&L_prev, GrB_UINT64, n, n))
226 :
227 10 : if (!symmetric)
228 : {
229 : // compute AT for the unsymmetric case as it will be used
230 : // to compute A' = A' min.2nd L in each iteration
231 1 : GRB_TRY (GrB_Matrix_new (&AT, GrB_UINT64, n, n)) ;
232 1 : GRB_TRY (GrB_transpose (AT, NULL, NULL, S, NULL)) ;
233 : }
234 :
235 : // Initialize data structures for extraction from 'AL_in' and (for directed graphs) 'AL_out'
236 10 : LAGRAPH_TRY (LAGraph_Malloc((void **) &I, nnz, sizeof(GrB_Index), msg));
237 10 : LAGRAPH_TRY (LAGraph_Malloc((void **) &X, nnz, sizeof(GrB_Index), msg));
238 :
239 353 : for (int iteration = 0; iteration < itermax; iteration++)
240 : {
241 : // A = A min.2nd L
242 : // (using the "push" (saxpy) method)
243 350 : GRB_TRY (GrB_mxm(S, GrB_NULL, GrB_NULL,
244 : GrB_MIN_SECOND_SEMIRING_UINT64, S, L, NULL));
245 350 : GRB_TRY (GrB_Matrix_extractTuples_UINT64(I, GrB_NULL, X, &nz, S));
246 :
247 350 : if (!symmetric)
248 : {
249 : // A' = A' min.2nd L
250 : // (using the "push" (saxpy) method)
251 6 : GRB_TRY (GrB_mxm(AT, GrB_NULL, GrB_NULL,
252 : GrB_MIN_SECOND_SEMIRING_UINT64, AT, L, NULL));
253 6 : GRB_TRY (GrB_Matrix_extractTuples_UINT64(&I[nz],
254 : GrB_NULL, &X[nz], &nz, AT));
255 : }
256 :
257 350 : LG_msort2((int64_t *) I, (int64_t *) X, nnz, NULL);
258 :
259 : // save current labels for comparison by swapping L and L_prev
260 350 : GrB_Matrix L_swap = L;
261 350 : L = L_prev;
262 350 : L_prev = L_swap;
263 :
264 350 : GrB_Index mode_value = -1;
265 350 : GrB_Index mode_length = 0;
266 350 : GrB_Index run_length = 1;
267 :
268 : // I[k] is the current row index
269 : // X[k] is the current value
270 : // we iterate in range 1..nnz and use the last index (nnz) to process the last row of the matrix
271 8828598 : for (GrB_Index k = 1; k <= nnz; k++)
272 : {
273 : // check if we have a reason to recompute the mode value
274 8828248 : if (k == nnz // we surpassed the last element
275 8827898 : || I[k-1] != I[k] // the row index has changed
276 8512372 : || X[k-1] != X[k]) // the run value has changed
277 : {
278 727057 : if (run_length > mode_length)
279 : {
280 431414 : mode_value = X[k-1];
281 431414 : mode_length = run_length;
282 : }
283 727057 : run_length = 0;
284 : }
285 8828248 : run_length++;
286 :
287 : // check if we passed a row
288 8828248 : if (k == nnz // we surpassed the last element
289 8827898 : || I[k-1] != I[k]) // or the row index has changed
290 : {
291 315876 : GrB_Matrix_setElement(L, mode_value, I[k-1], I[k-1]);
292 315876 : mode_length = 0;
293 : }
294 : }
295 :
296 : bool isequal;
297 350 : LAGraph_Matrix_IsEqual (&isequal, L_prev, L, NULL);
298 350 : if (isequal) {
299 7 : break;
300 : }
301 : }
302 :
303 10 : LAGraph_Free ((void **) &I, NULL) ;
304 10 : LAGraph_Free ((void **) &X, NULL) ;
305 :
306 : //--------------------------------------------------------------------------
307 : // extract final labels to the result vector
308 : //--------------------------------------------------------------------------
309 :
310 10 : GRB_TRY (GrB_Vector_new(&CDLP, GrB_UINT64, n)) ;
311 : #if LAGRAPH_SUITESPARSE
312 10 : GRB_TRY (GxB_Vector_diag (CDLP, L, 0, GrB_NULL)) ;
313 : #else
314 : for (GrB_Index i = 0; i < n; i++)
315 : {
316 : uint64_t x;
317 : GRB_TRY (GrB_Matrix_extractElement(&x, L, i, i))
318 : GRB_TRY (GrB_Vector_setElement(CDLP, x, i))
319 : }
320 : #endif
321 :
322 : //--------------------------------------------------------------------------
323 : // free workspace and return result
324 : //--------------------------------------------------------------------------
325 :
326 10 : (*CDLP_handle) = CDLP;
327 10 : CDLP = NULL; // set to NULL so LG_FREE_ALL doesn't free it
328 10 : LG_FREE_ALL;
329 :
330 10 : return (GrB_SUCCESS);
331 : }
|