Line data Source code
1 : //------------------------------------------------------------------------------
2 : // LAGraph_cdlp: community detection using label propagation
3 : //------------------------------------------------------------------------------
4 :
5 : // LAGraph, (c) 2019-2022 by The LAGraph Contributors, All Rights Reserved.
6 : // SPDX-License-Identifier: BSD-2-Clause
7 : //
8 : // For additional details (including references to third party source code and
9 : // other files) see the LICENSE file or contact permission@sei.cmu.edu. See
10 : // Contributors.txt for a full list of contributors. Created, in part, with
11 : // funding and support from the U.S. Government (see Acknowledgments.txt file).
12 : // DM22-0790
13 :
14 : // Contributed by Gabor Szarnyas and Balint Hegyi, Budapest University of
15 : // Technology and Economics (with accented characters: G\'{a}bor Sz\'{a}rnyas
16 : // and B\'{a}lint Hegyi, using LaTeX syntax).
17 : // https://inf.mit.bme.hu/en/members/szarnyasg .
18 :
19 : //------------------------------------------------------------------------------
20 :
21 : // ## Background
22 : //
23 : // This function was originally written for the LDBC Graphalytics benchmark.
24 : //
25 : // The community detection using label propagation (CDLP) algorithm is
26 : // defined both for directed and undirected graphs.
27 : //
28 : // The definition implemented here is described in the following document:
29 : // https://ldbc.github.io/ldbc_graphalytics_docs/graphalytics_spec.pdf
30 : //
31 : // The algorithm is based on the one given in the following paper:
32 : //
33 : // Usha Raghavan, Reka Albert, and Soundar Kumara. "Near linear time algorithm
34 : // to detect community structures in large-scale networks". In: Physical
35 : // Review E 76.3 (2007), p. 036106, https://arxiv.org/abs/0709.2938
36 : //
37 : // The key idea of the algorithm is that each vertex is assigned the label
38 : // that is most frequent among its neighbors. To allow reproducible
39 : // experiments, the algorithm is modified to guarantee deterministic behavior:
40 : // it always picks the smallest label in case of a tie:
41 : //
42 : // min ( argmax_{l} (#neighbors with label l) )
43 : //
44 : // In other words, we need to compute the *minimum mode value* (minmode) for
45 : // the labels among the neighbors.
46 : //
47 : // For directed graphs, a label on a neighbor that is connected through both
48 : // an outgoing and on an incoming edge counts twice:
49 : //
50 : // min ( argmax_{l} (#incoming neighbors with l + #outgoing neighbors with l) )
51 : //
52 : // ## Example (undirected)
53 : //
54 : // For an example, let's assume an undirected graph where vertex 1 has four
55 : // neighbors {2, 3, 4, 5}, and the current labels in the graph are
56 : // L = [3, 5, 4, 5, 4].
57 : //
58 : // In this example, the distribution of labels among the neighbors of vertex 1
59 : // is {4 => 2, 5 => 2}, therefore, the minimum mode value is 4.
60 : //
61 : // Next, we capture this operation using GraphBLAS operations and
62 : // data structures. Notice that the neighbors of vertex 1 are encoded
63 : // as a sparse vector in the adjacency matrix:
64 : //
65 : // A = | 0 1 1 1 1 |
66 : // | 1 . . . |
67 : // | 1 . |
68 : // | 1 . |
69 : // | 1 |
70 : //
71 : // To allow propagating the labels along edges, we use a diagonal matrix
72 : // with the elements of the diagonal set to the values of L:
73 : //
74 : // diag(L) = | 3 0 0 0 0 |
75 : // | 0 5 0 0 0 |
76 : // | 0 0 4 0 0 |
77 : // | 0 0 0 5 0 |
78 : // | 0 0 0 0 4 |
79 : //
80 : // If we multiply adjacency matrix with diag(L), we get a matrix
81 : // containing the labels of the neighbor nodes. We use the 'sel2nd' operator
82 : // for multiplication to avoid having to lookup the value on the left.
83 : // The conventional plus.times semiring would also work: 1 * y = sel2nd(1, y).
84 : // Note that we multiply with a diagonal matrix so the addition operator
85 : // is not used. In the implementation, we use "min" so the semiring is
86 : // "min.sel2nd" on uint64 values.
87 : //
88 : // In the example, this gives the following:
89 : //
90 : // AL = A min.sel2nd diag(L) = | 0 5 4 5 4 |
91 : // | 3 . . . . |
92 : //
93 : // ## Selecting the minimum mode value
94 : //
95 : // Next, we need to compute the minimum mode value for each row. As it is
96 : // difficult to capture this operation as a monoid, we use a sort operation
97 : // on each row. In the undirected case, we extract tuples <I, _, X> from the
98 : // matrix, then use <I, X> for sorting. In the directed case, we extract
99 : // tuples <I1, _, X1> and <I2, _, X2>, then use <I1+I2, X1+X2>,
100 : // where '+' denotes concatenation. Column indices (J) are not used.
101 : //
102 : // The resulting two-tuples are sorted using a parallel merge sort.
103 : // Finally, we use the sorted arrays compute the minimum mode value for each
104 : // row.
105 : //
106 : // ## Fixed point
107 : //
108 : // At the end of each iteration, we check whether L[i-1] == L[i] and
109 : // terminate if we reached a fixed point.
110 : //
111 : // ## Further optimizations
112 : //
113 : // A possible optimization is that the first iteration is rather trivial:
114 : //
115 : // * in the undirected case, each vertex gets the minimal initial label (=id)
116 : // of its neighbors.
117 : // * in the directed case, each vertex gets the minimal initial label (=id)
118 : // of its neighbors which are doubly-linked (on an incoming and on an
119 : // outgoing edge). In the absence of such a neighbor, it picks the minimal
120 : // label of its neighbors (connected through either an incoming or through
121 : // an outgoing edge).
122 :
123 : #define LG_FREE_ALL \
124 : { \
125 : LAGraph_Free ((void *) &I, NULL) ; \
126 : LAGraph_Free ((void *) &X, NULL) ; \
127 : LAGraph_Free ((void *) &AI, NULL) ; \
128 : LAGraph_Free ((void *) &AJ, NULL) ; \
129 : LAGraph_Free ((void *) &AX, NULL) ; \
130 : LAGraph_Free ((void *) &X, NULL) ; \
131 : LAGraph_Free ((void *) &X, NULL) ; \
132 : GrB_free (&L) ; \
133 : GrB_free (&L_prev) ; \
134 : if (sanitize) GrB_free (&S) ; \
135 : GrB_free (&AT) ; \
136 : }
137 :
138 : #include <LAGraph.h>
139 : #include <LAGraphX.h>
140 : #include "LG_internal.h"
141 :
142 : //****************************************************************************
143 12 : int LAGraph_cdlp
144 : (
145 : GrB_Vector *CDLP_handle, // output vector
146 : const GrB_Matrix A, // input matrix
147 : bool symmetric, // denote whether the matrix is symmetric
148 : bool sanitize, // if true, ensure A is binary
149 : int itermax, // max number of iterations,
150 : double *t, // t [0] = sanitize time, t [1] = cdlp time,
151 : // in seconds
152 : char *msg
153 : )
154 : {
155 : GrB_Info info;
156 12 : LG_CLEAR_MSG ;
157 :
158 : // Diagonal label matrix
159 12 : GrB_Matrix L = NULL;
160 12 : GrB_Matrix L_prev = NULL;
161 : // Source adjacency matrix
162 12 : GrB_Matrix S = NULL;
163 : // Transposed matrix for the unsymmetric case
164 12 : GrB_Matrix AT = NULL;
165 : // Result CDLP vector
166 12 : GrB_Vector CDLP = NULL;
167 :
168 : // Arrays holding extracted tuples if the matrix needs to be copied
169 12 : GrB_Index *AI = NULL;
170 12 : GrB_Index *AJ = NULL;
171 12 : GrB_Index *AX = NULL;
172 : // Arrays holding extracted tuples during the algorithm
173 12 : GrB_Index *I = NULL;
174 12 : GrB_Index *X = NULL;
175 :
176 : //--------------------------------------------------------------------------
177 : // check inputs
178 : //--------------------------------------------------------------------------
179 :
180 12 : if (CDLP_handle == NULL || t == NULL)
181 : {
182 1 : return GrB_NULL_POINTER;
183 : }
184 :
185 : //--------------------------------------------------------------------------
186 : // ensure input is binary and has no self-edges
187 : //--------------------------------------------------------------------------
188 :
189 11 : t [0] = 0; // sanitize time
190 11 : t [1] = 0; // CDLP time
191 :
192 : // n = size of A (# of nodes in the graph)
193 : // nz = # of non-zero elements in the matrix
194 : // nnz = # of non-zero elements used in the computations
195 : // (twice as many for directed graphs)
196 : GrB_Index n, nz, nnz;
197 11 : GRB_TRY (GrB_Matrix_nrows(&n, A))
198 11 : GRB_TRY (GrB_Matrix_nvals(&nz, A))
199 11 : if (!symmetric)
200 : {
201 1 : nnz = 2 * nz;
202 : }
203 : else
204 : {
205 10 : nnz = nz;
206 : }
207 :
208 11 : if (sanitize)
209 : {
210 4 : t [0] = LAGraph_WallClockTime ( ) ;
211 :
212 4 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &AI, nz, sizeof(GrB_Index),msg));
213 4 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &AJ, nz, sizeof(GrB_Index),msg));
214 4 : GRB_TRY (GrB_Matrix_extractTuples_UINT64(AI, AJ, GrB_NULL, &nz, A))
215 :
216 4 : LAGRAPH_TRY (LAGraph_Calloc ((void **) &AX, nz, sizeof(GrB_Index),msg));
217 4 : GRB_TRY (GrB_Matrix_new(&S, GrB_UINT64, n, n));
218 4 : GRB_TRY (GrB_Matrix_build(S, AI, AJ, AX, nz, GrB_PLUS_UINT64));
219 :
220 4 : t [0] = LAGraph_WallClockTime ( ) - t [0] ;
221 : }
222 : else
223 : {
224 : // Use the input as-is, and assume it is UINT64(!) with no self edges.
225 : // Results are undefined if this condition does not hold.
226 7 : S = A;
227 : }
228 :
229 11 : t [1] = LAGraph_WallClockTime ( ) ;
230 :
231 : #ifdef LAGRAPH_SUITESPARSE
232 11 : GxB_Format_Value A_format = -1, global_format = -1 ;
233 11 : GRB_TRY (GxB_get(A, GxB_FORMAT, &A_format))
234 11 : GRB_TRY (GxB_get(GxB_FORMAT, &global_format))
235 11 : if (A_format != GxB_BY_ROW || global_format != GxB_BY_ROW)
236 : {
237 1 : LG_FREE_ALL;
238 1 : return (GrB_INVALID_VALUE) ;
239 : }
240 : #endif
241 :
242 : // Initialize L with diagonal elements 1..n
243 10 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &I, n, sizeof (GrB_Index), msg)) ;
244 10 : LAGRAPH_TRY (LAGraph_Malloc ((void **) &X, n, sizeof (GrB_Index), msg)) ;
245 3304 : for (GrB_Index i = 0; i < n; i++) {
246 3294 : I[i] = i;
247 3294 : X[i] = i;
248 : }
249 10 : GRB_TRY (GrB_Matrix_new (&L, GrB_UINT64, n, n)) ;
250 10 : GRB_TRY (GrB_Matrix_build (L, I, I, X, n, GrB_PLUS_UINT64)) ;
251 10 : LAGraph_Free ((void **) &I, NULL) ;
252 10 : LAGraph_Free ((void **) &X, NULL) ;
253 :
254 : // Initialize matrix for storing previous labels
255 10 : GRB_TRY (GrB_Matrix_new(&L_prev, GrB_UINT64, n, n))
256 :
257 10 : if (!symmetric)
258 : {
259 : // compute AT for the unsymmetric case as it will be used
260 : // to compute A' = A' min.2nd L in each iteration
261 1 : GRB_TRY (GrB_Matrix_new (&AT, GrB_UINT64, n, n)) ;
262 1 : GRB_TRY (GrB_transpose (AT, NULL, NULL, A, NULL)) ;
263 : }
264 :
265 259 : for (int iteration = 0; iteration < itermax; iteration++)
266 : {
267 : // Initialize data structures for extraction from 'AL_in' and (for directed graphs) 'AL_out'
268 257 : LAGRAPH_TRY (LAGraph_Malloc((void **) &I, nnz, sizeof(GrB_Index), msg));
269 257 : LAGRAPH_TRY (LAGraph_Malloc((void **) &X, nnz, sizeof(GrB_Index), msg));
270 :
271 : // A = A min.2nd L
272 : // (using the "push" (saxpy) method)
273 257 : GRB_TRY (GrB_mxm(S, GrB_NULL, GrB_NULL,
274 : GrB_MIN_SECOND_SEMIRING_UINT64, S, L, NULL));
275 257 : GRB_TRY (GrB_Matrix_extractTuples_UINT64(I, GrB_NULL, X, &nz, S));
276 :
277 257 : if (!symmetric)
278 : {
279 : // A' = A' min.2nd L
280 : // (using the "push" (saxpy) method)
281 6 : GRB_TRY (GrB_mxm(AT, GrB_NULL, GrB_NULL,
282 : GrB_MIN_SECOND_SEMIRING_UINT64, AT, L, NULL));
283 6 : GRB_TRY (GrB_Matrix_extractTuples_UINT64(&I[nz],
284 : GrB_NULL, &X[nz], &nz, AT));
285 : }
286 :
287 257 : LG_msort2((int64_t *) I, (int64_t *) X, nnz, NULL);
288 :
289 : // save current labels for comparison by swapping L and L_prev
290 257 : GrB_Matrix L_swap = L;
291 257 : L = L_prev;
292 257 : L_prev = L_swap;
293 :
294 257 : GrB_Index mode_value = -1;
295 257 : GrB_Index mode_length = 0;
296 257 : GrB_Index run_length = 1;
297 :
298 : // I[k] is the current row index
299 : // X[k] is the current value
300 : // we iterate in range 1..nnz and use the last index (nnz) to process the last row of the matrix
301 1524968 : for (GrB_Index k = 1; k <= nnz; k++)
302 : {
303 : // check if we have a reason to recompute the mode value
304 1524711 : if (k == nnz // we surpassed the last element
305 1524454 : || I[k-1] != I[k] // the row index has changed
306 1474530 : || X[k-1] != X[k]) // the run value has changed
307 : {
308 260160 : if (run_length > mode_length)
309 : {
310 84859 : mode_value = X[k-1];
311 84859 : mode_length = run_length;
312 : }
313 260160 : run_length = 0;
314 : }
315 1524711 : run_length++;
316 :
317 : // check if we passed a row
318 1524711 : if (k == nnz // we surpassed the last element
319 1524454 : || I[k-1] != I[k]) // or the row index has changed
320 : {
321 50181 : GrB_Matrix_setElement(L, mode_value, I[k-1], I[k-1]);
322 50181 : mode_length = 0;
323 : }
324 : }
325 :
326 257 : LAGraph_Free ((void **) &I, NULL) ;
327 257 : LAGraph_Free ((void **) &X, NULL) ;
328 :
329 : bool isequal;
330 257 : LAGraph_Matrix_IsEqual (&isequal, L_prev, L, NULL);
331 257 : if (isequal) {
332 8 : break;
333 : }
334 : }
335 :
336 : //--------------------------------------------------------------------------
337 : // extract final labels to the result vector
338 : //--------------------------------------------------------------------------
339 :
340 10 : GRB_TRY (GrB_Vector_new(&CDLP, GrB_UINT64, n))
341 3304 : for (GrB_Index i = 0; i < n; i++)
342 : {
343 : uint64_t x;
344 3294 : GRB_TRY (GrB_Matrix_extractElement(&x, L, i, i))
345 3294 : GRB_TRY (GrB_Vector_setElement(CDLP, x, i))
346 : }
347 :
348 : //--------------------------------------------------------------------------
349 : // free workspace and return result
350 : //--------------------------------------------------------------------------
351 :
352 10 : (*CDLP_handle) = CDLP;
353 10 : CDLP = NULL; // set to NULL so LG_FREE_ALL doesn't free it
354 10 : LG_FREE_ALL;
355 :
356 10 : t [1] = LAGraph_WallClockTime ( ) - t [1] ;
357 :
358 10 : return (GrB_SUCCESS);
359 : }
|