Line data Source code
1 : //------------------------------------------------------------------------------
2 : // LAGraph_SquareClustering: vertex square-clustering
3 : //------------------------------------------------------------------------------
4 :
5 : // LAGraph, (c) 2019-2022 by The LAGraph Contributors, All Rights Reserved.
6 : // SPDX-License-Identifier: BSD-2-Clause
7 : //
8 : // For additional details (including references to third party source code and
9 : // other files) see the LICENSE file or contact permission@sei.cmu.edu. See
10 : // Contributors.txt for a full list of contributors. Created, in part, with
11 : // funding and support from the U.S. Government (see Acknowledgments.txt file).
12 : // DM22-0790
13 :
14 : // Contributed by Erik Welch, NVIDIA.
15 :
16 : //------------------------------------------------------------------------------
17 :
18 : // TODO: ready for src, except this is advanced; need basic method.
19 :
20 : // Compute the square clustering coefficient for each node of an undirected
21 : // graph, which is the fraction of possible squares that exist at each node.
22 : // It is a clustering coefficient suitable for bipartite graphs and is fully
23 : // described here:
24 : // https://arxiv.org/pdf/0710.0117v1.pdf
25 : // which uses a different denominator than the original definition:
26 : // https://arxiv.org/pdf/cond-mat/0504241.pdf
27 : // Furthermore, we count squares based on
28 : // https://arxiv.org/pdf/2007.11111.pdf (sigma_12, c_4)
29 : // which is implemented in LAGraph_FastGraphletTransform.c (thanks Tim Davis
30 : // for mentioning this to me!).
31 :
32 : // The NetworkX implementation of square clustering was used heavily during
33 : // development. I used it to determine the contributions to the denominator
34 : // and to verify correctness (including on larger graphs).
35 : // https://networkx.org/documentation/stable/reference/algorithms/\
36 : // generated/networkx.algorithms.cluster.square_clustering.html
37 :
38 : // Pseudocode (doesn't show dropping 0s in the final result):
39 : //
40 : // P2(~degrees.diag().S) = plus_pair(A @ A.T)
41 : // tri = first(P2 & A).reduce_rowwise()
42 : // squares = (P2 * (P2 - 1)).reduce_rowwise() / 2
43 : // uw_count = degrees * (degrees - 1)
44 : // uw_degrees = plus_times(A @ degrees) * (degrees - 1)
45 : // square_clustering = squares / (uw_degrees - uw_count - tri - squares)
46 :
47 : // The coefficient as described in https://arxiv.org/pdf/0710.0117v1.pdf
48 : // where m and n are different neighbors of node i.
49 : // Note that summations over mn are implied in the numerator and denominator:
50 : //
51 : // C_{4,mn}(i) = q_imn / ((k_m - eta_imn) + (k_n - eta_imn) + q_imn)
52 : // q_imn = # of common neighbors between m and n (i.e., squares)
53 : // k_m = number of neighbors of m (i.e., degrees[m])
54 : // eta_imn = 1 + q_imn + theta_mn
55 : // theta_mn = 1 if m and n are connected, otherwise 0 (i.e., triangles)
56 :
57 : // Here are the corresponding terms between the equation and pseudocode:
58 : // theta_mn <--> tri
59 : // q_imn <--> squares
60 : // eta_imn = 1 + ... <--> uw_count
61 : // k_m <--> uw_degrees
62 :
63 : // I first implemented this in the Python library graphblas-algorithms
64 : // https://github.com/python-graphblas/graphblas-algorithms/\
65 : // blob/main/graphblas_algorithms/algorithms/cluster.py
66 : // and I copy/pasted C code generated from the Recorder in Python-graphblas
67 : // https://github.com/python-graphblas/python-graphblas
68 :
69 : // This implementation requires that `out_degree` property is already cached.
70 : // 0 values are omitted from the result (i.e., missing values <--> zero).
71 : // Also, it computes `P2 = A @ A.T`, which may be very large. We could modify
72 : // the algorithm to compute coefficients for a subset of nodes, which would
73 : // allow expert users to compute in batches. Also, since this algorithm only
74 : // operates on undirected or symmetric graphs, we only need to compute the
75 : // upper (or lower) triangle of P2, which should reduce memory by about half.
76 : // However, this is not easy to do, and would complicate the implementation.
77 :
78 : //------------------------------------------------------------------------------
79 :
80 : #define LG_FREE_WORK \
81 : { \
82 : GrB_free (&squares) ; \
83 : GrB_free (&denom) ; \
84 : GrB_free (&neg_denom) ; \
85 : GrB_free (&P2) ; \
86 : GrB_free (&D) ; \
87 : }
88 :
89 : #define LG_FREE_ALL \
90 : { \
91 : LG_FREE_WORK ; \
92 : GrB_free (&r) ; \
93 : }
94 :
95 : #include <LAGraph.h>
96 : #include <LAGraphX.h>
97 : #include <LG_internal.h> // from src/utility
98 :
99 1 : int LAGraph_SquareClustering
100 : (
101 : // outputs:
102 : GrB_Vector *square_clustering,
103 : // inputs:
104 : LAGraph_Graph G,
105 : char *msg
106 : )
107 : {
108 1 : LG_CLEAR_MSG ;
109 :
110 : // The number of squares each node is part of
111 1 : GrB_Vector squares = NULL ;
112 :
113 : // Thought of as the total number of possible squares for each node
114 1 : GrB_Vector denom = NULL ;
115 :
116 : // Negative contributions to the denominator
117 1 : GrB_Vector neg_denom = NULL ;
118 :
119 : // Final result: the square coefficients for each node (squares / denom)
120 1 : GrB_Vector r = NULL ;
121 :
122 : // out_degrees assigned to diagonal matrix
123 : // Then used as triangles: first(P2 & A)
124 1 : GrB_Matrix D = NULL ;
125 :
126 : // P2 = plus_pair(A @ A.T).new(mask=~D.S)
127 : // Then used as a temporary workspace matrix (int64)
128 1 : GrB_Matrix P2 = NULL ;
129 :
130 1 : GrB_Vector deg = G->out_degree ;
131 1 : GrB_Matrix A = G->A ;
132 1 : GrB_Index n = 0 ;
133 :
134 : //--------------------------------------------------------------------------
135 : // check inputs
136 : //--------------------------------------------------------------------------
137 :
138 1 : LG_ASSERT (square_clustering != NULL, GrB_NULL_POINTER) ;
139 1 : (*square_clustering) = NULL ;
140 :
141 1 : LG_ASSERT_MSG (deg != NULL,
142 : LAGRAPH_NOT_CACHED, "G->out_degree is required") ;
143 :
144 1 : LG_TRY (LAGraph_CheckGraph (G, msg)) ;
145 :
146 1 : LG_ASSERT_MSG ((G->kind == LAGraph_ADJACENCY_UNDIRECTED ||
147 : (G->kind == LAGraph_ADJACENCY_DIRECTED &&
148 : G->is_symmetric_structure == LAGraph_TRUE)),
149 : LAGRAPH_SYMMETRIC_STRUCTURE_REQUIRED,
150 : "G->A must be known to be symmetric") ;
151 :
152 : // # of nodes
153 1 : GRB_TRY (GrB_Matrix_nrows (&n, A)) ;
154 :
155 : // out_degrees as a diagonal matrix.
156 1 : GRB_TRY (GrB_Matrix_diag(&D, deg, 0)) ;
157 :
158 : // We use ~D.S as a mask so P2 won't have values along the diagonal.
159 : // P2(~D.S) = plus_pair(A @ A.T)
160 1 : GRB_TRY (GrB_Matrix_new (&P2, GrB_INT64, n, n)) ;
161 1 : GRB_TRY (GrB_mxm (P2, D, NULL, LAGraph_plus_one_int64, A, A, GrB_DESC_SCT1)) ;
162 :
163 : // Denominator is thought of as total number of squares that could exist.
164 : // It has four terms (indicated below), and we use the definition from:
165 : // https://arxiv.org/pdf/0710.0117v1.pdf.
166 : //
167 : // (1) tri = first(P2 & A).reduce_rowwise()
168 : // Subtract 1 for each edge where u-w or w-u are connected.
169 : // In other words, triangles. Use P2, since we already have it.
170 : // D = first(P2 & A)
171 : // neg_denom = D.reduce_rowwise()
172 1 : GRB_TRY (GrB_Matrix_eWiseMult_BinaryOp (D, NULL, NULL, GrB_FIRST_INT64, P2,
173 : A, NULL)) ;
174 1 : GRB_TRY (GrB_Vector_new (&neg_denom, GrB_INT64, n)) ;
175 1 : GRB_TRY (GrB_Matrix_reduce_Monoid (neg_denom, NULL, NULL,
176 : GrB_PLUS_MONOID_INT64, D, NULL)) ;
177 1 : GrB_free (&D) ;
178 :
179 : // squares = (P2 * (P2 - 1)).reduce_rowwise() / 2
180 : // Now compute the number of squares (the numerator). We count squares
181 : // based on https://arxiv.org/pdf/2007.11111.pdf (sigma_12, c_4).
182 : // P2 *= P2 - 1
183 : // squares = P2.reduce_rowwise() / 2 (and drop zeros)
184 1 : GRB_TRY (GrB_Matrix_apply_BinaryOp2nd_INT64 (P2, NULL, GrB_TIMES_INT64,
185 : GrB_MINUS_INT64, P2, 1, NULL)) ;
186 1 : GRB_TRY (GrB_Vector_new (&squares, GrB_INT64, n)) ;
187 1 : GRB_TRY (GrB_Matrix_reduce_Monoid (squares, NULL, NULL,
188 : GrB_PLUS_MONOID_INT64, P2, NULL)) ;
189 1 : GrB_free (&P2) ;
190 : // Divide by 2, and use squares as value mask to drop zeros
191 1 : GRB_TRY (GrB_Vector_apply_BinaryOp2nd_INT64 (squares, squares, NULL,
192 : GrB_DIV_INT64, squares, 2, GrB_DESC_R)) ;
193 :
194 : // (2) uw_count = degrees * (degrees - 1).
195 : // Subtract 1 for each u and 1 for each w for all combos.
196 : // denom(squares.S) = degrees - 1
197 1 : GRB_TRY (GrB_Vector_new (&denom, GrB_INT64, n)) ;
198 1 : GRB_TRY (GrB_Vector_apply_BinaryOp2nd_INT64(denom, squares, NULL,
199 : GrB_MINUS_INT64, deg, 1, GrB_DESC_S)) ;
200 : // neg_denom += degrees * (degrees - 1)
201 1 : GRB_TRY (GrB_Vector_eWiseMult_BinaryOp(neg_denom, NULL, GrB_PLUS_INT64,
202 : GrB_TIMES_INT64, deg, denom, NULL)) ;
203 :
204 : // (3) uw_degrees = plus_times(A @ degrees) * (degrees - 1).
205 : // The main contribution to (and only positive term of) the denominator:
206 : // degrees[u] + degrees[w] for each u-w combo.
207 : // Recall that `denom = degrees - 1` from above.
208 : // denom(denom.S) *= plus_times(A @ deg)
209 1 : GRB_TRY (GrB_mxv(denom, denom, GrB_TIMES_INT64,
210 : GrB_PLUS_TIMES_SEMIRING_INT64, A, deg, GrB_DESC_S)) ;
211 :
212 : // (4) squares. Subtract the number of squares
213 : // denom -= neg_denom + squares
214 1 : GRB_TRY (GrB_Vector_eWiseMult_BinaryOp(denom, NULL, GrB_MINUS_INT64,
215 : GrB_PLUS_INT64, neg_denom, squares, NULL)) ;
216 :
217 : // square_clustering = squares / (uw_degrees - uw_count - tri - squares)
218 : // Almost done! Now compute the final result:
219 : // square_clustering = r = squares / denom
220 1 : GRB_TRY (GrB_Vector_new (&r, GrB_FP64, n)) ;
221 1 : GRB_TRY (GrB_Vector_eWiseMult_BinaryOp (r, NULL, NULL, GrB_DIV_FP64,
222 : squares, denom, NULL)) ;
223 :
224 1 : (*square_clustering) = r ;
225 1 : LG_FREE_WORK ;
226 1 : return (GrB_SUCCESS) ;
227 : }
|