Line data Source code
1 : //------------------------------------------------------------------------------
2 : // LAGr_PartitionQuality: computes coverage and performance of a clustering
3 : //------------------------------------------------------------------------------
4 :
5 : // LAGraph, (c) 2019-2022 by The LAGraph Contributors, All Rights Reserved.
6 : // SPDX-License-Identifier: BSD-2-Clause
7 : //
8 : // For additional details (including references to third party source code and
9 : // other files) see the LICENSE file or contact permission@sei.cmu.edu. See
10 : // Contributors.txt for a full list of contributors. Created, in part, with
11 : // funding and support from the U.S. Government (see Acknowledgments.txt file).
12 : // DM22-0790
13 :
14 : // Contributed by Cameron Quilici, Texas A&M University
15 :
16 : //------------------------------------------------------------------------------
17 :
18 : // TODO: ready to consider for src
19 : // TODO: define the input vector c that defines the cluster assignment
20 :
21 : // The coverage of a graph clustering C ( Cov(C) ) is defined as the ratio of
22 : // intra-cluster edges to the total edges in a graph. The performance of a graph
23 : // clustering C ( Perf(C) ) is defined as the ratio of intra-cluster edges and
24 : // inter-cluster non-edges to the total number of edges in a graph. These are
25 : // very simple cluster quality metrics which can be used to evaluate the quality
26 : // of a clustering, primarily based on the idea of intra-cluster density.
27 :
28 : // Note that coverage and performance are counting problems. Therefore, if a
29 : // weighted graph is passed to this function, edge weights are ignored.
30 :
31 : // https://arxiv.org/abs/0906.0612 pp. 15
32 :
33 : #define LG_FREE_WORK \
34 : { \
35 : GrB_free(&trace); \
36 : GrB_free(&k); \
37 : GrB_free(&C); \
38 : GrB_free(&CA); \
39 : GrB_free(&ONE_INT64); \
40 : }
41 :
42 : #define LG_FREE_ALL \
43 : { \
44 : LG_FREE_WORK; \
45 : }
46 :
47 : #include "LG_internal.h"
48 : #include <LAGraphX.h>
49 :
50 43 : int LAGr_PartitionQuality(
51 : // Outputs
52 : double *cov, // coverage output, can be NULL
53 : double *perf, // performance output, can be NULL
54 : // Inputs
55 : GrB_Vector c, // input cluster vector
56 : LAGraph_Graph G, // original graph from which the clustering was obtained
57 : char *msg)
58 : {
59 : #if LAGRAPH_SUITESPARSE
60 43 : GrB_Vector trace = NULL;
61 43 : GrB_Vector k = NULL;
62 43 : GrB_Matrix C = NULL;
63 43 : GrB_Matrix CA = NULL;
64 :
65 43 : GrB_Scalar ONE_INT64 = NULL;
66 :
67 : GrB_Index n, nedges;
68 : GrB_Index n_intraEdges, n_interEdges, n_interNonEdges, sum_k2;
69 :
70 : //--------------------------------------------------------------------------
71 : // check inputs
72 : //--------------------------------------------------------------------------
73 :
74 43 : LG_CLEAR_MSG;
75 :
76 43 : LG_ASSERT_MSG(cov != NULL || perf != NULL, GrB_NULL_POINTER,
77 : "cov and perf "
78 : "cannot both be NULL");
79 :
80 42 : LG_TRY(LAGraph_CheckGraph(G, msg));
81 :
82 41 : LG_ASSERT_MSG (G->is_symmetric_structure != LAGRAPH_UNKNOWN,
83 : LAGRAPH_NOT_CACHED,
84 : "G->is_symmetric_structure is required") ;
85 :
86 40 : GrB_Matrix A = G->A;
87 :
88 : // Delete self-edges, not relevant to these clustering metrics
89 40 : GRB_TRY(GrB_select(A, NULL, NULL, GrB_OFFDIAG, A, 0, NULL));
90 :
91 : #if 0
92 : FILE *f = fopen("./data/pp_sanitized_data.mtx", "w");
93 : LAGRAPH_TRY(LAGraph_MMWrite(A, f, NULL, msg));
94 : fclose(f);
95 : #endif
96 :
97 40 : GRB_TRY(GrB_Matrix_nrows(&n, A));
98 40 : GRB_TRY(GrB_Matrix_nvals(&nedges, A));
99 :
100 40 : GRB_TRY(GrB_Matrix_new(&C, GrB_INT64, n, n));
101 40 : GRB_TRY(GrB_Matrix_new(&CA, GrB_INT64, n, n));
102 40 : GRB_TRY(GrB_Vector_new(&trace, GrB_INT64, n));
103 40 : GRB_TRY(GrB_Vector_new(&k, GrB_INT64, n));
104 40 : GRB_TRY(GrB_Scalar_new(&ONE_INT64, GrB_INT64));
105 :
106 40 : GRB_TRY(GrB_Scalar_setElement_BOOL(ONE_INT64, (uint64_t)1));
107 :
108 : // convert the cluster vector to a boolean matrix C where
109 : // C(i, j) = 1 if and only if vertex j is in cluster i
110 : GrB_Index *cI, *cX;
111 40 : LAGRAPH_TRY(LAGraph_Malloc((void **)&cI, n, sizeof(GrB_Index), msg));
112 40 : LAGRAPH_TRY(LAGraph_Malloc((void **)&cX, n, sizeof(GrB_Index), msg));
113 40 : GRB_TRY(GrB_Vector_extractTuples_INT64(cI, (int64_t *) cX, &n, c));
114 40 : GRB_TRY(GxB_Matrix_build_Scalar(C, cX, cI, ONE_INT64, n));
115 40 : GrB_Matrix_wait(C, GrB_MATERIALIZE);
116 40 : LAGraph_Free((void **)&cI, NULL);
117 40 : LAGraph_Free((void **)&cX, NULL);
118 :
119 40 : bool is_undirected = (G->is_symmetric_structure == LAGraph_TRUE);
120 :
121 : // k = sum(C) .^ 2
122 40 : GRB_TRY(GrB_reduce(k, NULL, NULL, GrB_PLUS_MONOID_INT64, C, NULL));
123 40 : GRB_TRY(GrB_Vector_apply_BinaryOp2nd_INT64(k, NULL, NULL, GxB_POW_INT64, k,
124 : 2, NULL));
125 : // sum_k2 = total number of possible intra-cluster edges
126 40 : GRB_TRY(GrB_reduce(&sum_k2, NULL, GrB_PLUS_MONOID_INT64, k, NULL));
127 :
128 : // Calculate actual number of intra-cluster edges, if A is weighted
129 : // then we ignore the weights and just count the number of edges as
130 : // performance and coverage are inherently counting problems
131 40 : GRB_TRY(GrB_mxm(CA, NULL, NULL, LAGraph_plus_one_int64, C, A, NULL));
132 40 : GRB_TRY(GrB_mxm(CA, NULL, NULL, GrB_PLUS_TIMES_SEMIRING_INT64, CA, C,
133 : GrB_DESC_T1));
134 40 : GRB_TRY(GxB_Vector_diag(trace, CA, 0, NULL));
135 :
136 40 : GRB_TRY(
137 : GrB_reduce(&n_intraEdges, NULL, GrB_PLUS_MONOID_INT64, trace, NULL));
138 :
139 40 : if (perf)
140 : {
141 : // undirected case
142 34 : if (is_undirected)
143 : {
144 22 : nedges /= 2;
145 22 : n_intraEdges /= 2;
146 22 : n_interEdges = nedges - n_intraEdges;
147 : // All possible edges - intra cluster non-edges gives space of
148 : // possible inter-cluster edges. Then subtract the actual
149 : // inter-cluster edges to get the number of inter-cluster non-edges.
150 22 : n_interNonEdges =
151 22 : (n * (n - 1) / 2) - ((sum_k2 - n) / 2) - n_interEdges;
152 22 : (*perf) =
153 22 : (n_intraEdges + n_interNonEdges) * 1.0 / (n * (n - 1) / 2);
154 : }
155 : // directed case
156 : else
157 : {
158 12 : n_interEdges = nedges - n_intraEdges;
159 12 : n_interNonEdges = n * (n - 1) - (sum_k2 - n) - n_interEdges;
160 12 : (*perf) = (n_intraEdges + n_interNonEdges) * 1.0 / (n * (n - 1));
161 : }
162 : }
163 :
164 40 : if (cov)
165 : {
166 34 : (*cov) = n_intraEdges * 1.0 / nedges;
167 : }
168 :
169 40 : LG_FREE_WORK;
170 40 : return (GrB_SUCCESS);
171 : #else
172 : return (GrB_NOT_IMPLEMENTED);
173 : #endif
174 : }
|