Line data Source code
1 : //------------------------------------------------------------------------------
2 : // LG_msort3: sort a 3-by-n list of integers, using A[0:2][ ] as the key
3 : //------------------------------------------------------------------------------
4 :
5 : // LAGraph, (c) 2019-2022 by The LAGraph Contributors, All Rights Reserved.
6 : // SPDX-License-Identifier: BSD-2-Clause
7 : //
8 : // For additional details (including references to third party source code and
9 : // other files) see the LICENSE file or contact permission@sei.cmu.edu. See
10 : // Contributors.txt for a full list of contributors. Created, in part, with
11 : // funding and support from the U.S. Government (see Acknowledgments.txt file).
12 : // DM22-0790
13 :
14 : // Contributed by Timothy A. Davis, Texas A&M University
15 :
16 : //------------------------------------------------------------------------------
17 :
18 : // A parallel mergesort of an array of 3-by-n integers. Each key
19 : // consists of three integers.
20 :
21 : #define LG_FREE_ALL LAGraph_Free ((void **) &W, NULL) ;
22 :
23 : #include "LG_internal.h"
24 :
25 : //------------------------------------------------------------------------------
26 : // prototype only needed for LG_msort3
27 : //------------------------------------------------------------------------------
28 :
29 : void LG_msort_3b_create_merge_tasks
30 : (
31 : // output:
32 : int64_t *LG_RESTRICT L_task, // L_task [t0...t0+ntasks-1] computed
33 : int64_t *LG_RESTRICT L_len, // L_len [t0...t0+ntasks-1] computed
34 : int64_t *LG_RESTRICT R_task, // R_task [t0...t0+ntasks-1] computed
35 : int64_t *LG_RESTRICT R_len, // R_len [t0...t0+ntasks-1] computed
36 : int64_t *LG_RESTRICT S_task, // S_task [t0...t0+ntasks-1] computed
37 : // input:
38 : const int t0, // first task tid to create
39 : const int ntasks, // # of tasks to create
40 : const int64_t pS_start, // merge into S [pS_start...]
41 : const int64_t *LG_RESTRICT L_0, // Left = L [pL_start...pL_end-1]
42 : const int64_t *LG_RESTRICT L_1,
43 : const int64_t *LG_RESTRICT L_2,
44 : const int64_t pL_start,
45 : const int64_t pL_end,
46 : const int64_t *LG_RESTRICT R_0, // Right = R [pR_start...pR_end-1]
47 : const int64_t *LG_RESTRICT R_1,
48 : const int64_t *LG_RESTRICT R_2,
49 : const int64_t pR_start,
50 : const int64_t pR_end
51 : ) ;
52 :
53 : //------------------------------------------------------------------------------
54 : // LG_msort_3b_binary_search: binary search for the pivot
55 : //------------------------------------------------------------------------------
56 :
57 : // The Pivot value is Y [pivot], and a binary search for the Pivot is made in
58 : // the array X [p_pstart...p_end-1], which is sorted in non-decreasing order on
59 : // input. The return value is pleft, where
60 : //
61 : // X [p_start ... pleft-1] <= Pivot and
62 : // X [pleft ... p_end-1] >= Pivot holds.
63 : //
64 : // pleft is returned in the range p_start to p_end. If pleft is p_start, then
65 : // the Pivot is smaller than all entries in X [p_start...p_end-1], and the left
66 : // list X [p_start...pleft-1] is empty. If pleft is p_end, then the Pivot is
67 : // larger than all entries in X [p_start...p_end-1], and the right list X
68 : // [pleft...p_end-1] is empty.
69 :
70 196 : static int64_t LG_msort_3b_binary_search // return pleft
71 : (
72 : const int64_t *LG_RESTRICT Y_0, // Pivot is Y [pivot]
73 : const int64_t *LG_RESTRICT Y_1,
74 : const int64_t *LG_RESTRICT Y_2,
75 : const int64_t pivot,
76 : const int64_t *LG_RESTRICT X_0, // search in X [p_start..p_end_-1]
77 : const int64_t *LG_RESTRICT X_1,
78 : const int64_t *LG_RESTRICT X_2,
79 : const int64_t p_start,
80 : const int64_t p_end
81 : )
82 : {
83 :
84 : //--------------------------------------------------------------------------
85 : // find where the Pivot appears in X
86 : //--------------------------------------------------------------------------
87 :
88 : // binary search of X [p_start...p_end-1] for the Pivot
89 196 : int64_t pleft = p_start ;
90 196 : int64_t pright = p_end - 1 ;
91 3030 : while (pleft < pright)
92 : {
93 2834 : int64_t pmiddle = (pleft + pright) >> 1 ;
94 : // less = (X [pmiddle] < Pivot)
95 2834 : bool less = LG_lt_3 (X_0, X_1, X_2, pmiddle,
96 : Y_0, Y_1, Y_2, pivot) ;
97 2834 : pleft = less ? (pmiddle+1) : pleft ;
98 2834 : pright = less ? pright : pmiddle ;
99 : }
100 :
101 : // binary search is narrowed down to a single item
102 : // or it has found the list is empty:
103 : ASSERT (pleft == pright || pleft == pright + 1) ;
104 :
105 : // If found is true then X [pleft == pright] == Pivot. If duplicates
106 : // appear then X [pleft] is any one of the entries equal to the Pivot
107 : // in the list. If found is false then
108 : // X [p_start ... pleft-1] < Pivot and
109 : // X [pleft+1 ... p_end-1] > Pivot holds.
110 : // The value X [pleft] may be either < or > Pivot.
111 196 : bool found = (pleft == pright) && LG_eq_3 (X_0, X_1, X_2, pleft,
112 : Y_0, Y_1, Y_2, pivot) ;
113 :
114 : // Modify pleft and pright:
115 196 : if (!found && (pleft == pright))
116 : {
117 100 : if (LG_lt_3 (X_0, X_1, X_2, pleft,
118 : Y_0, Y_1, Y_2, pivot))
119 : {
120 2 : pleft++ ;
121 : }
122 : else
123 : {
124 : // pright++ ; // (not needed)
125 : }
126 : }
127 :
128 : //--------------------------------------------------------------------------
129 : // return result
130 : //--------------------------------------------------------------------------
131 :
132 : // If found is false then
133 : // X [p_start ... pleft-1] < Pivot and
134 : // X [pleft ... p_end-1] > Pivot holds,
135 : // and pleft-1 == pright
136 :
137 : // If X has no duplicates, then whether or not Pivot is found,
138 : // X [p_start ... pleft-1] < Pivot and
139 : // X [pleft ... p_end-1] >= Pivot holds.
140 :
141 : // If X has duplicates, then whether or not Pivot is found,
142 : // X [p_start ... pleft-1] <= Pivot and
143 : // X [pleft ... p_end-1] >= Pivot holds.
144 :
145 196 : return (pleft) ;
146 : }
147 :
148 : //------------------------------------------------------------------------------
149 : // LG_msort_3b_create_merge_tasks
150 : //------------------------------------------------------------------------------
151 :
152 : // Recursively constructs ntasks tasks to merge two arrays, Left and Right,
153 : // into Sresult, where Left is L [pL_start...pL_end-1], Right is R
154 : // [pR_start...pR_end-1], and Sresult is S [pS_start...pS_start+total_work-1],
155 : // and where total_work is the total size of Left and Right.
156 : //
157 : // Task tid will merge L [L_task [tid] ... L_task [tid] + L_len [tid] - 1] and
158 : // R [R_task [tid] ... R_task [tid] + R_len [tid] -1] into the merged output
159 : // array S [S_task [tid] ... ]. The task tids created are t0 to
160 : // t0+ntasks-1.
161 :
162 452 : void LG_msort_3b_create_merge_tasks
163 : (
164 : // output:
165 : int64_t *LG_RESTRICT L_task, // L_task [t0...t0+ntasks-1] computed
166 : int64_t *LG_RESTRICT L_len, // L_len [t0...t0+ntasks-1] computed
167 : int64_t *LG_RESTRICT R_task, // R_task [t0...t0+ntasks-1] computed
168 : int64_t *LG_RESTRICT R_len, // R_len [t0...t0+ntasks-1] computed
169 : int64_t *LG_RESTRICT S_task, // S_task [t0...t0+ntasks-1] computed
170 : // input:
171 : const int t0, // first task tid to create
172 : const int ntasks, // # of tasks to create
173 : const int64_t pS_start, // merge into S [pS_start...]
174 : const int64_t *LG_RESTRICT L_0, // Left = L [pL_start...pL_end-1]
175 : const int64_t *LG_RESTRICT L_1,
176 : const int64_t *LG_RESTRICT L_2,
177 : const int64_t pL_start,
178 : const int64_t pL_end,
179 : const int64_t *LG_RESTRICT R_0, // Right = R [pR_start...pR_end-1]
180 : const int64_t *LG_RESTRICT R_1,
181 : const int64_t *LG_RESTRICT R_2,
182 : const int64_t pR_start,
183 : const int64_t pR_end
184 : )
185 : {
186 :
187 : //--------------------------------------------------------------------------
188 : // get problem size
189 : //--------------------------------------------------------------------------
190 :
191 452 : int64_t nleft = pL_end - pL_start ; // size of Left array
192 452 : int64_t nright = pR_end - pR_start ; // size of Right array
193 452 : int64_t total_work = nleft + nright ; // total work to do
194 : ASSERT (ntasks >= 1) ;
195 : ASSERT (total_work > 0) ;
196 :
197 : //--------------------------------------------------------------------------
198 : // create the tasks
199 : //--------------------------------------------------------------------------
200 :
201 452 : if (ntasks == 1)
202 : {
203 :
204 : //----------------------------------------------------------------------
205 : // a single task will merge all of Left and Right into Sresult
206 : //----------------------------------------------------------------------
207 :
208 256 : L_task [t0] = pL_start ; L_len [t0] = nleft ;
209 256 : R_task [t0] = pR_start ; R_len [t0] = nright ;
210 256 : S_task [t0] = pS_start ;
211 :
212 : }
213 : else
214 : {
215 :
216 : //----------------------------------------------------------------------
217 : // partition the Left and Right arrays for multiple merge tasks
218 : //----------------------------------------------------------------------
219 :
220 : int64_t pleft, pright ;
221 196 : if (nleft >= nright)
222 : {
223 : // split Left in half, and search for its pivot in Right
224 144 : pleft = (pL_end + pL_start) >> 1 ;
225 144 : pright = LG_msort_3b_binary_search (
226 : L_0, L_1, L_2, pleft,
227 : R_0, R_1, R_2, pR_start, pR_end) ;
228 : }
229 : else
230 : {
231 : // split Right in half, and search for its pivot in Left
232 52 : pright = (pR_end + pR_start) >> 1 ;
233 52 : pleft = LG_msort_3b_binary_search (
234 : R_0, R_1, R_2, pright,
235 : L_0, L_1, L_2, pL_start, pL_end) ;
236 : }
237 :
238 : //----------------------------------------------------------------------
239 : // partition the tasks according to the work of each partition
240 : //----------------------------------------------------------------------
241 :
242 : // work0 is the total work in the first partition
243 196 : int64_t work0 = (pleft - pL_start) + (pright - pR_start) ;
244 196 : int ntasks0 = (int) round ((double) ntasks *
245 196 : (((double) work0) / ((double) total_work))) ;
246 :
247 : // ensure at least one task is assigned to each partition
248 196 : ntasks0 = LAGRAPH_MAX (ntasks0, 1) ;
249 196 : ntasks0 = LAGRAPH_MIN (ntasks0, ntasks-1) ;
250 196 : int ntasks1 = ntasks - ntasks0 ;
251 :
252 : //----------------------------------------------------------------------
253 : // assign ntasks0 to the first half
254 : //----------------------------------------------------------------------
255 :
256 : // ntasks0 tasks merge L [pL_start...pleft-1] and R [pR_start..pright-1]
257 : // into the result S [pS_start...work0-1].
258 :
259 196 : LG_msort_3b_create_merge_tasks (
260 : L_task, L_len, R_task, R_len, S_task, t0, ntasks0, pS_start,
261 : L_0, L_1, L_2, pL_start, pleft,
262 : R_0, R_1, R_2, pR_start, pright) ;
263 :
264 : //----------------------------------------------------------------------
265 : // assign ntasks1 to the second half
266 : //----------------------------------------------------------------------
267 :
268 : // ntasks1 tasks merge L [pleft...pL_end-1] and R [pright...pR_end-1]
269 : // into the result S [pS_start+work0...pS_start+total_work].
270 :
271 196 : int t1 = t0 + ntasks0 ; // first task id of the second set of tasks
272 196 : int64_t pS_start1 = pS_start + work0 ; // 2nd set starts here in S
273 196 : LG_msort_3b_create_merge_tasks (
274 : L_task, L_len, R_task, R_len, S_task, t1, ntasks1, pS_start1,
275 : L_0, L_1, L_2, pleft, pL_end,
276 : R_0, R_1, R_2, pright, pR_end) ;
277 : }
278 452 : }
279 :
280 : //------------------------------------------------------------------------------
281 : // LG_msort_3b_merge: merge two sorted lists via a single thread
282 : //------------------------------------------------------------------------------
283 :
284 : // merge Left [0..nleft-1] and Right [0..nright-1] into S [0..nleft+nright-1] */
285 :
286 256 : static void LG_msort_3b_merge
287 : (
288 : int64_t *LG_RESTRICT S_0, // output of length nleft + nright
289 : int64_t *LG_RESTRICT S_1,
290 : int64_t *LG_RESTRICT S_2,
291 : const int64_t *LG_RESTRICT Left_0, // left input of length nleft
292 : const int64_t *LG_RESTRICT Left_1,
293 : const int64_t *LG_RESTRICT Left_2,
294 : const int64_t nleft,
295 : const int64_t *LG_RESTRICT Right_0, // right input of length nright
296 : const int64_t *LG_RESTRICT Right_1,
297 : const int64_t *LG_RESTRICT Right_2,
298 : const int64_t nright
299 : )
300 : {
301 : int64_t p, pleft, pright ;
302 :
303 : // merge the two inputs, Left and Right, while both inputs exist
304 3688060 : for (p = 0, pleft = 0, pright = 0 ; pleft < nleft && pright < nright ; p++)
305 : {
306 3687804 : if (LG_lt_3 (Left_0, Left_1, Left_2, pleft,
307 : Right_0, Right_1, Right_2, pright))
308 : {
309 : // S [p] = Left [pleft++]
310 1668324 : S_0 [p] = Left_0 [pleft] ;
311 1668324 : S_1 [p] = Left_1 [pleft] ;
312 1668324 : S_2 [p] = Left_2 [pleft] ;
313 1668324 : pleft++ ;
314 : }
315 : else
316 : {
317 : // S [p] = Right [pright++]
318 2019480 : S_0 [p] = Right_0 [pright] ;
319 2019480 : S_1 [p] = Right_1 [pright] ;
320 2019480 : S_2 [p] = Right_2 [pright] ;
321 2019480 : pright++ ;
322 : }
323 : }
324 :
325 : // either input is exhausted; copy the remaining list into S
326 256 : if (pleft < nleft)
327 : {
328 176 : int64_t nremaining = (nleft - pleft) ;
329 176 : memcpy (S_0 + p, Left_0 + pleft, nremaining * sizeof (int64_t)) ;
330 176 : memcpy (S_1 + p, Left_1 + pleft, nremaining * sizeof (int64_t)) ;
331 176 : memcpy (S_2 + p, Left_2 + pleft, nremaining * sizeof (int64_t)) ;
332 : }
333 80 : else if (pright < nright)
334 : {
335 80 : int64_t nremaining = (nright - pright) ;
336 80 : memcpy (S_0 + p, Right_0 + pright, nremaining * sizeof (int64_t)) ;
337 80 : memcpy (S_1 + p, Right_1 + pright, nremaining * sizeof (int64_t)) ;
338 80 : memcpy (S_2 + p, Right_2 + pright, nremaining * sizeof (int64_t)) ;
339 : }
340 256 : }
341 :
342 : //------------------------------------------------------------------------------
343 : // LG_msort3: parallel mergesort
344 : //------------------------------------------------------------------------------
345 :
346 111 : int LG_msort3
347 : (
348 : // input/output:
349 : int64_t *A_0, // size n array
350 : int64_t *A_1, // size n array
351 : int64_t *A_2, // size n array
352 : // input:
353 : const int64_t n,
354 : char *msg
355 : )
356 : {
357 :
358 : //--------------------------------------------------------------------------
359 : // check inputs
360 : //--------------------------------------------------------------------------
361 :
362 111 : LG_CLEAR_MSG ;
363 111 : int64_t *LG_RESTRICT W = NULL ;
364 111 : LG_ASSERT (A_0 != NULL, GrB_NULL_POINTER) ;
365 111 : LG_ASSERT (A_1 != NULL, GrB_NULL_POINTER) ;
366 111 : LG_ASSERT (A_2 != NULL, GrB_NULL_POINTER) ;
367 :
368 : //--------------------------------------------------------------------------
369 : // handle small problems with a single thread
370 : //--------------------------------------------------------------------------
371 :
372 111 : int nthreads = LG_nthreads_outer * LG_nthreads_inner ; // # threads to use
373 111 : if (nthreads <= 1 || n <= LG_BASECASE)
374 : {
375 : // sequential quicksort
376 105 : LG_qsort_3 (A_0, A_1, A_2, n) ;
377 105 : return (GrB_SUCCESS) ;
378 : }
379 :
380 : //--------------------------------------------------------------------------
381 : // determine # of tasks
382 : //--------------------------------------------------------------------------
383 :
384 : // determine the number of levels to create, which must always be an
385 : // even number. The # of levels is chosen to ensure that the # of leaves
386 : // of the task tree is between 4*nthreads and 16*nthreads.
387 :
388 : // 2 to 4 threads: 4 levels, 16 qsort leaves
389 : // 5 to 16 threads: 6 levels, 64 qsort leaves
390 : // 17 to 64 threads: 8 levels, 256 qsort leaves
391 : // 65 to 256 threads: 10 levels, 1024 qsort leaves
392 : // 256 to 1024 threads: 12 levels, 4096 qsort leaves
393 : // ...
394 :
395 6 : int k = (int) (2 + 2 * ceil (log2 ((double) nthreads) / 2)) ;
396 6 : int ntasks = 1 << k ;
397 :
398 : //--------------------------------------------------------------------------
399 : // allocate workspace
400 : //--------------------------------------------------------------------------
401 :
402 6 : LG_TRY (LAGraph_Malloc ((void **) &W, 3*n + 6*ntasks + 1, sizeof (int64_t),
403 : msg)) ;
404 :
405 4 : int64_t *T = W ;
406 4 : int64_t *LG_RESTRICT W_0 = T ; T += n ;
407 4 : int64_t *LG_RESTRICT W_1 = T ; T += n ;
408 4 : int64_t *LG_RESTRICT W_2 = T ; T += n ;
409 4 : int64_t *LG_RESTRICT L_task = T ; T += ntasks ;
410 4 : int64_t *LG_RESTRICT L_len = T ; T += ntasks ;
411 4 : int64_t *LG_RESTRICT R_task = T ; T += ntasks ;
412 4 : int64_t *LG_RESTRICT R_len = T ; T += ntasks ;
413 4 : int64_t *LG_RESTRICT S_task = T ; T += ntasks ;
414 4 : int64_t *LG_RESTRICT Slice = T ; T += (ntasks+1) ;
415 :
416 : //--------------------------------------------------------------------------
417 : // partition and sort the leaves
418 : //--------------------------------------------------------------------------
419 :
420 4 : LG_eslice (Slice, n, ntasks) ;
421 : int tid ;
422 : #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
423 68 : for (tid = 0 ; tid < ntasks ; tid++)
424 : {
425 64 : int64_t leaf = Slice [tid] ;
426 64 : int64_t leafsize = Slice [tid+1] - leaf ;
427 64 : LG_qsort_3 (A_0 + leaf, A_1 + leaf, A_2 + leaf, leafsize) ;
428 : }
429 :
430 : //--------------------------------------------------------------------------
431 : // merge each level
432 : //--------------------------------------------------------------------------
433 :
434 4 : int nt = 1 ;
435 12 : for ( ; k >= 2 ; k -= 2)
436 : {
437 :
438 : //----------------------------------------------------------------------
439 : // merge level k into level k-1, from A into W
440 : //----------------------------------------------------------------------
441 :
442 : // this could be done in parallel if ntasks was large
443 48 : for (tid = 0 ; tid < ntasks ; tid += 2*nt)
444 : {
445 : // create 2*nt tasks to merge two A sublists into one W sublist
446 40 : LG_msort_3b_create_merge_tasks (
447 40 : L_task, L_len, R_task, R_len, S_task, tid, 2*nt, Slice [tid],
448 40 : A_0, A_1, A_2, Slice [tid], Slice [tid+nt],
449 40 : A_0, A_1, A_2, Slice [tid+nt], Slice [tid+2*nt]) ;
450 : }
451 :
452 : #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
453 136 : for (tid = 0 ; tid < ntasks ; tid++)
454 : {
455 : // merge A [pL...pL+nL-1] and A [pR...pR+nR-1] into W [pS..]
456 128 : int64_t pL = L_task [tid], nL = L_len [tid] ;
457 128 : int64_t pR = R_task [tid], nR = R_len [tid] ;
458 128 : int64_t pS = S_task [tid] ;
459 :
460 128 : LG_msort_3b_merge (
461 128 : W_0 + pS, W_1 + pS, W_2 + pS,
462 128 : A_0 + pL, A_1 + pL, A_2 + pL, nL,
463 128 : A_0 + pR, A_1 + pR, A_2 + pR, nR) ;
464 : }
465 8 : nt = 2*nt ;
466 :
467 : //----------------------------------------------------------------------
468 : // merge level k-1 into level k-2, from W into A
469 : //----------------------------------------------------------------------
470 :
471 : // this could be done in parallel if ntasks was large
472 28 : for (tid = 0 ; tid < ntasks ; tid += 2*nt)
473 : {
474 : // create 2*nt tasks to merge two W sublists into one A sublist
475 20 : LG_msort_3b_create_merge_tasks (
476 20 : L_task, L_len, R_task, R_len, S_task, tid, 2*nt, Slice [tid],
477 20 : W_0, W_1, W_2, Slice [tid], Slice [tid+nt],
478 20 : W_0, W_1, W_2, Slice [tid+nt], Slice [tid+2*nt]) ;
479 : }
480 :
481 : #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
482 136 : for (tid = 0 ; tid < ntasks ; tid++)
483 : {
484 : // merge A [pL...pL+nL-1] and A [pR...pR+nR-1] into W [pS..]
485 128 : int64_t pL = L_task [tid], nL = L_len [tid] ;
486 128 : int64_t pR = R_task [tid], nR = R_len [tid] ;
487 128 : int64_t pS = S_task [tid] ;
488 128 : LG_msort_3b_merge (
489 128 : A_0 + pS, A_1 + pS, A_2 + pS,
490 128 : W_0 + pL, W_1 + pL, W_2 + pL, nL,
491 128 : W_0 + pR, W_1 + pR, W_2 + pR, nR) ;
492 : }
493 8 : nt = 2*nt ;
494 : }
495 :
496 : //--------------------------------------------------------------------------
497 : // free workspace and return result
498 : //--------------------------------------------------------------------------
499 :
500 4 : LG_FREE_ALL ;
501 4 : return (GrB_SUCCESS) ;
502 : }
|