forked from trilinos/Trilinos
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsave_Zoltan2_AlgMultiJagged.hpp
9407 lines (8371 loc) · 370 KB
/
save_Zoltan2_AlgMultiJagged.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// @HEADER
//
// ***********************************************************************
//
// Zoltan2: A package of combinatorial algorithms for scientific computing
// Copyright 2012 Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Karen Devine ([email protected])
// Erik Boman ([email protected])
// Siva Rajamanickam ([email protected])
//
// ***********************************************************************
//
// @HEADER
/*! \file Zoltan2_AlgMultiJagged.hpp
\brief Contains the Multi-jagged algorthm.
*/
#ifndef _ZOLTAN2_ALGMultiJagged_HPP_
#define _ZOLTAN2_ALGMultiJagged_HPP_
#include <Zoltan2_MultiJagged_ReductionOps.hpp>
#include <Zoltan2_CoordinateModel.hpp>
#include <Zoltan2_Parameters.hpp>
#include <Zoltan2_Algorithm.hpp>
#include <Zoltan2_IntegerRangeList.hpp>
#include <Zoltan2_CoordinatePartitioningGraph.hpp>
#include <Zoltan2_Util.hpp>
#include <Tpetra_Distributor.hpp>
#include <Teuchos_StandardParameterEntryValidators.hpp>
#include <Teuchos_ParameterList.hpp>
#include <algorithm> // std::sort
#include <vector>
#include <unordered_map>
#ifdef ZOLTAN2_USEZOLTANCOMM
#ifdef HAVE_ZOLTAN2_MPI
#define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
#include "zoltan_comm_cpp.h"
#include "zoltan_types.h" // for error codes
#endif
#endif
namespace Teuchos{
/*! \brief Zoltan2_BoxBoundaries is a reduction operation
* to all reduce the all box boundaries.
*/
template <typename Ordinal, typename T>
class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
{
private:
Ordinal size;
T epsilon;
public:
/*! \brief Default Constructor
*/
Zoltan2_BoxBoundaries() : size(0),
epsilon(std::numeric_limits<T>::epsilon()) {}
/*! \brief Constructor
* \param Ordinal DOCWORK: Documentation
*/
Zoltan2_BoxBoundaries(Ordinal s_):
size(s_), epsilon(std::numeric_limits<T>::epsilon()) {}
/*! \brief Implement Teuchos::ValueTypeReductionOp interface
* \param count DOCWORK: Documentation
* \param inBuffer DOCWORK: Documentation
* \param inoutBuffer DOCWORK: Documentation
*/
void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const {
for(Ordinal i = 0; i < count; i++) {
if(Z2_ABS(inBuffer[i]) > epsilon) {
inoutBuffer[i] = inBuffer[i];
}
}
}
};
} // namespace Teuchos
namespace Zoltan2{
/*! \brief Class for sorting items with multiple values.
* First sorting with respect to val[0], then val[1] then ... val[count-1].
* The last tie breaking is done with index values.
* Used for task mapping partitioning where the points on a cut line needs to
* be distributed consistently.
*/
template <typename IT, typename CT, typename WT>
class uMultiSortItem
{
public:
// TODO: Why volatile?
// no idea, another intel compiler failure.
volatile IT index;
volatile CT count;
volatile WT *val;
volatile WT epsilon;
uMultiSortItem() {
this->index = 0;
this->count = 0;
this->val = NULL;
this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
}
// TODO: Document these methods?
uMultiSortItem(IT index_ ,CT count_, WT *vals_) {
this->index = index_;
this->count = count_;
this->val = vals_;
this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
}
~uMultiSortItem() {
}
void set(IT index_ ,CT count_, WT *vals_) {
this->index = index_;
this->count = count_;
this->val = vals_;
}
bool operator<(const uMultiSortItem<IT,CT,WT>& other) const {
assert(this->count == other.count);
for(CT i = 0; i < this->count; ++i) {
// if the values are equal go to next one.
if(std::abs(this->val[i] - other.val[i]) < this->epsilon) {
continue;
}
// if next value is smaller return true;
if(this->val[i] < other.val[i]) {
return true;
}
// if next value is bigger return false;
else {
return false;
}
}
// if they are totally equal.
return this->index < other.index;
}
};
/*! \brief Sort items for quick sort function.
*/
template <class IT, class WT>
struct uSortItem
{
IT id;
WT val;
};
/*! \brief Quick sort function.
* Sorts the arr of uSortItems, with respect to increasing vals.
* DOCWORK: Document input params
*/
template <class IT, class WT>
void uqsort(IT n, uSortItem<IT, WT> * arr) {
int NSTACK = 50;
int M = 7;
IT i, ir=n, j, k, l=1;
IT jstack=0, istack[50];
WT aval;
uSortItem<IT,WT> a;
--arr;
for(;;) {
if(ir-l < M) {
for(j=l+1;j<=ir;j++) {
a=arr[j];
aval = a.val;
for(i=j-1;i>=1;i--) {
if(arr[i].val <= aval)
break;
arr[i+1] = arr[i];
}
arr[i+1]=a;
}
if(jstack == 0)
break;
ir=istack[jstack--];
l=istack[jstack--];
}
else {
k=(l+ir) >> 1;
std::swap(arr[k],arr[l+1]);
if(arr[l+1].val > arr[ir].val) {
std::swap(arr[l+1],arr[ir]);
}
if(arr[l].val > arr[ir].val) {
std::swap(arr[l],arr[ir]);
}
if(arr[l+1].val > arr[l].val) {
std::swap(arr[l+1],arr[l]);
}
i=l+1;
j=ir;
a=arr[l];
aval = a.val;
for(;;) {
do i++; while (arr[i].val < aval);
do j--; while (arr[j].val > aval);
if(j < i) break;
std::swap(arr[i],arr[j]);
}
arr[l]=arr[j];
arr[j]=a;
jstack += 2;
if(jstack > NSTACK) {
std::cout << "uqsort: NSTACK too small in sort." << std::endl;
std::terminate();
}
if(ir-i+1 >= j-l) {
istack[jstack]=ir;
istack[jstack-1]=i;
ir=j-1;
}
else {
istack[jstack]=j-1;
istack[jstack-1]=l;
l=i;
}
}
}
}
template <class IT, class WT, class SIGN>
struct uSignedSortItem
{
IT id;
WT val;
SIGN signbit; // 1 means positive, 0 means negative.
bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
/*if I am negative, the other is positive*/
if(this->signbit < rhs.signbit) {
return true;
}
/*if both has the same sign*/
else if(this->signbit == rhs.signbit) {
if(this->val < rhs.val) {//if my value is smaller,
return this->signbit;//then if we both are positive return true.
//if we both are negative, return false.
}
else if(this->val > rhs.val) {//if my value is larger,
return !this->signbit; //then if we both are positive return false.
//if we both are negative, return true.
}
else { //if both are equal.
return false;
}
}
else {
/*if I am positive, the other is negative*/
return false;
}
}
bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs) {
return (this->val == rhs.val && this->signbit == rhs.signbit) || (*this < rhs);
}
};
/*! \brief Quick sort function.
* Sorts the arr of uSignedSortItems, with respect to increasing vals.
*/
template <class IT, class WT, class SIGN>
void uqSignsort(IT n, uSignedSortItem<IT, WT, SIGN> * arr) {
IT NSTACK = 50;
IT M = 7;
IT i, ir=n, j, k, l=1;
IT jstack=0, istack[50];
uSignedSortItem<IT,WT,SIGN> a;
--arr;
for(;;) {
if(ir < M + l) {
for(j=l+1;j<=ir;j++) {
a=arr[j];
for(i=j-1;i>=1;i--) {
if(arr[i] <= a) {
break;
}
arr[i+1] = arr[i];
}
arr[i+1]=a;
}
if(jstack == 0) {
break;
}
ir=istack[jstack--];
l=istack[jstack--];
}
else {
k=(l+ir) >> 1;
std::swap(arr[k],arr[l+1]);
if(arr[ir] < arr[l+1]) {
std::swap(arr[l+1],arr[ir]);
}
if(arr[ir] < arr[l] ) {
std::swap(arr[l],arr[ir]);
}
if(arr[l] < arr[l+1]) {
std::swap(arr[l+1],arr[l]);
}
i=l+1;
j=ir;
a=arr[l];
for(;;) {
do i++; while (arr[i] < a);
do j--; while (a < arr[j]);
if(j < i) break;
std::swap(arr[i],arr[j]);
}
arr[l]=arr[j];
arr[j]=a;
jstack += 2;
if(jstack > NSTACK) {
std::cout << "uqsort: NSTACK too small in sort." << std::endl;
std::terminate();
}
if(ir+l+1 >= j+i) {
istack[jstack]=ir;
istack[jstack-1]=i;
ir=j-1;
}
else {
istack[jstack]=j-1;
istack[jstack-1]=l;
l=i;
}
}
}
}
// This exists only so we can track how many times the MJ algorithm is
// called and put each of those into different timer names.
// Currently the MultiJaggedTest.cpp will actually call it twice.
// First time with data from a Tpetra MultiVector and then a second time using
// a BasicVectorAdapter which allows us to turn UVM off for some tests. The
// results of the two runs are compared which helps to catch a lot of bugs. For
// profiling I'm mostly just interested in the UVM off case and need it to be
// in separate timers. Passing a value through would mess up the API. Possibly
// we could check the Adapter and use that. The statics have to be outside the
// templated class as the two called instances will be different template
// parameters. Another complication is that MultiJagged.cpp will call through
// the Zoltan2_AlgMJ class and we want to time things in both classes. However
// TaskMapper will directly call AlgMJ so I made two counters for the two
// classes to make sure it was always correct. This does not impact any
// behavior and has the sole purpose of generating unique timer names. If you
// run an MJ test you'll see MJ(0) and MJ(1) in the names to distinguish the
// 1st and 2nd run. Right now only MultijaggedTest.cpp cares about this.
struct Zoltan2_AlgMJ_TrackCallsCounter {
static int get_counter_AlgMJ() {
static int counter = 0;
return counter++;
}
static int get_counter_Zoltan2_AlgMJ() {
static int counter = 0;
return counter++;
}
};
/*! \brief Multi Jagged coordinate partitioning algorithm.
*/
template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
typename mj_part_t, typename mj_node_t>
class AlgMJ
{
private:
typedef typename mj_node_t::device_type device_t; // for views
typedef coordinateModelPartBox mj_partBox_t;
typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
//if the (last dimension reduce all count) x the mpi world size
//estimated to be bigger than this number then migration will be forced
//in earlier iterations.
static constexpr size_t future_reduceall_cutoff = 1500000;
//if parts right before last dimension are estimated to have less than
//MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
static constexpr mj_lno_t min_work_last_dim = 1000;
static constexpr mj_scalar_t least_signifiance = 0.0001;
static constexpr int significance_mul = 1000;
std::string mj_timer_base_string; // for convenience making timer names
RCP<const Environment> mj_env; // the environment object
RCP<const Comm<int> > mj_problemComm; // initial comm object
RCP<Comm<int> > comm; // comm object than can be altered during execution
double imbalance_tolerance; // input imbalance tolerance.
int recursion_depth; // number of steps that partitioning will be solved in.
int coord_dim; // coordinate dim
int num_weights_per_coord; // # of weights per coord
size_t initial_num_loc_coords; // initial num local coords.
global_size_t initial_num_glob_coords; // initial num global coords.
mj_lno_t num_local_coords; // number of local coords.
mj_gno_t num_global_coords; // number of global coords.
mj_scalar_t sEpsilon; // epsilon for mj_scalar_t
// can distribute points on same coordinant to different parts.
bool distribute_points_on_cut_lines;
// how many parts we can calculate concurrently.
mj_part_t max_concurrent_part_calculation;
bool mj_run_as_rcb; // means recursion depth is adjusted to maximum value.
int mj_user_recursion_depth; // the recursion depth value provided by user.
bool mj_keep_part_boxes; // if the boxes need to be kept.
// whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
int check_migrate_avoid_migration_option;
// when doing the migration, 0 will aim for perfect load-imbalance, 1 - will
// aim for minimized number of messages with possibly bad load-imbalance
int migration_type;
// when MJ decides whether to migrate, the minimum imbalance for migration.
double minimum_migration_imbalance;
mj_part_t total_num_cut ; // how many cuts will be totally
mj_part_t total_num_part; // how many parts will be totally
mj_part_t max_num_part_along_dim ; // maximum part count along a dimension.
mj_part_t max_num_cut_along_dim; // maximum cut count along a dimension.
// maximum part+cut count along a dimension.
size_t max_num_total_part_along_dim;
mj_part_t total_dim_num_reduce_all; // estimate on #reduceAlls can be done.
// max no of parts that might occur during the partition before the last
// partitioning dimension.
mj_part_t last_dim_num_part;
// input part array specifying num part to divide along each dim.
Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
// two dimension coordinate array
// coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
mj_coordinates;
// two dimension weight array
Kokkos::View<mj_scalar_t **, device_t> mj_weights;
// if the target parts are uniform
Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
// target part weight sizes.
Kokkos::View<mj_scalar_t **, device_t> mj_part_sizes;
// if the coordinates have uniform weights
Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
int mj_num_teams; // the number of teams
size_t num_global_parts; // the targeted number of parts
// vector of all boxes for all parts, constructed if mj_keep_part_boxes true
RCP<mj_partBoxVector_t> kept_boxes;
RCP<mj_partBox_t> global_box;
int myRank; // processor rank
int myActualRank; // initial rank
bool divide_to_prime_first;
// initial global ids of the coordinates.
Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
// current global ids of the coordinates, might change during migration.
Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
// the actual processor owner of the coordinate, to track after migrations.
Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
// permutation of coordinates, for partitioning.
Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
// permutation work array.
Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
// the part ids assigned to coordinates.
Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
// beginning and end of each part.
Kokkos::View<mj_lno_t *, device_t> part_xadj;
// work array for beginning and end of each part.
Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
// how much weight should a MPI put left side of the each cutline
Kokkos::View<mj_scalar_t *, device_t>
process_cut_line_weight_to_put_left;
// weight percentage each thread in MPI puts left side of the each outline
Kokkos::View<mj_scalar_t *, device_t>
thread_cut_line_weight_to_put_left;
// work array to manipulate coordinate of cutlines in different iterations.
// necessary because previous cut line information is used for determining
// the next cutline information. therefore, cannot update the cut work array
// until all cutlines are determined.
Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
// Used for swapping above cut_coordinates_work_array
Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
// cumulative part weight array.
Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
// upper bound coordinate of a cut line
Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
// lower bound coordinate of a cut line
Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
// lower bound weight of a cut line
Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
// upper bound weight of a cut line
Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
// combined array to exchange the min and max coordinate, and total
// weight of part.
Kokkos::View<mj_scalar_t *, device_t>
process_local_min_max_coord_total_weight;
// global combined array with the results for min, max and total weight.
Kokkos::View<mj_scalar_t *, device_t>
global_min_max_coord_total_weight;
// isDone is used to determine if a cutline is determined already. If a cut
// line is already determined, the next iterations will skip this cut line.
Kokkos::View<bool *, device_t> is_cut_line_determined;
// incomplete_cut_count count holds the number of cutlines that have not
// been finalized for each part when concurrentPartCount>1, using this
// information, if incomplete_cut_count[x]==0, then no work is done
// for this part.
Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
typename decltype(device_incomplete_cut_count)::HostMirror
incomplete_cut_count;
// Need a quick accessor for this on host
typename decltype (part_xadj)::HostMirror host_part_xadj;
// local part weights of each thread.
Kokkos::View<double *, device_t>
thread_part_weights;
// the work manupulation array for partweights.
Kokkos::View<double *, device_t>
thread_part_weight_work;
// thread_cut_left_closest_point to hold the closest coordinate
// to a cutline from left (for each thread).
Kokkos::View<mj_scalar_t *, device_t>
thread_cut_left_closest_point;
// thread_cut_right_closest_point to hold the closest coordinate
// to a cutline from right (for each thread)
Kokkos::View<mj_scalar_t *, device_t>
thread_cut_right_closest_point;
// to store how many points in each part a thread has.
Kokkos::View<mj_lno_t *, device_t>
thread_point_counts;
Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
// for faster communication, concatanation of
// totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
// leftClosest distances sized P-1, since P-1 cut lines
// rightClosest distances size P-1, since P-1 cut lines.
Kokkos::View<mj_scalar_t *, device_t>
total_part_weight_left_right_closests;
Kokkos::View<mj_scalar_t *, device_t>
global_total_part_weight_left_right_closests;
Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
typename decltype(device_num_partitioning_in_current_dim)::HostMirror
host_num_partitioning_in_current_dim; // for quick access on host
/* \brief helper functio to calculate imbalance.
* \param achieved balance we achieved.
* \param expected balance expected.
*/
KOKKOS_INLINE_FUNCTION
double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) const {
return static_cast<double>(achieved) / static_cast<double>(expected) - 1.0;
}
/* \brief Either the mj array (part_no_array) or num_global_parts should be
* provided in the input. part_no_array takes precedence if both are
* provided. Depending on these parameters, total cut/part number, maximum
* part/cut number along a dimension, estimated number of reduceAlls,
* and the number of parts before the last dimension is calculated.
* */
void set_part_specifications();
/* \brief Tries to determine the part number for current dimension,
* by trying to make the partitioning as square as possible.
* \param num_total_future how many more partitionings are required.
* \param root how many more recursion depth is left.
*/
inline mj_part_t get_part_count(
mj_part_t num_total_future,
double root);
/* \brief for part communication we keep track of the box boundaries.
* This is performed when either asked specifically, or when geometric
* mapping is performed afterwards. This function initializes a single box
* with all global min and max coordinates.
* \param initial_partitioning_boxes the input and output vector for boxes.
*/
void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
/* \brief Function returns how many parts that will be obtained after this
* dimension partitioning. It sets how many parts each current part will be
* partitioned into in this dimension to device_num_partitioning_in_current_dim
* vector, sets how many total future parts each obtained part will be
* partitioned into in next_future_num_parts_in_parts vector, If part boxes
* are kept, then sets initializes the output_part_boxes as its ancestor.
* \param future_num_part_in_parts: input, how many future parts each
* current part will be partitioned into.
* \param next_future_num_parts_in_parts: output, how many future parts
* each obtained part will be partitioned into.
* \param future_num_parts: output, max number of future parts that will be
* obtained from a single
* \param current_num_parts: input, how many parts are there currently.
* \param current_iteration: input, current dimension iteration number.
* \param input_part_boxes: input, if boxes are kept, current boxes.
* \param output_part_boxes: output, if boxes are kept, the initial box
* boundaries for obtained parts.
* \param atomic_part_count // DOCWORK: Documentation
*/
mj_part_t update_part_num_arrays(
std::vector<mj_part_t> *future_num_part_in_parts,
std::vector<mj_part_t> *next_future_num_parts_in_parts,
mj_part_t &future_num_parts,
mj_part_t current_num_parts,
int current_iteration,
RCP<mj_partBoxVector_t> input_part_boxes,
RCP<mj_partBoxVector_t> output_part_boxes,
mj_part_t atomic_part_count);
/*! \brief Function that calculates the next pivot position,
* according to given coordinates of upper bound and lower bound, the
* weights at upper and lower bounds, and the expected weight.
* \param cut_upper_bound is the upper bound coordinate of the cut.
* \param cut_lower_bound is the lower bound coordinate of the cut.
* \param cut_upper_weight is the weights at the upper bound of the cut.
* \param cut_lower_weight is the weights at the lower bound of the cut.
* \param expected_weight is the expected weight that should be placed on
* the left of the cut line.
* \param new_cut_position DOCWORK: Documentation
*/
KOKKOS_INLINE_FUNCTION
void mj_calculate_new_cut_position (
mj_scalar_t cut_upper_bound,
mj_scalar_t cut_lower_bound,
mj_scalar_t cut_upper_weight,
mj_scalar_t cut_lower_weight,
mj_scalar_t expected_weight,
mj_scalar_t &new_cut_position);
/*! \brief Function checks if should do migration or not.
* It returns true to point that migration should be done when
* -migration_reduce_all_population are higher than a predetermined value
* -num_coords_for_last_dim_part that left for the last dimension
* partitioning is less than a predetermined value - the imbalance of the
* processors on the parts are higher than given threshold.
* \param input_num_parts is the number of parts when migration is called.
* \param output_num_parts is the output number of parts after migration.
* \param next_future_num_parts_in_parts is the number of total future parts
* each part is partitioned into. Updated when migration is performed.
* \param output_part_begin_index is the number that will be used as
* beginning part number when final solution part numbers are assigned.
* \param migration_reduce_all_population is the estimated total number of
* reduceall operations multiplied with number of processors to be used for
* determining migration.
* \param num_coords_for_last_dim_part is the estimated number of points in
* each part, when last dimension partitioning is performed.
* \param iteration is the string that gives information about the dimension
* for printing purposes.
* \param input_part_boxes is the array that holds the part boxes after the
* migration. (swapped)
* \param output_part_boxes is the array that holds the part boxes before
* the migration. (swapped)
*/
bool mj_perform_migration(
mj_part_t in_num_parts, //current number of parts
mj_part_t &out_num_parts, //output number of parts.
std::vector<mj_part_t> *next_future_num_parts_in_parts,
mj_part_t &output_part_begin_index,
size_t migration_reduce_all_population,
mj_lno_t num_coords_for_last_dim_part,
std::string iteration,
RCP<mj_partBoxVector_t> &input_part_boxes,
RCP<mj_partBoxVector_t> &output_part_boxes);
/*! \brief Function checks if should do migration or not.
* It returns true to point that migration should be done when
* -migration_reduce_all_population are higher than a predetermined value
* -num_coords_for_last_dim_part that left for the last dimension
* partitioning is less than a predetermined value - the imbalance of the
* processors on the parts are higher than given threshold.
* \param migration_reduce_all_population is the multiplication of the
* number of reduceall operations estimated and the number of processors.
* \param num_coords_for_last_dim_part is the estimated number of
* coordinates in a part per processor in the last dimension partitioning.
* \param num_procs is the number of processor attending to migration
* operation.
* \param num_parts is the number of parts that exist in the current
* partitioning.
* \param num_points_in_all_processor_parts is the input array that holds
* the number of coordinates in each part in each processor.
*/
bool mj_check_to_migrate(
size_t migration_reduce_all_population,
mj_lno_t num_coords_for_last_dim_part,
mj_part_t num_procs,
mj_part_t num_parts,
mj_gno_t *num_points_in_all_processor_parts);
/*! \brief Function fills up coordinate_destinations is the output array
* that holds which part each coordinate should be sent. In addition it
* calculates the shift amount (output_part_numbering_begin_index) to be
* done when final numberings of the parts are performed.
* \param num_points_in_all_processor_parts is the array holding the num
* points in each part in each proc.
* \param num_parts is the number of parts that exist in the current
* partitioning.
* \param num_procs is the number of processor attending to migration
* operation.
* \param send_count_to_each_proc array array storing the number of points
* to be sent to each part.
* \param processor_ranks_for_subcomm is the ranks of the processors that
* will be in the subcommunicator with me.
* \param next_future_num_parts_in_parts is the vector, how many more parts
* each part will be divided into in the future.
* \param out_num_part is the number of parts assigned to the process.
* \param out_part_indices is the indices of the part to which the processor
* is assigned.
* \param output_part_numbering_begin_index is how much the numbers should
* be shifted when numbering the result parts.
* \param coordinate_destinations is the output array that holds which part
* each coordinate should be sent.
*/
void mj_migration_part_proc_assignment(
mj_gno_t * num_points_in_all_processor_parts,
mj_part_t num_parts,
mj_part_t num_procs,
mj_lno_t *send_count_to_each_proc,
std::vector<mj_part_t> &processor_ranks_for_subcomm,
std::vector<mj_part_t> *next_future_num_parts_in_parts,
mj_part_t &out_num_part,
std::vector<mj_part_t> &out_part_indices,
mj_part_t &output_part_numbering_begin_index,
int *coordinate_destinations);
/*! \brief Function that assigned the processors to parts, when there are
* more processors then parts. Sets the destination of each coordinate in
* coordinate_destinations, also edits output_part_numbering_begin_index,
* and out_part_index, and returns the processor_ranks_for_subcomm which
* represents the ranks of the processors that will be used for creating the
* subcommunicator.
* \param num_points_in_all_processor_parts is the array holding the num
* points in each part in each proc.
* \param num_parts is the number of parts that exist in the current
* partitioning.
* \param num_procs is the number of processor attending to migration
* operation.
* \param send_count_to_each_proc array array storing the number of points
* to be sent to each part.
* \param processor_ranks_for_subcomm is the ranks of the processors that
* will be in the subcommunicator with me.
* \param next_future_num_parts_in_parts is the vector, how many more parts
* each part will be divided into in the future.
* \param out_part_index is the index of the part to which the processor
* is assigned.
* \param output_part_numbering_begin_index is how much the numbers should
* be shifted when numbering the result parts.
* \param coordinate_destinations is the output array that holds which part
* each coordinate should be sent.
*/
void mj_assign_proc_to_parts(
mj_gno_t * num_points_in_all_processor_parts,
mj_part_t num_parts,
mj_part_t num_procs,
mj_lno_t *send_count_to_each_proc,
std::vector<mj_part_t> &processor_ranks_for_subcomm,
std::vector<mj_part_t> *next_future_num_parts_in_parts,
mj_part_t &out_part_index,
mj_part_t &output_part_numbering_begin_index,
int *coordinate_destinations);
/*! \brief Function fills up coordinate_destinations is the output array
* that holds which part each coordinate should be sent.
* \param num_parts is the number of parts that exist in the
* current partitioning.
* \param num_procs is the number of processors attending to
* migration operation.
* \param part_assignment_proc_begin_indices ([i]) points to the first
* processor index that part i will be sent to.
* \param processor_chains_in_parts the array that holds the linked list
* structure, started from part_assignment_proc_begin_indices ([i]).
* \param send_count_to_each_proc array array storing the number of points to
* be sent to each part.
* \param coordinate_destinations is the output array that holds which part
* each coordinate should be sent.
*/
void assign_send_destinations(
mj_part_t num_parts,
mj_part_t *part_assignment_proc_begin_indices,
mj_part_t *processor_chains_in_parts,
mj_lno_t *send_count_to_each_proc,
int *coordinate_destinations);
/*! \brief Function fills up coordinate_destinations is the output array
* that holds which part each coordinate should be sent. In addition it
* calculates the shift amount (output_part_numbering_begin_index) to be done
* when final numberings of the parts are performed.
* \param num_parts is the number of parts in the current partitioning.
* \param sort_item_part_to_proc_assignment is the sorted parts with respect
* to the assigned processors.
* \param coordinate_destinations is the output array that holds which part
* each coordinate should be sent.
* \param output_part_numbering_begin_index is how much the numbers should be
* shifted when numbering the result parts.
* \param next_future_num_parts_in_parts is the vector, how many more parts
* each part will be divided into in the future.
*/
void assign_send_destinations2(
mj_part_t num_parts,
uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
int *coordinate_destinations,
mj_part_t &output_part_numbering_begin_index,
std::vector<mj_part_t> *next_future_num_parts_in_parts);
/*! \brief Function fills up coordinate_destinations is the output array
* that holds which part each coordinate should be sent. In addition it
* calculates the shift amount (output_part_numbering_begin_index) to be done
* when final numberings of the parts are performed.
* \param num_points_in_all_processor_parts is the array holding the num
* points in each part in each proc.
* \param num_parts is the number of parts that exist in the current
* partitioning.
* \param num_procs is the number of processors attending to
* migration operation.
* \param send_count_to_each_proc array array storing the number of points to
* be sent to each part.
* \param next_future_num_parts_in_parts is the vector, how many more parts
* each part will be divided into in the future.
* \param out_num_part is the number of parts assigned to the process.
* \param out_part_indices is the indices of the part to which the processor
* is assigned.
* \param output_part_numbering_begin_index is how much the numbers should be
* shifted when numbering the result parts.
* \param coordinate_destinations is the output array that holds which parta
* each coordinate should be sent.
*/
void mj_assign_parts_to_procs(
mj_gno_t * num_points_in_all_processor_parts,
mj_part_t num_parts,
mj_part_t num_procs,
mj_lno_t *send_count_to_each_proc,
std::vector<mj_part_t> *next_future_num_parts_in_parts,
mj_part_t &out_num_part,
std::vector<mj_part_t> &out_part_indices,
mj_part_t &output_part_numbering_begin_index,
int *coordinate_destinations);
/*! \brief Function fills up coordinate_destinations is the output array
* that holds which part each coordinate should be sent. In addition it
* calculates the shift amount (output_part_numbering_begin_index) to be done
* when final numberings of the parts are performed.
* \param num_procs is the number of processora attending to
* migration operation.
* \param num_new_local_points is the output to represent the new number
* of local points.
* \param iteration is the string for the current iteration.
* \param coordinate_destinations is the output array that holds which part
* each coordinate should be sent.
* \param num_parts is the number of parts in the current partitioning.
*/
void mj_migrate_coords(
mj_part_t num_procs,
mj_lno_t &num_new_local_points,
std::string iteration,
int *coordinate_destinations,
mj_part_t num_parts);
/*! \brief Function creates the new subcomminicator for the processors
* given in processor_ranks_for_subcomm.
* \param processor_ranks_for_subcomm is the vector that has the ranks of
* the processors that will be in the same group.
*/
void create_sub_communicator(
std::vector<mj_part_t> &processor_ranks_for_subcomm);
/*! \brief Function returns the largest prime factor of a given number.
* input and output are integer-like.
* \param num_parts DOCWORK: documentation
*/
mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
mj_part_t largest_factor = 1;
mj_part_t n = num_parts;
mj_part_t divisor = 2;
while (n > 1) {
while (n % divisor == 0) {
n = n / divisor;
largest_factor = divisor;
}
++divisor;
if(divisor * divisor > n) {
if(n > 1) {
largest_factor = n;
}
break;
}
}
return largest_factor;
}
public:
AlgMJ();
// DOCWORK: Make param documentation use : consistently
/*! \brief Multi Jagged coordinate partitioning algorithm.
* \param env: library configuration and problem parameters
* \param problemComm: the communicator for the problem
* \param imbalance_tolerance: the input provided imbalance tolerance.
* \param num_teams: number of teams for CUDA kernels.
* \param num_global_parts: number of target global parts.
* \param part_no_array: part no array, if provided this will be used
* for partitioning.
* \param recursion_depth: if part no array is provided, it is the length of
* part no array, if part no is not provided than it is the number of steps
* that algorithm will divide into num_global_parts parts.
* \param coord_dim: coordinate dimension
* \param num_local_coords: number of local coordinates
* \param num_global_coords: number of global coordinates
* \param initial_mj_gnos: the list of initial global id's
* \param mj_coordinates: the two dimensional coordinate array.
* \param num_weights_per_coord: number of weights per coordinate
* \param mj_uniform_weights: if weight index [i] has uniform weight or not.
* \param mj_weights: the two dimensional array for weights
* \param mj_uniform_parts: if the target partitioning aims uniform parts
* \param mj_part_sizes: if the target partitioning does not aim uniform
* parts, then weight of each part.
* \param result_assigned_part_ids: Output - the result partids corresponding
* to the coordinates given im result_mj_gnos.
* \param result_mj_gnos: Output - the result coordinate global id's
* corresponding to the part_ids array.
*/
void multi_jagged_part(
const RCP<const Environment> &env,
RCP<const Comm<int> > &problemComm,