1
1
2
- #ifndef PARLAY_TRANSPOSE_H_
3
- #define PARLAY_TRANSPOSE_H_
2
+ #ifndef PARLAY_INTERNAL_TRANSPOSE_H_
3
+ #define PARLAY_INTERNAL_TRANSPOSE_H_
4
4
5
+ #include < cassert>
6
+ #include < cstddef>
7
+
8
+ #include " ../monoid.h"
9
+ #include " ../parallel.h"
10
+ #include " ../sequence.h"
11
+ #include " ../slice.h"
5
12
#include " ../utilities.h"
6
13
14
+ #include " sequence_ops.h"
15
+
7
16
namespace parlay {
8
17
namespace internal {
9
18
@@ -21,17 +30,37 @@ constexpr const size_t NON_CACHE_OBLIVIOUS_THRESHOLD = 1 << 22;
21
30
22
31
inline size_t split (size_t n) { return n / 2 ; }
23
32
24
- template <typename assignment_tag, typename Iterator>
33
+ // Given a flat matrix represented in row-major order (i.e., a matrix where
34
+ // each row is written one after the other in a 1D sequence), computes the
35
+ // transpose of that matrix.
36
+ //
37
+ // E.g., given [1,2,3,1,2,3,1,2,3] which represents the matrix
38
+ //
39
+ // 1 2 3
40
+ // 1 2 3
41
+ // 1 2 3
42
+ //
43
+ // it computes [1,1,1,2,2,2,3,3,3] which represents the transpose matrix
44
+ //
45
+ // 1 1 1
46
+ // 2 2 2
47
+ // 3 3 3
48
+ //
49
+ // Alternatively, you can think of it as swapping to column-major
50
+ // representation when starting from row-major representation
51
+ //
52
+ template <typename assignment_tag, typename InIterator, typename OutIterator>
25
53
struct transpose {
26
- Iterator A, B;
27
- transpose (Iterator AA, Iterator BB) : A(AA), B(BB) {}
54
+ InIterator In;
55
+ OutIterator Out;
56
+ transpose (InIterator In_, OutIterator Out_) : In(std::move(In_)), Out(std::move(Out_)) {}
28
57
29
58
void transR (size_t rStart, size_t rCount, size_t rLength, size_t cStart,
30
59
size_t cCount, size_t cLength) {
31
60
if (cCount * rCount < TRANS_THRESHHOLD) {
32
61
for (size_t i = rStart; i < rStart + rCount; i++)
33
62
for (size_t j = cStart; j < cStart + cCount; j++)
34
- assign_dispatch (B [j * cLength + i], A [i * rLength + j], assignment_tag ());
63
+ assign_dispatch (Out [j * cLength + i], In [i * rLength + j], assignment_tag ());
35
64
} else if (cCount > rCount) {
36
65
size_t l1 = split (cCount);
37
66
size_t l2 = cCount - l1;
@@ -43,7 +72,7 @@ struct transpose {
43
72
};
44
73
par_do (left, right);
45
74
} else {
46
- size_t l1 = split (cCount );
75
+ size_t l1 = split (rCount );
47
76
size_t l2 = rCount - l1;
48
77
auto left = [&]() {
49
78
transR (rStart, l1, rLength, cStart, cCount, cLength);
@@ -64,27 +93,57 @@ struct transpose {
64
93
}
65
94
};
66
95
96
+ // Given a flat matrix represented in row-major order, in which the rows are divided
97
+ // into contiguous chunks, computes the matrix resulting from transposing those chunks,
98
+ // in row-major order. Note that as the matrix is represented in row-major order, it
99
+ // may have rows of different lengths, so it might not be a real matrix.
100
+ //
101
+ // For example, consider the following matrix in which the rows are chunked
102
+ //
103
+ // [ ( 1 2) ( 3 4 5) ( 6 7 8 9) ]
104
+ // [ (10 11) (12 13 14) (15 16 17 18) ]
105
+ //
106
+ // Its block-transpose is
107
+ //
108
+ // [ ( 1 2) (10 11) ]
109
+ // [ ( 3 4 5) (12 13 14) ]
110
+ // [ (6 7 8 9) (15 16 17 18) ]
111
+ //
112
+ // (which of course is represented in row-major order because it isn't a real matrix
113
+ // as it has imbalanced rows)
114
+ //
115
+ // The input to the problem is given in the form of the input matrix in row-major order,
116
+ // the output destination, and the offsets that define where each chunk begins in the
117
+ // input and output. For example, the input offsets for the matrix above are
118
+ //
119
+ // [ 0 2 5 ]
120
+ // [ 0 2 5 ]
121
+ //
122
+ // again, given in row-major order. You can think of the offsets as the prefix sum of
123
+ // the chunk sizes.
124
+ //
67
125
template <typename assignment_tag, typename InIterator, typename OutIterator, typename CountIterator, typename DestIterator>
68
126
struct blockTrans {
69
- InIterator A ;
70
- OutIterator B ;
71
- CountIterator OA ;
72
- DestIterator OB ;
127
+ InIterator In ;
128
+ OutIterator Out ;
129
+ CountIterator InOffset ;
130
+ DestIterator OutOffset ;
73
131
74
- blockTrans (InIterator AA , OutIterator BB , CountIterator OOA , DestIterator OOB )
75
- : A(AA), B(BB), OA(OOA), OB(OOB ) {}
132
+ blockTrans (InIterator In_ , OutIterator Out_ , CountIterator InOffset_ , DestIterator OutOffset_ )
133
+ : In(std::move(In_)), Out(std::move(Out_)), InOffset(std::move(InOffset_)), OutOffset(std::move(OutOffset_) ) {}
76
134
77
135
void transR (size_t rStart, size_t rCount, size_t rLength, size_t cStart,
78
136
size_t cCount, size_t cLength) {
79
137
if (cCount * rCount < TRANS_THRESHHOLD * 16 ) {
80
138
parallel_for (rStart, rStart + rCount, [&](size_t i) {
81
139
for (size_t j = cStart; j < cStart + cCount; j++) {
82
- size_t sa = OA[i * rLength + j];
83
- size_t sb = OB[j * cLength + i];
84
- size_t l = OA[i * rLength + j + 1 ] - sa;
85
- for (size_t k = 0 ; k < l; k++) assign_dispatch (B[k + sb], A[k + sa], assignment_tag ());
140
+ size_t sa = InOffset[i * rLength + j];
141
+ size_t sb = OutOffset[j * cLength + i];
142
+ size_t l = InOffset[i * rLength + j + 1 ] - sa;
143
+ for (size_t k = 0 ; k < l; k++) {
144
+ assign_dispatch (Out[k + sb], In[k + sa], assignment_tag ());
145
+ }
86
146
}
87
-
88
147
});
89
148
} else if (cCount > rCount) {
90
149
size_t l1 = split (cCount);
@@ -97,7 +156,7 @@ struct blockTrans {
97
156
};
98
157
par_do (left, right);
99
158
} else {
100
- size_t l1 = split (cCount );
159
+ size_t l1 = split (rCount );
101
160
size_t l2 = rCount - l1;
102
161
auto left = [&]() {
103
162
transR (rStart, l1, rLength, cStart, cCount, cLength);
@@ -121,9 +180,9 @@ struct blockTrans {
121
180
// Moves values from blocks to buckets
122
181
// From is sorted by key within each block, in block major
123
182
// counts is the # of keys in each bucket for each block, in block major
124
- // From and To are of lenght n
183
+ // From and To are of length n
125
184
// counts is of length num_blocks * num_buckets
126
- // Data is memcpy'd into To avoiding initializers and overloaded =
185
+ //
127
186
template <typename assignment_tag, typename InIterator, typename OutIterator, typename s_size_t >
128
187
sequence<size_t > transpose_buckets (InIterator From, OutIterator To,
129
188
sequence<s_size_t >& counts, size_t n,
@@ -161,10 +220,9 @@ sequence<size_t> transpose_buckets(InIterator From, OutIterator To,
161
220
};
162
221
parallel_for (0 , num_blocks, f, 1 );
163
222
} else { // for larger input do cache efficient transpose
164
- // sequence<s_size_t> source_offsets(counts,m+1);
165
- dest_offsets = sequence<s_size_t >(m);
166
- transpose<assignment_tag, typename sequence<s_size_t >::iterator>(counts.begin (), dest_offsets.begin ())
167
- .trans (num_blocks, num_buckets);
223
+ dest_offsets = sequence<s_size_t >::uninitialized (m);
224
+ transpose<uninitialized_copy_tag, decltype (counts.begin ()), decltype (dest_offsets.begin ())>
225
+ (counts.begin (), dest_offsets.begin ()).trans (num_blocks, num_buckets);
168
226
169
227
// do both scans inplace
170
228
[[maybe_unused]] size_t total = scan_inplace (make_slice (dest_offsets), add);
@@ -189,4 +247,4 @@ sequence<size_t> transpose_buckets(InIterator From, OutIterator To,
189
247
} // namespace internal
190
248
} // namespace parlay
191
249
192
- #endif // PARLAY_TRANSPOSE_H_
250
+ #endif // PARLAY_INTERNAL_TRANSPOSE_H_
0 commit comments