Skip to content

Commit 2ab8659

Browse files
Fix transpose and blockTrans methods (fixes cmuparlay#17)
- Correct infinite recursion observed in transpose and blockTrans for inputs with substantially more rows than columns - Minor performance improvement for transpose_buckets - Add unit tests for the transpose algorithms - Fix some related and unrelated static analysis warnings
1 parent 99474c0 commit 2ab8659

File tree

7 files changed

+326
-30
lines changed

7 files changed

+326
-30
lines changed

.clang-tidy

+2
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@
1212
# value: 'some value'
1313

1414
Checks: 'bugprone-*,
15+
-bugprone-easily-swappable-parameters,
1516
clang-analyzer-*,
1617
performance-*,
1718
portability-*'
1819
WarningsAsErrors: 'bugprone-*,
20+
-bugprone-easily-swappable-parameters,
1921
clang-analyzer-*,
2022
performance-*,
2123
portability-*'

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# -------------------------------------------------------------------
77

88
cmake_minimum_required(VERSION 3.14)
9-
project(PARLAY VERSION 2.0.1
9+
project(PARLAY VERSION 2.0.2
1010
DESCRIPTION "A collection of parallel algorithms and other support for parallelism in C++"
1111
LANGUAGES CXX)
1212

analysis/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ configure_danalysis(
1515
${PARLAY_INCLUDE_DIR}/parlay/internal/sequence_base.h
1616
${PARLAY_INCLUDE_DIR}/parlay/internal/sequence_ops.h
1717
${PARLAY_INCLUDE_DIR}/parlay/internal/stream_delayed.h
18+
${PARLAY_INCLUDE_DIR}/parlay/internal/transpose.h
1819
${PARLAY_INCLUDE_DIR}/parlay/internal/work_stealing_job.h
1920
${PARLAY_INCLUDE_DIR}/parlay/internal/delayed/common.h
2021
${PARLAY_INCLUDE_DIR}/parlay/internal/delayed/filter.h

include/parlay/internal/transpose.h

+84-26
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,18 @@
11

2-
#ifndef PARLAY_TRANSPOSE_H_
3-
#define PARLAY_TRANSPOSE_H_
2+
#ifndef PARLAY_INTERNAL_TRANSPOSE_H_
3+
#define PARLAY_INTERNAL_TRANSPOSE_H_
44

5+
#include <cassert>
6+
#include <cstddef>
7+
8+
#include "../monoid.h"
9+
#include "../parallel.h"
10+
#include "../sequence.h"
11+
#include "../slice.h"
512
#include "../utilities.h"
613

14+
#include "sequence_ops.h"
15+
716
namespace parlay {
817
namespace internal {
918

@@ -21,17 +30,37 @@ constexpr const size_t NON_CACHE_OBLIVIOUS_THRESHOLD = 1 << 22;
2130

2231
inline size_t split(size_t n) { return n / 2; }
2332

24-
template <typename assignment_tag, typename Iterator>
33+
// Given a flat matrix represented in row-major order (i.e., a matrix where
34+
// each row is written one after the other in a 1D sequence), computes the
35+
// transpose of that matrix.
36+
//
37+
// E.g., given [1,2,3,1,2,3,1,2,3] which represents the matrix
38+
//
39+
// 1 2 3
40+
// 1 2 3
41+
// 1 2 3
42+
//
43+
// it computes [1,1,1,2,2,2,3,3,3] which represents the transpose matrix
44+
//
45+
// 1 1 1
46+
// 2 2 2
47+
// 3 3 3
48+
//
49+
// Alternatively, you can think of it as swapping to column-major
50+
// representation when starting from row-major representation
51+
//
52+
template <typename assignment_tag, typename InIterator, typename OutIterator>
2553
struct transpose {
26-
Iterator A, B;
27-
transpose(Iterator AA, Iterator BB) : A(AA), B(BB) {}
54+
InIterator In;
55+
OutIterator Out;
56+
transpose(InIterator In_, OutIterator Out_) : In(std::move(In_)), Out(std::move(Out_)) {}
2857

2958
void transR(size_t rStart, size_t rCount, size_t rLength, size_t cStart,
3059
size_t cCount, size_t cLength) {
3160
if (cCount * rCount < TRANS_THRESHHOLD) {
3261
for (size_t i = rStart; i < rStart + rCount; i++)
3362
for (size_t j = cStart; j < cStart + cCount; j++)
34-
assign_dispatch(B[j * cLength + i], A[i * rLength + j], assignment_tag());
63+
assign_dispatch(Out[j * cLength + i], In[i * rLength + j], assignment_tag());
3564
} else if (cCount > rCount) {
3665
size_t l1 = split(cCount);
3766
size_t l2 = cCount - l1;
@@ -43,7 +72,7 @@ struct transpose {
4372
};
4473
par_do(left, right);
4574
} else {
46-
size_t l1 = split(cCount);
75+
size_t l1 = split(rCount);
4776
size_t l2 = rCount - l1;
4877
auto left = [&]() {
4978
transR(rStart, l1, rLength, cStart, cCount, cLength);
@@ -64,27 +93,57 @@ struct transpose {
6493
}
6594
};
6695

96+
// Given a flat matrix represented in row-major order, in which the rows are divided
97+
// into contiguous chunks, computes the matrix resulting from transposing those chunks,
98+
// in row-major order. Note that as the matrix is represented in row-major order, it
99+
// may have rows of different lengths, so it might not be a real matrix.
100+
//
101+
// For example, consider the following matrix in which the rows are chunked
102+
//
103+
// [ ( 1 2) ( 3 4 5) ( 6 7 8 9) ]
104+
// [ (10 11) (12 13 14) (15 16 17 18) ]
105+
//
106+
// Its block-transpose is
107+
//
108+
// [ ( 1 2) (10 11) ]
109+
// [ ( 3 4 5) (12 13 14) ]
110+
// [ (6 7 8 9) (15 16 17 18) ]
111+
//
112+
// (which of course is represented in row-major order because it isn't a real matrix
113+
// as it has imbalanced rows)
114+
//
115+
// The input to the problem is given in the form of the input matrix in row-major order,
116+
// the output destination, and the offsets that define where each chunk begins in the
117+
// input and output. For example, the input offsets for the matrix above are
118+
//
119+
// [ 0 2 5 ]
120+
// [ 0 2 5 ]
121+
//
122+
// again, given in row-major order. You can think of the offsets as the prefix sum of
123+
// the chunk sizes.
124+
//
67125
template <typename assignment_tag, typename InIterator, typename OutIterator, typename CountIterator, typename DestIterator>
68126
struct blockTrans {
69-
InIterator A;
70-
OutIterator B;
71-
CountIterator OA;
72-
DestIterator OB;
127+
InIterator In;
128+
OutIterator Out;
129+
CountIterator InOffset;
130+
DestIterator OutOffset;
73131

74-
blockTrans(InIterator AA, OutIterator BB, CountIterator OOA, DestIterator OOB)
75-
: A(AA), B(BB), OA(OOA), OB(OOB) {}
132+
blockTrans(InIterator In_, OutIterator Out_, CountIterator InOffset_, DestIterator OutOffset_)
133+
: In(std::move(In_)), Out(std::move(Out_)), InOffset(std::move(InOffset_)), OutOffset(std::move(OutOffset_)) {}
76134

77135
void transR(size_t rStart, size_t rCount, size_t rLength, size_t cStart,
78136
size_t cCount, size_t cLength) {
79137
if (cCount * rCount < TRANS_THRESHHOLD * 16) {
80138
parallel_for(rStart, rStart + rCount, [&](size_t i) {
81139
for (size_t j = cStart; j < cStart + cCount; j++) {
82-
size_t sa = OA[i * rLength + j];
83-
size_t sb = OB[j * cLength + i];
84-
size_t l = OA[i * rLength + j + 1] - sa;
85-
for (size_t k = 0; k < l; k++) assign_dispatch(B[k + sb], A[k + sa], assignment_tag());
140+
size_t sa = InOffset[i * rLength + j];
141+
size_t sb = OutOffset[j * cLength + i];
142+
size_t l = InOffset[i * rLength + j + 1] - sa;
143+
for (size_t k = 0; k < l; k++) {
144+
assign_dispatch(Out[k + sb], In[k + sa], assignment_tag());
145+
}
86146
}
87-
88147
});
89148
} else if (cCount > rCount) {
90149
size_t l1 = split(cCount);
@@ -97,7 +156,7 @@ struct blockTrans {
97156
};
98157
par_do(left, right);
99158
} else {
100-
size_t l1 = split(cCount);
159+
size_t l1 = split(rCount);
101160
size_t l2 = rCount - l1;
102161
auto left = [&]() {
103162
transR(rStart, l1, rLength, cStart, cCount, cLength);
@@ -121,9 +180,9 @@ struct blockTrans {
121180
// Moves values from blocks to buckets
122181
// From is sorted by key within each block, in block major
123182
// counts is the # of keys in each bucket for each block, in block major
124-
// From and To are of lenght n
183+
// From and To are of length n
125184
// counts is of length num_blocks * num_buckets
126-
// Data is memcpy'd into To avoiding initializers and overloaded =
185+
//
127186
template <typename assignment_tag, typename InIterator, typename OutIterator, typename s_size_t>
128187
sequence<size_t> transpose_buckets(InIterator From, OutIterator To,
129188
sequence<s_size_t>& counts, size_t n,
@@ -161,10 +220,9 @@ sequence<size_t> transpose_buckets(InIterator From, OutIterator To,
161220
};
162221
parallel_for(0, num_blocks, f, 1);
163222
} else { // for larger input do cache efficient transpose
164-
// sequence<s_size_t> source_offsets(counts,m+1);
165-
dest_offsets = sequence<s_size_t>(m);
166-
transpose<assignment_tag, typename sequence<s_size_t>::iterator>(counts.begin(), dest_offsets.begin())
167-
.trans(num_blocks, num_buckets);
223+
dest_offsets = sequence<s_size_t>::uninitialized(m);
224+
transpose<uninitialized_copy_tag, decltype(counts.begin()), decltype(dest_offsets.begin())>
225+
(counts.begin(), dest_offsets.begin()).trans(num_blocks, num_buckets);
168226

169227
// do both scans inplace
170228
[[maybe_unused]] size_t total = scan_inplace(make_slice(dest_offsets), add);
@@ -189,4 +247,4 @@ sequence<size_t> transpose_buckets(InIterator From, OutIterator To,
189247
} // namespace internal
190248
} // namespace parlay
191249

192-
#endif // PARLAY_TRANSPOSE_H_
250+
#endif // PARLAY_INTERNAL_TRANSPOSE_H_

include/parlay/scheduler.h

+4-3
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,9 @@ struct Deque {
5353
// Note: Explicit alignment specifier required
5454
// to ensure that Clang inlines atomic loads.
5555
struct alignas(int64_t) age_t {
56-
tag_t tag;
57-
qidx top;
56+
// cppcheck bug prevents it from seeing usage with braced initializer
57+
tag_t tag; // cppcheck-suppress unusedStructMember
58+
qidx top; // cppcheck-suppress unusedStructMember
5859
};
5960

6061
// align to avoid false sharing
@@ -332,7 +333,7 @@ class fork_join_scheduler {
332333
if (end <= start) return;
333334
if (granularity == 0) {
334335
size_t done = get_granularity(start, end, f);
335-
granularity = std::max(done, (end - start) / (128 * sched->num_threads));
336+
granularity = std::max(done, (end - start) / static_cast<size_t>(128 * sched->num_threads));
336337
parfor_(start + done, end, f, granularity, conservative);
337338
} else
338339
parfor_(start, end, f, granularity, conservative);

test/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ add_dtests(NAME test_sorting_primitives FILES test_sorting_primitives.cpp LIBS p
3838
add_dtests(NAME test_random FILES test_random.cpp LIBS parlay)
3939
add_dtests(NAME test_group_by FILES test_group_by.cpp LIBS parlay)
4040
add_dtests(NAME test_monoid FILES test_monoid.cpp LIBS parlay)
41+
add_dtests(NAME test_transpose FILES test_transpose.cpp LIBS parlay)
4142

4243
# -------------------------- Uninitialized memory testing ---------------------------
4344

0 commit comments

Comments
 (0)