Skip to content

Commit

Permalink
Fixed problem in load_two_streams_reg when loading from unaligned types.
Browse files Browse the repository at this point in the history
  • Loading branch information
seanbaxter committed Jun 8, 2016
1 parent 12ba631 commit 9ac571c
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 43 deletions.
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
# Generate PTX for the last named architecture for future support.
ARCH=\
-gencode arch=compute_20,code=compute_20 \
-gencode arch=compute_20,code=sm_20 \
-gencode arch=compute_35,code=compute_35 \
-gencode arch=compute_35,code=sm_35 \
-gencode arch=compute_52,code=compute_52 \
-gencode arch=compute_52,code=sm_52

Expand Down
11 changes: 7 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,19 @@ Full documentation with [github wiki](https://github.com/moderngpu/moderngpu/wik

**Latest update**:
```
2.11 2016 June 6 -
Removed decltype() calls on __device__-tagged lambdas. This introduces
two breaking changes: transform_scan and fill_function now take explicit
types as their first template arguments.
2.12 2016 June 8 -
Fixed problem in load_two_streams_reg when loading from unaligned types.
```
---
moderngpu is a productivity library for general-purpose computing on GPUs. It is a header-only C++ library written for CUDA. The unique value of the library is in its accelerated primitives for solving irregularly parallel problems.

## Release notes
```
2.11 2016 June 6 -
Removed decltype() calls on __device__-tagged lambdas. This introduces
two breaking changes: transform_scan and fill_function now take explicit
types as their first template arguments.
2.10 2016 May 15 -
Allow for non-pow2 sized launches. Rewrote cta_reduce_t to support these
sizes.
Expand Down
24 changes: 11 additions & 13 deletions src/moderngpu/cta_merge.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -58,27 +58,25 @@ MGPU_HOST_DEVICE merge_range_t compute_merge_range(int a_count, int b_count,
return merge_range_t { mp0, mp1, diag0 - mp0, diag1 - mp1 };
}

// TODO: Modify load_two_streams_mem to take merge_range_t.

// Specialization that emits just one LD instruction. Can only reliably used
// with raw pointer types. Fixed not to use pointer arithmetic so that
// we don't get undefined behaviors with unaligned types.
template<int nt, int vt, typename type_t>
MGPU_DEVICE array_t<type_t, vt> load_two_streams_reg(const type_t* a,
int a_count, const type_t* b, int b_count, int tid) {

// Locate the start of the b array from the start of the a array, and
// subtract a_count. This lets us index into a to read both a and b values.
ptrdiff_t b_offset = b - a - a_count;
int total = a_count + b_count;
MGPU_DEVICE array_t<type_t, vt>
load_two_streams_reg(const type_t* a, int a_count,
const type_t* b, int b_count, int tid) {

b -= a_count;
array_t<type_t, vt> x;
strided_iterate<nt, vt>([&](int i, int index) {
if(index >= a_count) index += b_offset;
x[i] = a[index];
}, tid, total);
const type_t* p = (index >= a_count) ? b : a;
x[i] = p[index];
}, tid, a_count + b_count);

return x;
return x;
}


template<int nt, int vt, typename type_t, typename a_it, typename b_it>
MGPU_DEVICE
enable_if_t<
Expand Down
4 changes: 1 addition & 3 deletions tests/test_merge.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ using namespace mgpu;
int main(int argc, char** argv) {
standard_context_t context;

typedef launch_params_t<32 * 5, 7> launch_t;

// Loop from 1K to 100M.
for(int count = 1000; count <= 100000000; count += count / 10) {
int a_count = count / 2;
Expand All @@ -16,7 +14,7 @@ int main(int argc, char** argv) {
mem_t<int> b = fill_random(0, count, b_count, true, context);
mem_t<int> c(count, context);

merge<launch_t>(a.data(), a_count, b.data(), b_count, c.data(),
merge(a.data(), a_count, b.data(), b_count, c.data(),
mgpu::less_t<int>(), context);

// Download the results.
Expand Down
36 changes: 13 additions & 23 deletions tests/test_mergesort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,25 @@ using namespace mgpu;
int main(int argc, char** argv) {
standard_context_t context;

typedef launch_params_t<128, 7> launch_t;
// Loop from 1K to 100M.
for(int count = 2000; count <= 100000000; count += count / 10) {
for(int it = 1; it <= 5; ++it) {

enum { nt = 128, vt = 11 };
int count = 12345678;
mem_t<int> data = fill_random(0, 100000, count, false, context);

for(int it = 1; it <= 5; ++it) {
mergesort(data.data(), count, less_t<int>(), context);

mem_t<int> data = fill_random(0, 100000, count, false, context);
std::vector<int> ref = from_mem(data);
std::sort(ref.begin(), ref.end());
std::vector<int> sorted = from_mem(data);

mergesort<launch_t>(data.data(), count, less_t<int>(), context);
bool success = ref == sorted;

printf("%7d: %d %s\n", count, it, success ? "SUCCESS" : "FAILURE");

std::vector<int> ref = from_mem(data);
std::sort(ref.begin(), ref.end());
std::vector<int> sorted = from_mem(data);

bool print_sorted = ref != sorted;
if(print_sorted) {
for(int i = 0; i < div_up(count, vt); ++i) {
printf("%4d: ", vt * i);
for(int j = 0; j < vt; ++j)
if(vt * i + j < count) printf("%5d ", sorted[vt * i + j]);
printf("\n");
}
if(!success)
return 1;
}

printf("%3d %s\n", it, (ref == sorted) ? "SUCCESS" : "FAILURE");

if(ref != sorted)
return 0;
}

return 0;
Expand Down

0 comments on commit 9ac571c

Please sign in to comment.