Fixed problem in load_two_streams_reg when loading from unaligned types.

luckyq · Jun 8, 2016 · 9ac571c · 9ac571c
1 parent 12ba631
commit 9ac571c
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 43 deletions.
diff --git a/Makefile b/Makefile
@@ -5,7 +5,9 @@
 # Generate PTX for the last named architecture for future support.
 ARCH=\
   -gencode arch=compute_20,code=compute_20 \
+  -gencode arch=compute_20,code=sm_20 \
   -gencode arch=compute_35,code=compute_35 \
+  -gencode arch=compute_35,code=sm_35 \
   -gencode arch=compute_52,code=compute_52 \
   -gencode arch=compute_52,code=sm_52
 

diff --git a/README.md b/README.md
@@ -8,16 +8,19 @@ Full documentation with [github wiki](https://github.com/moderngpu/moderngpu/wik
 
 **Latest update**:
 ```
-2.11 2016 June 6 -
-  Removed decltype() calls on __device__-tagged lambdas. This introduces
-    two breaking changes: transform_scan and fill_function now take explicit
-    types as their first template arguments.
+2.12 2016 June 8 -
+  Fixed problem in load_two_streams_reg when loading from unaligned types.
 ```
 ---
 moderngpu is a productivity library for general-purpose computing on GPUs. It is a header-only C++ library written for CUDA. The unique value of the library is in its accelerated primitives for solving irregularly parallel problems. 
 
 ## Release notes
 ```
+2.11 2016 June 6 -
+  Removed decltype() calls on __device__-tagged lambdas. This introduces
+    two breaking changes: transform_scan and fill_function now take explicit
+    types as their first template arguments.
+
 2.10 2016 May 15 -
   Allow for non-pow2 sized launches. Rewrote cta_reduce_t to support these
   sizes.

diff --git a/src/moderngpu/cta_merge.hxx b/src/moderngpu/cta_merge.hxx
@@ -58,27 +58,25 @@ MGPU_HOST_DEVICE merge_range_t compute_merge_range(int a_count, int b_count,
   return merge_range_t { mp0, mp1, diag0 - mp0, diag1 - mp1 };
 }
 
-// TODO: Modify load_two_streams_mem to take merge_range_t.
 
+// Specialization that emits just one LD instruction. Can only reliably used
+// with raw pointer types. Fixed not to use pointer arithmetic so that 
+// we don't get undefined behaviors with unaligned types.
 template<int nt, int vt, typename type_t>
-MGPU_DEVICE array_t<type_t, vt> load_two_streams_reg(const type_t* a, 
-  int a_count, const type_t* b, int b_count, int tid) {
-
-  // Locate the start of the b array from the start of the a array, and
-  // subtract a_count. This lets us index into a to read both a and b values.
-  ptrdiff_t b_offset = b - a - a_count;
-  int total = a_count + b_count;
+MGPU_DEVICE array_t<type_t, vt> 
+load_two_streams_reg(const type_t* a, int a_count, 
+  const type_t* b, int b_count, int tid) {
 
+  b -= a_count;
   array_t<type_t, vt> x;
   strided_iterate<nt, vt>([&](int i, int index) {
-    if(index >= a_count) index += b_offset;
-    x[i] = a[index];
-  }, tid, total);
+    const type_t* p = (index >= a_count) ? b : a;
+    x[i] = p[index];
+  }, tid, a_count + b_count);
 
-  return x;
+  return x;  
 }
 
-
 template<int nt, int vt, typename type_t, typename a_it, typename b_it>
 MGPU_DEVICE 
 enable_if_t<

diff --git a/tests/test_merge.cu b/tests/test_merge.cu
@@ -5,8 +5,6 @@ using namespace mgpu;
 int main(int argc, char** argv) {
   standard_context_t context;
 
-  typedef launch_params_t<32 * 5, 7> launch_t;
-
   // Loop from 1K to 100M.
   for(int count = 1000; count <= 100000000; count += count / 10) {
     int a_count = count / 2;
@@ -16,7 +14,7 @@ int main(int argc, char** argv) {
     mem_t<int> b = fill_random(0, count, b_count, true, context);
     mem_t<int> c(count, context);
 
-    merge<launch_t>(a.data(), a_count, b.data(), b_count, c.data(), 
+    merge(a.data(), a_count, b.data(), b_count, c.data(), 
       mgpu::less_t<int>(), context);
 
     // Download the results.

diff --git a/tests/test_mergesort.cu b/tests/test_mergesort.cu
@@ -5,35 +5,25 @@ using namespace mgpu;
 int main(int argc, char** argv) {
   standard_context_t context;
 
-  typedef launch_params_t<128, 7> launch_t;
+  // Loop from 1K to 100M.
+  for(int count = 2000; count <= 100000000; count += count / 10) {
+    for(int it = 1; it <= 5; ++it) {
 
-  enum { nt = 128, vt = 11 };
-  int count = 12345678;
+      mem_t<int> data = fill_random(0, 100000, count, false, context);
 
-  for(int it = 1; it <= 5; ++it) {
+      mergesort(data.data(), count, less_t<int>(), context);
 
-    mem_t<int> data = fill_random(0, 100000, count, false, context);
+      std::vector<int> ref = from_mem(data);
+      std::sort(ref.begin(), ref.end());
+      std::vector<int> sorted = from_mem(data);
 
-    mergesort<launch_t>(data.data(), count, less_t<int>(), context);
+      bool success = ref == sorted;
+
+      printf("%7d: %d %s\n", count, it, success ? "SUCCESS" : "FAILURE");
 
-    std::vector<int> ref = from_mem(data);
-    std::sort(ref.begin(), ref.end());
-    std::vector<int> sorted = from_mem(data);
-
-    bool print_sorted = ref != sorted;
-    if(print_sorted) {
-      for(int i = 0; i < div_up(count, vt); ++i) {
-         printf("%4d: ", vt * i);
-         for(int j = 0; j < vt; ++j)
-           if(vt * i + j < count) printf("%5d ", sorted[vt * i + j]);
-         printf("\n");
-      }
+      if(!success)
+        return 1;
     }
-
-    printf("%3d %s\n", it, (ref == sorted) ? "SUCCESS" : "FAILURE");
-
-    if(ref != sorted)
-      return 0;
   }
 
   return 0;