Skip to content

Commit b7cb153

Browse files
committed
Add sequential RCM on CPU.
Also change the parallel algorithm a little bit to remove a large overhead.
1 parent e7e8840 commit b7cb153

File tree

6 files changed

+497
-97
lines changed

6 files changed

+497
-97
lines changed

CMakeLists.txt

+6-4
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ MESSAGE(STATUS "Cuda version: ${CUDA_VERSION}")
99

1010
SET(RCM_HEADERS
1111
rcm/common.h
12+
rcm/rcm.h
1213
rcm/rcm_um.h
1314
rcm/exception.h
1415
rcm/timer.h
@@ -23,15 +24,16 @@ SET(MMIO_FILES
2324
mm_io/mm_io.c
2425
)
2526

26-
SOURCE_GROUP("Headers" FILES ${MC64_HEADERS})
27-
SOURCE_GROUP("CUDA Headers" FILES ${MC64_CUHEADERS})
27+
SOURCE_GROUP("Headers" FILES ${RCM_HEADERS})
28+
SOURCE_GROUP("CUDA Headers" FILES ${RCM_CUHEADERS})
2829
SOURCE_GROUP("MM_IO" FILES ${MMIO_FILES})
2930

3031
INCLUDE_DIRECTORIES(
3132
${CMAKE_SOURCE_DIR}
3233
)
3334

3435
IF(NOT (${CUDA_VERSION} VERSION_LESS "6.0"))
35-
cuda_add_executable(driver_um driver_um.cu ${MC64_HEADERS} ${MC64_CUHEADERS} ${MMIO_FILES})
36-
cuda_add_executable(testing testing.cu ${MC64_HEADERS} ${MC64_CUHEADERS} ${MMIO_FILES})
36+
cuda_add_executable(driver_um driver_um.cu ${RCM_HEADERS} ${RCM_CUHEADERS} ${MMIO_FILES})
37+
cuda_add_executable(testing testing.cu ${RCM_HEADERS} ${RCM_CUHEADERS} ${MMIO_FILES})
38+
cuda_add_executable(testing_um testing_um.cu ${RCM_HEADERS} ${RCM_CUHEADERS} ${MMIO_FILES})
3739
ENDIF()

rcm/device/kernels.cuh

+4-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ __global__ void achieveLevels(int N,
2727
bool* frontier,
2828
int* visited,
2929
int* updated_by,
30-
int* levels)
30+
int* levels,
31+
bool* has_frontier)
3132
{
3233
int bid = blockIdx.x + blockIdx.y * gridDim.x;
3334

@@ -48,6 +49,8 @@ __global__ void achieveLevels(int N,
4849
frontier[column] = true;
4950
updated_by[column] = bid + 1;
5051
levels[column] = cur_cost + 1;
52+
if (!(*has_frontier))
53+
*has_frontier = true;
5154
}
5255
}
5356

rcm/rcm.h

+331
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,331 @@
1+
#ifndef RCM_H
2+
#define RCM_H
3+
4+
#include <rcm/common.h>
5+
#include <rcm/exception.h>
6+
#include <rcm/device/kernels.cuh>
7+
8+
#include <thrust/scan.h>
9+
#include <thrust/functional.h>
10+
#include <thrust/sequence.h>
11+
#include <thrust/iterator/zip_iterator.h>
12+
#include <thrust/gather.h>
13+
#include <thrust/binary_search.h>
14+
#include <thrust/system/cuda/execution_policy.h>
15+
#include <thrust/logical.h>
16+
#include <thrust/host_vector.h>
17+
#include <thrust/device_vector.h>
18+
#include <thrust/adjacent_difference.h>
19+
#include <thrust/inner_product.h>
20+
21+
#include <queue>
22+
23+
extern "C" {
24+
#include "mm_io/mm_io.h"
25+
}
26+
27+
namespace rcm {
28+
29+
class RCM_base
30+
{
31+
protected:
32+
int m_half_bandwidth;
33+
int m_half_bandwidth_original;
34+
35+
size_t m_n;
36+
size_t m_nnz;
37+
38+
typedef typename thrust::tuple<int, int, int, int> IntTuple;
39+
40+
template <typename IVector>
41+
static void offsets_to_indices(const IVector& offsets, IVector& indices)
42+
{
43+
// convert compressed row offsets into uncompressed row indices
44+
thrust::fill(indices.begin(), indices.end(), 0);
45+
thrust::scatter( thrust::counting_iterator<int>(0),
46+
thrust::counting_iterator<int>(offsets.size()-1),
47+
offsets.begin(),
48+
indices.begin());
49+
thrust::inclusive_scan(indices.begin(), indices.end(), indices.begin(), thrust::maximum<int>());
50+
}
51+
52+
template <typename IVector>
53+
static void indices_to_offsets(const IVector& indices, IVector& offsets)
54+
{
55+
// convert uncompressed row indices into compressed row offsets
56+
thrust::lower_bound(indices.begin(),
57+
indices.end(),
58+
thrust::counting_iterator<int>(0),
59+
thrust::counting_iterator<int>(offsets.size()),
60+
offsets.begin());
61+
}
62+
63+
public:
64+
65+
virtual ~RCM_base() {}
66+
67+
struct Difference: public thrust::binary_function<int, int, int>
68+
{
69+
inline
70+
__host__ __device__
71+
int operator() (const int &a, const int &b) const {
72+
return abs(a-b);
73+
}
74+
};
75+
76+
struct ExtendedDifference: public thrust::binary_function<int, int, int>
77+
{
78+
int *m_perm;
79+
80+
ExtendedDifference(int *perm): m_perm(perm) {}
81+
inline
82+
__host__ __device__
83+
int operator() (const int &a, const int &b) const {
84+
return abs(m_perm[a]-m_perm[b]);
85+
}
86+
};
87+
88+
struct TupleCompare
89+
{
90+
inline
91+
__host__ __device__
92+
bool operator() (IntTuple a, IntTuple b) const
93+
{
94+
int a_level = thrust::get<0>(a), b_level = thrust::get<0>(b);
95+
if (a_level != b_level) return a_level < b_level;
96+
int a_updated_by = thrust::get<1>(a), b_updated_by = thrust::get<1>(b);
97+
if (a_updated_by != b_updated_by) return a_updated_by < b_updated_by;
98+
return thrust::get<2>(a) < thrust::get<2>(b);
99+
}
100+
};
101+
102+
int getHalfBandwidth() const {return m_half_bandwidth;}
103+
int getHalfBandwidthOriginal() const {return m_half_bandwidth_original;}
104+
105+
virtual void execute() = 0;
106+
};
107+
108+
class RCM: public RCM_base
109+
{
110+
private:
111+
typedef typename thrust::host_vector<int> IntVectorH;
112+
typedef typename thrust::host_vector<double> DoubleVectorH;
113+
typedef typename thrust::host_vector<bool> BoolVectorH;
114+
115+
typedef typename IntVectorH::iterator IntIterator;
116+
typedef typename thrust::tuple<IntIterator, IntIterator> IntIteratorTuple;
117+
typedef typename thrust::zip_iterator<IntIteratorTuple> EdgeIterator;
118+
119+
typedef typename thrust::tuple<int, int> NodeType;
120+
121+
IntVectorH m_row_offsets;
122+
IntVectorH m_column_indices;
123+
DoubleVectorH m_values;
124+
125+
IntVectorH m_perm;
126+
127+
void buildTopology(EdgeIterator& begin,
128+
EdgeIterator& end,
129+
int node_begin,
130+
int node_end,
131+
IntVectorH& row_offsets,
132+
IntVectorH& column_indices);
133+
134+
struct CompareValue
135+
{
136+
inline
137+
bool operator() (NodeType a, NodeType b) const {
138+
return thrust::get<1>(a) > thrust::get<1>(b);
139+
}
140+
};
141+
142+
public:
143+
RCM(const IntVectorH& row_offsets,
144+
const IntVectorH& column_indices,
145+
const DoubleVectorH& values)
146+
: m_row_offsets(row_offsets),
147+
m_column_indices(column_indices),
148+
m_values(values)
149+
{
150+
size_t n = row_offsets.size() - 1;
151+
m_perm.resize(n);
152+
m_n = n;
153+
m_nnz = m_values.size();
154+
}
155+
156+
~RCM() {}
157+
158+
void execute();
159+
};
160+
161+
void
162+
RCM::execute()
163+
{
164+
IntVectorH tmp_reordering(m_n);
165+
IntVectorH tmp_perm(m_n);
166+
167+
thrust::sequence(tmp_reordering.begin(), tmp_reordering.end());
168+
169+
IntVectorH row_indices(m_nnz);
170+
IntVectorH tmp_row_indices(m_nnz << 1);
171+
IntVectorH tmp_column_indices(m_nnz << 1);
172+
IntVectorH tmp_row_offsets(m_n + 1);
173+
offsets_to_indices(m_row_offsets, row_indices);
174+
175+
m_half_bandwidth_original = m_half_bandwidth = thrust::inner_product(row_indices.begin(), row_indices.end(), m_column_indices.begin(), 0, thrust::maximum<int>(), Difference());
176+
177+
EdgeIterator begin = thrust::make_zip_iterator(thrust::make_tuple(row_indices.begin(), m_column_indices.begin()));
178+
EdgeIterator end = thrust::make_zip_iterator(thrust::make_tuple(row_indices.end(), m_column_indices.end()));
179+
buildTopology(begin, end, 0, m_n, tmp_row_offsets, tmp_column_indices);
180+
181+
const int MAX_NUM_TRIAL = 5;
182+
183+
BoolVectorH tried(m_n, false);
184+
tried[0] = true;
185+
186+
int last_tried = 0;
187+
188+
m_perm.resize(m_n);
189+
thrust::sequence(m_perm.begin(), m_perm.end());
190+
191+
for (int trial_num = 0; trial_num < MAX_NUM_TRIAL ; trial_num++)
192+
{
193+
std::queue<int> q;
194+
std::priority_queue<NodeType, std::vector<NodeType>, CompareValue > pq;
195+
196+
int tmp_node;
197+
BoolVectorH pushed(m_n, false);
198+
199+
int left_cnt = m_n;
200+
int j = 0, last = 0;
201+
202+
if (trial_num > 0) {
203+
204+
if (trial_num < MAX_NUM_TRIAL) {
205+
tmp_node = rand() % m_n;
206+
207+
while(tried[tmp_node])
208+
tmp_node = rand() % m_n;
209+
} else {
210+
if (last_tried >= m_n - 1) {
211+
fprintf(stderr, "All possible starting points have been tried in RCM\n");
212+
break;
213+
}
214+
for (tmp_node = last_tried+1; tmp_node < m_n; tmp_node++)
215+
if (!tried[tmp_node]) {
216+
last_tried = tmp_node;
217+
break;
218+
}
219+
}
220+
221+
pushed[tmp_node] = true;
222+
tried[tmp_node] = true;
223+
q.push(tmp_node);
224+
}
225+
226+
while(left_cnt--) {
227+
if(q.empty()) {
228+
left_cnt++;
229+
int i;
230+
231+
for(i = last; i < m_n; i++) {
232+
if(!pushed[i]) {
233+
q.push(i);
234+
pushed[i] = true;
235+
last = i;
236+
break;
237+
}
238+
}
239+
if(i < m_n) continue;
240+
fprintf(stderr, "Can never get here!\n");
241+
return;
242+
}
243+
244+
tmp_node = q.front();
245+
tmp_reordering[j] = tmp_node;
246+
j++;
247+
248+
q.pop();
249+
250+
int start_idx = tmp_row_offsets[tmp_node], end_idx = tmp_row_offsets[tmp_node + 1];
251+
252+
for (int i = start_idx; i < end_idx; i++) {
253+
int target_node = tmp_column_indices[i];
254+
if(!pushed[target_node]) {
255+
pushed[target_node] = true;
256+
pq.push(thrust::make_tuple(target_node, tmp_row_offsets[target_node + 1] - tmp_row_offsets[target_node]));
257+
}
258+
}
259+
260+
while(!pq.empty()) {
261+
q.push(thrust::get<0>(pq.top()));
262+
pq.pop();
263+
}
264+
}
265+
266+
thrust::scatter(thrust::make_counting_iterator(0),
267+
thrust::make_counting_iterator((int)(m_n)),
268+
tmp_reordering.begin(),
269+
tmp_perm.begin());
270+
271+
{
272+
int *perm_array = thrust::raw_pointer_cast(&tmp_perm[0]);
273+
int tmp_bdwidth = thrust::inner_product(row_indices.begin(), row_indices.end(), m_column_indices.begin(), 0, thrust::maximum<int>(), ExtendedDifference(perm_array));
274+
275+
if (m_half_bandwidth > tmp_bdwidth) {
276+
m_half_bandwidth = tmp_bdwidth;
277+
m_perm = tmp_perm;
278+
}
279+
}
280+
}
281+
}
282+
283+
void
284+
RCM::buildTopology(EdgeIterator& begin,
285+
EdgeIterator& end,
286+
int node_begin,
287+
int node_end,
288+
IntVectorH& row_offsets,
289+
IntVectorH& column_indices)
290+
{
291+
if (row_offsets.size() != m_n + 1)
292+
row_offsets.resize(m_n + 1, 0);
293+
else
294+
thrust::fill(row_offsets.begin(), row_offsets.end(), 0);
295+
296+
IntVectorH row_indices((end - begin) << 1);
297+
column_indices.resize((end - begin) << 1);
298+
int actual_cnt = 0;
299+
300+
for(EdgeIterator edgeIt = begin; edgeIt != end; edgeIt++) {
301+
int from = thrust::get<0>(*edgeIt), to = thrust::get<1>(*edgeIt);
302+
if (from != to) {
303+
row_indices[actual_cnt] = from;
304+
column_indices[actual_cnt] = to;
305+
row_indices[actual_cnt + 1] = to;
306+
column_indices[actual_cnt + 1] = from;
307+
actual_cnt += 2;
308+
}
309+
}
310+
row_indices.resize(actual_cnt);
311+
column_indices.resize(actual_cnt);
312+
// thrust::sort_by_key(row_indices.begin(), row_indices.end(), column_indices.begin());
313+
{
314+
int& nnz = actual_cnt;
315+
IntVectorH tmp_column_indices(nnz);
316+
for (int i = 0; i < nnz; i++)
317+
row_offsets[row_indices[i]] ++;
318+
319+
thrust::inclusive_scan(row_offsets.begin() + node_begin, row_offsets.begin() + (node_end + 1), row_offsets.begin() + node_begin);
320+
321+
for (int i = nnz - 1; i >= 0; i--) {
322+
int idx = (--row_offsets[row_indices[i]]);
323+
tmp_column_indices[idx] = column_indices[i];
324+
}
325+
column_indices = tmp_column_indices;
326+
}
327+
}
328+
329+
} // end namespace rcm
330+
331+
#endif

0 commit comments

Comments
 (0)