|
| 1 | +#ifndef RCM_H |
| 2 | +#define RCM_H |
| 3 | + |
| 4 | +#include <rcm/common.h> |
| 5 | +#include <rcm/exception.h> |
| 6 | +#include <rcm/device/kernels.cuh> |
| 7 | + |
| 8 | +#include <thrust/scan.h> |
| 9 | +#include <thrust/functional.h> |
| 10 | +#include <thrust/sequence.h> |
| 11 | +#include <thrust/iterator/zip_iterator.h> |
| 12 | +#include <thrust/gather.h> |
| 13 | +#include <thrust/binary_search.h> |
| 14 | +#include <thrust/system/cuda/execution_policy.h> |
| 15 | +#include <thrust/logical.h> |
| 16 | +#include <thrust/host_vector.h> |
| 17 | +#include <thrust/device_vector.h> |
| 18 | +#include <thrust/adjacent_difference.h> |
| 19 | +#include <thrust/inner_product.h> |
| 20 | + |
| 21 | +#include <queue> |
| 22 | + |
| 23 | +extern "C" { |
| 24 | +#include "mm_io/mm_io.h" |
| 25 | +} |
| 26 | + |
| 27 | +namespace rcm { |
| 28 | + |
| 29 | +class RCM_base |
| 30 | +{ |
| 31 | +protected: |
| 32 | + int m_half_bandwidth; |
| 33 | + int m_half_bandwidth_original; |
| 34 | + |
| 35 | + size_t m_n; |
| 36 | + size_t m_nnz; |
| 37 | + |
| 38 | + typedef typename thrust::tuple<int, int, int, int> IntTuple; |
| 39 | + |
| 40 | + template <typename IVector> |
| 41 | + static void offsets_to_indices(const IVector& offsets, IVector& indices) |
| 42 | + { |
| 43 | + // convert compressed row offsets into uncompressed row indices |
| 44 | + thrust::fill(indices.begin(), indices.end(), 0); |
| 45 | + thrust::scatter( thrust::counting_iterator<int>(0), |
| 46 | + thrust::counting_iterator<int>(offsets.size()-1), |
| 47 | + offsets.begin(), |
| 48 | + indices.begin()); |
| 49 | + thrust::inclusive_scan(indices.begin(), indices.end(), indices.begin(), thrust::maximum<int>()); |
| 50 | + } |
| 51 | + |
| 52 | + template <typename IVector> |
| 53 | + static void indices_to_offsets(const IVector& indices, IVector& offsets) |
| 54 | + { |
| 55 | + // convert uncompressed row indices into compressed row offsets |
| 56 | + thrust::lower_bound(indices.begin(), |
| 57 | + indices.end(), |
| 58 | + thrust::counting_iterator<int>(0), |
| 59 | + thrust::counting_iterator<int>(offsets.size()), |
| 60 | + offsets.begin()); |
| 61 | + } |
| 62 | + |
| 63 | +public: |
| 64 | + |
| 65 | + virtual ~RCM_base() {} |
| 66 | + |
| 67 | + struct Difference: public thrust::binary_function<int, int, int> |
| 68 | + { |
| 69 | + inline |
| 70 | + __host__ __device__ |
| 71 | + int operator() (const int &a, const int &b) const { |
| 72 | + return abs(a-b); |
| 73 | + } |
| 74 | + }; |
| 75 | + |
| 76 | + struct ExtendedDifference: public thrust::binary_function<int, int, int> |
| 77 | + { |
| 78 | + int *m_perm; |
| 79 | + |
| 80 | + ExtendedDifference(int *perm): m_perm(perm) {} |
| 81 | + inline |
| 82 | + __host__ __device__ |
| 83 | + int operator() (const int &a, const int &b) const { |
| 84 | + return abs(m_perm[a]-m_perm[b]); |
| 85 | + } |
| 86 | + }; |
| 87 | + |
| 88 | + struct TupleCompare |
| 89 | + { |
| 90 | + inline |
| 91 | + __host__ __device__ |
| 92 | + bool operator() (IntTuple a, IntTuple b) const |
| 93 | + { |
| 94 | + int a_level = thrust::get<0>(a), b_level = thrust::get<0>(b); |
| 95 | + if (a_level != b_level) return a_level < b_level; |
| 96 | + int a_updated_by = thrust::get<1>(a), b_updated_by = thrust::get<1>(b); |
| 97 | + if (a_updated_by != b_updated_by) return a_updated_by < b_updated_by; |
| 98 | + return thrust::get<2>(a) < thrust::get<2>(b); |
| 99 | + } |
| 100 | + }; |
| 101 | + |
| 102 | + int getHalfBandwidth() const {return m_half_bandwidth;} |
| 103 | + int getHalfBandwidthOriginal() const {return m_half_bandwidth_original;} |
| 104 | + |
| 105 | + virtual void execute() = 0; |
| 106 | +}; |
| 107 | + |
| 108 | +class RCM: public RCM_base |
| 109 | +{ |
| 110 | +private: |
| 111 | + typedef typename thrust::host_vector<int> IntVectorH; |
| 112 | + typedef typename thrust::host_vector<double> DoubleVectorH; |
| 113 | + typedef typename thrust::host_vector<bool> BoolVectorH; |
| 114 | + |
| 115 | + typedef typename IntVectorH::iterator IntIterator; |
| 116 | + typedef typename thrust::tuple<IntIterator, IntIterator> IntIteratorTuple; |
| 117 | + typedef typename thrust::zip_iterator<IntIteratorTuple> EdgeIterator; |
| 118 | + |
| 119 | + typedef typename thrust::tuple<int, int> NodeType; |
| 120 | + |
| 121 | + IntVectorH m_row_offsets; |
| 122 | + IntVectorH m_column_indices; |
| 123 | + DoubleVectorH m_values; |
| 124 | + |
| 125 | + IntVectorH m_perm; |
| 126 | + |
| 127 | + void buildTopology(EdgeIterator& begin, |
| 128 | + EdgeIterator& end, |
| 129 | + int node_begin, |
| 130 | + int node_end, |
| 131 | + IntVectorH& row_offsets, |
| 132 | + IntVectorH& column_indices); |
| 133 | + |
| 134 | + struct CompareValue |
| 135 | + { |
| 136 | + inline |
| 137 | + bool operator() (NodeType a, NodeType b) const { |
| 138 | + return thrust::get<1>(a) > thrust::get<1>(b); |
| 139 | + } |
| 140 | + }; |
| 141 | + |
| 142 | +public: |
| 143 | + RCM(const IntVectorH& row_offsets, |
| 144 | + const IntVectorH& column_indices, |
| 145 | + const DoubleVectorH& values) |
| 146 | + : m_row_offsets(row_offsets), |
| 147 | + m_column_indices(column_indices), |
| 148 | + m_values(values) |
| 149 | + { |
| 150 | + size_t n = row_offsets.size() - 1; |
| 151 | + m_perm.resize(n); |
| 152 | + m_n = n; |
| 153 | + m_nnz = m_values.size(); |
| 154 | + } |
| 155 | + |
| 156 | + ~RCM() {} |
| 157 | + |
| 158 | + void execute(); |
| 159 | +}; |
| 160 | + |
| 161 | +void |
| 162 | +RCM::execute() |
| 163 | +{ |
| 164 | + IntVectorH tmp_reordering(m_n); |
| 165 | + IntVectorH tmp_perm(m_n); |
| 166 | + |
| 167 | + thrust::sequence(tmp_reordering.begin(), tmp_reordering.end()); |
| 168 | + |
| 169 | + IntVectorH row_indices(m_nnz); |
| 170 | + IntVectorH tmp_row_indices(m_nnz << 1); |
| 171 | + IntVectorH tmp_column_indices(m_nnz << 1); |
| 172 | + IntVectorH tmp_row_offsets(m_n + 1); |
| 173 | + offsets_to_indices(m_row_offsets, row_indices); |
| 174 | + |
| 175 | + m_half_bandwidth_original = m_half_bandwidth = thrust::inner_product(row_indices.begin(), row_indices.end(), m_column_indices.begin(), 0, thrust::maximum<int>(), Difference()); |
| 176 | + |
| 177 | + EdgeIterator begin = thrust::make_zip_iterator(thrust::make_tuple(row_indices.begin(), m_column_indices.begin())); |
| 178 | + EdgeIterator end = thrust::make_zip_iterator(thrust::make_tuple(row_indices.end(), m_column_indices.end())); |
| 179 | + buildTopology(begin, end, 0, m_n, tmp_row_offsets, tmp_column_indices); |
| 180 | + |
| 181 | + const int MAX_NUM_TRIAL = 5; |
| 182 | + |
| 183 | + BoolVectorH tried(m_n, false); |
| 184 | + tried[0] = true; |
| 185 | + |
| 186 | + int last_tried = 0; |
| 187 | + |
| 188 | + m_perm.resize(m_n); |
| 189 | + thrust::sequence(m_perm.begin(), m_perm.end()); |
| 190 | + |
| 191 | + for (int trial_num = 0; trial_num < MAX_NUM_TRIAL ; trial_num++) |
| 192 | + { |
| 193 | + std::queue<int> q; |
| 194 | + std::priority_queue<NodeType, std::vector<NodeType>, CompareValue > pq; |
| 195 | + |
| 196 | + int tmp_node; |
| 197 | + BoolVectorH pushed(m_n, false); |
| 198 | + |
| 199 | + int left_cnt = m_n; |
| 200 | + int j = 0, last = 0; |
| 201 | + |
| 202 | + if (trial_num > 0) { |
| 203 | + |
| 204 | + if (trial_num < MAX_NUM_TRIAL) { |
| 205 | + tmp_node = rand() % m_n; |
| 206 | + |
| 207 | + while(tried[tmp_node]) |
| 208 | + tmp_node = rand() % m_n; |
| 209 | + } else { |
| 210 | + if (last_tried >= m_n - 1) { |
| 211 | + fprintf(stderr, "All possible starting points have been tried in RCM\n"); |
| 212 | + break; |
| 213 | + } |
| 214 | + for (tmp_node = last_tried+1; tmp_node < m_n; tmp_node++) |
| 215 | + if (!tried[tmp_node]) { |
| 216 | + last_tried = tmp_node; |
| 217 | + break; |
| 218 | + } |
| 219 | + } |
| 220 | + |
| 221 | + pushed[tmp_node] = true; |
| 222 | + tried[tmp_node] = true; |
| 223 | + q.push(tmp_node); |
| 224 | + } |
| 225 | + |
| 226 | + while(left_cnt--) { |
| 227 | + if(q.empty()) { |
| 228 | + left_cnt++; |
| 229 | + int i; |
| 230 | + |
| 231 | + for(i = last; i < m_n; i++) { |
| 232 | + if(!pushed[i]) { |
| 233 | + q.push(i); |
| 234 | + pushed[i] = true; |
| 235 | + last = i; |
| 236 | + break; |
| 237 | + } |
| 238 | + } |
| 239 | + if(i < m_n) continue; |
| 240 | + fprintf(stderr, "Can never get here!\n"); |
| 241 | + return; |
| 242 | + } |
| 243 | + |
| 244 | + tmp_node = q.front(); |
| 245 | + tmp_reordering[j] = tmp_node; |
| 246 | + j++; |
| 247 | + |
| 248 | + q.pop(); |
| 249 | + |
| 250 | + int start_idx = tmp_row_offsets[tmp_node], end_idx = tmp_row_offsets[tmp_node + 1]; |
| 251 | + |
| 252 | + for (int i = start_idx; i < end_idx; i++) { |
| 253 | + int target_node = tmp_column_indices[i]; |
| 254 | + if(!pushed[target_node]) { |
| 255 | + pushed[target_node] = true; |
| 256 | + pq.push(thrust::make_tuple(target_node, tmp_row_offsets[target_node + 1] - tmp_row_offsets[target_node])); |
| 257 | + } |
| 258 | + } |
| 259 | + |
| 260 | + while(!pq.empty()) { |
| 261 | + q.push(thrust::get<0>(pq.top())); |
| 262 | + pq.pop(); |
| 263 | + } |
| 264 | + } |
| 265 | + |
| 266 | + thrust::scatter(thrust::make_counting_iterator(0), |
| 267 | + thrust::make_counting_iterator((int)(m_n)), |
| 268 | + tmp_reordering.begin(), |
| 269 | + tmp_perm.begin()); |
| 270 | + |
| 271 | + { |
| 272 | + int *perm_array = thrust::raw_pointer_cast(&tmp_perm[0]); |
| 273 | + int tmp_bdwidth = thrust::inner_product(row_indices.begin(), row_indices.end(), m_column_indices.begin(), 0, thrust::maximum<int>(), ExtendedDifference(perm_array)); |
| 274 | + |
| 275 | + if (m_half_bandwidth > tmp_bdwidth) { |
| 276 | + m_half_bandwidth = tmp_bdwidth; |
| 277 | + m_perm = tmp_perm; |
| 278 | + } |
| 279 | + } |
| 280 | + } |
| 281 | +} |
| 282 | + |
| 283 | +void |
| 284 | +RCM::buildTopology(EdgeIterator& begin, |
| 285 | + EdgeIterator& end, |
| 286 | + int node_begin, |
| 287 | + int node_end, |
| 288 | + IntVectorH& row_offsets, |
| 289 | + IntVectorH& column_indices) |
| 290 | +{ |
| 291 | + if (row_offsets.size() != m_n + 1) |
| 292 | + row_offsets.resize(m_n + 1, 0); |
| 293 | + else |
| 294 | + thrust::fill(row_offsets.begin(), row_offsets.end(), 0); |
| 295 | + |
| 296 | + IntVectorH row_indices((end - begin) << 1); |
| 297 | + column_indices.resize((end - begin) << 1); |
| 298 | + int actual_cnt = 0; |
| 299 | + |
| 300 | + for(EdgeIterator edgeIt = begin; edgeIt != end; edgeIt++) { |
| 301 | + int from = thrust::get<0>(*edgeIt), to = thrust::get<1>(*edgeIt); |
| 302 | + if (from != to) { |
| 303 | + row_indices[actual_cnt] = from; |
| 304 | + column_indices[actual_cnt] = to; |
| 305 | + row_indices[actual_cnt + 1] = to; |
| 306 | + column_indices[actual_cnt + 1] = from; |
| 307 | + actual_cnt += 2; |
| 308 | + } |
| 309 | + } |
| 310 | + row_indices.resize(actual_cnt); |
| 311 | + column_indices.resize(actual_cnt); |
| 312 | + // thrust::sort_by_key(row_indices.begin(), row_indices.end(), column_indices.begin()); |
| 313 | + { |
| 314 | + int& nnz = actual_cnt; |
| 315 | + IntVectorH tmp_column_indices(nnz); |
| 316 | + for (int i = 0; i < nnz; i++) |
| 317 | + row_offsets[row_indices[i]] ++; |
| 318 | + |
| 319 | + thrust::inclusive_scan(row_offsets.begin() + node_begin, row_offsets.begin() + (node_end + 1), row_offsets.begin() + node_begin); |
| 320 | + |
| 321 | + for (int i = nnz - 1; i >= 0; i--) { |
| 322 | + int idx = (--row_offsets[row_indices[i]]); |
| 323 | + tmp_column_indices[idx] = column_indices[i]; |
| 324 | + } |
| 325 | + column_indices = tmp_column_indices; |
| 326 | + } |
| 327 | +} |
| 328 | + |
| 329 | +} // end namespace rcm |
| 330 | + |
| 331 | +#endif |
0 commit comments