Skip to content

Commit

Permalink
Implemented a basic version of column swapping. Still experiencing is…
Browse files Browse the repository at this point in the history
…sues,
  • Loading branch information
TeachRaccooon committed Jun 18, 2024
1 parent 0ca1035 commit a1385e3
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 189 deletions.
11 changes: 11 additions & 0 deletions RandLAPACK/comps/rl_qb.hh
Original file line number Diff line number Diff line change
Expand Up @@ -173,18 +173,25 @@ int QB<T, RNG>::call(
return 1;
}

printf("Size Q %ld\n", Q.size());

tol = std::max(tol, 100 * std::numeric_limits<T>::epsilon());
// If the space allocated for col in Q and row in B is insufficient for any iterations ...
if(std::max( Q.size() / m, B.size() / n) < (uint64_t)k) {
// ... allocate more!
this->curr_lim = std::min(this->dim_growth_factor * block_sz, k);
printf("%d\n", this->dim_growth_factor);
printf("Current lim %ld\n", this->curr_lim);
printf("%ld\n", block_sz);
// No need for data movement in this case
util::upsize(m * this->curr_lim, Q);
util::upsize(this->curr_lim * n, B);
} else {
this->curr_lim = k;
}

printf("Size Q %ld\n", Q.size());

// Copy the initial data to avoid unwanted modification TODO #1
std::vector<T> A_cpy (m * n, 0.0);
T* A_cpy_dat = A_cpy.data();
Expand Down Expand Up @@ -221,6 +228,10 @@ int QB<T, RNG>::call(
util::upsize(this->curr_lim * this->curr_lim, Q_gram);
}

printf("Size Q %ld\n", Q.size());
printf("Size B %ld\n", B.size());
printf("Size QtQi %ld\n", QtQi.size());

// Calling RangeFinder
if(this->RF_Obj.call(m, n, A_cpy, block_sz, this->Q_i, state))
return 6; // RF failed
Expand Down
81 changes: 34 additions & 47 deletions RandLAPACK/gpu_functions/rl_cuda_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,28 +50,23 @@ inline
__device__
bool valid_index(size_t q, size_t m, size_t n, size_t lda)
{
return ((q < m*lda) && ((q % lda) < n));
return ((q < m * lda) && ((q % lda) < n));
}

inline
__device__
bool valid_index(size_t q, size_t m)
{
return (q < m);
__global__ void __launch_bounds__(128) copy(int64_t* src, int64_t* dest, int64_t n) {
int64_t const id = (int64_t)blockDim.x * blockIdx.x + threadIdx.x;
if (id < n) {
dest[id] = src[id];
}
}


template <typename T>
inline
__device__
void find(T* array, int64_t size, T target, T* result) {

size_t q = array_index();
if (!valid_index(q, size))
return;

if (q < size && array[q] == target) {
*result = q;
__device__ void swap(T* a, T* b, int64_t n) {
int64_t const id = (int64_t)blockDim.x * blockIdx.x + threadIdx.x;
if (id < n) {
T const v{a[id]};
a[id] = b[id];
b[id] = v;
}
}

Expand All @@ -86,35 +81,20 @@ void col_swap_gpu(
int64_t k,
T* A,
int64_t lda,
T* idx)
int64_t* idx
)
{
//if (!valid_index(q, m))
//return;

int64_t i, j, l;
T buf;
for (i = 0, j = 0; i < k; ++i) {
j = idx[i] - 1;
//blas::swap(m, &A[i * lda], 1, &A[j * lda], 1);
//buf = A[i * lda + q];
//A[q + i * lda] = A[q + j * lda];
//A[q + j * lda] = buf;

for (int s = 0; s < m; ++s) {
buf = A[i * lda + s];
A[s + i * lda] = A[s + j * lda];
A[s + j * lda] = buf;
A -= lda;
int64_t* end = idx + k;
for (int64_t i = 1; i <= k; ++i, ++idx) {
// swap rows IFF mismatched
if (int64_t const j = *idx; i != j) {
// swap columns
swap(A + i * lda, A + j * lda, m);
__syncthreads();
// swap indices
std::iter_swap(idx, std::find(idx, end, i));
}

// swap idx array elements
// Find idx element with value i and assign it to j
for(l = i; l < k; ++l) {
if(idx[l] == i + 1) {
idx[l] = j + 1;
break;
}
}
idx[i] = i + 1;
}
}

Expand Down Expand Up @@ -142,11 +122,18 @@ void col_swap_gpu(
int64_t k,
T* A,
int64_t lda,
T* idx, cudaStream_t strm)
int64_t* idx,
int64_t* temp_buf,
cudaStream_t strm)
{
#ifdef USE_CUDA
auto [tg, bg] = partition_1d(n, m, lda);
col_swap_gpu<<<tg, bg, 0, strm>>>(m, n, k, A, lda, idx);
blas::Queue blas_queue(0);
// threads per block
int tpb = 128;
// num blcoks to spawn
int nb = (m + tpb - 1) / tpb;
copy<<<nb, tpb, 0, strm>>>(idx, temp_buf, n);
col_swap_gpu<<<nb, tpb, 0, strm>>>(m, n, k, A, lda, idx);
#endif

cudaError_t ierr = cudaGetLastError();
Expand Down
86 changes: 1 addition & 85 deletions RandLAPACK/misc/rl_util.hh
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ void get_U(

/// Positions columns of A in accordance with idx vector of length k.
/// idx array modified ONLY within the scope of this function.
/*template <typename T>
template <typename T>
void col_swap(
int64_t m,
int64_t n,
Expand All @@ -107,90 +107,6 @@ void col_swap(
idx[it - (idx.begin())] = j + 1;
}
}
*/

template <typename T>
void col_swap(
int64_t m,
int64_t n,
int64_t k,
T* A,
int64_t lda,
std::vector<int64_t> idx
) {
char name [] = "A pivoted";
if(k > n)
throw std::runtime_error("Invalid rank parameter.");
T buf;
int64_t i, j, l;
for (i = 0, j = 0; i < k; ++i) {
j = idx[i] - 1;
//blas::swap(m, &A[i * lda], 1, &A[j * lda], 1);
for (int s = 0; s < m; ++s) {
buf = A[i * lda + s];
A[s + i * lda] = A[s + j * lda];
A[s + j * lda] = buf;
}


// swap idx array elements
// Find idx element with value i and assign it to j
for(l = i; l < k; ++l) {
if(idx[l] == i + 1) {
idx[l] = j + 1;
break;
}
}
idx[i] = i + 1;

if(n <= 20) {
for(int f = 0; f < n; ++f)
printf("%ld, ", idx[f]);
printf("\n");
RandBLAS::util::print_colmaj(m, n, A, name);
}
}
}

/*
template <typename T>
void col_swap(
int64_t m,
int64_t n,
int64_t k,
T* A,
int64_t lda,
std::vector<int64_t> idx
) {
char name [] = "A pivoted";
if(k > n)
throw std::runtime_error("Invalid rank parameter.");
T buf;
int64_t i, j, l;
for (i = 0, j = 0; i < k; ++i) {
j = idx[i] - 1;
//blas::swap(m, &A[i * lda], 1, &A[j * lda], 1);
if (j != i) {
for (int s = 0; s < m; ++s) {
buf = A[i * lda + s];
A[s + i * lda] = A[s + j * lda];
A[s + j * lda] = buf;
}
buf = idx[i];
idx[i] = idx[j];
idx[j] = buf;
}
if(n <= 20) {
for(int f = 0; f < n; ++f)
printf("%ld, ", idx[f]);
printf("\n");
RandBLAS::util::print_colmaj(m, n, A, name);
}
}
}
*/

/// A version of the above function to be used on a vector of integers
template <typename T>
Expand Down
6 changes: 3 additions & 3 deletions test/comps/test_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ class TestUtil : public ::testing::Test
all_data.A_cpy[i] -= all_data.Ident[i];

T norm = lapack::lange(Norm::Fro, m, n, all_data.A_cpy.data(), m);
printf("||A_piv - QB||_F: %e\n", norm);
printf("||A_piv - QR||_F: %e\n", norm);
ASSERT_NEAR(norm, 0.0, std::pow(std::numeric_limits<T>::epsilon(), 0.625));
}
};
Expand Down Expand Up @@ -276,8 +276,8 @@ TEST_F(Test_Inplace_Square_Transpose, random_matrix_rowmajor) {

TEST_F(TestUtil, test_col_swp) {

int64_t m = 5;
int64_t n = 5;
int64_t m = 1000;
int64_t n = 1000;
auto state = RandBLAS::RNGState();
ColSwpTestData<double> all_data(m, n);

Expand Down
Loading

0 comments on commit a1385e3

Please sign in to comment.