-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcuda_dot.cu
67 lines (58 loc) · 1.97 KB
/
cuda_dot.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
//
// cuda_dot.cu
// Cuda GMRES
//
// Created by Tim Ioannidis on 2/18/12.
// Copyright 2012 Chemeng NTUA. All rights reserved.
//
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "cuda_config.h"
#include "cuda_methods.h"
//dot product dot_res=a<dot>b me diastasi dim
__global__ void cuda_dot_kernel(int n,double *a, double *b, double *dot_res)
{
__shared__ double cache[threadsPerBlock]; //thread shared memory
int global_tid=0,cacheIndex=0;
double temp = 0;
//orismos indexing
global_tid = threadIdx.x + blockIdx.x * blockDim.x;
cacheIndex = threadIdx.x;
while (global_tid < n) {
temp += a[global_tid] * b[global_tid];
global_tid += blockDim.x * gridDim.x;
}
// set the cache values
cache[cacheIndex] = temp;
// synchronize threads in this block
__syncthreads();
if (blockDim.x >= 1024 && threadIdx.x < 512) {
cache[threadIdx.x] += cache[threadIdx.x + 512];
__syncthreads();
}
if (blockDim.x >= 512 && threadIdx.x < 256) {
cache[threadIdx.x] += cache[threadIdx.x + 256];
__syncthreads();
}
if (blockDim.x >= 256 && threadIdx.x < 128) {
cache[threadIdx.x] += cache[threadIdx.x + 128];
__syncthreads();
}
if (blockDim.x >= 128 && threadIdx.x < 64) {
cache[threadIdx.x] += cache[threadIdx.x + 64];
__syncthreads();
}
//unroll last warp no sync needed
if (threadIdx.x <32 ) {
if (blockDim.x >= 64) cache[threadIdx.x] += cache[threadIdx.x +32];
if (blockDim.x >= 32) cache[threadIdx.x] += cache[threadIdx.x +16];
if (blockDim.x >= 16) cache[threadIdx.x] += cache[threadIdx.x +8];
if (blockDim.x >= 8) cache[threadIdx.x] += cache[threadIdx.x +4];
if (blockDim.x >= 4) cache[threadIdx.x] += cache[threadIdx.x +2];
if (blockDim.x >= 2) cache[threadIdx.x] += cache[threadIdx.x +1];
}
if (cacheIndex==0) {
dot_res[blockIdx.x]=cache[0];
}
}