Skip to content

Latest commit

 

History

History
273 lines (205 loc) · 6.34 KB

OpenCL.MD

File metadata and controls

273 lines (205 loc) · 6.34 KB

#define PROGRAM_FILE "my_test_kernel.cl" #define KERNEL_FUNC "my_test_kernel" #define MAX_SOURCE_SIZE (0x100000)

#include <stdio.h> #include <stdlib.h> #include <math.h>

#ifdef MAC #include <OpenCL/cl.h> #else #include <CL/cl.h> #endif #include #include using namespace std;

#pragma warning(disable: 4996)

int main(int argc, char* argv[]) {

int loops = 10000;
int mask0  = 0xFF;
int mask1 = 0;
int mask2 = 0;


// Length of vectors
unsigned int n = 256 * 10000;

// Host input vectors
int *h_a;
int *h_b;
// Host output vector
int *h_c;

// Device input buffers
cl_mem d_a;
cl_mem d_b;
// Device output buffer
cl_mem d_c;

cl_platform_id cpPlatform;        // OpenCL platform
cl_device_id device_id;           // device ID
cl_context context;               // context
cl_command_queue cmdqueue;           // command queue
cl_program program;               // program
cl_kernel kernel;                 // kernel

// Size, in bytes, of each vector
size_t bytes = n * sizeof(int);

// Allocate memory for each vector on host
h_a = (int*)malloc(bytes);
h_b = (int*)malloc(bytes);
h_c = (int*)malloc(bytes);

// Initialize vectors on host
int i;
for (i = 0; i < n; i++)
{

#define PI 3.14159265 float fval1 = sin(PI * i / 180); float fval2 = cos(PI * i / 180);

	h_a[i] = *(int*)(&fval1);
	h_b[i] = *(int*)(&fval1);
}

size_t globalSize, localSize;
cl_int err;

// Number of work items in each local work group
localSize = 256;

// Number of total work items - localSize must be devisor
globalSize = ceil(n / (float)localSize)*localSize;

// Bind to platform
err = clGetPlatformIDs(1, &cpPlatform, NULL);

// Get ID for the device
err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);

// Create a context  
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);

// Create a command queue 
cmdqueue = clCreateCommandQueue(context, device_id, 0, &err);

// Create the compute program from the source buffer
char *kernelSource;
size_t kernelSize;
{
	FILE *kernelFile;
	
	

	kernelFile = fopen(PROGRAM_FILE, "r");

	if (!kernelFile) {

		fprintf(stderr, "No file named my_test_kernel.cl was found\n");

		exit(-1);

	}
	kernelSource = (char*)malloc(MAX_SOURCE_SIZE);
	kernelSize = fread(kernelSource, 1, MAX_SOURCE_SIZE, kernelFile);
	fclose(kernelFile);
}

//printf("%s\n", kernelSource);
//printf("%d\n", strlen(kernelSource));
program = clCreateProgramWithSource(context, 1,
	(const char **)& kernelSource, &kernelSize, &err);

// Build the program executable 
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

// Create the compute kernel in the program we wish to run
kernel = clCreateKernel(program, "my_test_kernel", &err);

// Create the input and output arrays in device memory for our calculation
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);

// Write our data set into the input array in device memory
err = clEnqueueWriteBuffer(cmdqueue, d_a, CL_TRUE, 0,
	bytes, h_a, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(cmdqueue, d_b, CL_TRUE, 0,
	bytes, h_b, 0, NULL, NULL);

// Set the arguments to our compute kernel
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
err |= clSetKernelArg(kernel, 3, sizeof(int), &loops);
err |= clSetKernelArg(kernel, 4, sizeof(int), &mask0);
err |= clSetKernelArg(kernel, 5, sizeof(int), &mask1);
err |= clSetKernelArg(kernel, 6, sizeof(int), &mask2);

for (int i=0; i < 100; i++)
{
	// Execute the kernel over the entire range of the data set  
	err = clEnqueueNDRangeKernel(cmdqueue, kernel, 1, NULL, &globalSize, &localSize,
		0, NULL, NULL);

	// Wait for the command queue to get serviced before reading back results
	clFinish(cmdqueue);
}

// Read the results from the device
clEnqueueReadBuffer(cmdqueue, d_c, CL_TRUE, 0,
	bytes, h_c, 0, NULL, NULL);

//Sum up vector c and print result divided by n, this should equal 1 within error
int sum = 0;
//for (i = 0; i < n; i++)
//	sum += h_c[i];
//printf("final result: %f"n", sum / n);

// release OpenCL resources
clReleaseMemObject(d_a);
clReleaseMemObject(d_b);
clReleaseMemObject(d_c);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(cmdqueue);
clReleaseContext(context);

//release host memory
free(h_a);
free(h_b);
free(h_c);
free(kernelSource);

return 0;

}

//Mask0 ==FF; //Mask1 == 00; //mask2 == 00 __kernel void test_clock_tree(__global int * A, __global int *B, __global int *C,int loop, int mask0, int mask1, int mask2) { __local int ldata[1024]; int local_id = get_local_id(0); int gid0 =get_global_id(0); int gid = gid0 & mask1;

	for(int i=0; i < 1024; i+=256){
			ldata[local_id+i] = 0;
	}
	
	for(int i= 0; i < loop; i++){		
		 if(( i&mask0) == local_id) {
			 ldata[local_id] += A[gid + (i&mask1)];
		 }					 
	}	
	
	if(mask2 & local_id){
		B[gid0] = ldata[local_id];	
	}

}

// mask == 0; //~90+VGPR+27INST __kernel void test_load_vgpr(__global int * A, __global int *B, __global int *C,int loop, int mask0, int mask1, int mask2) { uint gid = get_global_id(0) << 4; uint localid = get_global_id(0) ; uint localid2 = get_global_id(0) >> 10;

  int16 data2 = 0;		
	__global int *point = A + localid2;
	for(int i=0; i < 10000; i++)
	{		
		int16 data = vload16(0, point); 			
		data2 = max(data,data2);	
		point += i & mask1;
		point+=1;
	}


	// uint bstore = data.s0 & data.s4 & data.s8 & data.b & data2.b ;				
	if(localid & mask2 ) {		
		vstore16(data2, 0, B + gid );
	}

}

__kernel void my_test_kernel(__global float * A, __global float *B, __global float *C,int loop, int mask0, int mask1, int mask2) { uint gid = get_global_id(0) ; uint localid = get_global_id(0) ; uint localid2 = get_global_id(0) >> 10;

  float16 data = 0;		


{
		float16 a = vload16( 0 ,  A + (gid&mask1));
		float16 b = vload16( 0 ,  B + (gid&mask1));
		float16 a1 = vload16( 0,  A  + (gid&mask1)+1);

		data = a;			
		for(int i=0; i < loop/8; i++)
		{		
			for(int j=0; j< 8; j++)
			{
				data = a * data + b;		
				//data = max(max(a, data),b);	
			}
		}
		
		if(localid & mask2 ) {		
			vstore16(data, 0, C + gid );
		}			
}

}