Skip to content
Snippets Groups Projects
Commit b3ba5bbc authored by dansa828's avatar dansa828
Browse files

update lab6

parent 4c5b52d8
Branches
No related tags found
No related merge requests found
all: all:
nvcc filter.cu -c -arch=sm_30 -o filter.o nvcc filteroptimized.cu -c -arch=sm_30 -o filter.o
g++ filter.o milli.c readppm.c -lGL -lm -lcuda -lcudart -L/usr/local/cuda/lib -lglut -o filter g++ filter.o milli.c readppm.c -lGL -lm -lcuda -lcudart -L/usr/local/cuda/lib -lglut -o filter
...@@ -339,7 +339,7 @@ void computeImages(int kernelsizex, int kernelsizey) ...@@ -339,7 +339,7 @@ void computeImages(int kernelsizex, int kernelsizey)
//dim3 grid((imagesizex + blockSize.x - 1)/blockSize.x, (imagesizey + blockSize.y - 1)/blockSize.y); //dim3 grid((imagesizex + blockSize.x - 1)/blockSize.x, (imagesizey + blockSize.y - 1)/blockSize.y);
//Task 1 //Task 1
//filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey);
//Task 2 //Task 2
//filter<<<grid,blockSize>>>(dev_input, dev_sep, imagesizex, imagesizey, kernelsizex, kernelsizey); //filter<<<grid,blockSize>>>(dev_input, dev_sep, imagesizex, imagesizey, kernelsizex, kernelsizey);
...@@ -347,11 +347,11 @@ void computeImages(int kernelsizex, int kernelsizey) ...@@ -347,11 +347,11 @@ void computeImages(int kernelsizex, int kernelsizey)
//filter<<<grid,blockSize>>>(dev_sep, dev_bitmap, imagesizex, imagesizey, kernelsizey, kernelsizex); //filter<<<grid,blockSize>>>(dev_sep, dev_bitmap, imagesizex, imagesizey, kernelsizey, kernelsizex);
//Task 3 Gaussian //Task 3 Gaussian
cudaMalloc( (void**)&stencilcuda, 5*sizeof(unsigned int)); /*cudaMalloc( (void**)&stencilcuda, 5*sizeof(unsigned int));
cudaMemcpy( stencilcuda, stencil, 5*sizeof(unsigned int), cudaMemcpyHostToDevice); cudaMemcpy( stencilcuda, stencil, 5*sizeof(unsigned int), cudaMemcpyHostToDevice);
gaussian_filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey, stencilcuda); gaussian_filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey, stencilcuda);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gaussian_filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizey, kernelsizex, stencilcuda); gaussian_filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizey, kernelsizex, stencilcuda);*/
//Task 4 Median //Task 4 Median
//median_filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); //median_filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey);
cudaThreadSynchronize(); cudaThreadSynchronize();
...@@ -415,7 +415,7 @@ int main( int argc, char** argv) ...@@ -415,7 +415,7 @@ int main( int argc, char** argv)
//computeImages(10, 0); //computeImages(10, 0);
//computeImages(0, 10); //computeImages(0, 10);
//Task 3 //Task 3
computeImages(2, 2); computeImages(3, 3);
printf("%s%f\n","Measured time (ms): ", (float)GetMicroseconds()*0.001); printf("%s%f\n","Measured time (ms): ", (float)GetMicroseconds()*0.001);
// You can save the result to a file like this: // You can save the result to a file like this:
......
all: all:
gcc hello_world_cl.c CLutilities.c -lOpenCL -I/usr/local/cuda/include -o hello_world_cl #gcc hello_world_cl.c CLutilities.c -lOpenCL -I/usr/local/cuda/include -o hello_world_cl
#gcc -std=c99 find_max.c -o find_max milli.c CLutilities.c -lOpenCL -I/usr/local/cuda/include/
gcc -std=c99 bitonic.c -o bitonic milli.c CLutilities.c -lOpenCL -I/usr/local/cuda/include/
/*
* Placeholder OpenCL kernel
*/
__kernel void bitonic(__global unsigned int *data, const unsigned int length)
{
unsigned int pos = 0;
unsigned int val;
//Something should happen here
unsigned int i,j,k;
printf("CPU sorting.\n");
unsigned int N = length;
for (k=2;k<=N;k=2*k) // Outer loop, double size for each step
{
for (j=k/2;j>0;j=j/2) // Inner loop, half size for each step
{
for (i=0;i<N;i++) // Loop over data
{
int ixj=i+j; // Calculate indexing!
if ((ixj)>i)
{
if ((i&k)==0 && data[i]>data[ixj])
exchange(&data[i],&data[ixj]);
if ((i&k)!=0 && data[i]<data[ixj])
exchange(&data[i],&data[ixj]);
}
}
}
}
data[get_global_id(0)]=get_global_id(0);
}
...@@ -57,26 +57,28 @@ unsigned int *generateRandomData(unsigned int length) ...@@ -57,26 +57,28 @@ unsigned int *generateRandomData(unsigned int length)
// Kernel run conveniently packed. Edit as needed, i.e. with more parameters. // Kernel run conveniently packed. Edit as needed, i.e. with more parameters.
// Only ONE array of data. // Only ONE array of data.
// __kernel void sort(__global unsigned int *data, const unsigned int length) // __kernel void sort(__global unsigned int *data, const unsigned int length)
void runKernel(cl_kernel kernel, int threads, cl_mem data, unsigned int length) void runKernel(cl_kernel kernel, int threads, cl_mem data, unsigned int length, unsigned int i)
{ {
size_t localWorkSize, globalWorkSize; size_t localWorkSize, globalWorkSize;
cl_int ciErrNum = CL_SUCCESS; cl_int ciErrNum = CL_SUCCESS;
// Some reasonable number of blocks based on # of threads // Some reasonable number of blocks based on # of threads
if (threads<512) localWorkSize = threads; if (threads<512) localWorkSize = threads;
else localWorkSize = 512; else localWorkSize = 512;
globalWorkSize = threads;
globalWorkSize = threads;
// set the args values // set the args values
ciErrNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &data); ciErrNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &data);
ciErrNum |= clSetKernelArg(kernel, 1, sizeof(cl_uint), (void *) &length); ciErrNum |= clSetKernelArg(kernel, 1, sizeof(cl_uint), (void *) &length);
ciErrNum |= clSetKernelArg(kernel, 2, sizeof(cl_uint), (void *) &i);
printCLError(ciErrNum,8); printCLError(ciErrNum,8);
// Run kernel // Run kernel
cl_event event; cl_event event;
ciErrNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &event); ciErrNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &event);
printCLError(ciErrNum,9); printCLError(ciErrNum,9);
// Synch // Synch
clWaitForEvents(1, &event); clWaitForEvents(1, &event);
printCLError(ciErrNum,10); printCLError(ciErrNum,10);
...@@ -96,8 +98,12 @@ int find_max_gpu(unsigned int *data, unsigned int length) ...@@ -96,8 +98,12 @@ int find_max_gpu(unsigned int *data, unsigned int length)
printCLError(ciErrNum,7); printCLError(ciErrNum,7);
// ********** RUN THE KERNEL ************ // ********** RUN THE KERNEL ************
runKernel(gpgpuReduction, length, io_data, length); ResetMilli();
for(unsigned int i = 1; i < length; i*=2)
{
runKernel(gpgpuReduction, length, io_data, length, i);
}
printf("GPU %f\n", GetSeconds());
// Get data // Get data
cl_event event; cl_event event;
ciErrNum = clEnqueueReadBuffer(commandQueue, io_data, CL_TRUE, 0, length * sizeof(unsigned int), data, 0, NULL, &event); ciErrNum = clEnqueueReadBuffer(commandQueue, io_data, CL_TRUE, 0, length * sizeof(unsigned int), data, 0, NULL, &event);
...@@ -105,7 +111,7 @@ int find_max_gpu(unsigned int *data, unsigned int length) ...@@ -105,7 +111,7 @@ int find_max_gpu(unsigned int *data, unsigned int length)
// Synch // Synch
clWaitForEvents(1, &event); clWaitForEvents(1, &event);
printCLError(ciErrNum,10); printCLError(ciErrNum,10);
clReleaseMemObject(io_data); clReleaseMemObject(io_data);
return ciErrNum; return ciErrNum;
} }
...@@ -113,26 +119,30 @@ int find_max_gpu(unsigned int *data, unsigned int length) ...@@ -113,26 +119,30 @@ int find_max_gpu(unsigned int *data, unsigned int length)
// CPU max finder (sequential) // CPU max finder (sequential)
void find_max_cpu(unsigned int *data, int N) void find_max_cpu(unsigned int *data, int N)
{ {
unsigned int i, m; unsigned int i, m, idx;
m = data[0]; m = data[0];
for (i=0;i<N;i++) // Loop over data for (i=0;i<N;i++) // Loop over data
{ {
if (data[i] > m) if (data[i] > m){
m = data[i]; m = data[i];
idx = i;
}
} }
printf("%s%i\n", "Index: ", idx);
data[0] = m; data[0] = m;
} }
// ------------ main ------------ // ------------ main ------------
int main( int argc, char** argv) int main( int argc, char** argv)
{ {
int length = kDataLength; // SIZE OF DATA int length = kDataLength; // SIZE OF DATA
unsigned short int header[2]; unsigned short int header[2];
// Computed data // Computed data
unsigned int *data_cpu, *data_gpu; unsigned int *data_cpu, *data_gpu;
// Find a platform and device // Find a platform and device
if (initOpenCL()<0) if (initOpenCL()<0)
{ {
...@@ -150,18 +160,18 @@ int main( int argc, char** argv) ...@@ -150,18 +160,18 @@ int main( int argc, char** argv)
printf("\nError allocating data.\n\n"); printf("\nError allocating data.\n\n");
return 1; return 1;
} }
// Copy to gpu data. // Copy to gpu data.
for(int i=0;i<length;i++) for(int i=0;i<length;i++)
data_gpu[i]=data_cpu[i]; data_gpu[i]=data_cpu[i];
ResetMilli(); ResetMilli();
find_max_cpu(data_cpu,length); find_max_cpu(data_cpu,length);
printf("CPU %f\n", GetSeconds()); printf("CPU %f\n", GetSeconds());
ResetMilli(); // You may consider moving this inside find_max_gpu(), to skip timing of data allocation. //ResetMilli(); // You may consider moving this inside find_max_gpu(), to skip timing of data allocation.
find_max_gpu(data_gpu,length); find_max_gpu(data_gpu,length);
printf("GPU %f\n", GetSeconds()); //printf("GPU %f\n", GetSeconds());
// Print part of result // Print part of result
for (int i=0;i<MAXPRINTSIZE;i++) for (int i=0;i<MAXPRINTSIZE;i++)
...@@ -170,7 +180,7 @@ int main( int argc, char** argv) ...@@ -170,7 +180,7 @@ int main( int argc, char** argv)
if (data_cpu[0] != data_gpu[0]) if (data_cpu[0] != data_gpu[0])
{ {
printf("Wrong value at position 0.\n"); printf("%s%i", "Wrong value at position 0. Should be", data_cpu[0]);
closeOpenCL(); closeOpenCL();
return(1); return(1);
} }
......
/*
* Placeholder OpenCL kernel
*/
__kernel void find_max(__global unsigned int *data, const unsigned int length, const unsigned int i)
{
unsigned int pos = 0;
unsigned int val;
//Something should happen here
int index = get_global_id(0)*i*2;
if(index <= length)
{
if(data[index] <= data[index+i])
{
data[index] = data[index+i];
}
}
}
...@@ -161,4 +161,18 @@ First we set our context on the GPU. Then we allocate/copy memory to GPU. Then t ...@@ -161,4 +161,18 @@ First we set our context on the GPU. Then we allocate/copy memory to GPU. Then t
### Question: What function executes your kernel? ### Question: What function executes your kernel?
clEnqueueNDRangeKernel() clEnqueueNDRangeKernel()
### Question: How does the kernel know what element to work on? ### Question: How does the kernel know what element to work on?
get_global_id() in function __kernel void hello() get_global_id() in function kernel void hello()
### QUESTION: What timing did you get for your GPU reduction? Compare it to the CPU version.
1024 elements
CPU 0.000013
GPU 0.000403
### QUESTION: Try larger data size. On what size does the GPU version get faster, or at least comparable, to the CPU?
Around 100 000 elements the two are comparable. With larger sizes the GPU gets faster.
262 144 elements
CPU 0.001173
GPU 0.000492
### QUESTION: How can you optimize this further? You should know at least one way.
Use shared memory. Use better memory access pattern for kernel calls that compare blocks.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment