Skip to content
Snippets Groups Projects
Commit 6c215228 authored by ylvse560's avatar ylvse560
Browse files

Lab 6 done!

parent b3ba5bbc
No related branches found
No related tags found
No related merge requests found
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
#include "milli.h" #include "milli.h"
// Size of data! // Size of data!
#define kDataLength 1024 #define kDataLength 2048
#define MAXPRINTSIZE 16 #define MAXPRINTSIZE 16
unsigned int *generateRandomData(unsigned int length) unsigned int *generateRandomData(unsigned int length)
...@@ -56,26 +56,28 @@ unsigned int *generateRandomData(unsigned int length) ...@@ -56,26 +56,28 @@ unsigned int *generateRandomData(unsigned int length)
// Kernel run conveniently packed. Edit as needed, i.e. with more parameters. // Kernel run conveniently packed. Edit as needed, i.e. with more parameters.
// Only ONE array of data. // Only ONE array of data.
// __kernel void sort(__global unsigned int *data, const unsigned int length) // __kernel void sort(__global unsigned int *data, const unsigned int length)
void runKernel(cl_kernel kernel, int threads, cl_mem data, unsigned int length) void runKernel(cl_kernel kernel, int threads, cl_mem data, unsigned int length, unsigned int k , unsigned int j)
{ {
size_t localWorkSize, globalWorkSize; size_t localWorkSize, globalWorkSize;
cl_int ciErrNum = CL_SUCCESS; cl_int ciErrNum = CL_SUCCESS;
// Some reasonable number of blocks based on # of threads // Some reasonable number of blocks based on # of threads
if (threads<512) localWorkSize = threads; if (threads<512) localWorkSize = threads;
else localWorkSize = 512; else localWorkSize = 512;
globalWorkSize = threads; globalWorkSize = threads;
// set the args values // set the args values
ciErrNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &data); ciErrNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &data);
ciErrNum |= clSetKernelArg(kernel, 1, sizeof(cl_uint), (void *) &length); ciErrNum |= clSetKernelArg(kernel, 1, sizeof(cl_uint), (void *) &length);
ciErrNum |= clSetKernelArg(kernel, 2, sizeof(cl_uint), (void *) &k);
ciErrNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), (void *) &j);
printCLError(ciErrNum,8); printCLError(ciErrNum,8);
// Run kernel // Run kernel
cl_event event; cl_event event;
ciErrNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &event); ciErrNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &event);
printCLError(ciErrNum,9); printCLError(ciErrNum,9);
// Synch // Synch
clWaitForEvents(1, &event); clWaitForEvents(1, &event);
printCLError(ciErrNum,10); printCLError(ciErrNum,10);
...@@ -95,8 +97,16 @@ int bitonic_gpu(unsigned int *data, unsigned int length) ...@@ -95,8 +97,16 @@ int bitonic_gpu(unsigned int *data, unsigned int length)
printCLError(ciErrNum,7); printCLError(ciErrNum,7);
// ********** RUN THE KERNEL ************ // ********** RUN THE KERNEL ************
runKernel(gpgpuSort, length, io_data, length); ResetMilli(); // You may consider moving this inside bitonic_gpu(), to skip timing of data allocation.
for (unsigned int k=2;k<=length;k=2*k) // Outer loop, double size for each step
{
for (unsigned int j=k>>1;j>0;j=j>>1) // Inner loop, half size for each step
{
printf("j %i \n", j);
runKernel(gpgpuSort, length, io_data, length,k, j);
}
}
printf("GPU %f\n", GetSeconds());
// Get data // Get data
cl_event event; cl_event event;
ciErrNum = clEnqueueReadBuffer(commandQueue, io_data, CL_TRUE, 0, length * sizeof(unsigned int), data, 0, NULL, &event); ciErrNum = clEnqueueReadBuffer(commandQueue, io_data, CL_TRUE, 0, length * sizeof(unsigned int), data, 0, NULL, &event);
...@@ -104,7 +114,7 @@ int bitonic_gpu(unsigned int *data, unsigned int length) ...@@ -104,7 +114,7 @@ int bitonic_gpu(unsigned int *data, unsigned int length)
// Synch // Synch
clWaitForEvents(1, &event); clWaitForEvents(1, &event);
printCLError(ciErrNum,10); printCLError(ciErrNum,10);
clReleaseMemObject(io_data); clReleaseMemObject(io_data);
return ciErrNum; return ciErrNum;
} }
...@@ -121,6 +131,7 @@ static void exchange(unsigned int *i, unsigned int *j) ...@@ -121,6 +131,7 @@ static void exchange(unsigned int *i, unsigned int *j)
void bitonic_cpu(unsigned int *data, int N) void bitonic_cpu(unsigned int *data, int N)
{ {
unsigned int i,j,k; unsigned int i,j,k;
printf("CPU sorting.\n"); printf("CPU sorting.\n");
...@@ -144,14 +155,14 @@ void bitonic_cpu(unsigned int *data, int N) ...@@ -144,14 +155,14 @@ void bitonic_cpu(unsigned int *data, int N)
// ------------ main ------------ // ------------ main ------------
int main( int argc, char** argv) int main( int argc, char** argv)
{ {
int length = kDataLength; // SIZE OF DATA int length = kDataLength; // SIZE OF DATA
unsigned short int header[2]; unsigned short int header[2];
// Computed data // Computed data
unsigned int *data_cpu, *data_gpu; unsigned int *data_cpu, *data_gpu;
// Find a platform and device // Find a platform and device
if (initOpenCL()<0) if (initOpenCL()<0)
{ {
...@@ -169,22 +180,22 @@ int main( int argc, char** argv) ...@@ -169,22 +180,22 @@ int main( int argc, char** argv)
printf("\nError allocating data.\n\n"); printf("\nError allocating data.\n\n");
return 1; return 1;
} }
// Copy to gpu data. // Copy to gpu data.
for(int i=0;i<length;i++) for(int i=0;i<length;i++)
data_gpu[i]=data_cpu[i]; data_gpu[i]=data_cpu[i];
ResetMilli(); ResetMilli();
bitonic_cpu(data_cpu,length); bitonic_cpu(data_cpu,length);
printf("CPU %f\n", GetSeconds()); printf("CPU %f\n", GetSeconds());
ResetMilli(); // You may consider moving this inside bitonic_gpu(), to skip timing of data allocation.
bitonic_gpu(data_gpu,length); bitonic_gpu(data_gpu,length);
printf("GPU %f\n", GetSeconds());
// Print part of result // Print part of result
for (int i=0;i<MAXPRINTSIZE;i++) for (int i=0;i<MAXPRINTSIZE;i++) {
printf("%d ", data_gpu[i]); printf("%d ", data_gpu[i]);
}
printf("\n"); printf("\n");
for (int i=0;i<length;i++) for (int i=0;i<length;i++)
......
/* /*
* Placeholder OpenCL kernel * Bitonic sort OpenCL
*/ */
inline void swap(__global unsigned int *data, unsigned int a, unsigned int b) {
unsigned int temp = data[a];
data[a] = data[b];
data[b] = temp;
}
__kernel void bitonic(__global unsigned int *data, const unsigned int length) __kernel void bitonic(__global unsigned int *data, const unsigned int length, unsigned int k, unsigned int j)
{ {
unsigned int pos = 0; unsigned int i = get_global_id(0);
unsigned int val; int ixj=i^j; // Calculate indexing!
if(i == 1)
//Something should happen here
unsigned int i,j,k;
printf("CPU sorting.\n");
unsigned int N = length;
for (k=2;k<=N;k=2*k) // Outer loop, double size for each step
{ {
for (j=k/2;j>0;j=j/2) // Inner loop, half size for each step printf("ixj %i \n", ixj);
{ }
for (i=0;i<N;i++) // Loop over data if ((ixj)>i)
{ {
int ixj=i+j; // Calculate indexing! if ((i&k)==0 && data[i]>data[ixj]) swap(data,i,ixj);
if ((ixj)>i) if ((i&k)!=0 && data[i]<data[ixj]) swap(data,i,ixj);
{
if ((i&k)==0 && data[i]>data[ixj])
exchange(&data[i],&data[ixj]);
if ((i&k)!=0 && data[i]<data[ixj])
exchange(&data[i],&data[ixj]);
}
}
}
} }
data[get_global_id(0)]=get_global_id(0);
} }
...@@ -4,11 +4,6 @@ ...@@ -4,11 +4,6 @@
__kernel void find_max(__global unsigned int *data, const unsigned int length, const unsigned int i) __kernel void find_max(__global unsigned int *data, const unsigned int length, const unsigned int i)
{ {
unsigned int pos = 0;
unsigned int val;
//Something should happen here
int index = get_global_id(0)*i*2; int index = get_global_id(0)*i*2;
if(index <= length) if(index <= length)
{ {
......
...@@ -176,3 +176,22 @@ GPU 0.000492 ...@@ -176,3 +176,22 @@ GPU 0.000492
### QUESTION: How can you optimize this further? You should know at least one way. ### QUESTION: How can you optimize this further? You should know at least one way.
Use shared memory. Use better memory access pattern for kernel calls that compare blocks. Use shared memory. Use better memory access pattern for kernel calls that compare blocks.
### QUESTION: Should each thread produce one output or two? Why?
Each thread swaps two elements, which means that each thread produces two outputs.
### QUESTION: How many items can you handle in one workgroup?
512 since the number of threads in the local work size is 512.
### QUESTION: What problem must be solved when you use more than one workgroup? How did you solve it?
Synchronization between work groups. Multiple kernel calls.
### QUESTION: What time do you get? Difference to the CPU? What is the break even size? What can you expect for a parallel CPU version? (Your conclusions here may vary between the labs.)
For 131072 elements:
CPU sorting.
CPU 0.082142
GPU sorting.
GPU 0.001693
The CPU is faster than the GPU only up til 1024 elements, after that the GPU is always faster. A parallelized CPU will run faster than the current version. However, the GPU will always beat the CPU on large element sizes, since the bitonic sort makes use of massive parallelism, which the CPU cannot.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment