update lab6

b3ba5bbc · dansa828 · 4c5b52d8 · b3ba5bbc · b3ba5bbc · b3ba5bbc
Commit b3ba5bbc authored 4 years ago by dansa828
--- a/Lab5/Makefile
+++ b/Lab5/Makefile
 all:
-	nvcc filter.cu -c -arch=sm_30 -o filter.o
+	nvcc filteroptimized.cu -c -arch=sm_30 -o filter.o
 	g++ filter.o milli.c readppm.c -lGL -lm -lcuda -lcudart -L/usr/local/cuda/lib -lglut -o filter
--- a/Lab5/filteroptimized.cu
+++ b/Lab5/filteroptimized.cu
@@ -339,7 +339,7 @@ void computeImages(int kernelsizex, int kernelsizey)
  //dim3 grid((imagesizex + blockSize.x - 1)/blockSize.x, (imagesizey + blockSize.y - 1)/blockSize.y);
  //Task 1
-	//filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey);
+	filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey);
  //Task 2
  //filter<<<grid,blockSize>>>(dev_input, dev_sep, imagesizex, imagesizey, kernelsizex, kernelsizey);
@@ -347,11 +347,11 @@ void computeImages(int kernelsizex, int kernelsizey)
  //filter<<<grid,blockSize>>>(dev_sep, dev_bitmap, imagesizex, imagesizey, kernelsizey, kernelsizex);
  //Task 3 Gaussian
-  cudaMalloc( (void**)&stencilcuda, 5*sizeof(unsigned int));
+  /*cudaMalloc( (void**)&stencilcuda, 5*sizeof(unsigned int));
  cudaMemcpy( stencilcuda, stencil, 5*sizeof(unsigned int), cudaMemcpyHostToDevice);
  gaussian_filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey, stencilcuda);
  cudaDeviceSynchronize();
-  gaussian_filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizey, kernelsizex, stencilcuda);
+  gaussian_filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizey, kernelsizex, stencilcuda);*/
  //Task 4 Median
  //median_filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey);
 	cudaThreadSynchronize();
@@ -415,7 +415,7 @@ int main( int argc, char** argv)
  //computeImages(10, 0);
  //computeImages(0, 10);
  //Task 3
-  computeImages(2, 2);
+  computeImages(3, 3);
  printf("%s%f\n","Measured time (ms): ", (float)GetMicroseconds()*0.001);
 // You can save the result to a file like this:

--- a/Lab6/Makefile
+++ b/Lab6/Makefile
 all:
-	gcc hello_world_cl.c CLutilities.c -lOpenCL -I/usr/local/cuda/include -o hello_world_cl
+	#gcc hello_world_cl.c CLutilities.c -lOpenCL -I/usr/local/cuda/include -o hello_world_cl
+	#gcc -std=c99 find_max.c -o find_max milli.c CLutilities.c -lOpenCL  -I/usr/local/cuda/include/
+	gcc -std=c99 bitonic.c -o bitonic milli.c CLutilities.c -lOpenCL -I/usr/local/cuda/include/
--- a/Lab6/bitonic.cl
+++ b/Lab6/bitonic.cl
+/*
+ * Placeholder OpenCL kernel
+ */
+__kernel void bitonic(__global unsigned int *data, const unsigned int length)
+{
+  unsigned int pos = 0;
+  unsigned int val;
+  //Something should happen here
+  unsigned int i,j,k;
+  printf("CPU sorting.\n");
+  unsigned int N = length;
+  for (k=2;k<=N;k=2*k) // Outer loop, double size for each step
+  {
+    for (j=k/2;j>0;j=j/2) // Inner loop, half size for each step
+    {
+      for (i=0;i<N;i++) // Loop over data
+      {
+        int ixj=i+j; // Calculate indexing!
+        if ((ixj)>i)
+        {
+          if ((i&k)==0 && data[i]>data[ixj])
+            exchange(&data[i],&data[ixj]);
+          if ((i&k)!=0 && data[i]<data[ixj])
+            exchange(&data[i],&data[ixj]);
+        }
+      }
+    }
+  }
+  data[get_global_id(0)]=get_global_id(0);
+}
--- a/Lab6/find_max.c
+++ b/Lab6/find_max.c
@@ -57,26 +57,28 @@ unsigned int *generateRandomData(unsigned int length)
 // Kernel run conveniently packed. Edit as needed, i.e. with more parameters.
 // Only ONE array of data.
 // __kernel void sort(__global unsigned int *data, const unsigned int length)
-void runKernel(cl_kernel kernel, int threads, cl_mem data, unsigned int length)
+void runKernel(cl_kernel kernel, int threads, cl_mem data, unsigned int length, unsigned int i)
 {
 	size_t localWorkSize, globalWorkSize;
 	cl_int ciErrNum = CL_SUCCESS;
 	// Some reasonable number of blocks based on # of threads
 	if (threads<512) localWorkSize  = threads;
 	else            localWorkSize  = 512;
-		globalWorkSize = threads;
+	globalWorkSize = threads;
 	// set the args values
 	ciErrNum  = clSetKernelArg(kernel, 0, sizeof(cl_mem),  (void *) &data);
 	ciErrNum |= clSetKernelArg(kernel, 1, sizeof(cl_uint), (void *) &length);
+  ciErrNum |= clSetKernelArg(kernel, 2, sizeof(cl_uint), (void *) &i);
 	printCLError(ciErrNum,8);
 	// Run kernel
 	cl_event event;
 	ciErrNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &event);
 	printCLError(ciErrNum,9);
 	// Synch
 	clWaitForEvents(1, &event);
 	printCLError(ciErrNum,10);
@@ -96,8 +98,12 @@ int find_max_gpu(unsigned int *data, unsigned int length)
 	printCLError(ciErrNum,7);
 	// ********** RUN THE KERNEL ************
-	runKernel(gpgpuReduction, length, io_data, length);
+  ResetMilli();
+  for(unsigned int i = 1; i < length; i*=2)
+  {
+	   runKernel(gpgpuReduction, length, io_data, length, i);
+  }
+  printf("GPU %f\n", GetSeconds());
 	// Get data
 	cl_event event;
 	ciErrNum = clEnqueueReadBuffer(commandQueue, io_data, CL_TRUE, 0, length * sizeof(unsigned int), data, 0, NULL, &event);
@@ -105,7 +111,7 @@ int find_max_gpu(unsigned int *data, unsigned int length)
 	// Synch
 	clWaitForEvents(1, &event);
 	printCLError(ciErrNum,10);
 	clReleaseMemObject(io_data);
 	return ciErrNum;
 }
@@ -113,26 +119,30 @@ int find_max_gpu(unsigned int *data, unsigned int length)
 // CPU max finder (sequential)
 void find_max_cpu(unsigned int *data, int N)
 {
-  unsigned int i, m;
+  unsigned int i, m, idx;
 	m = data[0];
 	for (i=0;i<N;i++) // Loop over data
 	{
-		if (data[i] > m)
+		if (data[i] > m){
-			m = data[i];
+      m = data[i];
+      idx = i;
+    }
 	}
+  printf("%s%i\n", "Index: ", idx);
 	data[0] = m;
 }
 // ------------ main ------------
-int main( int argc, char** argv) 
+int main( int argc, char** argv)
 {
  int length = kDataLength; // SIZE OF DATA
  unsigned short int header[2];
  // Computed data
  unsigned int *data_cpu, *data_gpu;
  // Find a platform and device
  if (initOpenCL()<0)
  {
@@ -150,18 +160,18 @@ int main( int argc, char** argv)
    printf("\nError allocating data.\n\n");
    return 1;
  }
  // Copy to gpu data.
  for(int i=0;i<length;i++)
    data_gpu[i]=data_cpu[i];
  ResetMilli();
  find_max_cpu(data_cpu,length);
  printf("CPU %f\n", GetSeconds());
-  ResetMilli(); // You may consider moving this inside find_max_gpu(), to skip timing of data allocation.
+  //ResetMilli(); // You may consider moving this inside find_max_gpu(), to skip timing of data allocation.
  find_max_gpu(data_gpu,length);
-  printf("GPU %f\n", GetSeconds());
+  //printf("GPU %f\n", GetSeconds());
  // Print part of result
  for (int i=0;i<MAXPRINTSIZE;i++)
@@ -170,7 +180,7 @@ int main( int argc, char** argv)
  if (data_cpu[0] != data_gpu[0])
    {
-      printf("Wrong value at position 0.\n");
+      printf("%s%i", "Wrong value at position 0. Should be", data_cpu[0]);
      closeOpenCL();
      return(1);
    }

--- a/Lab6/find_max.cl
+++ b/Lab6/find_max.cl
+/*
+ * Placeholder OpenCL kernel
+ */
+__kernel void find_max(__global unsigned int *data, const unsigned int length, const unsigned int i)
+{
+  unsigned int pos = 0;
+  unsigned int val;
+  //Something should happen here
+  int index = get_global_id(0)*i*2;
+  if(index <= length)
+  {
+    if(data[index] <= data[index+i])
+    {
+      data[index] = data[index+i];
+    }
+  }
+}
--- a/README.md
+++ b/README.md
@@ -161,4 +161,18 @@ First we set our context on the GPU. Then we allocate/copy memory to GPU. Then t
 ### Question: What function executes your kernel?
 clEnqueueNDRangeKernel()
 ### Question: How does the kernel know what element to work on?
-get_global_id() in function __kernel void hello()
+get_global_id() in function kernel void hello()
+### QUESTION: What timing did you get for your GPU reduction? Compare it to the CPU version.
+1024 elements
+CPU 0.000013
+GPU 0.000403
+### QUESTION: Try larger data size. On what size does the GPU version get faster, or at least comparable, to the CPU?
+Around 100 000 elements the two are comparable. With larger sizes the GPU gets faster.
+262 144 elements
+CPU 0.001173
+GPU 0.000492
+### QUESTION: How can you optimize this further? You should know at least one way.
+Use shared memory. Use better memory access pattern for kernel calls that compare blocks.