update

9cff78f9 · dansa828 · 5ef16c69 · 9cff78f9
Commit 9cff78f9 authored 4 years ago by dansa828
--- a/Lab5/filteroptimized.cu
+++ b/Lab5/filteroptimized.cu
@@ -32,7 +32,6 @@
 // Use these for setting shared memory size.
 #define maxKernelSizeX 10
 #define maxKernelSizeY 10
-#define tileSize 8

 // unsigned char median_kernel(skepu::Region2D<unsigned char> image, size_t elemPerPx)
 /*__global__ void median_filter(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey)
@@ -118,17 +117,9 @@
 }*/
 __global__ void gaussian_filter(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey, unsigned int *stencil)
 {
-
-  stencil[0] = 1;
-  stencil[1] = 4;
-  stencil[2] = 6;
-  stencil[3] = 4;
-  stencil[4] = 1;
  //__shared__ unsigned char patch[(32*3+(2*maxKernelSizeX*3))*(32+2*maxKernelSizeY)];
-  const int block_w = 32*3+(2*maxKernelSizeX);
-  const int block_h = 32+(2*maxKernelSizeY);
-  const int tile = 32 - 2*maxKernelSizeX;
-  __shared__ unsigned char patch[block_w*block_h];
+  const int tile = blockDim.x - 2*maxKernelSizeX;
+  __shared__ unsigned char patch[32*3*32];
  // map from blockIdx to pixel position
 	int g_x = blockIdx.x * tile + threadIdx.x-kernelsizex;
  int g_y = blockIdx.y * tile + threadIdx.y-kernelsizey;
@@ -156,32 +147,30 @@ __global__ void gaussian_filter(unsigned char *image, unsigned char *out, const
     unsigned int sumx, sumy, sumz;
     int dy, dx;
     sumx=0;sumy=0;sumz=0;
-
+     int j = 0;
   	 for(dy=-kernelsizey;dy<=kernelsizey;dy++)
     {
     		for(dx=-kernelsizex;dx<=kernelsizex;dx++)
     		{
-           sumx += patch[(l_i+(dy*blockDim.x)+dx)*3+0]*stencil[dy+kernelsizey];
-           sumy += patch[(l_i+(dy*blockDim.x)+dx)*3+1]*stencil[dy+kernelsizey];
-           sumz += patch[(l_i+(dy*blockDim.x)+dx)*3+2]*stencil[dy+kernelsizey];

+           sumx += patch[(l_i+(dy*blockDim.x)+dx)*3+0]*stencil[j];
+           sumy += patch[(l_i+(dy*blockDim.x)+dx)*3+1]*stencil[j];
+           sumz += patch[(l_i+(dy*blockDim.x)+dx)*3+2]*stencil[j];
+
+           j++;
     		}
    }

-    int divby = (2*kernelsizex+1)*(2*kernelsizey+1); // Works for box filters only!
-
-    out[g_i*3+0] = sumx/divby;
-    out[g_i*3+1] = sumy/divby;
-    out[g_i*3+2] = sumz/divby;
+    out[g_i*3+0] = sumx/16;
+    out[g_i*3+1] = sumy/16;
+    out[g_i*3+2] = sumz/16;

  }
 }
  __global__ void median_filter(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey)
  {
-    const int block_w = 32*3+(2*maxKernelSizeX);
-    const int block_h = 32+(2*maxKernelSizeY);
-    const int tile = 32 - 2*maxKernelSizeX;
-    __shared__ unsigned char patch[block_w*block_h];
+    const int tile = blockDim.x - 2*maxKernelSizeX;
+    __shared__ unsigned char patch[32*3*32];
    // map from blockIdx to pixel position
    int g_x = blockIdx.x * tile + threadIdx.x-kernelsizex;
    int g_y = blockIdx.y * tile + threadIdx.y-kernelsizey;
@@ -257,11 +246,8 @@ __global__ void gaussian_filter(unsigned char *image, unsigned char *out, const
 }
 __global__ void filter(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey)
 {
-  //__shared__ unsigned char patch[(32*3+(2*maxKernelSizeX*3))*(32+2*maxKernelSizeY)];
-  const int block_w = 32*3+(2*maxKernelSizeX);
-  const int block_h = 32+(2*maxKernelSizeY);
-  const int tile = 32 - 2*maxKernelSizeX;
-  __shared__ unsigned char patch[block_w*block_h];
+  const int tile = blockDim.x - 2*maxKernelSizeX;
+  __shared__ unsigned char patch[32*3*32];
  // map from blockIdx to pixel position
 	int g_x = blockIdx.x * tile + threadIdx.x-kernelsizex;
  int g_y = blockIdx.y * tile + threadIdx.y-kernelsizey;
@@ -326,7 +312,8 @@ __global__ void filter(unsigned char *image, unsigned char *out, const unsigned

 unsigned char *image, *pixels, *dev_bitmap, *dev_input;
 unsigned int imagesizey, imagesizex; // Image size
-unsigned int *stencil;
+unsigned int stencil[5] = {1,4,6,4,1};
+unsigned int *stencilcuda;

 ////////////////////////////////////////////////////////////////////////////////
 // main computation function
@@ -351,26 +338,20 @@ void computeImages(int kernelsizex, int kernelsizey)
  //dim3 grid((imagesizex + blockSize.x - 1)/blockSize.x, (imagesizey + blockSize.y - 1)/blockSize.y);

  //Task 1
-	//filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey);
+	filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey);

  //Task 2
  //filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey);
  //cudaDeviceSynchronize();
  //filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizey, kernelsizex);

-/*
-  // Gaussian
-  filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey);
-  filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey);
-  for(int = 0; i < 5; i++) {
-    filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey);
-    filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey);
-  } */
-  //Gaussian
-  cudaMalloc( (void**)&stencil, (kernelsizex*2)+1);
-  gaussian_filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey, stencil);
-  gaussian_filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizey, kernelsizex, stencil);
-  //Median
+  //Task 3 Gaussian
+  /*cudaMalloc( (void**)&stencilcuda, 5*sizeof(unsigned int));
+  cudaMemcpy( stencilcuda, stencil, 5*sizeof(unsigned int), cudaMemcpyHostToDevice);
+  gaussian_filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey, stencilcuda);
+  cudaDeviceSynchronize();
+  gaussian_filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizey, kernelsizex, stencilcuda);*/
+  //Task 4 Median
  //median_filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey);
 	cudaThreadSynchronize();
 //	Check for errors!
@@ -380,8 +361,7 @@ void computeImages(int kernelsizex, int kernelsizey)
 	cudaMemcpy( pixels, dev_bitmap, imagesizey*imagesizex*3, cudaMemcpyDeviceToHost );
 	cudaFree( dev_bitmap );
 	cudaFree( dev_input );
-  
-  cudaFree(stencil);
+  cudaFree(stencilcuda);
 }

 // Display images
@@ -429,7 +409,7 @@ int main( int argc, char** argv)
 	ResetMilli();

  //Task 1,4
-	//computeImages(2, 2);
+	//computeImages(7, 7);
  //Task 2
  computeImages(2, 1);
  computeImages(1, 2);