diff --git a/Lab5/filter.cu b/Lab5/filter.cu index 4bddc805aae49c2ef4d14e1a1acb47a44c413911..63a01e96ff25f8c99050d7eb8a58534ee0db8d07 100755 --- a/Lab5/filter.cu +++ b/Lab5/filter.cu @@ -39,18 +39,13 @@ #define maxKernelSizeX BLOCK_W #define maxKernelSizeY BLOCK_H -// #define BLOCK_SIZE 16 // total number of threads per block = 256 -// #define GRID_SIZE 8 // total number of blocks per grid = 256 +#define LOCAL_MEM_SIZE (FILTER_W * FILTER_H) + __global__ void filter(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey) { - __shared__ unsigned char local_memory[FILTER_W * FILTER_H * 3]; - // __shared__ unsigned char local_memory[(BLOCK_W + RADIUS) * 3 * (BLOCK_H + RADIUS)]; - // map from blockIdx to pixel position - unsigned blockNumInGrid = blockIdx.x + gridDim.x * blockIdx.y; - unsigned threadNumInBlock = threadIdx.x + blockDim.x *threadIdx.y; - unsigned threadsPerBlock = blockDim.x * blockDim.y; + __shared__ unsigned char local_memory[LOCAL_MEM_SIZE * 3]; int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -64,7 +59,7 @@ __global__ void filter(unsigned char *image, unsigned char *out, const unsigned int blockOffsetY = blockIdx.y * blockDim.y; /* Changed while loop here also, might need to include some stuff with - RADIUS here also */ - while((block_y * FILTER_W + block_x) < (FILTER_W *FILTER_H)) + while((block_y * FILTER_W + block_x) < LOCAL_MEM_SIZE) { int global_y = max(blockOffsetY + block_y - RADIUS, 0); int global_x = max(blockOffsetX + block_x - RADIUS, 0);