diff --git a/Lab5/filter.cu b/Lab5/filter.cu index b98381166338d9e18655dd24b056ce65c19d852d..5c008c2465a9cb3e37e3fc7fb1640d486e3ff206 100755 --- a/Lab5/filter.cu +++ b/Lab5/filter.cu @@ -101,7 +101,7 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const { __shared__ unsigned char local_memory [STATIC_SHARED_MEM_SIZE_SEP * 3]; // map from blockIdx to pixel position - unsigned threadsPerBlock = blockDim.x * blockDim.y; + unsigned threadsPerBlock = blockDim.x; int blockOffsetX = blockIdx.x * blockDim.x; int blockOffsetY = blockIdx.y * blockDim.y; @@ -109,10 +109,12 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const int local_index = threadIdx.x; int FILTER_SIZE = (kernelsize * 2 + 1) + (2 * kernelsize); + while(local_index < FILTER_SIZE) { - int local_corner_px = blockOffsetY * imagesizex + blockOffsetX; - int global_index = max(0, local_corner_px - kernelsize * stride + local_index); + int global_x = max(0, blockOffsetX - kernelsize + local_index); + int global_y = max(0, blockOffsetY); + unsigned global_index = global_y * imagesizex + global_x; local_memory[3*local_index+0] = image[3*global_index+0]; local_memory[3*local_index+1] = image[3*global_index+1]; @@ -214,13 +216,14 @@ void computeImages(int kernelsizex, int kernelsizey) dim3 gridsize_sep(imagesizex/(kernelsizex*2+1), imagesizey); dim3 blocksize_sep(kernelsizex*2+1); - unsigned char *tmp = dev_input; + // unsigned char *tmp = dev_input; ResetMilli(); /* X-direction */ - filter_separable<<<gridsize_sep, blocksize_sep>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, imagesizex); + filter_separable<<<gridsize_sep, blocksize_sep>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, 1); cudaThreadSynchronize(); /* Y-direction */ - // filter_separable<<<gridsize_sep, blocksize_sep>>>(tmp, dev_bitmap, imagesizex, imagesizey, kernelsizex, imagesizex); + // dim3 gridsize_sep(imagesizex, imagesizey/(kernelsizex*2+1)); + // filter_separable<<<gridsize_sep, blocksize_sep>>>(tmp, dev_bitmap, imagesizex, imagesizey, kernelsizex, 1); cudaThreadSynchronize(); printf("[Separable] %f\n", GetSeconds());