diff --git a/Lab5/filter.cu b/Lab5/filter.cu
index b98381166338d9e18655dd24b056ce65c19d852d..5c008c2465a9cb3e37e3fc7fb1640d486e3ff206 100755
--- a/Lab5/filter.cu
+++ b/Lab5/filter.cu
@@ -101,7 +101,7 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const
 {
   __shared__ unsigned char local_memory [STATIC_SHARED_MEM_SIZE_SEP * 3];
   // map from blockIdx to pixel position
-  unsigned threadsPerBlock  = blockDim.x * blockDim.y;
+  unsigned threadsPerBlock  = blockDim.x;
 
   int blockOffsetX = blockIdx.x * blockDim.x;
   int blockOffsetY = blockIdx.y * blockDim.y;
@@ -109,10 +109,12 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const
   int local_index = threadIdx.x;
 
   int FILTER_SIZE = (kernelsize * 2 + 1) + (2 * kernelsize);
+
   while(local_index < FILTER_SIZE)
   {
-    int local_corner_px = blockOffsetY * imagesizex + blockOffsetX;
-    int global_index = max(0, local_corner_px - kernelsize * stride + local_index);
+    int global_x = max(0, blockOffsetX - kernelsize + local_index);
+    int global_y = max(0, blockOffsetY);
+    unsigned global_index = global_y * imagesizex + global_x;
 
     local_memory[3*local_index+0] = image[3*global_index+0];
     local_memory[3*local_index+1] = image[3*global_index+1];
@@ -214,13 +216,14 @@ void computeImages(int kernelsizex, int kernelsizey)
 
   dim3 gridsize_sep(imagesizex/(kernelsizex*2+1), imagesizey);
   dim3 blocksize_sep(kernelsizex*2+1);
-  unsigned char *tmp = dev_input;
+  // unsigned char *tmp = dev_input;
   ResetMilli();
   /* X-direction */
-  filter_separable<<<gridsize_sep, blocksize_sep>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, imagesizex);
+  filter_separable<<<gridsize_sep, blocksize_sep>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, 1);
   cudaThreadSynchronize();
   /* Y-direction */
-  // filter_separable<<<gridsize_sep, blocksize_sep>>>(tmp, dev_bitmap, imagesizex, imagesizey, kernelsizex, imagesizex);
+  // dim3 gridsize_sep(imagesizex, imagesizey/(kernelsizex*2+1));
+  // filter_separable<<<gridsize_sep, blocksize_sep>>>(tmp, dev_bitmap, imagesizex, imagesizey, kernelsizex, 1);
   cudaThreadSynchronize();
   printf("[Separable]    %f\n", GetSeconds());