diff --git a/Lab5/filter.cu b/Lab5/filter.cu
index 315145de762fb4a221c2e2be4d37836be2561024..115dfb28e3b10fd6869007a37d8bbfa7c4e49310 100755
--- a/Lab5/filter.cu
+++ b/Lab5/filter.cu
@@ -20,7 +20,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-
 #ifdef __APPLE__
   #include <GLUT/glut.h>
   #include <OpenGL/gl.h>
@@ -31,89 +30,47 @@
 #include "milli.h"
 
 // Use these for setting shared memory size.
-#define RADIUS 3
-
-#define maxKernelSizeX (RADIUS*2+1)
-#define maxKernelSizeY (RADIUS*2+1)
+#define maxKernelSizeX 10
+#define maxKernelSizeY 10
 
-#define FILTER_W (maxKernelSizeX+RADIUS*2)
-#define FILTER_H (maxKernelSizeY+RADIUS*2)
-
-#define LOCAL_MEM_SIZE_PX (FILTER_W * FILTER_H)
 
 __global__ void filter(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey)
 {
-  __shared__ unsigned char local_memory[LOCAL_MEM_SIZE_PX * 3];
-
   // map from blockIdx to pixel position
-  unsigned blockNumInGrid = blockIdx.x + gridDim.x * blockIdx.y;
-  unsigned threadNumInBlock = threadIdx.x + blockDim.x *threadIdx.y;
-  unsigned threadsPerBlock  = blockDim.x * blockDim.y;
-
-
-
-  int blockOffsetX = blockIdx.x * blockDim.x;
-  int blockOffsetY = blockIdx.y * blockDim.y;
-
-  int local_index = threadNumInBlock;
-  while(local_index < LOCAL_MEM_SIZE_PX)
-  {
-    /* Repeat three times for colored pixel */
-    int global_x = max(0, blockOffsetX - RADIUS + local_index % FILTER_W);
-	  int global_y = max(0, blockOffsetY - RADIUS + local_index / FILTER_W);
-    int global_index = global_y * imagesizex + global_x;
-
-    local_memory[3*local_index+0] = image[3*global_index+0];
-    local_memory[3*local_index+1] = image[3*global_index+1];
-    local_memory[3*local_index+2] = image[3*global_index+2];
-
-    local_index += threadsPerBlock;
-  }
+	int x = blockIdx.x * blockDim.x + threadIdx.x;
+	int y = blockIdx.y * blockDim.y + threadIdx.y;
 
   int dy, dx;
   unsigned int sumx, sumy, sumz;
-  /* Old factor */
-  int divby = (2*kernelsizex*3+1)*(2*kernelsizey+1); // Works for box filters only!
-  int skip_pixels =  FILTER_W * RADIUS + RADIUS;
-
-  int x = blockIdx.x * blockDim.x + threadIdx.x;
-  int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-  int local_x = threadIdx.x;
-  int local_y = threadIdx.y;
-
-  int local_mem_address = skip_pixels + (local_y * FILTER_W)  + local_x;
-
- 	// if (x < imagesizex && y < imagesizey) // If inside image
- 	// {
- // Filter kernel (simple box filter)
- 	sumx=0;sumy=0;sumz=0;
- 	for(dy=-kernelsizey;dy<=kernelsizey;dy++)
- 		for(dx=-kernelsizex;dx<=kernelsizex;dx++)
- 		{
-       int mem_access = skip_pixels + (local_y + dy) * FILTER_W + (local_x + dx);
-
-       sumx += local_memory[mem_access*3+0];
-       sumy += local_memory[mem_access*3+1];
-       sumz += local_memory[mem_access*3+2];
- 		}
-
-  __syncthreads();
-  // out[3*(y*imagesizex+x)+0] = local_memory[3*local_mem_address+0];
-  // out[3*(y*imagesizex+x)+1] = local_memory[3*local_mem_address+1];
-  // out[3*(y*imagesizex+x)+2] = local_memory[3*local_mem_address+2];
-
-  out[(y*imagesizex+x)*3+0] = sumx/divby;
-	out[(y*imagesizex+x)*3+1] = sumy/divby;
-	out[(y*imagesizex+x)*3+2] = sumz/divby;
 
+  int divby = (2*kernelsizex+1)*(2*kernelsizey+1); // Works for box filters only!
 
+	if (x < imagesizex && y < imagesizey) // If inside image
+	{
+// Filter kernel (simple box filter)
+	sumx=0;sumy=0;sumz=0;
+	for(dy=-kernelsizey;dy<=kernelsizey;dy++)
+		for(dx=-kernelsizex;dx<=kernelsizex;dx++)
+		{
+			// Use max and min to avoid branching!
+			int yy = min(max(y+dy, 0), imagesizey-1);
+			int xx = min(max(x+dx, 0), imagesizex-1);
+
+			sumx += image[((yy)*imagesizex+(xx))*3+0];
+			sumy += image[((yy)*imagesizex+(xx))*3+1];
+			sumz += image[((yy)*imagesizex+(xx))*3+2];
+		}
+	out[(y*imagesizex+x)*3+0] = sumx/divby;
+	out[(y*imagesizex+x)*3+1] = sumy/divby;
+	out[(y*imagesizex+x)*3+2] = sumz/divby;
+	}
 }
 
 // Global variables for image data
 
 unsigned char *image, *pixels, *dev_bitmap, *dev_input;
 unsigned int imagesizey, imagesizex; // Image size
+
 ////////////////////////////////////////////////////////////////////////////////
 // main computation function
 ////////////////////////////////////////////////////////////////////////////////
@@ -125,52 +82,25 @@ void computeImages(int kernelsizex, int kernelsizey)
 		return;
 	}
 
-
-
-
-  //dim3 gridsize(1, 1);
-
-  dim3 gridsize(imagesizex/maxKernelSizeX, imagesizey/maxKernelSizeY);
-	dim3 blocksize(maxKernelSizeX, maxKernelSizeY); // 256
-
-	/* whole pic, no need to declare several times */
 	pixels = (unsigned char *) malloc(imagesizex*imagesizey*3);
 	cudaMalloc( (void**)&dev_input, imagesizex*imagesizey*3);
-	cudaMemcpy( dev_input, image, imagesizey*imagesizex*3, cudaMemcpyHostToDevice);
+	cudaMemcpy( dev_input, image, imagesizey*imagesizex*3, cudaMemcpyHostToDevice );
 	cudaMalloc( (void**)&dev_bitmap, imagesizex*imagesizey*3);
-	filter<<<gridsize,blocksize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Awful load balance
-
-	cudaDeviceSynchronize();
+	dim3 grid(imagesizex,imagesizey);
+	filter<<<grid,1>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Awful load balance
+	cudaThreadSynchronize();
 //	Check for errors!
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-	printf("Error: %s\n", cudaGetErrorString(err));
-
-
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+        printf("Error: %s\n", cudaGetErrorString(err));
 	cudaMemcpy( pixels, dev_bitmap, imagesizey*imagesizex*3, cudaMemcpyDeviceToHost );
 	cudaFree( dev_bitmap );
 	cudaFree( dev_input );
-
 }
 
-// unsigned image_size = imagesizex * imagesizey;
-// unsigned num_kernels = floor(image_size / (GRID_SIZE * BLOCK_SIZE));
-// unsigned rows_per_kernel = floor(imagesizey / num_kernels);
-// unsigned remainder_y = imagesizey % num_kernels; // 10
-
-// pixels = (unsigned char *) malloc(imagesizex*imagesizey*3);
-// for(int i = 1; i <= num_kernels; i++)
-// {
-//   cudaMalloc( (void**)&dev_input, imagesizex*imagesizey*3);
-//   cudaMemcpy( dev_input, image, imagesizey*imagesizex*3, cudaMemcpyHostToDevice);
-//   cudaMalloc( (void**)&dev_bitmap, imagesizex*imagesizey*3);
-//   filter<<<grid,blocksize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey, i); // Awful load balance
-// }
-
 // Display images
 void Draw()
 {
-
 // Dump the whole picture onto the screen.
 	glClearColor( 0.0, 0.0, 0.0, 1.0 );
 	glClear( GL_COLOR_BUFFER_BIT );
@@ -195,7 +125,6 @@ void Draw()
 // Main program, inits
 int main( int argc, char** argv)
 {
-
 	glutInit(&argc, argv);
 	glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA );
 
@@ -213,9 +142,10 @@ int main( int argc, char** argv)
 
 	ResetMilli();
 
-	computeImages(RADIUS, RADIUS);
+	computeImages(2, 2);
+
 // You can save the result to a file like this:
-	writeppm("out.ppm", imagesizey, imagesizex, pixels);
+//	writeppm("out.ppm", imagesizey, imagesizex, pixels);
 
 	glutMainLoop();
 	return 0;