diff --git a/Lab5/filter.cu b/Lab5/filter.cu
index 5c008c2465a9cb3e37e3fc7fb1640d486e3ff206..cbc2fb66f6c6a592149982278008c6a37e11fe76 100755
--- a/Lab5/filter.cu
+++ b/Lab5/filter.cu
@@ -55,8 +55,8 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne
 
   while(local_index < LOCAL_MEM_SIZE)
   {
-    int global_x = max(0, blockOffsetX - kernelsizex + local_index % FILTER_W);
-	  int global_y = max(0, blockOffsetY - kernelsizey + local_index / FILTER_W);
+    int global_x = min(max(0, blockOffsetX - kernelsizex + local_index % FILTER_W), imagesizex - 1);
+	  int global_y = min(max(0, blockOffsetY - kernelsizey + local_index / FILTER_W), imagesizey - 1);
     unsigned global_index = global_y * imagesizex + global_x;
 
     local_memory[3*local_index+0] = image[3*global_index+0];
@@ -65,6 +65,7 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne
 
     local_index += threadsPerBlock;
   }
+  __syncthreads();
 
   unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
   unsigned y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -97,7 +98,7 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne
   }
 }
 
-__global__ void filter_separable(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsize, const int stride)
+__global__ void filter_separable(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsize, const bool horizontal)
 {
   __shared__ unsigned char local_memory [STATIC_SHARED_MEM_SIZE_SEP * 3];
   // map from blockIdx to pixel position
@@ -106,14 +107,28 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const
   int blockOffsetX = blockIdx.x * blockDim.x;
   int blockOffsetY = blockIdx.y * blockDim.y;
 
-  int local_index = threadIdx.x;
+  /* Threadidx.x or threadIdx.y will always be zero depending on direction */
+  int local_index = threadIdx.x + threadIdx.y;
+
 
   int FILTER_SIZE = (kernelsize * 2 + 1) + (2 * kernelsize);
 
   while(local_index < FILTER_SIZE)
   {
-    int global_x = max(0, blockOffsetX - kernelsize + local_index);
-    int global_y = max(0, blockOffsetY);
+    int global_x = 0;
+    int global_y = 0;
+
+    if(horizontal)
+    {
+      global_x = min(max(0, blockOffsetX - kernelsize + local_index), imagesizex - 1);
+      global_y = blockOffsetY;
+    }
+    else
+    {
+      global_x = blockOffsetX;
+      global_y = min(max(0, blockOffsetY - kernelsize + local_index), imagesizey - 1);//max(0, blockOffsetY);
+    }
+
     unsigned global_index = global_y * imagesizex + global_x;
 
     local_memory[3*local_index+0] = image[3*global_index+0];
@@ -122,10 +137,12 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const
 
     local_index += threadsPerBlock;
   }
-
+  __syncthreads();
   unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
   unsigned y = blockIdx.y * blockDim.y + threadIdx.y;
 
+  /* Since one of threadIdx.x or threadIdx.y is 0 we can just add them */
+  local_index = threadIdx.x + threadIdx.y;
   if (x < imagesizex && y < imagesizey) // If inside image
   {
     int d;
@@ -135,21 +152,90 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const
    	sumx=0;sumy=0;sumz=0;
    	for(d=-kernelsize;d<=kernelsize;d++)
     {
-			unsigned dx = kernelsize + threadIdx.x + d;
+			unsigned dx = kernelsize + local_index + d;
 
       sumx += local_memory[dx*3+0];
       sumy += local_memory[dx*3+1];
       sumz += local_memory[dx*3+2];
  		}
-    out[(y*imagesizex+x)*3+0] = local_memory[kernelsize + threadIdx.x + 0];
-    out[(y*imagesizex+x)*3+1] = local_memory[kernelsize + threadIdx.x + 1];
-    out[(y*imagesizex+x)*3+2] = local_memory[kernelsize + threadIdx.x + 2];
-    // out[(y*imagesizex+x)*3+0] = sumx/divby;
-  	// out[(y*imagesizex+x)*3+1] = sumy/divby;
-  	// out[(y*imagesizex+x)*3+2] = sumz/divby;
+
+
+    out[(y*imagesizex+x)*3+0] = sumx/divby;
+  	out[(y*imagesizex+x)*3+1] = sumy/divby;
+  	out[(y*imagesizex+x)*3+2] = sumz/divby;
   }
 }
 
+__global__ void filter_gaussian(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsize, const bool horizontal)
+{
+  __shared__ unsigned char local_memory [STATIC_SHARED_MEM_SIZE_SEP * 3];
+  // map from blockIdx to pixel position
+  unsigned threadsPerBlock  = blockDim.x;
+
+  int blockOffsetX = blockIdx.x * blockDim.x;
+  int blockOffsetY = blockIdx.y * blockDim.y;
+
+  /* Threadidx.x or threadIdx.y will always be zero depending on direction */
+  int local_index = threadIdx.x + threadIdx.y;
+
+
+  int FILTER_SIZE = (kernelsize * 2 + 1) + (2 * kernelsize);
+
+  while(local_index < FILTER_SIZE)
+  {
+    int global_x = 0;
+    int global_y = 0;
+
+    if(horizontal)
+    {
+      global_x = min(max(0, blockOffsetX - kernelsize + local_index), imagesizex - 1);
+      global_y = blockOffsetY;
+    }
+    else
+    {
+      global_x = blockOffsetX;
+      global_y = min(max(0, blockOffsetY - kernelsize + local_index), imagesizey - 1);//max(0, blockOffsetY);
+    }
+
+    unsigned global_index = global_y * imagesizex + global_x;
+
+    local_memory[3*local_index+0] = image[3*global_index+0];
+    local_memory[3*local_index+1] = image[3*global_index+1];
+    local_memory[3*local_index+2] = image[3*global_index+2];
+
+    local_index += threadsPerBlock;
+  }
+  __syncthreads();
+
+  unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  /* Since one of threadIdx.x or threadIdx.y is 0 we can just add them */
+  local_index = threadIdx.x + threadIdx.y;
+  if (x < imagesizex && y < imagesizey) // If inside image
+  {
+    int d;
+    unsigned int sumx, sumy, sumz;
+    float divby[5] = {1.0/16.0, 4.0/16.0, 6.0/16.0, 4.0/16.0, 1.0/16.0};
+   	sumx=0;sumy=0;sumz=0;
+
+   	for(d=-kernelsize;d<=kernelsize;d++)
+    {
+			unsigned dx = kernelsize + local_index + d;
+
+      sumx += local_memory[dx*3+0] * divby[kernelsize+d];
+      sumy += local_memory[dx*3+1] * divby[kernelsize+d];
+      sumz += local_memory[dx*3+2] * divby[kernelsize+d];
+    //  printf("Divby factor %f kernelsize+d %d\n", divby[kernelsize+d], kernelsize+d);
+ 		}
+
+    out[(y*imagesizex+x)*3+0] = sumx;// / old_divby;
+  	out[(y*imagesizex+x)*3+1] = sumy;// / old_divby;
+  	out[(y*imagesizex+x)*3+2] = sumz;/// old_divby;
+  }
+
+}
+
 __global__ void filter_old(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey)
 {
   int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -180,9 +266,98 @@ __global__ void filter_old(unsigned char *image, unsigned char *out, const unsig
   out[(y*imagesizex+x)*3+2] = sumz/divby;
   }
 }
+
+__global__ void filter_median(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey)
+{
+
+  __shared__ unsigned char local_memory [STATIC_SHARED_MEM_SIZE_PX * 3];
+  // map from blockIdx to pixel position
+  unsigned threadNumInBlock = threadIdx.x + blockDim.x * threadIdx.y;
+  unsigned threadsPerBlock  = blockDim.x * blockDim.y;
+
+  int blockOffsetX = blockIdx.x * blockDim.x;
+  int blockOffsetY = blockIdx.y * blockDim.y;
+
+  int local_index = threadNumInBlock;
+
+  int FILTER_W = (kernelsizex * 2 + 1) + (2 * kernelsizex);
+  int FILTER_H = (kernelsizey * 2 + 1) + (2 * kernelsizey);
+
+  unsigned LOCAL_MEM_SIZE = FILTER_W * FILTER_H;
+
+  while(local_index < LOCAL_MEM_SIZE)
+  {
+    int global_x = min(max(0, blockOffsetX - kernelsizex + local_index % FILTER_W), imagesizex - 1);
+	  int global_y = min(max(0, blockOffsetY - kernelsizey + local_index / FILTER_W), imagesizey - 1);
+    unsigned global_index = global_y * imagesizex + global_x;
+
+    local_memory[3*local_index+0] = image[3*global_index+0];
+    local_memory[3*local_index+1] = image[3*global_index+1];
+    local_memory[3*local_index+2] = image[3*global_index+2];
+
+    local_index += threadsPerBlock;
+  }
+  __syncthreads();
+  unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (x < imagesizex && y < imagesizey) // If inside image
+  {
+    int dy, dx;
+    unsigned int sumx[256] = {0}, sumy[256] = {0}, sumz[256] = {0};
+
+    unsigned local_x = threadIdx.x;
+    unsigned local_y = threadIdx.y;
+
+   	for(dy=-kernelsizey;dy<=kernelsizey;dy++)
+   		for(dx=-kernelsizex;dx<=kernelsizex;dx++)
+   		{
+        unsigned yy = kernelsizey + local_y + dy;
+  			unsigned xx = kernelsizex + local_x + dx;
+        unsigned mem_access = yy * FILTER_W + xx;
+
+        ++sumx[local_memory[mem_access*3+0]];
+        ++sumy[local_memory[mem_access*3+1]];
+        ++sumz[local_memory[mem_access*3+2]];
+   		}
+
+    unsigned num_x = 0;
+    unsigned num_y = 0;
+    unsigned num_z = 0;
+    int index_x = 0;
+    int index_y = 0;
+    int index_z = 0;
+
+    unsigned half_filter_size = ((kernelsizex*2+1) * (kernelsizey*2+1))/2;
+    while(num_x < half_filter_size)
+    {
+      // printf("Thread: %d Num_x %d index_x %d\n",y* blockDim.x + x, num_x, index_x);
+      num_x += sumx[index_x++];
+    }
+    while(num_y < half_filter_size)
+    {
+      num_y += sumy[index_y++];
+    }
+    while(num_z < half_filter_size)
+    {
+      num_z += sumz[index_z++];
+    }
+
+    // while(num_x < half_filter_size && num_y < half_filter_size && num_z < half_filter_size)
+    // {
+    //   num_x += sumx[index_x++];
+    //   num_y += sumy[index_y++];
+    //   num_z += sumz[index_z++];
+    // }
+
+    out[(y*imagesizex+x)*3+0] = index_x;
+  	out[(y*imagesizex+x)*3+1] = index_y;
+  	out[(y*imagesizex+x)*3+2] = index_z;
+  }
+}
 // Global variables for image data
 
-unsigned char *image, *pixels, *dev_bitmap, *dev_input;
+unsigned char *image, *pixels, *dev_bitmap, *dev_input, *dev_bitmap_tmp;
 unsigned int imagesizey, imagesizex; // Image size
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -199,41 +374,75 @@ void computeImages(int kernelsizex, int kernelsizey)
 	pixels = (unsigned char *) malloc(imagesizex*imagesizey*3);
 	cudaMalloc( (void**)&dev_input, imagesizex*imagesizey*3);
 	cudaMemcpy( dev_input, image, imagesizey*imagesizex*3, cudaMemcpyHostToDevice );
+
+  cudaMalloc( (void**)&dev_bitmap_tmp, imagesizex*imagesizey*3);
 	cudaMalloc( (void**)&dev_bitmap, imagesizex*imagesizey*3);
 
-	dim3 grid(imagesizex,imagesizey);
-  ResetMilli();
-	filter_old<<<grid,1>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Awful load balance
-  cudaThreadSynchronize();
-  printf("[Un-Optimized] %f\n", GetSeconds());
 
-  dim3 gridsize(imagesizex/(kernelsizex*2+1), imagesizey/(kernelsizey*2+1));
+  /* Given filter */
+	// dim3 grid(imagesizex,imagesizey);
+  // ResetMilli();
+	// filter_old<<<grid,1>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Awful load balance
+  // cudaThreadSynchronize();
+  // printf("[Un-Optimized] %f\n", GetSeconds());
+
+  /* Our first block filter */
+  dim3 gridsize(imagesizex/(kernelsizex*2+1)+1, imagesizey/(kernelsizey*2+1)+1);
   dim3 blocksize(kernelsizex*2+1, kernelsizey*2+1);
   ResetMilli();
   filter_1<<<gridsize, blocksize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Nice load balance
   cudaThreadSynchronize();
   printf("[Optimized]    %f\n", GetSeconds());
 
-  dim3 gridsize_sep(imagesizex/(kernelsizex*2+1), imagesizey);
-  dim3 blocksize_sep(kernelsizex*2+1);
-  // unsigned char *tmp = dev_input;
+  /* Separable filter */
+  dim3 gridsize_sep_horizontal(imagesizex/(kernelsizex*2+1) + 1, imagesizey);
+  dim3 gridsize_sep_vertical(imagesizex, imagesizey/(kernelsizex*2+1) + 1);
+
+  dim3 threads_horizontal(kernelsizex*2+1);
+  dim3 threads_vertical(1, kernelsizex*2+1);
+  // // unsigned char *tmp = dev_input;
   ResetMilli();
-  /* X-direction */
-  filter_separable<<<gridsize_sep, blocksize_sep>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, 1);
+  // /* X-direction */
+  filter_separable<<<gridsize_sep_horizontal, threads_horizontal>>>(dev_input, dev_bitmap_tmp, imagesizex, imagesizey, kernelsizex, true);
   cudaThreadSynchronize();
   /* Y-direction */
-  // dim3 gridsize_sep(imagesizex, imagesizey/(kernelsizex*2+1));
-  // filter_separable<<<gridsize_sep, blocksize_sep>>>(tmp, dev_bitmap, imagesizex, imagesizey, kernelsizex, 1);
+  filter_separable<<<gridsize_sep_vertical,   threads_vertical>>>(dev_bitmap_tmp, dev_bitmap, imagesizex, imagesizey, kernelsizex, false);
   cudaThreadSynchronize();
   printf("[Separable]    %f\n", GetSeconds());
 
+
+  /* Gaussian filter i.e. 2 x 5*1 filters run horizontal and vertical with gaussian weights */
+  if(kernelsizex == 2)
+  {
+    ResetMilli();
+    /* X-direction */
+    filter_gaussian<<<gridsize_sep_horizontal, threads_horizontal>>>(dev_input, dev_bitmap_tmp, imagesizex, imagesizey, kernelsizex, true);
+    cudaThreadSynchronize();
+    /* Y-direction */
+    filter_gaussian<<<gridsize_sep_vertical,  threads_vertical>>>(dev_bitmap_tmp, dev_bitmap, imagesizex, imagesizey, kernelsizex, false);
+    cudaThreadSynchronize();
+
+    printf("[Gaussian]    %f\n", GetSeconds());
+  }
+
+
+
+  /* Median filter */
+  ResetMilli();
+  filter_median<<<gridsize, blocksize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Nice load balance
+  cudaThreadSynchronize();
+  printf("[Median]    %f\n", GetSeconds());
+
 //	Check for errors!
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess)
       printf("Error: %s\n", cudaGetErrorString(err));
 	cudaMemcpy( pixels, dev_bitmap, imagesizey*imagesizex*3, cudaMemcpyDeviceToHost );
+
+  cudaFree( dev_bitmap_tmp );
 	cudaFree( dev_bitmap );
 	cudaFree( dev_input );
+
 }
 
 // Display images
@@ -279,7 +488,7 @@ int main( int argc, char** argv)
 	glutDisplayFunc(Draw);
 
 
-  for(int i = 2; i < 10; i++)
+  for(int i = 2; i < 5; i++)
   {
      printf("Frame is %i x %i\n", i*2+1, i*2+1);
 	   computeImages(i, i);