diff --git a/Lab5/filter.cu b/Lab5/filter.cu index 5c008c2465a9cb3e37e3fc7fb1640d486e3ff206..cbc2fb66f6c6a592149982278008c6a37e11fe76 100755 --- a/Lab5/filter.cu +++ b/Lab5/filter.cu @@ -55,8 +55,8 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne while(local_index < LOCAL_MEM_SIZE) { - int global_x = max(0, blockOffsetX - kernelsizex + local_index % FILTER_W); - int global_y = max(0, blockOffsetY - kernelsizey + local_index / FILTER_W); + int global_x = min(max(0, blockOffsetX - kernelsizex + local_index % FILTER_W), imagesizex - 1); + int global_y = min(max(0, blockOffsetY - kernelsizey + local_index / FILTER_W), imagesizey - 1); unsigned global_index = global_y * imagesizex + global_x; local_memory[3*local_index+0] = image[3*global_index+0]; @@ -65,6 +65,7 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne local_index += threadsPerBlock; } + __syncthreads(); unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; @@ -97,7 +98,7 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne } } -__global__ void filter_separable(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsize, const int stride) +__global__ void filter_separable(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsize, const bool horizontal) { __shared__ unsigned char local_memory [STATIC_SHARED_MEM_SIZE_SEP * 3]; // map from blockIdx to pixel position @@ -106,14 +107,28 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const int blockOffsetX = blockIdx.x * blockDim.x; int blockOffsetY = blockIdx.y * blockDim.y; - int local_index = threadIdx.x; + /* Threadidx.x or threadIdx.y will always be zero depending on direction */ + int local_index = threadIdx.x + threadIdx.y; + int FILTER_SIZE = (kernelsize * 2 + 1) + (2 * kernelsize); while(local_index < FILTER_SIZE) { - int global_x = max(0, blockOffsetX - kernelsize + local_index); - int global_y = max(0, blockOffsetY); + int global_x = 0; + int global_y = 0; + + if(horizontal) + { + global_x = min(max(0, blockOffsetX - kernelsize + local_index), imagesizex - 1); + global_y = blockOffsetY; + } + else + { + global_x = blockOffsetX; + global_y = min(max(0, blockOffsetY - kernelsize + local_index), imagesizey - 1);//max(0, blockOffsetY); + } + unsigned global_index = global_y * imagesizex + global_x; local_memory[3*local_index+0] = image[3*global_index+0]; @@ -122,10 +137,12 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const local_index += threadsPerBlock; } - + __syncthreads(); unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned y = blockIdx.y * blockDim.y + threadIdx.y; + /* Since one of threadIdx.x or threadIdx.y is 0 we can just add them */ + local_index = threadIdx.x + threadIdx.y; if (x < imagesizex && y < imagesizey) // If inside image { int d; @@ -135,21 +152,90 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const sumx=0;sumy=0;sumz=0; for(d=-kernelsize;d<=kernelsize;d++) { - unsigned dx = kernelsize + threadIdx.x + d; + unsigned dx = kernelsize + local_index + d; sumx += local_memory[dx*3+0]; sumy += local_memory[dx*3+1]; sumz += local_memory[dx*3+2]; } - out[(y*imagesizex+x)*3+0] = local_memory[kernelsize + threadIdx.x + 0]; - out[(y*imagesizex+x)*3+1] = local_memory[kernelsize + threadIdx.x + 1]; - out[(y*imagesizex+x)*3+2] = local_memory[kernelsize + threadIdx.x + 2]; - // out[(y*imagesizex+x)*3+0] = sumx/divby; - // out[(y*imagesizex+x)*3+1] = sumy/divby; - // out[(y*imagesizex+x)*3+2] = sumz/divby; + + + out[(y*imagesizex+x)*3+0] = sumx/divby; + out[(y*imagesizex+x)*3+1] = sumy/divby; + out[(y*imagesizex+x)*3+2] = sumz/divby; } } +__global__ void filter_gaussian(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsize, const bool horizontal) +{ + __shared__ unsigned char local_memory [STATIC_SHARED_MEM_SIZE_SEP * 3]; + // map from blockIdx to pixel position + unsigned threadsPerBlock = blockDim.x; + + int blockOffsetX = blockIdx.x * blockDim.x; + int blockOffsetY = blockIdx.y * blockDim.y; + + /* Threadidx.x or threadIdx.y will always be zero depending on direction */ + int local_index = threadIdx.x + threadIdx.y; + + + int FILTER_SIZE = (kernelsize * 2 + 1) + (2 * kernelsize); + + while(local_index < FILTER_SIZE) + { + int global_x = 0; + int global_y = 0; + + if(horizontal) + { + global_x = min(max(0, blockOffsetX - kernelsize + local_index), imagesizex - 1); + global_y = blockOffsetY; + } + else + { + global_x = blockOffsetX; + global_y = min(max(0, blockOffsetY - kernelsize + local_index), imagesizey - 1);//max(0, blockOffsetY); + } + + unsigned global_index = global_y * imagesizex + global_x; + + local_memory[3*local_index+0] = image[3*global_index+0]; + local_memory[3*local_index+1] = image[3*global_index+1]; + local_memory[3*local_index+2] = image[3*global_index+2]; + + local_index += threadsPerBlock; + } + __syncthreads(); + + unsigned x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned y = blockIdx.y * blockDim.y + threadIdx.y; + + /* Since one of threadIdx.x or threadIdx.y is 0 we can just add them */ + local_index = threadIdx.x + threadIdx.y; + if (x < imagesizex && y < imagesizey) // If inside image + { + int d; + unsigned int sumx, sumy, sumz; + float divby[5] = {1.0/16.0, 4.0/16.0, 6.0/16.0, 4.0/16.0, 1.0/16.0}; + sumx=0;sumy=0;sumz=0; + + for(d=-kernelsize;d<=kernelsize;d++) + { + unsigned dx = kernelsize + local_index + d; + + sumx += local_memory[dx*3+0] * divby[kernelsize+d]; + sumy += local_memory[dx*3+1] * divby[kernelsize+d]; + sumz += local_memory[dx*3+2] * divby[kernelsize+d]; + // printf("Divby factor %f kernelsize+d %d\n", divby[kernelsize+d], kernelsize+d); + } + + out[(y*imagesizex+x)*3+0] = sumx;// / old_divby; + out[(y*imagesizex+x)*3+1] = sumy;// / old_divby; + out[(y*imagesizex+x)*3+2] = sumz;/// old_divby; + } + +} + __global__ void filter_old(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey) { int x = blockIdx.x * blockDim.x + threadIdx.x; @@ -180,9 +266,98 @@ __global__ void filter_old(unsigned char *image, unsigned char *out, const unsig out[(y*imagesizex+x)*3+2] = sumz/divby; } } + +__global__ void filter_median(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey) +{ + + __shared__ unsigned char local_memory [STATIC_SHARED_MEM_SIZE_PX * 3]; + // map from blockIdx to pixel position + unsigned threadNumInBlock = threadIdx.x + blockDim.x * threadIdx.y; + unsigned threadsPerBlock = blockDim.x * blockDim.y; + + int blockOffsetX = blockIdx.x * blockDim.x; + int blockOffsetY = blockIdx.y * blockDim.y; + + int local_index = threadNumInBlock; + + int FILTER_W = (kernelsizex * 2 + 1) + (2 * kernelsizex); + int FILTER_H = (kernelsizey * 2 + 1) + (2 * kernelsizey); + + unsigned LOCAL_MEM_SIZE = FILTER_W * FILTER_H; + + while(local_index < LOCAL_MEM_SIZE) + { + int global_x = min(max(0, blockOffsetX - kernelsizex + local_index % FILTER_W), imagesizex - 1); + int global_y = min(max(0, blockOffsetY - kernelsizey + local_index / FILTER_W), imagesizey - 1); + unsigned global_index = global_y * imagesizex + global_x; + + local_memory[3*local_index+0] = image[3*global_index+0]; + local_memory[3*local_index+1] = image[3*global_index+1]; + local_memory[3*local_index+2] = image[3*global_index+2]; + + local_index += threadsPerBlock; + } + __syncthreads(); + unsigned x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x < imagesizex && y < imagesizey) // If inside image + { + int dy, dx; + unsigned int sumx[256] = {0}, sumy[256] = {0}, sumz[256] = {0}; + + unsigned local_x = threadIdx.x; + unsigned local_y = threadIdx.y; + + for(dy=-kernelsizey;dy<=kernelsizey;dy++) + for(dx=-kernelsizex;dx<=kernelsizex;dx++) + { + unsigned yy = kernelsizey + local_y + dy; + unsigned xx = kernelsizex + local_x + dx; + unsigned mem_access = yy * FILTER_W + xx; + + ++sumx[local_memory[mem_access*3+0]]; + ++sumy[local_memory[mem_access*3+1]]; + ++sumz[local_memory[mem_access*3+2]]; + } + + unsigned num_x = 0; + unsigned num_y = 0; + unsigned num_z = 0; + int index_x = 0; + int index_y = 0; + int index_z = 0; + + unsigned half_filter_size = ((kernelsizex*2+1) * (kernelsizey*2+1))/2; + while(num_x < half_filter_size) + { + // printf("Thread: %d Num_x %d index_x %d\n",y* blockDim.x + x, num_x, index_x); + num_x += sumx[index_x++]; + } + while(num_y < half_filter_size) + { + num_y += sumy[index_y++]; + } + while(num_z < half_filter_size) + { + num_z += sumz[index_z++]; + } + + // while(num_x < half_filter_size && num_y < half_filter_size && num_z < half_filter_size) + // { + // num_x += sumx[index_x++]; + // num_y += sumy[index_y++]; + // num_z += sumz[index_z++]; + // } + + out[(y*imagesizex+x)*3+0] = index_x; + out[(y*imagesizex+x)*3+1] = index_y; + out[(y*imagesizex+x)*3+2] = index_z; + } +} // Global variables for image data -unsigned char *image, *pixels, *dev_bitmap, *dev_input; +unsigned char *image, *pixels, *dev_bitmap, *dev_input, *dev_bitmap_tmp; unsigned int imagesizey, imagesizex; // Image size //////////////////////////////////////////////////////////////////////////////// @@ -199,41 +374,75 @@ void computeImages(int kernelsizex, int kernelsizey) pixels = (unsigned char *) malloc(imagesizex*imagesizey*3); cudaMalloc( (void**)&dev_input, imagesizex*imagesizey*3); cudaMemcpy( dev_input, image, imagesizey*imagesizex*3, cudaMemcpyHostToDevice ); + + cudaMalloc( (void**)&dev_bitmap_tmp, imagesizex*imagesizey*3); cudaMalloc( (void**)&dev_bitmap, imagesizex*imagesizey*3); - dim3 grid(imagesizex,imagesizey); - ResetMilli(); - filter_old<<<grid,1>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Awful load balance - cudaThreadSynchronize(); - printf("[Un-Optimized] %f\n", GetSeconds()); - dim3 gridsize(imagesizex/(kernelsizex*2+1), imagesizey/(kernelsizey*2+1)); + /* Given filter */ + // dim3 grid(imagesizex,imagesizey); + // ResetMilli(); + // filter_old<<<grid,1>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Awful load balance + // cudaThreadSynchronize(); + // printf("[Un-Optimized] %f\n", GetSeconds()); + + /* Our first block filter */ + dim3 gridsize(imagesizex/(kernelsizex*2+1)+1, imagesizey/(kernelsizey*2+1)+1); dim3 blocksize(kernelsizex*2+1, kernelsizey*2+1); ResetMilli(); filter_1<<<gridsize, blocksize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Nice load balance cudaThreadSynchronize(); printf("[Optimized] %f\n", GetSeconds()); - dim3 gridsize_sep(imagesizex/(kernelsizex*2+1), imagesizey); - dim3 blocksize_sep(kernelsizex*2+1); - // unsigned char *tmp = dev_input; + /* Separable filter */ + dim3 gridsize_sep_horizontal(imagesizex/(kernelsizex*2+1) + 1, imagesizey); + dim3 gridsize_sep_vertical(imagesizex, imagesizey/(kernelsizex*2+1) + 1); + + dim3 threads_horizontal(kernelsizex*2+1); + dim3 threads_vertical(1, kernelsizex*2+1); + // // unsigned char *tmp = dev_input; ResetMilli(); - /* X-direction */ - filter_separable<<<gridsize_sep, blocksize_sep>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, 1); + // /* X-direction */ + filter_separable<<<gridsize_sep_horizontal, threads_horizontal>>>(dev_input, dev_bitmap_tmp, imagesizex, imagesizey, kernelsizex, true); cudaThreadSynchronize(); /* Y-direction */ - // dim3 gridsize_sep(imagesizex, imagesizey/(kernelsizex*2+1)); - // filter_separable<<<gridsize_sep, blocksize_sep>>>(tmp, dev_bitmap, imagesizex, imagesizey, kernelsizex, 1); + filter_separable<<<gridsize_sep_vertical, threads_vertical>>>(dev_bitmap_tmp, dev_bitmap, imagesizex, imagesizey, kernelsizex, false); cudaThreadSynchronize(); printf("[Separable] %f\n", GetSeconds()); + + /* Gaussian filter i.e. 2 x 5*1 filters run horizontal and vertical with gaussian weights */ + if(kernelsizex == 2) + { + ResetMilli(); + /* X-direction */ + filter_gaussian<<<gridsize_sep_horizontal, threads_horizontal>>>(dev_input, dev_bitmap_tmp, imagesizex, imagesizey, kernelsizex, true); + cudaThreadSynchronize(); + /* Y-direction */ + filter_gaussian<<<gridsize_sep_vertical, threads_vertical>>>(dev_bitmap_tmp, dev_bitmap, imagesizex, imagesizey, kernelsizex, false); + cudaThreadSynchronize(); + + printf("[Gaussian] %f\n", GetSeconds()); + } + + + + /* Median filter */ + ResetMilli(); + filter_median<<<gridsize, blocksize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Nice load balance + cudaThreadSynchronize(); + printf("[Median] %f\n", GetSeconds()); + // Check for errors! cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(err)); cudaMemcpy( pixels, dev_bitmap, imagesizey*imagesizex*3, cudaMemcpyDeviceToHost ); + + cudaFree( dev_bitmap_tmp ); cudaFree( dev_bitmap ); cudaFree( dev_input ); + } // Display images @@ -279,7 +488,7 @@ int main( int argc, char** argv) glutDisplayFunc(Draw); - for(int i = 2; i < 10; i++) + for(int i = 2; i < 5; i++) { printf("Frame is %i x %i\n", i*2+1, i*2+1); computeImages(i, i);