diff --git a/Lab5/filter.cu b/Lab5/filter.cu index 315145de762fb4a221c2e2be4d37836be2561024..115dfb28e3b10fd6869007a37d8bbfa7c4e49310 100755 --- a/Lab5/filter.cu +++ b/Lab5/filter.cu @@ -20,7 +20,6 @@ #include <stdlib.h> #include <string.h> #include <sys/time.h> - #ifdef __APPLE__ #include <GLUT/glut.h> #include <OpenGL/gl.h> @@ -31,89 +30,47 @@ #include "milli.h" // Use these for setting shared memory size. -#define RADIUS 3 - -#define maxKernelSizeX (RADIUS*2+1) -#define maxKernelSizeY (RADIUS*2+1) +#define maxKernelSizeX 10 +#define maxKernelSizeY 10 -#define FILTER_W (maxKernelSizeX+RADIUS*2) -#define FILTER_H (maxKernelSizeY+RADIUS*2) - -#define LOCAL_MEM_SIZE_PX (FILTER_W * FILTER_H) __global__ void filter(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey) { - __shared__ unsigned char local_memory[LOCAL_MEM_SIZE_PX * 3]; - // map from blockIdx to pixel position - unsigned blockNumInGrid = blockIdx.x + gridDim.x * blockIdx.y; - unsigned threadNumInBlock = threadIdx.x + blockDim.x *threadIdx.y; - unsigned threadsPerBlock = blockDim.x * blockDim.y; - - - - int blockOffsetX = blockIdx.x * blockDim.x; - int blockOffsetY = blockIdx.y * blockDim.y; - - int local_index = threadNumInBlock; - while(local_index < LOCAL_MEM_SIZE_PX) - { - /* Repeat three times for colored pixel */ - int global_x = max(0, blockOffsetX - RADIUS + local_index % FILTER_W); - int global_y = max(0, blockOffsetY - RADIUS + local_index / FILTER_W); - int global_index = global_y * imagesizex + global_x; - - local_memory[3*local_index+0] = image[3*global_index+0]; - local_memory[3*local_index+1] = image[3*global_index+1]; - local_memory[3*local_index+2] = image[3*global_index+2]; - - local_index += threadsPerBlock; - } + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; int dy, dx; unsigned int sumx, sumy, sumz; - /* Old factor */ - int divby = (2*kernelsizex*3+1)*(2*kernelsizey+1); // Works for box filters only! - int skip_pixels = FILTER_W * RADIUS + RADIUS; - - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; - - int local_x = threadIdx.x; - int local_y = threadIdx.y; - - int local_mem_address = skip_pixels + (local_y * FILTER_W) + local_x; - - // if (x < imagesizex && y < imagesizey) // If inside image - // { - // Filter kernel (simple box filter) - sumx=0;sumy=0;sumz=0; - for(dy=-kernelsizey;dy<=kernelsizey;dy++) - for(dx=-kernelsizex;dx<=kernelsizex;dx++) - { - int mem_access = skip_pixels + (local_y + dy) * FILTER_W + (local_x + dx); - - sumx += local_memory[mem_access*3+0]; - sumy += local_memory[mem_access*3+1]; - sumz += local_memory[mem_access*3+2]; - } - - __syncthreads(); - // out[3*(y*imagesizex+x)+0] = local_memory[3*local_mem_address+0]; - // out[3*(y*imagesizex+x)+1] = local_memory[3*local_mem_address+1]; - // out[3*(y*imagesizex+x)+2] = local_memory[3*local_mem_address+2]; - - out[(y*imagesizex+x)*3+0] = sumx/divby; - out[(y*imagesizex+x)*3+1] = sumy/divby; - out[(y*imagesizex+x)*3+2] = sumz/divby; + int divby = (2*kernelsizex+1)*(2*kernelsizey+1); // Works for box filters only! + if (x < imagesizex && y < imagesizey) // If inside image + { +// Filter kernel (simple box filter) + sumx=0;sumy=0;sumz=0; + for(dy=-kernelsizey;dy<=kernelsizey;dy++) + for(dx=-kernelsizex;dx<=kernelsizex;dx++) + { + // Use max and min to avoid branching! + int yy = min(max(y+dy, 0), imagesizey-1); + int xx = min(max(x+dx, 0), imagesizex-1); + + sumx += image[((yy)*imagesizex+(xx))*3+0]; + sumy += image[((yy)*imagesizex+(xx))*3+1]; + sumz += image[((yy)*imagesizex+(xx))*3+2]; + } + out[(y*imagesizex+x)*3+0] = sumx/divby; + out[(y*imagesizex+x)*3+1] = sumy/divby; + out[(y*imagesizex+x)*3+2] = sumz/divby; + } } // Global variables for image data unsigned char *image, *pixels, *dev_bitmap, *dev_input; unsigned int imagesizey, imagesizex; // Image size + //////////////////////////////////////////////////////////////////////////////// // main computation function //////////////////////////////////////////////////////////////////////////////// @@ -125,52 +82,25 @@ void computeImages(int kernelsizex, int kernelsizey) return; } - - - - //dim3 gridsize(1, 1); - - dim3 gridsize(imagesizex/maxKernelSizeX, imagesizey/maxKernelSizeY); - dim3 blocksize(maxKernelSizeX, maxKernelSizeY); // 256 - - /* whole pic, no need to declare several times */ pixels = (unsigned char *) malloc(imagesizex*imagesizey*3); cudaMalloc( (void**)&dev_input, imagesizex*imagesizey*3); - cudaMemcpy( dev_input, image, imagesizey*imagesizex*3, cudaMemcpyHostToDevice); + cudaMemcpy( dev_input, image, imagesizey*imagesizex*3, cudaMemcpyHostToDevice ); cudaMalloc( (void**)&dev_bitmap, imagesizex*imagesizey*3); - filter<<<gridsize,blocksize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Awful load balance - - cudaDeviceSynchronize(); + dim3 grid(imagesizex,imagesizey); + filter<<<grid,1>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Awful load balance + cudaThreadSynchronize(); // Check for errors! - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - printf("Error: %s\n", cudaGetErrorString(err)); - - + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + printf("Error: %s\n", cudaGetErrorString(err)); cudaMemcpy( pixels, dev_bitmap, imagesizey*imagesizex*3, cudaMemcpyDeviceToHost ); cudaFree( dev_bitmap ); cudaFree( dev_input ); - } -// unsigned image_size = imagesizex * imagesizey; -// unsigned num_kernels = floor(image_size / (GRID_SIZE * BLOCK_SIZE)); -// unsigned rows_per_kernel = floor(imagesizey / num_kernels); -// unsigned remainder_y = imagesizey % num_kernels; // 10 - -// pixels = (unsigned char *) malloc(imagesizex*imagesizey*3); -// for(int i = 1; i <= num_kernels; i++) -// { -// cudaMalloc( (void**)&dev_input, imagesizex*imagesizey*3); -// cudaMemcpy( dev_input, image, imagesizey*imagesizex*3, cudaMemcpyHostToDevice); -// cudaMalloc( (void**)&dev_bitmap, imagesizex*imagesizey*3); -// filter<<<grid,blocksize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey, i); // Awful load balance -// } - // Display images void Draw() { - // Dump the whole picture onto the screen. glClearColor( 0.0, 0.0, 0.0, 1.0 ); glClear( GL_COLOR_BUFFER_BIT ); @@ -195,7 +125,6 @@ void Draw() // Main program, inits int main( int argc, char** argv) { - glutInit(&argc, argv); glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA ); @@ -213,9 +142,10 @@ int main( int argc, char** argv) ResetMilli(); - computeImages(RADIUS, RADIUS); + computeImages(2, 2); + // You can save the result to a file like this: - writeppm("out.ppm", imagesizey, imagesizex, pixels); +// writeppm("out.ppm", imagesizey, imagesizex, pixels); glutMainLoop(); return 0;