diff --git a/Lab5/filter.cu b/Lab5/filter.cu index cdc962b24660195ccf620ec43ed350cede11c748..315145de762fb4a221c2e2be4d37836be2561024 100755 --- a/Lab5/filter.cu +++ b/Lab5/filter.cu @@ -31,19 +31,15 @@ #include "milli.h" // Use these for setting shared memory size. -#define RADIUS 12 -#define BLOCK_H 32 -#define BLOCK_W 32 -#define FILTER_W (RADIUS*2+BLOCK_W) // 3 pixels each -#define FILTER_H (RADIUS*2+BLOCK_H) -#define maxKernelSizeX BLOCK_W -#define maxKernelSizeY BLOCK_H - -// Number of pixels in local memory -#define LOCAL_MEM_SIZE_PX (FILTER_W * FILTER_H) +#define RADIUS 3 + +#define maxKernelSizeX (RADIUS*2+1) +#define maxKernelSizeY (RADIUS*2+1) -// #define BLOCK_SIZE 16 // total number of threads per block = 256 -// #define GRID_SIZE 8 // total number of blocks per grid = 256 +#define FILTER_W (maxKernelSizeX+RADIUS*2) +#define FILTER_H (maxKernelSizeY+RADIUS*2) + +#define LOCAL_MEM_SIZE_PX (FILTER_W * FILTER_H) __global__ void filter(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey) { @@ -54,8 +50,7 @@ __global__ void filter(unsigned char *image, unsigned char *out, const unsigned unsigned threadNumInBlock = threadIdx.x + blockDim.x *threadIdx.y; unsigned threadsPerBlock = blockDim.x * blockDim.y; - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; + int blockOffsetX = blockIdx.x * blockDim.x; int blockOffsetY = blockIdx.y * blockDim.y; @@ -75,65 +70,43 @@ __global__ void filter(unsigned char *image, unsigned char *out, const unsigned local_index += threadsPerBlock; } - int dy, dx; + int dy, dx; unsigned int sumx, sumy, sumz; /* Old factor */ -// int divby = (2*kernelsizex*3+1)*(2*kernelsizey+1); // Works for box filters only! -// if (x < imagesizex && y < imagesizey) // If inside image -// { -// // Filter kernel (simple box filter) -// sumx=0;sumy=0;sumz=0; -// for(dy=-kernelsizey;dy<=kernelsizey;dy++) -// for(dx=-kernelsizex;dx<=kernelsizex;dx++) -// { -// // Use max and min to avoid branching! -// int yy = min(max(y+dy, 0), BLOCK_H - 1); -// int xx = min(max(x+dx, 0), BLOCK_W - 1); -// -// // printf("This is the index value: %i for thread %i\n", ((yy)*imagesizex+(xx))*3+0, threadid); -// sumx += local_memory[((yy)*(BLOCK_W+RADIUS)+(xx))*3+0]; -// sumy += local_memory[((yy)*(BLOCK_W+RADIUS)+(xx))*3+1]; -// sumz += local_memory[((yy)*(BLOCK_W+RADIUS)+(xx))*3+2]; -// // printf("This is the sumx: %i sumy: %i sumz: %i\n", sumx, sumy, sumz); -// } + int divby = (2*kernelsizex*3+1)*(2*kernelsizey+1); // Works for box filters only! + int skip_pixels = FILTER_W * RADIUS + RADIUS; + + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + int local_x = threadIdx.x; + int local_y = threadIdx.y; + + int local_mem_address = skip_pixels + (local_y * FILTER_W) + local_x; + + // if (x < imagesizex && y < imagesizey) // If inside image + // { + // Filter kernel (simple box filter) + sumx=0;sumy=0;sumz=0; + for(dy=-kernelsizey;dy<=kernelsizey;dy++) + for(dx=-kernelsizex;dx<=kernelsizex;dx++) + { + int mem_access = skip_pixels + (local_y + dy) * FILTER_W + (local_x + dx); + + sumx += local_memory[mem_access*3+0]; + sumy += local_memory[mem_access*3+1]; + sumz += local_memory[mem_access*3+2]; + } __syncthreads(); + // out[3*(y*imagesizex+x)+0] = local_memory[3*local_mem_address+0]; + // out[3*(y*imagesizex+x)+1] = local_memory[3*local_mem_address+1]; + // out[3*(y*imagesizex+x)+2] = local_memory[3*local_mem_address+2]; - for(dy=-kernelsizey;dy<=kernelsizey;dy++) - for(dx=-kernelsizex;dx<=kernelsizex;dx++) - { - int yy = min(max(y+dy, 0), BLOCK_H - 1); - int xx = min(max(x+dx, 0), BLOCK_W - 1); - - // out[y*imagesizex+(x*3+0)] = local_memory[yy*(BLOCK_W+RADIUS)+(xx*3+0)]; - // out[y*imagesizex+(x*3+1)] = local_memory[yy*(BLOCK_W+RADIUS)+(xx*3+1)]; - // out[y*imagesizex+(x*3+2)] = local_memory[yy*(BLOCK_W+RADIUS)+(xx*3+2)]; - } - - int local_x = threadIdx.x; - int local_y = threadIdx.y; - // if((local_x + local_y) == 0) - // printf("This is the adress we're trying to reach in local memory: %i\n", FILTER_W*RADIUS+local_y*FILTER_W+3*(local_x+RADIUS)+0); - /* Escape top + escape Y + escape start x */ - int local_mem_address = FILTER_W * RADIUS + (local_y * FILTER_W) + RADIUS + local_x; - // int local_mem_address = (BLOCK_W+RADIUS*2)*RADIUS*3 + (local_y*2*RADIUS+RADIUS) + (3*(local_y*BLOCK_W + RADIUS * 2) + local_x); - if((blockIdx.x + blockIdx.y == 0) && (threadIdx.y == 5) && (threadIdx.x == 5)) - printf("In Outprint:\nWe are thread %i. Local mem [%i]\n", threadIdx.y * BLOCK_H + threadIdx.x,local_mem_address); - out[3*(y*imagesizex+x)+0] = local_memory[3*local_mem_address+0]; - out[3*(y*imagesizex+x)+1] = local_memory[3*local_mem_address+1]; - out[3*(y*imagesizex+x)+2] = local_memory[3*local_mem_address+2]; - - // printf("Global id: %i Local Mem: %i Size of shared memory is: %i FILTER_W: %i FILTER_H: %i\n", threadid, local_mem_address, FILTER_W*FILTER_H, FILTER_W,FILTER_H); - // out[3*(y*imagesizex+x)+0] = image[3*(y*imagesizex+x)+0]; - // out[3*(y*imagesizex+x)+1] = image[3*(y*imagesizex+x)+1]; - // out[3*(y*imagesizex+x)+2] = image[3*(y*imagesizex+x)+2]; - - // printf("Filter_W: %i Filter_H: %i ", ) -/* out[(y*imagesizex+x)*3+0] = sumx/divby; out[(y*imagesizex+x)*3+1] = sumy/divby; out[(y*imagesizex+x)*3+2] = sumz/divby; -*/ + } @@ -240,7 +213,7 @@ int main( int argc, char** argv) ResetMilli(); - computeImages(2, 2); + computeImages(RADIUS, RADIUS); // You can save the result to a file like this: writeppm("out.ppm", imagesizey, imagesizex, pixels);