Skip to content
Snippets Groups Projects
Commit 2e6d1189 authored by dansa828's avatar dansa828
Browse files

lab5 first task done

parent 934e244b
No related branches found
No related tags found
No related merge requests found
...@@ -32,51 +32,65 @@ ...@@ -32,51 +32,65 @@
// Use these for setting shared memory size. // Use these for setting shared memory size.
#define maxKernelSizeX 10 #define maxKernelSizeX 10
#define maxKernelSizeY 10 #define maxKernelSizeY 10
#define tileSize 8
__global__ void filter(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey) __global__ void filter(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey)
{ {
__shared__ unsigned char patch[maxKernelSizeX*maxKernelSizeY]; __shared__ unsigned char patch[(tileSize+2*maxKernelSizeX*3)*(tileSize+2*maxKernelSizeY)];
// map from blockIdx to pixel position // map from blockIdx to pixel position
int g_x = blockIdx.x * blockDim.x*3 + threadIdx.x - kernelsizex*3; int g_x = blockIdx.x * tileSize + threadIdx.x*3 - kernelsizex*3;
int g_y = blockIdx.y * blockDim.y + threadIdx.y - kernelsizey; int g_y = blockIdx.y * tileSize + threadIdx.y - kernelsizey;
// Use max and min to avoid branching! // Use max and min to avoid branching!
g_x = min(max(g_x, 0), imagesizex*3-1); g_x = min(max(g_x, 0), imagesizex*3-1);
g_y = min(max(g_y, 0), imagesizey-1); g_y = min(max(g_y, 0), imagesizey-1);
int g_i = imagesizex*g_y*3 + g_x; int g_i = imagesizex*g_y*3 + g_x;
int l_i = threadIdx.x*3 + blockDim.x * threadIdx.y; int l_i = threadIdx.x*3 + blockDim.x*3*threadIdx.y;
int l_x = threadIdx.x*3;
int l_y = threadIdx.y;
patch[l_i+0] = image[g_i+0]; patch[l_i+0] = image[g_i+0];
patch[l_i+1] = image[g_i+1]; patch[l_i+1] = image[g_i+1];
patch[l_i+2] = image[g_i+2]; patch[l_i+2] = image[g_i+2];
__syncthreads(); __syncthreads();
unsigned int sumx, sumy, sumz;
if(threadIdx.x > kernelsizex*3 && threadIdx.x < maxKernelSizeX-kernelsizex*3 &&
threadIdx.y > kernelsizey && threadIdx.y < maxKernelSizeY-kernelsizey) {
int dy, dx;
if((l_x >= (kernelsizex*3)) && (l_x < ((blockDim.x-(kernelsizex*3)))) &&
(l_y >= kernelsizey) && (l_y < (blockDim.y-kernelsizey))) {
unsigned int sumx, sumy, sumz;
int dy, dx;
sumx=0;sumy=0;sumz=0; sumx=0;sumy=0;sumz=0;
for(dy=-kernelsizey;dy<=kernelsizey;dy++) for(dy=-kernelsizey;dy<=kernelsizey;dy++)
for(dx=-kernelsizex*3;dx<=kernelsizex*3;dx +=3) {
for(dx=-kernelsizex;dx<=kernelsizex;dx++)
{ {
sumx += patch[l_i+(dy*blockDim.x)+(dx)+0]; sumx += patch[(l_i+(dy*blockDim.x*3)+dx*3)+0];
sumy += patch[l_i+(dy*blockDim.x)+(dx)+1]; sumy += patch[(l_i+(dy*blockDim.x*3)+dx*3)+1];
sumz += patch[l_i+(dy*blockDim.x)+(dx)+2]; sumz += patch[(l_i+(dy*blockDim.x*3)+dx*3)+2];
//patch[l_i+(dy*blockDim.x*3)+(dx*3)+0] = 255;
//patch[l_i+(dy*blockDim.x*3)+(dx*3)+1] = 255;
//patch[l_i+(dy*blockDim.x*3)+(dx*3)+2] = 255;
} }
}
int divby = (2*kernelsizex+1)*(2*kernelsizey+1); // Works for box filters only!
out[g_i+0] = sumx/divby;
out[g_i+1] = sumy/divby;
out[g_i+2] = sumz/divby;
//out[g_i+0] = patch[l_i+0];
//out[g_i+1] = patch[l_i+1];
//out[g_i+2] = patch[l_i+2];
//out[g_i+0] = image[g_i+0];
//out[g_i+1] = image[g_i+1];
//out[g_i+2] = image[g_i+2];
} }
int divby = (2*kernelsizex+1)*(2*kernelsizey+1); // Works for box filters only!
if(threadIdx.x == 0 && threadIdx.y == 0)
printf("%i %i %i\n", sumx/divby, sumy/divby, sumz/divby);
out[g_i+0] = sumx/divby;
out[g_i+1] = sumy/divby;
out[g_i+2] = sumz/divby; /*
out[g_i+0] = patch[l_i+0];
out[g_i+1] = patch[l_i+1];
out[g_i+2] = patch[l_i+2]; */
} }
// Global variables for image data // Global variables for image data
...@@ -99,8 +113,11 @@ void computeImages(int kernelsizex, int kernelsizey) ...@@ -99,8 +113,11 @@ void computeImages(int kernelsizex, int kernelsizey)
cudaMalloc( (void**)&dev_input, imagesizex*imagesizey*3); cudaMalloc( (void**)&dev_input, imagesizex*imagesizey*3);
cudaMemcpy( dev_input, image, imagesizey*imagesizex*3, cudaMemcpyHostToDevice ); cudaMemcpy( dev_input, image, imagesizey*imagesizex*3, cudaMemcpyHostToDevice );
cudaMalloc( (void**)&dev_bitmap, imagesizex*imagesizey*3); cudaMalloc( (void**)&dev_bitmap, imagesizex*imagesizey*3);
dim3 grid(imagesizex,imagesizey);
filter<<<grid,1>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Awful load balance dim3 grid(imagesizex*3/tileSize,imagesizey/tileSize);
dim3 blockSize(tileSize*3+(2*kernelsizex*3), tileSize+(2*kernelsizey));
filter<<<grid,blockSize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Awful load balance
cudaThreadSynchronize(); cudaThreadSynchronize();
// Check for errors! // Check for errors!
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment