Skip to content
Snippets Groups Projects
Commit 9f445063 authored by matbe320's avatar matbe320
Browse files

Lab5: Finished hopefully

parent 4d95f649
No related branches found
No related tags found
No related merge requests found
...@@ -55,8 +55,8 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne ...@@ -55,8 +55,8 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne
while(local_index < LOCAL_MEM_SIZE) while(local_index < LOCAL_MEM_SIZE)
{ {
int global_x = max(0, blockOffsetX - kernelsizex + local_index % FILTER_W); int global_x = min(max(0, blockOffsetX - kernelsizex + local_index % FILTER_W), imagesizex - 1);
int global_y = max(0, blockOffsetY - kernelsizey + local_index / FILTER_W); int global_y = min(max(0, blockOffsetY - kernelsizey + local_index / FILTER_W), imagesizey - 1);
unsigned global_index = global_y * imagesizex + global_x; unsigned global_index = global_y * imagesizex + global_x;
local_memory[3*local_index+0] = image[3*global_index+0]; local_memory[3*local_index+0] = image[3*global_index+0];
...@@ -65,6 +65,7 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne ...@@ -65,6 +65,7 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne
local_index += threadsPerBlock; local_index += threadsPerBlock;
} }
__syncthreads();
unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned y = blockIdx.y * blockDim.y + threadIdx.y; unsigned y = blockIdx.y * blockDim.y + threadIdx.y;
...@@ -97,7 +98,7 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne ...@@ -97,7 +98,7 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne
} }
} }
__global__ void filter_separable(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsize, const int stride) __global__ void filter_separable(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsize, const bool horizontal)
{ {
__shared__ unsigned char local_memory [STATIC_SHARED_MEM_SIZE_SEP * 3]; __shared__ unsigned char local_memory [STATIC_SHARED_MEM_SIZE_SEP * 3];
// map from blockIdx to pixel position // map from blockIdx to pixel position
...@@ -106,14 +107,28 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const ...@@ -106,14 +107,28 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const
int blockOffsetX = blockIdx.x * blockDim.x; int blockOffsetX = blockIdx.x * blockDim.x;
int blockOffsetY = blockIdx.y * blockDim.y; int blockOffsetY = blockIdx.y * blockDim.y;
int local_index = threadIdx.x; /* Threadidx.x or threadIdx.y will always be zero depending on direction */
int local_index = threadIdx.x + threadIdx.y;
int FILTER_SIZE = (kernelsize * 2 + 1) + (2 * kernelsize); int FILTER_SIZE = (kernelsize * 2 + 1) + (2 * kernelsize);
while(local_index < FILTER_SIZE) while(local_index < FILTER_SIZE)
{ {
int global_x = max(0, blockOffsetX - kernelsize + local_index); int global_x = 0;
int global_y = max(0, blockOffsetY); int global_y = 0;
if(horizontal)
{
global_x = min(max(0, blockOffsetX - kernelsize + local_index), imagesizex - 1);
global_y = blockOffsetY;
}
else
{
global_x = blockOffsetX;
global_y = min(max(0, blockOffsetY - kernelsize + local_index), imagesizey - 1);//max(0, blockOffsetY);
}
unsigned global_index = global_y * imagesizex + global_x; unsigned global_index = global_y * imagesizex + global_x;
local_memory[3*local_index+0] = image[3*global_index+0]; local_memory[3*local_index+0] = image[3*global_index+0];
...@@ -122,10 +137,12 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const ...@@ -122,10 +137,12 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const
local_index += threadsPerBlock; local_index += threadsPerBlock;
} }
__syncthreads();
unsigned x = blockIdx.x * blockDim.x + threadIdx.x; unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned y = blockIdx.y * blockDim.y + threadIdx.y; unsigned y = blockIdx.y * blockDim.y + threadIdx.y;
/* Since one of threadIdx.x or threadIdx.y is 0 we can just add them */
local_index = threadIdx.x + threadIdx.y;
if (x < imagesizex && y < imagesizey) // If inside image if (x < imagesizex && y < imagesizey) // If inside image
{ {
int d; int d;
...@@ -135,21 +152,90 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const ...@@ -135,21 +152,90 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const
sumx=0;sumy=0;sumz=0; sumx=0;sumy=0;sumz=0;
for(d=-kernelsize;d<=kernelsize;d++) for(d=-kernelsize;d<=kernelsize;d++)
{ {
unsigned dx = kernelsize + threadIdx.x + d; unsigned dx = kernelsize + local_index + d;
sumx += local_memory[dx*3+0]; sumx += local_memory[dx*3+0];
sumy += local_memory[dx*3+1]; sumy += local_memory[dx*3+1];
sumz += local_memory[dx*3+2]; sumz += local_memory[dx*3+2];
} }
out[(y*imagesizex+x)*3+0] = local_memory[kernelsize + threadIdx.x + 0];
out[(y*imagesizex+x)*3+1] = local_memory[kernelsize + threadIdx.x + 1];
out[(y*imagesizex+x)*3+2] = local_memory[kernelsize + threadIdx.x + 2]; out[(y*imagesizex+x)*3+0] = sumx/divby;
// out[(y*imagesizex+x)*3+0] = sumx/divby; out[(y*imagesizex+x)*3+1] = sumy/divby;
// out[(y*imagesizex+x)*3+1] = sumy/divby; out[(y*imagesizex+x)*3+2] = sumz/divby;
// out[(y*imagesizex+x)*3+2] = sumz/divby;
} }
} }
__global__ void filter_gaussian(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsize, const bool horizontal)
{
__shared__ unsigned char local_memory [STATIC_SHARED_MEM_SIZE_SEP * 3];
// map from blockIdx to pixel position
unsigned threadsPerBlock = blockDim.x;
int blockOffsetX = blockIdx.x * blockDim.x;
int blockOffsetY = blockIdx.y * blockDim.y;
/* Threadidx.x or threadIdx.y will always be zero depending on direction */
int local_index = threadIdx.x + threadIdx.y;
int FILTER_SIZE = (kernelsize * 2 + 1) + (2 * kernelsize);
while(local_index < FILTER_SIZE)
{
int global_x = 0;
int global_y = 0;
if(horizontal)
{
global_x = min(max(0, blockOffsetX - kernelsize + local_index), imagesizex - 1);
global_y = blockOffsetY;
}
else
{
global_x = blockOffsetX;
global_y = min(max(0, blockOffsetY - kernelsize + local_index), imagesizey - 1);//max(0, blockOffsetY);
}
unsigned global_index = global_y * imagesizex + global_x;
local_memory[3*local_index+0] = image[3*global_index+0];
local_memory[3*local_index+1] = image[3*global_index+1];
local_memory[3*local_index+2] = image[3*global_index+2];
local_index += threadsPerBlock;
}
__syncthreads();
unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned y = blockIdx.y * blockDim.y + threadIdx.y;
/* Since one of threadIdx.x or threadIdx.y is 0 we can just add them */
local_index = threadIdx.x + threadIdx.y;
if (x < imagesizex && y < imagesizey) // If inside image
{
int d;
unsigned int sumx, sumy, sumz;
float divby[5] = {1.0/16.0, 4.0/16.0, 6.0/16.0, 4.0/16.0, 1.0/16.0};
sumx=0;sumy=0;sumz=0;
for(d=-kernelsize;d<=kernelsize;d++)
{
unsigned dx = kernelsize + local_index + d;
sumx += local_memory[dx*3+0] * divby[kernelsize+d];
sumy += local_memory[dx*3+1] * divby[kernelsize+d];
sumz += local_memory[dx*3+2] * divby[kernelsize+d];
// printf("Divby factor %f kernelsize+d %d\n", divby[kernelsize+d], kernelsize+d);
}
out[(y*imagesizex+x)*3+0] = sumx;// / old_divby;
out[(y*imagesizex+x)*3+1] = sumy;// / old_divby;
out[(y*imagesizex+x)*3+2] = sumz;/// old_divby;
}
}
__global__ void filter_old(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey) __global__ void filter_old(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey)
{ {
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -180,9 +266,98 @@ __global__ void filter_old(unsigned char *image, unsigned char *out, const unsig ...@@ -180,9 +266,98 @@ __global__ void filter_old(unsigned char *image, unsigned char *out, const unsig
out[(y*imagesizex+x)*3+2] = sumz/divby; out[(y*imagesizex+x)*3+2] = sumz/divby;
} }
} }
__global__ void filter_median(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey)
{
__shared__ unsigned char local_memory [STATIC_SHARED_MEM_SIZE_PX * 3];
// map from blockIdx to pixel position
unsigned threadNumInBlock = threadIdx.x + blockDim.x * threadIdx.y;
unsigned threadsPerBlock = blockDim.x * blockDim.y;
int blockOffsetX = blockIdx.x * blockDim.x;
int blockOffsetY = blockIdx.y * blockDim.y;
int local_index = threadNumInBlock;
int FILTER_W = (kernelsizex * 2 + 1) + (2 * kernelsizex);
int FILTER_H = (kernelsizey * 2 + 1) + (2 * kernelsizey);
unsigned LOCAL_MEM_SIZE = FILTER_W * FILTER_H;
while(local_index < LOCAL_MEM_SIZE)
{
int global_x = min(max(0, blockOffsetX - kernelsizex + local_index % FILTER_W), imagesizex - 1);
int global_y = min(max(0, blockOffsetY - kernelsizey + local_index / FILTER_W), imagesizey - 1);
unsigned global_index = global_y * imagesizex + global_x;
local_memory[3*local_index+0] = image[3*global_index+0];
local_memory[3*local_index+1] = image[3*global_index+1];
local_memory[3*local_index+2] = image[3*global_index+2];
local_index += threadsPerBlock;
}
__syncthreads();
unsigned x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < imagesizex && y < imagesizey) // If inside image
{
int dy, dx;
unsigned int sumx[256] = {0}, sumy[256] = {0}, sumz[256] = {0};
unsigned local_x = threadIdx.x;
unsigned local_y = threadIdx.y;
for(dy=-kernelsizey;dy<=kernelsizey;dy++)
for(dx=-kernelsizex;dx<=kernelsizex;dx++)
{
unsigned yy = kernelsizey + local_y + dy;
unsigned xx = kernelsizex + local_x + dx;
unsigned mem_access = yy * FILTER_W + xx;
++sumx[local_memory[mem_access*3+0]];
++sumy[local_memory[mem_access*3+1]];
++sumz[local_memory[mem_access*3+2]];
}
unsigned num_x = 0;
unsigned num_y = 0;
unsigned num_z = 0;
int index_x = 0;
int index_y = 0;
int index_z = 0;
unsigned half_filter_size = ((kernelsizex*2+1) * (kernelsizey*2+1))/2;
while(num_x < half_filter_size)
{
// printf("Thread: %d Num_x %d index_x %d\n",y* blockDim.x + x, num_x, index_x);
num_x += sumx[index_x++];
}
while(num_y < half_filter_size)
{
num_y += sumy[index_y++];
}
while(num_z < half_filter_size)
{
num_z += sumz[index_z++];
}
// while(num_x < half_filter_size && num_y < half_filter_size && num_z < half_filter_size)
// {
// num_x += sumx[index_x++];
// num_y += sumy[index_y++];
// num_z += sumz[index_z++];
// }
out[(y*imagesizex+x)*3+0] = index_x;
out[(y*imagesizex+x)*3+1] = index_y;
out[(y*imagesizex+x)*3+2] = index_z;
}
}
// Global variables for image data // Global variables for image data
unsigned char *image, *pixels, *dev_bitmap, *dev_input; unsigned char *image, *pixels, *dev_bitmap, *dev_input, *dev_bitmap_tmp;
unsigned int imagesizey, imagesizex; // Image size unsigned int imagesizey, imagesizex; // Image size
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
...@@ -199,41 +374,75 @@ void computeImages(int kernelsizex, int kernelsizey) ...@@ -199,41 +374,75 @@ void computeImages(int kernelsizex, int kernelsizey)
pixels = (unsigned char *) malloc(imagesizex*imagesizey*3); pixels = (unsigned char *) malloc(imagesizex*imagesizey*3);
cudaMalloc( (void**)&dev_input, imagesizex*imagesizey*3); cudaMalloc( (void**)&dev_input, imagesizex*imagesizey*3);
cudaMemcpy( dev_input, image, imagesizey*imagesizex*3, cudaMemcpyHostToDevice ); cudaMemcpy( dev_input, image, imagesizey*imagesizex*3, cudaMemcpyHostToDevice );
cudaMalloc( (void**)&dev_bitmap_tmp, imagesizex*imagesizey*3);
cudaMalloc( (void**)&dev_bitmap, imagesizex*imagesizey*3); cudaMalloc( (void**)&dev_bitmap, imagesizex*imagesizey*3);
dim3 grid(imagesizex,imagesizey);
ResetMilli();
filter_old<<<grid,1>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Awful load balance
cudaThreadSynchronize();
printf("[Un-Optimized] %f\n", GetSeconds());
dim3 gridsize(imagesizex/(kernelsizex*2+1), imagesizey/(kernelsizey*2+1)); /* Given filter */
// dim3 grid(imagesizex,imagesizey);
// ResetMilli();
// filter_old<<<grid,1>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Awful load balance
// cudaThreadSynchronize();
// printf("[Un-Optimized] %f\n", GetSeconds());
/* Our first block filter */
dim3 gridsize(imagesizex/(kernelsizex*2+1)+1, imagesizey/(kernelsizey*2+1)+1);
dim3 blocksize(kernelsizex*2+1, kernelsizey*2+1); dim3 blocksize(kernelsizex*2+1, kernelsizey*2+1);
ResetMilli(); ResetMilli();
filter_1<<<gridsize, blocksize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Nice load balance filter_1<<<gridsize, blocksize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Nice load balance
cudaThreadSynchronize(); cudaThreadSynchronize();
printf("[Optimized] %f\n", GetSeconds()); printf("[Optimized] %f\n", GetSeconds());
dim3 gridsize_sep(imagesizex/(kernelsizex*2+1), imagesizey); /* Separable filter */
dim3 blocksize_sep(kernelsizex*2+1); dim3 gridsize_sep_horizontal(imagesizex/(kernelsizex*2+1) + 1, imagesizey);
// unsigned char *tmp = dev_input; dim3 gridsize_sep_vertical(imagesizex, imagesizey/(kernelsizex*2+1) + 1);
dim3 threads_horizontal(kernelsizex*2+1);
dim3 threads_vertical(1, kernelsizex*2+1);
// // unsigned char *tmp = dev_input;
ResetMilli(); ResetMilli();
/* X-direction */ // /* X-direction */
filter_separable<<<gridsize_sep, blocksize_sep>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, 1); filter_separable<<<gridsize_sep_horizontal, threads_horizontal>>>(dev_input, dev_bitmap_tmp, imagesizex, imagesizey, kernelsizex, true);
cudaThreadSynchronize(); cudaThreadSynchronize();
/* Y-direction */ /* Y-direction */
// dim3 gridsize_sep(imagesizex, imagesizey/(kernelsizex*2+1)); filter_separable<<<gridsize_sep_vertical, threads_vertical>>>(dev_bitmap_tmp, dev_bitmap, imagesizex, imagesizey, kernelsizex, false);
// filter_separable<<<gridsize_sep, blocksize_sep>>>(tmp, dev_bitmap, imagesizex, imagesizey, kernelsizex, 1);
cudaThreadSynchronize(); cudaThreadSynchronize();
printf("[Separable] %f\n", GetSeconds()); printf("[Separable] %f\n", GetSeconds());
/* Gaussian filter i.e. 2 x 5*1 filters run horizontal and vertical with gaussian weights */
if(kernelsizex == 2)
{
ResetMilli();
/* X-direction */
filter_gaussian<<<gridsize_sep_horizontal, threads_horizontal>>>(dev_input, dev_bitmap_tmp, imagesizex, imagesizey, kernelsizex, true);
cudaThreadSynchronize();
/* Y-direction */
filter_gaussian<<<gridsize_sep_vertical, threads_vertical>>>(dev_bitmap_tmp, dev_bitmap, imagesizex, imagesizey, kernelsizex, false);
cudaThreadSynchronize();
printf("[Gaussian] %f\n", GetSeconds());
}
/* Median filter */
ResetMilli();
filter_median<<<gridsize, blocksize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Nice load balance
cudaThreadSynchronize();
printf("[Median] %f\n", GetSeconds());
// Check for errors! // Check for errors!
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err)); printf("Error: %s\n", cudaGetErrorString(err));
cudaMemcpy( pixels, dev_bitmap, imagesizey*imagesizex*3, cudaMemcpyDeviceToHost ); cudaMemcpy( pixels, dev_bitmap, imagesizey*imagesizex*3, cudaMemcpyDeviceToHost );
cudaFree( dev_bitmap_tmp );
cudaFree( dev_bitmap ); cudaFree( dev_bitmap );
cudaFree( dev_input ); cudaFree( dev_input );
} }
// Display images // Display images
...@@ -279,7 +488,7 @@ int main( int argc, char** argv) ...@@ -279,7 +488,7 @@ int main( int argc, char** argv)
glutDisplayFunc(Draw); glutDisplayFunc(Draw);
for(int i = 2; i < 10; i++) for(int i = 2; i < 5; i++)
{ {
printf("Frame is %i x %i\n", i*2+1, i*2+1); printf("Frame is %i x %i\n", i*2+1, i*2+1);
computeImages(i, i); computeImages(i, i);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment