Skip to content
Snippets Groups Projects
Commit a6a25193 authored by matbe320's avatar matbe320
Browse files

Lab5: filter.cu restored to base

parent 71679182
No related branches found
No related tags found
No related merge requests found
......@@ -20,7 +20,6 @@
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#ifdef __APPLE__
#include <GLUT/glut.h>
#include <OpenGL/gl.h>
......@@ -31,89 +30,47 @@
#include "milli.h"
// Use these for setting shared memory size.
#define RADIUS 3
#define maxKernelSizeX (RADIUS*2+1)
#define maxKernelSizeY (RADIUS*2+1)
#define maxKernelSizeX 10
#define maxKernelSizeY 10
#define FILTER_W (maxKernelSizeX+RADIUS*2)
#define FILTER_H (maxKernelSizeY+RADIUS*2)
#define LOCAL_MEM_SIZE_PX (FILTER_W * FILTER_H)
__global__ void filter(unsigned char *image, unsigned char *out, const unsigned int imagesizex, const unsigned int imagesizey, const int kernelsizex, const int kernelsizey)
{
__shared__ unsigned char local_memory[LOCAL_MEM_SIZE_PX * 3];
// map from blockIdx to pixel position
unsigned blockNumInGrid = blockIdx.x + gridDim.x * blockIdx.y;
unsigned threadNumInBlock = threadIdx.x + blockDim.x *threadIdx.y;
unsigned threadsPerBlock = blockDim.x * blockDim.y;
int blockOffsetX = blockIdx.x * blockDim.x;
int blockOffsetY = blockIdx.y * blockDim.y;
int local_index = threadNumInBlock;
while(local_index < LOCAL_MEM_SIZE_PX)
{
/* Repeat three times for colored pixel */
int global_x = max(0, blockOffsetX - RADIUS + local_index % FILTER_W);
int global_y = max(0, blockOffsetY - RADIUS + local_index / FILTER_W);
int global_index = global_y * imagesizex + global_x;
local_memory[3*local_index+0] = image[3*global_index+0];
local_memory[3*local_index+1] = image[3*global_index+1];
local_memory[3*local_index+2] = image[3*global_index+2];
local_index += threadsPerBlock;
}
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int dy, dx;
unsigned int sumx, sumy, sumz;
/* Old factor */
int divby = (2*kernelsizex*3+1)*(2*kernelsizey+1); // Works for box filters only!
int skip_pixels = FILTER_W * RADIUS + RADIUS;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int local_x = threadIdx.x;
int local_y = threadIdx.y;
int local_mem_address = skip_pixels + (local_y * FILTER_W) + local_x;
// if (x < imagesizex && y < imagesizey) // If inside image
// {
// Filter kernel (simple box filter)
sumx=0;sumy=0;sumz=0;
for(dy=-kernelsizey;dy<=kernelsizey;dy++)
for(dx=-kernelsizex;dx<=kernelsizex;dx++)
{
int mem_access = skip_pixels + (local_y + dy) * FILTER_W + (local_x + dx);
sumx += local_memory[mem_access*3+0];
sumy += local_memory[mem_access*3+1];
sumz += local_memory[mem_access*3+2];
}
__syncthreads();
// out[3*(y*imagesizex+x)+0] = local_memory[3*local_mem_address+0];
// out[3*(y*imagesizex+x)+1] = local_memory[3*local_mem_address+1];
// out[3*(y*imagesizex+x)+2] = local_memory[3*local_mem_address+2];
out[(y*imagesizex+x)*3+0] = sumx/divby;
out[(y*imagesizex+x)*3+1] = sumy/divby;
out[(y*imagesizex+x)*3+2] = sumz/divby;
int divby = (2*kernelsizex+1)*(2*kernelsizey+1); // Works for box filters only!
if (x < imagesizex && y < imagesizey) // If inside image
{
// Filter kernel (simple box filter)
sumx=0;sumy=0;sumz=0;
for(dy=-kernelsizey;dy<=kernelsizey;dy++)
for(dx=-kernelsizex;dx<=kernelsizex;dx++)
{
// Use max and min to avoid branching!
int yy = min(max(y+dy, 0), imagesizey-1);
int xx = min(max(x+dx, 0), imagesizex-1);
sumx += image[((yy)*imagesizex+(xx))*3+0];
sumy += image[((yy)*imagesizex+(xx))*3+1];
sumz += image[((yy)*imagesizex+(xx))*3+2];
}
out[(y*imagesizex+x)*3+0] = sumx/divby;
out[(y*imagesizex+x)*3+1] = sumy/divby;
out[(y*imagesizex+x)*3+2] = sumz/divby;
}
}
// Global variables for image data
unsigned char *image, *pixels, *dev_bitmap, *dev_input;
unsigned int imagesizey, imagesizex; // Image size
////////////////////////////////////////////////////////////////////////////////
// main computation function
////////////////////////////////////////////////////////////////////////////////
......@@ -125,52 +82,25 @@ void computeImages(int kernelsizex, int kernelsizey)
return;
}
//dim3 gridsize(1, 1);
dim3 gridsize(imagesizex/maxKernelSizeX, imagesizey/maxKernelSizeY);
dim3 blocksize(maxKernelSizeX, maxKernelSizeY); // 256
/* whole pic, no need to declare several times */
pixels = (unsigned char *) malloc(imagesizex*imagesizey*3);
cudaMalloc( (void**)&dev_input, imagesizex*imagesizey*3);
cudaMemcpy( dev_input, image, imagesizey*imagesizex*3, cudaMemcpyHostToDevice);
cudaMemcpy( dev_input, image, imagesizey*imagesizex*3, cudaMemcpyHostToDevice );
cudaMalloc( (void**)&dev_bitmap, imagesizex*imagesizey*3);
filter<<<gridsize,blocksize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Awful load balance
cudaDeviceSynchronize();
dim3 grid(imagesizex,imagesizey);
filter<<<grid,1>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Awful load balance
cudaThreadSynchronize();
// Check for errors!
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
cudaMemcpy( pixels, dev_bitmap, imagesizey*imagesizex*3, cudaMemcpyDeviceToHost );
cudaFree( dev_bitmap );
cudaFree( dev_input );
}
// unsigned image_size = imagesizex * imagesizey;
// unsigned num_kernels = floor(image_size / (GRID_SIZE * BLOCK_SIZE));
// unsigned rows_per_kernel = floor(imagesizey / num_kernels);
// unsigned remainder_y = imagesizey % num_kernels; // 10
// pixels = (unsigned char *) malloc(imagesizex*imagesizey*3);
// for(int i = 1; i <= num_kernels; i++)
// {
// cudaMalloc( (void**)&dev_input, imagesizex*imagesizey*3);
// cudaMemcpy( dev_input, image, imagesizey*imagesizex*3, cudaMemcpyHostToDevice);
// cudaMalloc( (void**)&dev_bitmap, imagesizex*imagesizey*3);
// filter<<<grid,blocksize>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey, i); // Awful load balance
// }
// Display images
void Draw()
{
// Dump the whole picture onto the screen.
glClearColor( 0.0, 0.0, 0.0, 1.0 );
glClear( GL_COLOR_BUFFER_BIT );
......@@ -195,7 +125,6 @@ void Draw()
// Main program, inits
int main( int argc, char** argv)
{
glutInit(&argc, argv);
glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA );
......@@ -213,9 +142,10 @@ int main( int argc, char** argv)
ResetMilli();
computeImages(RADIUS, RADIUS);
computeImages(2, 2);
// You can save the result to a file like this:
writeppm("out.ppm", imagesizey, imagesizex, pixels);
// writeppm("out.ppm", imagesizey, imagesizex, pixels);
glutMainLoop();
return 0;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment