Skip to content
Snippets Groups Projects
Commit e88b0a1c authored by ylvse560's avatar ylvse560
Browse files

Lab 4 almost done

parent d22e26e6
No related branches found
No related tags found
No related merge requests found
This diff is collapsed.
// Mandelbrot explorer, based on my old Julia demo plus parts of Nicolas Melot's Lab 1 code.
// CPU only! Your task: Rewrite for CUDA! Test and evaluate performance.
// Compile with:
// gcc interactiveMandelbrot.cpp -shared-libgcc -lstdc++-static -o interactiveMandelbrot -lglut -lGL
// or
// g++ interactiveMandelbrot.cpp -o interactiveMandelbrot -lglut -lGL
// Your CUDA version should compile with something like
// nvcc -lglut -lGL interactiveMandelbrotCUDA.cu -o interactiveMandelbrotCUDA
// Preliminary version 2014-11-30
// Cleaned a bit more 2014-12-01
// Corrected the missing glRasterPos2i 2014-12-03
#ifdef __APPLE__
#include <OpenGL/gl.h>
#include <GLUT/glut.h>
#else
#include <GL/glut.h>
#include <GL/gl.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// Image data
unsigned char *pixels = NULL;
int gImageWidth, gImageHeight;
// Init image data
void initBitmap(int width, int height)
{
if (pixels) free(pixels);
pixels = (unsigned char *)malloc(width * height * 4);
gImageWidth = width;
gImageHeight = height;
}
#define DIM 512
// Select precision here! float or double!
#define MYFLOAT float
// User controlled parameters
int maxiter = 20;
MYFLOAT offsetx = -200, offsety = 0, zoom = 0;
MYFLOAT scale = 1.5;
// Complex number class
struct cuComplex
{
MYFLOAT r;
MYFLOAT i;
cuComplex( MYFLOAT a, MYFLOAT b ) : r(a), i(b) {}
float magnitude2( void )
{
return r * r + i * i;
}
cuComplex operator*(const cuComplex& a)
{
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
cuComplex operator+(const cuComplex& a)
{
return cuComplex(r+a.r, i+a.i);
}
};
int mandelbrot( int x, int y)
{
MYFLOAT jx = scale * (MYFLOAT)(gImageWidth/2 - x + offsetx/scale)/(gImageWidth/2);
MYFLOAT jy = scale * (MYFLOAT)(gImageHeight/2 - y + offsety/scale)/(gImageWidth/2);
cuComplex c(jx, jy);
cuComplex a(jx, jy);
int i = 0;
for (i=0; i<maxiter; i++)
{
a = a * a + c;
if (a.magnitude2() > 1000)
return i;
}
return i;
}
void computeFractal( unsigned char *ptr)
{
// map from x, y to pixel position
for (int x = 0; x < gImageWidth; x++)
for (int y = 0; y < gImageHeight; y++)
{
int offset = x + y * gImageWidth;
// now calculate the value at that position
int fractalValue = mandelbrot( x, y);
// Colorize it
int red = 255 * fractalValue/maxiter;
if (red > 255) red = 255 - red;
int green = 255 * fractalValue*4/maxiter;
if (green > 255) green = 255 - green;
int blue = 255 * fractalValue*20/maxiter;
if (blue > 255) blue = 255 - blue;
ptr[offset*4 + 0] = red;
ptr[offset*4 + 1] = green;
ptr[offset*4 + 2] = blue;
ptr[offset*4 + 3] = 255;
}
}
char print_help = 0;
// Yuck, GLUT text is old junk that should be avoided... but it will have to do
static void print_str(void *font, const char *string)
{
int i;
for (i = 0; string[i]; i++)
glutBitmapCharacter(font, string[i]);
}
void PrintHelp()
{
if (print_help)
{
glPushMatrix();
glLoadIdentity();
glOrtho(-0.5, 639.5, -0.5, 479.5, -1.0, 1.0);
glEnable(GL_BLEND);
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
glColor4f(0.f, 0.f, 0.5f, 0.5f);
glRecti(40, 40, 600, 440);
glColor3f(1.f, 1.f, 1.f);
glRasterPos2i(300, 420);
print_str(GLUT_BITMAP_HELVETICA_18, "Help");
glRasterPos2i(60, 390);
print_str(GLUT_BITMAP_HELVETICA_18, "h - Toggle Help");
glRasterPos2i(60, 300);
print_str(GLUT_BITMAP_HELVETICA_18, "Left click + drag - move picture");
glRasterPos2i(60, 270);
print_str(GLUT_BITMAP_HELVETICA_18,
"Right click + drag up/down - unzoom/zoom");
glRasterPos2i(60, 240);
print_str(GLUT_BITMAP_HELVETICA_18, "+ - Increase max. iterations by 32");
glRasterPos2i(60, 210);
print_str(GLUT_BITMAP_HELVETICA_18, "- - Decrease max. iterations by 32");
glRasterPos2i(0, 0);
glDisable(GL_BLEND);
glPopMatrix();
}
}
// Compute fractal and display image
void Draw()
{
computeFractal(pixels);
// Dump the whole picture onto the screen. (Old-style OpenGL but without lots of geometry that doesn't matter so much.)
glClearColor( 0.0, 0.0, 0.0, 1.0 );
glClear( GL_COLOR_BUFFER_BIT );
glDrawPixels( gImageWidth, gImageHeight, GL_RGBA, GL_UNSIGNED_BYTE, pixels );
if (print_help)
PrintHelp();
glutSwapBuffers();
}
char explore = 1;
static void Reshape(int width, int height)
{
glViewport(0, 0, width, height);
glLoadIdentity();
glOrtho(-0.5f, width - 0.5f, -0.5f, height - 0.5f, -1.f, 1.f);
initBitmap(width, height);
glutPostRedisplay();
}
int mouse_x, mouse_y, mouse_btn;
// Mouse down
static void mouse_button(int button, int state, int x, int y)
{
if (state == GLUT_DOWN)
{
// Record start position
mouse_x = x;
mouse_y = y;
mouse_btn = button;
}
}
// Drag mouse
static void mouse_motion(int x, int y)
{
if (mouse_btn == 0)
// Ordinary mouse button - move
{
offsetx += (x - mouse_x)*scale;
mouse_x = x;
offsety += (mouse_y - y)*scale;
mouse_y = y;
glutPostRedisplay();
}
else
// Alt mouse button - scale
{
scale *= pow(1.1, y - mouse_y);
mouse_y = y;
glutPostRedisplay();
}
}
void KeyboardProc(unsigned char key, int x, int y)
{
switch (key)
{
case 27: /* Escape key */
case 'q':
case 'Q':
exit(0);
break;
case '+':
maxiter += maxiter < 1024 - 32 ? 32 : 0;
break;
case '-':
maxiter -= maxiter > 0 + 32 ? 32 : 0;
break;
case 'h':
print_help = !print_help;
break;
}
glutPostRedisplay();
}
// Main program, inits
int main( int argc, char** argv)
{
glutInit(&argc, argv);
glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
glutInitWindowSize( DIM, DIM );
glutCreateWindow("Mandelbrot explorer (CPU)");
glutDisplayFunc(Draw);
glutMouseFunc(mouse_button);
glutMotionFunc(mouse_motion);
glutKeyboardFunc(KeyboardProc);
glutReshapeFunc(Reshape);
initBitmap(DIM, DIM);
glutMainLoop();
}
// Mandelbrot explorer, based on my old Julia demo plus parts of Nicolas Melot's Lab 1 code.
// CPU only! Your task: Rewrite for CUDA! Test and evaluate performance.
// Compile with:
// gcc interactiveMandelbrot.cpp -shared-libgcc -lstdc++-static -o interactiveMandelbrot -lglut -lGL
// or
// g++ interactiveMandelbrot.cpp -o interactiveMandelbrot -lglut -lGL
// Your CUDA version should compile with something like
// nvcc -lglut -lGL interactiveMandelbrotCUDA.cu -o interactiveMandelbrotCUDA
// Preliminary version 2014-11-30
// Cleaned a bit more 2014-12-01
// Corrected the missing glRasterPos2i 2014-12-03
#ifdef __APPLE__
#include <OpenGL/gl.h>
#include <GLUT/glut.h>
#else
#include <GL/glut.h>
#include <GL/gl.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// Image data
unsigned char *pixels = NULL;
unsigned char *pixelsCUDA = NULL;
int gImageWidth, gImageHeight;
int size = gImageWidth * gImageHeight * 4;
#define DIM 512
// Select precision here! float or double!
#define MYFLOAT float
// User controlled parameters
int maxiter = 20;
MYFLOAT offsetx = -200, offsety = 0, zoom = 0;
MYFLOAT scale = 1.5;
const int blocksize = 32;
const int gridsize = 32;
// Init image data
void initBitmap(int width, int height)
{
if (pixels) free(pixels);
pixels = (unsigned char *)malloc(size);
cudaMalloc( (void**)&pixelsCUDA, size );
cudaMemcpy( pixelsCUDA, pixels, size, cudaMemcpyHostToDevice );
gImageWidth = width;
gImageHeight = height;
}
// Complex number class
struct cuComplex
{
MYFLOAT r;
MYFLOAT i;
__device__
cuComplex( MYFLOAT a, MYFLOAT b ) : r(a), i(b) {}
__device__
float magnitude2( void )
{
return r * r + i * i;
}
__device__
cuComplex operator*(const cuComplex& a)
{
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__
cuComplex operator+(const cuComplex& a)
{
return cuComplex(r+a.r, i+a.i);
}
};
__device__
int mandelbrot( int x, int y, int gImageWidth, int gImageHeight, int maxiter,
MYFLOAT offsetx = -200, MYFLOAT offsety = 0, MYFLOAT scale = 1.5)
{
MYFLOAT jx = scale * (MYFLOAT)(gImageWidth/2 - x + offsetx/scale)/(gImageWidth/2);
MYFLOAT jy = scale * (MYFLOAT)(gImageHeight/2 - y + offsety/scale)/(gImageWidth/2);
cuComplex c(jx, jy);
cuComplex a(jx, jy);
int i = 0;
for (i=0; i<maxiter; i++)
{
a = a * a + c;
if (a.magnitude2() > 1000)
return i;
}
return i;
}
__global__
void computeFractal(unsigned char *ptr, int gImageWidth, int gImageHeight, int maxiter,
MYFLOAT offsetx = -200, MYFLOAT offsety = 0, MYFLOAT zoom = 0, MYFLOAT scale = 1.5)
{
// map from x, y to pixel position
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < gImageWidth && y < gImageWidth) {
int offset = x + y * gImageWidth;
int fractalValue = mandelbrot(x,y,gImageWidth, gImageHeight, maxiter);
// Colorize it
int red = 255 * fractalValue/maxiter;
if (red > 255) red = 255 - red;
int green = 255 * fractalValue*4/maxiter;
if (green > 255) green = 255 - green;
int blue = 255 * fractalValue*20/maxiter;
if (blue > 255) blue = 255 - blue;
ptr[offset*4 + 0] = red;
ptr[offset*4 + 1] = green;
ptr[offset*4 + 2] = blue;
ptr[offset*4 + 3] = 255;
}
}
char print_help = 0;
// Yuck, GLUT text is old junk that should be avoided... but it will have to do
static void print_str(void *font, const char *string)
{
int i;
for (i = 0; string[i]; i++)
glutBitmapCharacter(font, string[i]);
}
void PrintHelp()
{
if (print_help)
{
glPushMatrix();
glLoadIdentity();
glOrtho(-0.5, 639.5, -0.5, 479.5, -1.0, 1.0);
glEnable(GL_BLEND);
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
glColor4f(0.f, 0.f, 0.5f, 0.5f);
glRecti(40, 40, 600, 440);
glColor3f(1.f, 1.f, 1.f);
glRasterPos2i(300, 420);
print_str(GLUT_BITMAP_HELVETICA_18, "Help");
glRasterPos2i(60, 390);
print_str(GLUT_BITMAP_HELVETICA_18, "h - Toggle Help");
glRasterPos2i(60, 300);
print_str(GLUT_BITMAP_HELVETICA_18, "Left click + drag - move picture");
glRasterPos2i(60, 270);
print_str(GLUT_BITMAP_HELVETICA_18,
"Right click + drag up/down - unzoom/zoom");
glRasterPos2i(60, 240);
print_str(GLUT_BITMAP_HELVETICA_18, "+ - Increase max. iterations by 32");
glRasterPos2i(60, 210);
print_str(GLUT_BITMAP_HELVETICA_18, "- - Decrease max. iterations by 32");
glRasterPos2i(0, 0);
glDisable(GL_BLEND);
glPopMatrix();
}
}
// Compute fractal and display image
void Draw()
{
// Dump the whole picture onto the screen. (Old-style OpenGL but without lots of geometry that doesn't matter so much.)
glClearColor( 0.0, 0.0, 0.0, 1.0 );
glClear( GL_COLOR_BUFFER_BIT );
glDrawPixels( gImageWidth, gImageHeight, GL_RGBA, GL_UNSIGNED_BYTE, pixels );
if (print_help)
PrintHelp();
glutSwapBuffers();
}
char explore = 1;
static void Reshape(int width, int height)
{
glViewport(0, 0, width, height);
glLoadIdentity();
glOrtho(-0.5f, width - 0.5f, -0.5f, height - 0.5f, -1.f, 1.f);
initBitmap(width, height);
glutPostRedisplay();
}
int mouse_x, mouse_y, mouse_btn;
// Mouse down
static void mouse_button(int button, int state, int x, int y)
{
if (state == GLUT_DOWN)
{
// Record start position
mouse_x = x;
mouse_y = y;
mouse_btn = button;
}
}
// Drag mouse
static void mouse_motion(int x, int y)
{
if (mouse_btn == 0)
// Ordinary mouse button - move
{
offsetx += (x - mouse_x)*scale;
mouse_x = x;
offsety += (mouse_y - y)*scale;
mouse_y = y;
glutPostRedisplay();
}
else
// Alt mouse button - scale
{
scale *= pow(1.1, y - mouse_y);
mouse_y = y;
glutPostRedisplay();
}
}
void KeyboardProc(unsigned char key, int x, int y)
{
switch (key)
{
case 27: /* Escape key */
case 'q':
case 'Q':
exit(0);
break;
case '+':
maxiter += maxiter < 1024 - 32 ? 32 : 0;
break;
case '-':
maxiter -= maxiter > 0 + 32 ? 32 : 0;
break;
case 'h':
print_help = !print_help;
break;
}
glutPostRedisplay();
}
// Main program, inits
int main( int argc, char** argv)
{
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
printf("Device Number: %d\n", 0);
printf(" Device name: %s\n", prop.name);
printf(" Memory Clock Rate (KHz): %d\n",
prop.memoryClockRate);
printf(" Memory Bus Width (bits): %d\n",
prop.memoryBusWidth);
printf(" Peak Memory Bandwidth (GB/s): %f\n",
2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
printf(" Max threads per block: %i\n",
prop.maxThreadsPerBlock);
printf(" Max threads dim: %i x %i x %i\n",
prop.maxThreadsDim[0],prop.maxThreadsDim[1],prop.maxThreadsDim[2] );
printf(" Max grid size: %i x %i x %i\n",
prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2] );
/* glutInit(&argc, argv);
glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
glutInitWindowSize( DIM, DIM );
glutCreateWindow("Mandelbrot explorer (CPU)");
glutDisplayFunc(Draw);
glutMouseFunc(mouse_button);
glutMotionFunc(mouse_motion);
glutKeyboardFunc(KeyboardProc);
glutReshapeFunc(Reshape);
glutMainLoop();
*/
initBitmap(DIM, DIM);
dim3 dimBlock( blocksize, blocksize );
dim3 dimGrid( gridsize, gridsize );
computeFractal<<<dimGrid, dimBlock>>>(pixels, gImageWidth, gImageHeight, maxiter);
cudaMemcpy( pixels, pixelsCUDA, size, cudaMemcpyDeviceToHost );
cudaFree( pixelsCUDA );
}
......@@ -2,6 +2,7 @@
// gcc matrix_cpu.c -o matrix_cpu -std=c99
#include <stdio.h>
#include <stdlib.h>
#include "milli.h"
void add_matrix(float *a, float *b, float *c, int N)
......@@ -16,13 +17,15 @@ void add_matrix(float *a, float *b, float *c, int N)
}
}
int main()
int main(int argc, char *argv[])
{
const int N = 16;
char *argument = argv[argc-1];
int N = atoi(argument);
printf("Running on CPU with no of elements: %i \n",N);
float a[N*N];
float b[N*N];
float c[N*N];
float *a = malloc(N*N*sizeof(float));
float *b = malloc(N*N*sizeof(float));
float *c = malloc(N*N*sizeof(float));
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
......@@ -32,8 +35,8 @@ int main()
}
ResetMilli();
add_matrix(a, b, c, N);
printf("%s%i\n","Measured time (us): ", GetMicroseconds());
printf("%s%f\n","Measured time (ms): ", (float)GetMicroseconds()*0.001);
/*
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
......@@ -42,4 +45,8 @@ int main()
}
printf("\n");
}
*/
free(a);
free(b);
free(c);
}
......@@ -6,25 +6,26 @@
#include <stdio.h>
#include <cmath>
const int N = 1024;
const int blocksize = 32; //threads
const int gridsize = 32; //blocks
__global__
void add(float* result, float *c, float *d, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int index = idx + idy * N;
if (idx < N && idy < N) {
result[index] = c[index] + d[index];
}
}
int main()
int main(int argc, char *argv[])
{
char *argument = argv[argc-1];
int N = atoi(argument);
int blocksize = 32; //threads
int gridsize = 32; //blocks
printf("Running on GPU with no of elements: %i \n",N);
float *a = new float[N*N];
float *b = new float[N*N];
float *result = new float[N*N];
......@@ -97,8 +98,8 @@ int main()
prop.maxThreadsDim[0],prop.maxThreadsDim[1],prop.maxThreadsDim[2] );
printf(" Max grid size: %i x %i x %i\n",
prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2] );
delete []a;
delete []b;
delete[] a;
delete[] b;
printf("done\n");
return EXIT_SUCCESS;
}
// Mandelbrot explorer, based on my old Julia demo plus parts of Nicolas Melot's Lab 1 code.
// CPU only! Your task: Rewrite for CUDA! Test and evaluate performance.
// Emergency version for use without graphics output. I recommend the interactive version if you can.
// Preliminary! I will add timing using milli.c.
// Compile with:
// g++ noninteractiveMandelbrot.cpp readppm.cpp -o noninteractiveMandelbrot
// Your CUDA version should compile with something like
// nvcc -lglut -lGL interactiveMandelbrotCUDA.cu -o interactiveMandelbrotCUDA
// See lecture notes on how to compile and link with the other code.
// Note: A Simple trick is to #include the other files.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "readppm.h"
// uses framework Cocoa
// Image data
unsigned char *pixels = NULL;
int gImageWidth, gImageHeight;
// Init image data
void initBitmap(int width, int height)
{
if (pixels) free(pixels);
pixels = (unsigned char *)malloc(width * height * 4);
gImageWidth = width;
gImageHeight = height;
}
#define DIM 512
// Select precision here! float or double!
#define MYFLOAT float
// User controlled parameters
int maxiter = 20;
MYFLOAT offsetx = -200, offsety = 0, zoom = 0;
MYFLOAT scale = 1.5;
// Complex number class
struct cuComplex
{
MYFLOAT r;
MYFLOAT i;
cuComplex( MYFLOAT a, MYFLOAT b ) : r(a), i(b) {}
float magnitude2( void )
{
return r * r + i * i;
}
cuComplex operator*(const cuComplex& a)
{
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
cuComplex operator+(const cuComplex& a)
{
return cuComplex(r+a.r, i+a.i);
}
};
int mandelbrot( int x, int y)
{
MYFLOAT jx = scale * (MYFLOAT)(gImageWidth/2 - x + offsetx/scale)/(gImageWidth/2);
MYFLOAT jy = scale * (MYFLOAT)(gImageHeight/2 - y + offsety/scale)/(gImageWidth/2);
cuComplex c(jx, jy);
cuComplex a(jx, jy);
int i = 0;
for (i=0; i<maxiter; i++)
{
a = a * a + c;
if (a.magnitude2() > 1000)
return i;
}
return i;
}
void computeFractal( unsigned char *ptr)
{
// map from x, y to pixel position
for (int x = 0; x < gImageWidth; x++)
for (int y = 0; y < gImageHeight; y++)
{
int offset = x + y * gImageWidth;
// now calculate the value at that position
int fractalValue = mandelbrot( x, y);
// Colorize it
int red = 255 * fractalValue/maxiter;
if (red > 255) red = 255 - red;
int green = 255 * fractalValue*4/maxiter;
if (green > 255) green = 255 - green;
int blue = 255 * fractalValue*20/maxiter;
if (blue > 255) blue = 255 - blue;
ptr[offset*4 + 0] = red;
ptr[offset*4 + 1] = green;
ptr[offset*4 + 2] = blue;
ptr[offset*4 + 3] = 255;
}
}
// Main program, inits
int main( int argc, char** argv)
{
initBitmap(DIM, DIM);
computeFractal(pixels);
// Dump to PPM
writeppm("fractalout.ppm", DIM, DIM, pixels);
}
// Mandelbrot explorer, based on my old Julia demo plus parts of Nicolas Melot's Lab 1 code.
// CPU only! Your task: Rewrite for CUDA! Test and evaluate performance.
// Emergency version for use without graphics output. I recommend the interactive version if you can.
// Preliminary! I will add timing using milli.c.
// Compile with:
// g++ noninteractiveMandelbrot.cpp readppm.cpp -o noninteractiveMandelbrot
// Your CUDA version should compile with something like
// nvcc -lglut -lGL interactiveMandelbrotCUDA.cu -o interactiveMandelbrotCUDA
// See lecture notes on how to compile and link with the other code.
// Note: A Simple trick is to #include the other files.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "readppm.h"
// uses framework Cocoa
// Image data
unsigned char *pixels = NULL;
unsigned char *pixelsCUDA = NULL;
#define DIM 512
int size = DIM * DIM * 4;
// Select precision here! float or double!
#define MYFLOAT float
// User controlled parameters
int maxiter = 20;
MYFLOAT offsetx = -200, offsety = 0, zoom = 0;
MYFLOAT scale = 1.5;
const int blocksize = 32;
const int gridsize = 16;
// Init image data
void initBitmap(int width, int height)
{
pixels = (unsigned char *)malloc(width*height*4);
for(int i = 0; i < size; i++)
pixels[i] = 255;
cudaMalloc( (void**)&pixelsCUDA, size );
cudaMemcpy( pixelsCUDA, pixels, size, cudaMemcpyHostToDevice );
}
// Complex number class
struct cuComplex
{
MYFLOAT r;
MYFLOAT i;
__device__
cuComplex( MYFLOAT a, MYFLOAT b ) : r(a), i(b) {}
__device__
float magnitude2( void )
{
return r * r + i * i;
}
__device__
cuComplex operator*(const cuComplex& a)
{
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__
cuComplex operator+(const cuComplex& a)
{
return cuComplex(r+a.r, i+a.i);
}
};
__device__
int mandelbrot( int x, int y, int gImageWidth, int gImageHeight, int maxiter,
MYFLOAT offsetx = -200, MYFLOAT offsety = 0, MYFLOAT scale = 1.5)
{
MYFLOAT jx = scale * (MYFLOAT)(gImageWidth/2 - x + offsetx/scale)/(gImageWidth/2);
MYFLOAT jy = scale * (MYFLOAT)(gImageHeight/2 - y + offsety/scale)/(gImageWidth/2);
cuComplex c(jx, jy);
cuComplex a(jx, jy);
int i = 0;
for (i=0; i<maxiter; i++)
{
a = a * a + c;
if (a.magnitude2() > 1000)
return i;
}
return i;
}
__global__
void computeFractal(unsigned char *data, int gImageWidth, int gImageHeight, int maxiter,
MYFLOAT offsetx = -200, MYFLOAT offsety = 0, MYFLOAT zoom = 0, MYFLOAT scale = 1.5)
{
// map from x, y to pixel position
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int index = x + y * gImageWidth;
int fractalValue = mandelbrot(x,y,gImageWidth, gImageHeight, maxiter);
// Colorize it
int red = 255 * fractalValue/maxiter;
if (red > 255) red = 255 - red;
int green = 255 * fractalValue*4/maxiter;
if (green > 255) green = 255 - green;
int blue = 255 * fractalValue*20/maxiter;
if (blue > 255) blue = 255 - blue;
data[index*4 + 0] = red;
data[index*4 + 1] = green;
data[index*4 + 2] = blue;
data[index*4 + 3] = 255;
}
// Main program, inits
int main( int argc, char** argv)
{
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
printf("Device Number: %d\n", 0);
printf(" Device name: %s\n", prop.name);
printf(" Memory Clock Rate (KHz): %d\n",
prop.memoryClockRate);
printf(" Memory Bus Width (bits): %d\n",
prop.memoryBusWidth);
printf(" Peak Memory Bandwidth (GB/s): %f\n",
2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
printf(" Max threads per block: %i\n",
prop.maxThreadsPerBlock);
printf(" Max threads dim: %i x %i x %i\n",
prop.maxThreadsDim[0],prop.maxThreadsDim[1],prop.maxThreadsDim[2] );
printf(" Max grid size: %i x %i x %i\n",
prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2] );
initBitmap(DIM, DIM);
dim3 dimBlock( blocksize, blocksize );
dim3 dimGrid( gridsize, gridsize );
computeFractal<<<dimGrid, dimBlock>>>(pixelsCUDA, DIM, DIM, maxiter);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
cudaMemcpy( pixels, pixelsCUDA, size, cudaMemcpyDeviceToHost );
err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
cudaFree( pixelsCUDA );
err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
// Dump to PPM
writeppm("fractalout.ppm", DIM, DIM, pixels);
delete[] pixels;
}
// Cut-down version without texture loading
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "readppm.h"
// By Ingemar Ragnemalm
// based on a ppm reader by Edward Angel,
// Added support for binary PPM (2002?)
// 2006: Fixed a bug that caused some files to end prematurely
// and added a PPM writer
// for making screen shots and other output (i.e. GPGPU uses)
// Added writeppm. (Writes in ASCII format only.)
// 2009: Added readppmtexture. (A writer for a texture would also be possible when needed.)
// Portable #include.
int writeppm(char *filename, int height, int width, unsigned char *data)
{
FILE *fp;
int error = 1;
int i, h, v;
if (filename != NULL)
{
fp = fopen(filename,"w");
if (fp != NULL)
{
// Write PPM file
// Header
fprintf(fp, "P3\n");
fprintf(fp, "# written by Ingemars PPM writer\n");
fprintf(fp, "%d %d\n", height, width);
fprintf(fp, "%d\n", 255); // range
// Data
for (v = height-1; v >=0; v--)
{
for (h = 0; h < width; h++)
{
i = (width*v + h)*4; // assumes rgba, not rgba
fprintf(fp, "%d %d %d ", data[i], data[i+1], data[i+2]);
}
fprintf(fp, "\n"); // range
}
if (fwrite("\n",sizeof(char),1,fp) == 1)
error = 0; // Probable success
fclose(fp);
}
}
return(error);
}
unsigned char *readppm(char *filename, int *height, int *width)
{
FILE *fd;
int k, nm;
char c;
int i,j;
char b[100];
float s;
int red, green, blue;
long numbytes, howmuch;
int n;
int m;
unsigned char *image;
fd = fopen(filename, "rb");
if (fd == NULL)
{
printf("Could not open %s\n", filename);
return NULL;
}
c = getc(fd);
if (c=='P' || c=='p')
c = getc(fd);
if (c == '3')
{
printf("%s is a PPM file (plain text version)\n", filename);
// NOTE: This is not very good PPM code! Comments are not allowed
// except immediately after the magic number.
c = getc(fd);
if (c == '\n' || c == '\r') // Skip any line break and comments
{
c = getc(fd);
while(c == '#')
{
fscanf(fd, "%[^\n\r] ", b);
printf("%s\n",b);
c = getc(fd);
}
ungetc(c,fd);
}
fscanf(fd, "%d %d %d", &n, &m, &k);
printf("%d rows %d columns max value= %d\n",n,m,k);
numbytes = n * m * 3;
image = (unsigned char *) malloc(numbytes);
if (image == NULL)
{
printf("Memory allocation failed!\n");
return NULL;
}
for(i=n-1;i>=0;i--) for(j=0;j<m;j++)
{
fscanf(fd,"%d %d %d",&red, &green, &blue );
image[(i*n+j)*3]=red * 255 / k;
image[(i*n+j)*3+1]=green * 255 / k;
image[(i*n+j)*3+2]=blue * 255 / k;
}
}
else
if (c == '6')
{
printf("%s is a PPM file (raw version)!\n", filename);
c = getc(fd);
if (c == '\n' || c == '\r') // Skip any line break and comments
{
c = getc(fd);
while(c == '#')
{
fscanf(fd, "%[^\n\r] ", b);
printf("%s\n",b);
c = getc(fd);
}
ungetc(c,fd);
}
fscanf(fd, "%d %d %d", &n, &m, &k);
printf("%d rows %d columns max value= %d\n",n,m,k);
c = getc(fd); // Skip the last whitespace
numbytes = n * m * 3;
image = (unsigned char *) malloc(numbytes);
if (image == NULL)
{
printf("Memory allocation failed!\n");
return NULL;
}
// Read and re-order as necessary
for(i=n-1;i>=0;i--) for(j=0;j<m;j++)
{
image[(i*n+j)*3+0]=getc(fd);
image[(i*n+j)*3+1]=getc(fd);
image[(i*n+j)*3+2]=getc(fd);
}
}
else
{
printf("%s is not a PPM file!\n", filename);
return NULL;
}
printf("read image\n");
*height = n;
*width = m;
return image;
}
#ifndef READ_PPM
#define READ_PPM
int writeppm(char *filename, int height, int width, unsigned char *data);
unsigned char *readppm(char *filename, int *height, int *width);
#endif
......@@ -70,6 +70,28 @@ int index = idy * blockDim.x + idx;
result[index] = c[index] + d[index];
### QUESTION: What happens if you use too many threads per block?
There is an error in CUDA and the kernel is not run properly - undefined behavior.
### QUESTION: At what data size is the GPU faster than the CPU?
At N = 32, the CPU is faster. At N = 64 the GPU is faster.
### QUESTION: What block size seems like a good choice? Compared to what?
We have set the block size to 32. The maximum number of threads per block in Olympen GPU:s is 1024 threads. We would like to have the block quadratic so we take the square root of 1024 = 32.
### QUESTION: Write down your data size, block size and timing data for the best GPU performance you can get.
#### 1. Blocksize = 32, gridsize = 32, N = 1024. Time (ms): 0.058720
#### 2. Blocksize = 16, gridsize = 64, N = 1024. Time (ms): 0.069376
#### 3. Blocksize = 8, gridsize = 128, N = 1024. Time (ms): 0.069440
### QUESTION: How much performance did you lose by making data accesses non-coalesced?
For coalesced data: 0.058720 ms
For non coalesced data: 0.178624 ms
Took about 3x the time.
## Lab 5
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment