Lab 4 almost done

e88b0a1c · ylvse560 · d22e26e6 · e88b0a1c · e88b0a1c · e88b0a1c
Commit e88b0a1c authored 4 years ago by ylvse560
--- a/Lab4/fractalout.ppm
+++ b/Lab4/fractalout.ppm
--- a/Lab4/interactiveMandelbrot.cpp
+++ b/Lab4/interactiveMandelbrot.cpp
+// Mandelbrot explorer, based on my old Julia demo plus parts of Nicolas Melot's Lab 1 code.
+// CPU only! Your task: Rewrite for CUDA! Test and evaluate performance.
+
+// Compile with:
+// gcc interactiveMandelbrot.cpp -shared-libgcc -lstdc++-static  -o interactiveMandelbrot -lglut -lGL
+// or
+// g++ interactiveMandelbrot.cpp -o interactiveMandelbrot -lglut -lGL
+
+// Your CUDA version should compile with something like
+// nvcc -lglut -lGL interactiveMandelbrotCUDA.cu -o interactiveMandelbrotCUDA
+
+// Preliminary version 2014-11-30
+// Cleaned a bit more 2014-12-01
+// Corrected the missing glRasterPos2i 2014-12-03
+
+#ifdef __APPLE__
+#include <OpenGL/gl.h>
+#include <GLUT/glut.h>
+#else
+#include <GL/glut.h>
+#include <GL/gl.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+// Image data
+	unsigned char	*pixels = NULL;
+	int	 gImageWidth, gImageHeight;
+
+// Init image data
+void initBitmap(int width, int height)
+{
+	if (pixels) free(pixels);
+	pixels = (unsigned char *)malloc(width * height * 4);
+	gImageWidth = width;
+	gImageHeight = height;
+}
+
+#define DIM 512
+
+// Select precision here! float or double!
+#define MYFLOAT float
+
+// User controlled parameters
+int maxiter = 20;
+MYFLOAT offsetx = -200, offsety = 0, zoom = 0;
+MYFLOAT scale = 1.5;
+
+// Complex number class
+struct cuComplex
+{
+    MYFLOAT   r;
+    MYFLOAT   i;
+    
+    cuComplex( MYFLOAT a, MYFLOAT b ) : r(a), i(b)  {}
+    
+    float magnitude2( void )
+    {
+        return r * r + i * i;
+    }
+    
+    cuComplex operator*(const cuComplex& a)
+    {
+        return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
+    }
+    
+    cuComplex operator+(const cuComplex& a)
+    {
+        return cuComplex(r+a.r, i+a.i);
+    }
+};
+
+int mandelbrot( int x, int y)
+{
+    MYFLOAT jx = scale * (MYFLOAT)(gImageWidth/2 - x + offsetx/scale)/(gImageWidth/2);
+    MYFLOAT jy = scale * (MYFLOAT)(gImageHeight/2 - y + offsety/scale)/(gImageWidth/2);
+
+    cuComplex c(jx, jy);
+    cuComplex a(jx, jy);
+
+    int i = 0;
+    for (i=0; i<maxiter; i++)
+    {
+        a = a * a + c;
+        if (a.magnitude2() > 1000)
+            return i;
+    }
+
+    return i;
+}
+
+void computeFractal( unsigned char *ptr)
+{
+    // map from x, y to pixel position
+    for (int x = 0; x < gImageWidth; x++)
+	    for (int y = 0; y < gImageHeight; y++)
+	    {
+		    int offset = x + y * gImageWidth;
+
+		    // now calculate the value at that position
+		    int fractalValue = mandelbrot( x, y);
+		    
+		    // Colorize it
+		    int red = 255 * fractalValue/maxiter;
+		    if (red > 255) red = 255 - red;
+		    int green = 255 * fractalValue*4/maxiter;
+		    if (green > 255) green = 255 - green;
+		    int blue = 255 * fractalValue*20/maxiter;
+		    if (blue > 255) blue = 255 - blue;
+		    
+		    ptr[offset*4 + 0] = red;
+		    ptr[offset*4 + 1] = green;
+		    ptr[offset*4 + 2] = blue;
+		    
+		    ptr[offset*4 + 3] = 255;
+    	}
+}
+
+char print_help = 0;
+
+// Yuck, GLUT text is old junk that should be avoided... but it will have to do
+static void print_str(void *font, const char *string)
+{
+	int i;
+
+	for (i = 0; string[i]; i++)
+		glutBitmapCharacter(font, string[i]);
+}
+
+void PrintHelp()
+{
+	if (print_help)
+	{
+		glPushMatrix();
+		glLoadIdentity();
+		glOrtho(-0.5, 639.5, -0.5, 479.5, -1.0, 1.0);
+
+		glEnable(GL_BLEND);
+		glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
+		glColor4f(0.f, 0.f, 0.5f, 0.5f);
+		glRecti(40, 40, 600, 440);
+
+		glColor3f(1.f, 1.f, 1.f);
+		glRasterPos2i(300, 420);
+		print_str(GLUT_BITMAP_HELVETICA_18, "Help");
+
+		glRasterPos2i(60, 390);
+		print_str(GLUT_BITMAP_HELVETICA_18, "h - Toggle Help");
+		glRasterPos2i(60, 300);
+		print_str(GLUT_BITMAP_HELVETICA_18, "Left click + drag - move picture");
+		glRasterPos2i(60, 270);
+		print_str(GLUT_BITMAP_HELVETICA_18,
+		    "Right click + drag up/down - unzoom/zoom");
+		glRasterPos2i(60, 240);
+		print_str(GLUT_BITMAP_HELVETICA_18, "+ - Increase max. iterations by 32");
+		glRasterPos2i(60, 210);
+		print_str(GLUT_BITMAP_HELVETICA_18, "- - Decrease max. iterations by 32");
+		glRasterPos2i(0, 0);
+
+		glDisable(GL_BLEND);
+		
+		glPopMatrix();
+	}
+}
+
+// Compute fractal and display image
+void Draw()
+{
+	computeFractal(pixels);
+	
+// Dump the whole picture onto the screen. (Old-style OpenGL but without lots of geometry that doesn't matter so much.)
+	glClearColor( 0.0, 0.0, 0.0, 1.0 );
+	glClear( GL_COLOR_BUFFER_BIT );
+	glDrawPixels( gImageWidth, gImageHeight, GL_RGBA, GL_UNSIGNED_BYTE, pixels );
+	
+	if (print_help)
+		PrintHelp();
+	
+	glutSwapBuffers();
+}
+
+char explore = 1;
+
+static void Reshape(int width, int height)
+{
+	glViewport(0, 0, width, height);
+	glLoadIdentity();
+	glOrtho(-0.5f, width - 0.5f, -0.5f, height - 0.5f, -1.f, 1.f);
+	initBitmap(width, height);
+
+	glutPostRedisplay();
+}
+
+int mouse_x, mouse_y, mouse_btn;
+
+// Mouse down
+static void mouse_button(int button, int state, int x, int y)
+{
+	if (state == GLUT_DOWN)
+	{
+		// Record start position
+		mouse_x = x;
+		mouse_y = y;
+		mouse_btn = button;
+	}
+}
+
+// Drag mouse
+static void mouse_motion(int x, int y)
+{
+	if (mouse_btn == 0)
+	// Ordinary mouse button - move
+	{
+		offsetx += (x - mouse_x)*scale;
+		mouse_x = x;
+		offsety += (mouse_y - y)*scale;
+		mouse_y = y;
+		
+		glutPostRedisplay();
+	}
+	else
+	// Alt mouse button - scale
+	{
+		scale *= pow(1.1, y - mouse_y);
+		mouse_y = y;
+		glutPostRedisplay();
+	}
+}
+
+void KeyboardProc(unsigned char key, int x, int y)
+{
+	switch (key)
+	{
+	case 27: /* Escape key */
+	case 'q':
+	case 'Q':
+		exit(0);
+		break;
+	case '+':
+		maxiter += maxiter < 1024 - 32 ? 32 : 0;
+		break;
+	case '-':
+		maxiter -= maxiter > 0 + 32 ? 32 : 0;
+		break;
+	case 'h':
+		print_help = !print_help;
+		break;
+	}
+	glutPostRedisplay();
+}
+
+// Main program, inits
+int main( int argc, char** argv) 
+{
+	glutInit(&argc, argv);
+	glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
+	glutInitWindowSize( DIM, DIM );
+	glutCreateWindow("Mandelbrot explorer (CPU)");
+	glutDisplayFunc(Draw);
+	glutMouseFunc(mouse_button);
+	glutMotionFunc(mouse_motion);
+	glutKeyboardFunc(KeyboardProc);
+	glutReshapeFunc(Reshape);
+	
+	initBitmap(DIM, DIM);
+	
+	glutMainLoop();
+}
--- a/Lab4/mandelbrot.cu
+++ b/Lab4/mandelbrot.cu
+// Mandelbrot explorer, based on my old Julia demo plus parts of Nicolas Melot's Lab 1 code.
+// CPU only! Your task: Rewrite for CUDA! Test and evaluate performance.
+
+// Compile with:
+// gcc interactiveMandelbrot.cpp -shared-libgcc -lstdc++-static  -o interactiveMandelbrot -lglut -lGL
+// or
+// g++ interactiveMandelbrot.cpp -o interactiveMandelbrot -lglut -lGL
+
+// Your CUDA version should compile with something like
+// nvcc -lglut -lGL interactiveMandelbrotCUDA.cu -o interactiveMandelbrotCUDA
+
+// Preliminary version 2014-11-30
+// Cleaned a bit more 2014-12-01
+// Corrected the missing glRasterPos2i 2014-12-03
+
+#ifdef __APPLE__
+#include <OpenGL/gl.h>
+#include <GLUT/glut.h>
+#else
+#include <GL/glut.h>
+#include <GL/gl.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+// Image data
+	unsigned char	*pixels = NULL;
+	unsigned char	*pixelsCUDA = NULL;
+	int	 gImageWidth, gImageHeight;
+	int size = gImageWidth * gImageHeight * 4;
+
+	#define DIM 512
+
+	// Select precision here! float or double!
+	#define MYFLOAT float
+
+	// User controlled parameters
+	int maxiter = 20;
+	MYFLOAT offsetx = -200, offsety = 0, zoom = 0;
+	MYFLOAT scale = 1.5;
+
+	const int blocksize = 32;
+	const int gridsize = 32;
+
+// Init image data
+void initBitmap(int width, int height)
+{
+	if (pixels) free(pixels);
+	pixels = (unsigned char *)malloc(size);
+	cudaMalloc( (void**)&pixelsCUDA, size );
+	cudaMemcpy( pixelsCUDA, pixels, size, cudaMemcpyHostToDevice );
+	gImageWidth = width;
+	gImageHeight = height;
+}
+
+
+
+// Complex number class
+
+struct cuComplex
+{
+    MYFLOAT   r;
+    MYFLOAT   i;
+		__device__
+    cuComplex( MYFLOAT a, MYFLOAT b ) : r(a), i(b)  {}
+		__device__
+    float magnitude2( void )
+    {
+        return r * r + i * i;
+    }
+		__device__
+    cuComplex operator*(const cuComplex& a)
+    {
+        return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
+    }
+		__device__
+    cuComplex operator+(const cuComplex& a)
+    {
+        return cuComplex(r+a.r, i+a.i);
+    }
+};
+
+__device__
+int mandelbrot( int x, int y, int gImageWidth, int gImageHeight, int maxiter,
+								MYFLOAT offsetx = -200, MYFLOAT offsety = 0, MYFLOAT scale = 1.5)
+{
+    MYFLOAT jx = scale * (MYFLOAT)(gImageWidth/2 - x + offsetx/scale)/(gImageWidth/2);
+    MYFLOAT jy = scale * (MYFLOAT)(gImageHeight/2 - y + offsety/scale)/(gImageWidth/2);
+
+    cuComplex c(jx, jy);
+    cuComplex a(jx, jy);
+
+    int i = 0;
+    for (i=0; i<maxiter; i++)
+    {
+        a = a * a + c;
+        if (a.magnitude2() > 1000)
+            return i;
+    }
+
+    return i;
+}
+
+__global__
+void computeFractal(unsigned char *ptr, int gImageWidth, int gImageHeight, int maxiter,
+										MYFLOAT offsetx = -200, MYFLOAT offsety = 0,  MYFLOAT zoom = 0, MYFLOAT scale = 1.5)
+{
+    // map from x, y to pixel position
+		int x = blockIdx.x * blockDim.x + threadIdx.x;
+		int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (x < gImageWidth && y < gImageWidth) {
+	    int offset = x + y * gImageWidth;
+			int fractalValue = mandelbrot(x,y,gImageWidth, gImageHeight, maxiter);
+
+	    // Colorize it
+	    int red = 255 * fractalValue/maxiter;
+	    if (red > 255) red = 255 - red;
+	    int green = 255 * fractalValue*4/maxiter;
+	    if (green > 255) green = 255 - green;
+	    int blue = 255 * fractalValue*20/maxiter;
+	    if (blue > 255) blue = 255 - blue;
+
+	    ptr[offset*4 + 0] = red;
+	    ptr[offset*4 + 1] = green;
+	    ptr[offset*4 + 2] = blue;
+
+	    ptr[offset*4 + 3] = 255;
+		}
+
+}
+
+char print_help = 0;
+
+// Yuck, GLUT text is old junk that should be avoided... but it will have to do
+static void print_str(void *font, const char *string)
+{
+	int i;
+
+	for (i = 0; string[i]; i++)
+		glutBitmapCharacter(font, string[i]);
+}
+
+void PrintHelp()
+{
+	if (print_help)
+	{
+		glPushMatrix();
+		glLoadIdentity();
+		glOrtho(-0.5, 639.5, -0.5, 479.5, -1.0, 1.0);
+
+		glEnable(GL_BLEND);
+		glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
+		glColor4f(0.f, 0.f, 0.5f, 0.5f);
+		glRecti(40, 40, 600, 440);
+
+		glColor3f(1.f, 1.f, 1.f);
+		glRasterPos2i(300, 420);
+		print_str(GLUT_BITMAP_HELVETICA_18, "Help");
+
+		glRasterPos2i(60, 390);
+		print_str(GLUT_BITMAP_HELVETICA_18, "h - Toggle Help");
+		glRasterPos2i(60, 300);
+		print_str(GLUT_BITMAP_HELVETICA_18, "Left click + drag - move picture");
+		glRasterPos2i(60, 270);
+		print_str(GLUT_BITMAP_HELVETICA_18,
+		    "Right click + drag up/down - unzoom/zoom");
+		glRasterPos2i(60, 240);
+		print_str(GLUT_BITMAP_HELVETICA_18, "+ - Increase max. iterations by 32");
+		glRasterPos2i(60, 210);
+		print_str(GLUT_BITMAP_HELVETICA_18, "- - Decrease max. iterations by 32");
+		glRasterPos2i(0, 0);
+
+		glDisable(GL_BLEND);
+
+		glPopMatrix();
+	}
+}
+
+// Compute fractal and display image
+void Draw()
+{
+
+// Dump the whole picture onto the screen. (Old-style OpenGL but without lots of geometry that doesn't matter so much.)
+	glClearColor( 0.0, 0.0, 0.0, 1.0 );
+	glClear( GL_COLOR_BUFFER_BIT );
+	glDrawPixels( gImageWidth, gImageHeight, GL_RGBA, GL_UNSIGNED_BYTE, pixels );
+
+	if (print_help)
+		PrintHelp();
+
+	glutSwapBuffers();
+}
+
+char explore = 1;
+
+static void Reshape(int width, int height)
+{
+	glViewport(0, 0, width, height);
+	glLoadIdentity();
+	glOrtho(-0.5f, width - 0.5f, -0.5f, height - 0.5f, -1.f, 1.f);
+	initBitmap(width, height);
+
+	glutPostRedisplay();
+}
+
+int mouse_x, mouse_y, mouse_btn;
+
+// Mouse down
+static void mouse_button(int button, int state, int x, int y)
+{
+	if (state == GLUT_DOWN)
+	{
+		// Record start position
+		mouse_x = x;
+		mouse_y = y;
+		mouse_btn = button;
+	}
+}
+
+// Drag mouse
+static void mouse_motion(int x, int y)
+{
+	if (mouse_btn == 0)
+	// Ordinary mouse button - move
+	{
+		offsetx += (x - mouse_x)*scale;
+		mouse_x = x;
+		offsety += (mouse_y - y)*scale;
+		mouse_y = y;
+
+		glutPostRedisplay();
+	}
+	else
+	// Alt mouse button - scale
+	{
+		scale *= pow(1.1, y - mouse_y);
+		mouse_y = y;
+		glutPostRedisplay();
+	}
+}
+
+void KeyboardProc(unsigned char key, int x, int y)
+{
+	switch (key)
+	{
+	case 27: /* Escape key */
+	case 'q':
+	case 'Q':
+		exit(0);
+		break;
+	case '+':
+		maxiter += maxiter < 1024 - 32 ? 32 : 0;
+		break;
+	case '-':
+		maxiter -= maxiter > 0 + 32 ? 32 : 0;
+		break;
+	case 'h':
+		print_help = !print_help;
+		break;
+	}
+	glutPostRedisplay();
+}
+
+// Main program, inits
+int main( int argc, char** argv)
+{
+	cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, 0);
+  printf("Device Number: %d\n", 0);
+  printf("  Device name: %s\n", prop.name);
+  printf("  Memory Clock Rate (KHz): %d\n",
+         prop.memoryClockRate);
+  printf("  Memory Bus Width (bits): %d\n",
+         prop.memoryBusWidth);
+  printf("  Peak Memory Bandwidth (GB/s): %f\n",
+         2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
+  printf("  Max threads per block: %i\n",
+         prop.maxThreadsPerBlock);
+	printf("  Max threads dim: %i x %i x %i\n",
+         prop.maxThreadsDim[0],prop.maxThreadsDim[1],prop.maxThreadsDim[2] );
+  printf("  Max grid size: %i x %i x %i\n",
+         prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2] );
+/*	glutInit(&argc, argv);
+	glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
+	glutInitWindowSize( DIM, DIM );
+	glutCreateWindow("Mandelbrot explorer (CPU)");
+	glutDisplayFunc(Draw);
+	glutMouseFunc(mouse_button);
+	glutMotionFunc(mouse_motion);
+	glutKeyboardFunc(KeyboardProc);
+	glutReshapeFunc(Reshape);
+	glutMainLoop();
+*/
+	initBitmap(DIM, DIM);
+	dim3 dimBlock( blocksize, blocksize );
+	dim3 dimGrid( gridsize, gridsize );
+	computeFractal<<<dimGrid, dimBlock>>>(pixels, gImageWidth, gImageHeight, maxiter);
+	cudaMemcpy( pixels, pixelsCUDA, size, cudaMemcpyDeviceToHost );
+
+
+	cudaFree( pixelsCUDA );
+}
--- a/Lab4/matrix_cpu.c
+++ b/Lab4/matrix_cpu.c
@@ -2,6 +2,7 @@
 // gcc matrix_cpu.c -o matrix_cpu -std=c99

 #include <stdio.h>
+#include <stdlib.h>
 #include "milli.h"

 void add_matrix(float *a, float *b, float *c, int N)
@@ -16,13 +17,15 @@ void add_matrix(float *a, float *b, float *c, int N)
 		}
 }

-int main()
+int main(int argc, char *argv[])
 {
-	const int N = 16;
+	char *argument = argv[argc-1];
+  int N = atoi(argument);
+  printf("Running on CPU with no of elements: %i \n",N);

-	float a[N*N];
-	float b[N*N];
-	float c[N*N];
+	float *a = malloc(N*N*sizeof(float));
+	float *b = malloc(N*N*sizeof(float));
+	float *c = malloc(N*N*sizeof(float));

 	for (int i = 0; i < N; i++)
 		for (int j = 0; j < N; j++)
@@ -32,8 +35,8 @@ int main()
 		}
 	ResetMilli();
 	add_matrix(a, b, c, N);
-	printf("%s%i\n","Measured time (us): ", GetMicroseconds());
-
+	printf("%s%f\n","Measured time (ms): ", (float)GetMicroseconds()*0.001);
+/*
 	for (int i = 0; i < N; i++)
 	{
 		for (int j = 0; j < N; j++)
@@ -42,4 +45,8 @@ int main()
 		}
 		printf("\n");
 	}
+	*/
+	free(a);
+	free(b);
+	free(c);
 }
--- a/Lab4/matrix_gpu.cu
+++ b/Lab4/matrix_gpu.cu
@@ -6,25 +6,26 @@
 #include <stdio.h>
 #include <cmath>

-const int N = 1024;
-const int blocksize = 32; //threads
-const int gridsize = 32; //blocks
-
 __global__
 void add(float* result, float *c, float *d, int N)
 {
 	int idx = blockIdx.x * blockDim.x + threadIdx.x;
 	int idy = blockIdx.y * blockDim.y + threadIdx.y;
 	int index = idx + idy * N;
-	
+
 	if (idx < N && idy < N) {
 			result[index] = c[index] + d[index];
 	}

 }

-int main()
+int main(int argc, char *argv[])
 {
+	char *argument = argv[argc-1];
+  int N = atoi(argument);
+	int blocksize = 32; //threads
+	int gridsize = 32; //blocks
+  printf("Running on GPU with no of elements: %i \n",N);
 	float *a = new float[N*N];
 	float *b = new float[N*N];
 	float *result = new float[N*N];
@@ -97,8 +98,8 @@ int main()
         prop.maxThreadsDim[0],prop.maxThreadsDim[1],prop.maxThreadsDim[2] );
  printf("  Max grid size: %i x %i x %i\n",
         prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2] );
-	delete []a;
-	delete []b;
+	delete[] a;
+	delete[] b;
 	printf("done\n");
 	return EXIT_SUCCESS;
 }
--- a/Lab4/noninteractiveMandelbrot.cpp
+++ b/Lab4/noninteractiveMandelbrot.cpp
+// Mandelbrot explorer, based on my old Julia demo plus parts of Nicolas Melot's Lab 1 code.
+// CPU only! Your task: Rewrite for CUDA! Test and evaluate performance.
+
+// Emergency version for use without graphics output. I recommend the interactive version if you can.
+// Preliminary! I will add timing using milli.c.
+
+// Compile with:
+// g++ noninteractiveMandelbrot.cpp readppm.cpp -o noninteractiveMandelbrot
+
+// Your CUDA version should compile with something like
+// nvcc -lglut -lGL interactiveMandelbrotCUDA.cu -o interactiveMandelbrotCUDA
+// See lecture notes on how to compile and link with the other code.
+// Note: A Simple trick is to #include the other files.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "readppm.h"
+// uses framework Cocoa
+
+// Image data
+	unsigned char	*pixels = NULL;
+	int	 gImageWidth, gImageHeight;
+
+// Init image data
+void initBitmap(int width, int height)
+{
+	if (pixels) free(pixels);
+	pixels = (unsigned char *)malloc(width * height * 4);
+	gImageWidth = width;
+	gImageHeight = height;
+}
+
+#define DIM 512
+
+// Select precision here! float or double!
+#define MYFLOAT float
+
+// User controlled parameters
+int maxiter = 20;
+MYFLOAT offsetx = -200, offsety = 0, zoom = 0;
+MYFLOAT scale = 1.5;
+
+// Complex number class
+struct cuComplex
+{
+    MYFLOAT   r;
+    MYFLOAT   i;
+    
+    cuComplex( MYFLOAT a, MYFLOAT b ) : r(a), i(b)  {}
+    
+    float magnitude2( void )
+    {
+        return r * r + i * i;
+    }
+    
+    cuComplex operator*(const cuComplex& a)
+    {
+        return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
+    }
+    
+    cuComplex operator+(const cuComplex& a)
+    {
+        return cuComplex(r+a.r, i+a.i);
+    }
+};
+
+int mandelbrot( int x, int y)
+{
+    MYFLOAT jx = scale * (MYFLOAT)(gImageWidth/2 - x + offsetx/scale)/(gImageWidth/2);
+    MYFLOAT jy = scale * (MYFLOAT)(gImageHeight/2 - y + offsety/scale)/(gImageWidth/2);
+
+    cuComplex c(jx, jy);
+    cuComplex a(jx, jy);
+
+    int i = 0;
+    for (i=0; i<maxiter; i++)
+    {
+        a = a * a + c;
+        if (a.magnitude2() > 1000)
+            return i;
+    }
+
+    return i;
+}
+
+void computeFractal( unsigned char *ptr)
+{
+    // map from x, y to pixel position
+    for (int x = 0; x < gImageWidth; x++)
+	    for (int y = 0; y < gImageHeight; y++)
+	    {
+		    int offset = x + y * gImageWidth;
+
+		    // now calculate the value at that position
+		    int fractalValue = mandelbrot( x, y);
+		    
+		    // Colorize it
+		    int red = 255 * fractalValue/maxiter;
+		    if (red > 255) red = 255 - red;
+		    int green = 255 * fractalValue*4/maxiter;
+		    if (green > 255) green = 255 - green;
+		    int blue = 255 * fractalValue*20/maxiter;
+		    if (blue > 255) blue = 255 - blue;
+		    
+		    ptr[offset*4 + 0] = red;
+		    ptr[offset*4 + 1] = green;
+		    ptr[offset*4 + 2] = blue;
+		    
+		    ptr[offset*4 + 3] = 255;
+    	}
+}
+
+// Main program, inits
+int main( int argc, char** argv) 
+{
+	initBitmap(DIM, DIM);
+	computeFractal(pixels);
+// Dump to PPM
+	writeppm("fractalout.ppm", DIM, DIM, pixels);
+}
--- a/Lab4/noninteractiveMandelbrot.cu
+++ b/Lab4/noninteractiveMandelbrot.cu
+// Mandelbrot explorer, based on my old Julia demo plus parts of Nicolas Melot's Lab 1 code.
+// CPU only! Your task: Rewrite for CUDA! Test and evaluate performance.
+
+// Emergency version for use without graphics output. I recommend the interactive version if you can.
+// Preliminary! I will add timing using milli.c.
+
+// Compile with:
+// g++ noninteractiveMandelbrot.cpp readppm.cpp -o noninteractiveMandelbrot
+
+// Your CUDA version should compile with something like
+// nvcc -lglut -lGL interactiveMandelbrotCUDA.cu -o interactiveMandelbrotCUDA
+// See lecture notes on how to compile and link with the other code.
+// Note: A Simple trick is to #include the other files.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "readppm.h"
+// uses framework Cocoa
+
+// Image data
+	unsigned char	*pixels = NULL;
+	unsigned char	*pixelsCUDA = NULL;
+
+	#define DIM 512
+
+	int size = DIM * DIM * 4;
+
+	// Select precision here! float or double!
+	#define MYFLOAT float
+
+	// User controlled parameters
+	int maxiter = 20;
+	MYFLOAT offsetx = -200, offsety = 0, zoom = 0;
+	MYFLOAT scale = 1.5;
+
+	const int blocksize = 32;
+	const int gridsize = 16;
+
+// Init image data
+void initBitmap(int width, int height)
+{
+	pixels = (unsigned char *)malloc(width*height*4);
+	for(int i = 0; i < size; i++)
+		pixels[i] = 255;
+	cudaMalloc( (void**)&pixelsCUDA, size );
+	cudaMemcpy( pixelsCUDA, pixels, size, cudaMemcpyHostToDevice );
+}
+
+// Complex number class
+
+struct cuComplex
+{
+    MYFLOAT   r;
+    MYFLOAT   i;
+		__device__
+    cuComplex( MYFLOAT a, MYFLOAT b ) : r(a), i(b)  {}
+		__device__
+    float magnitude2( void )
+    {
+        return r * r + i * i;
+    }
+		__device__
+    cuComplex operator*(const cuComplex& a)
+    {
+        return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
+    }
+		__device__
+    cuComplex operator+(const cuComplex& a)
+    {
+        return cuComplex(r+a.r, i+a.i);
+    }
+};
+
+__device__
+int mandelbrot( int x, int y, int gImageWidth, int gImageHeight, int maxiter,
+								MYFLOAT offsetx = -200, MYFLOAT offsety = 0, MYFLOAT scale = 1.5)
+{
+    MYFLOAT jx = scale * (MYFLOAT)(gImageWidth/2 - x + offsetx/scale)/(gImageWidth/2);
+    MYFLOAT jy = scale * (MYFLOAT)(gImageHeight/2 - y + offsety/scale)/(gImageWidth/2);
+
+    cuComplex c(jx, jy);
+    cuComplex a(jx, jy);
+
+    int i = 0;
+    for (i=0; i<maxiter; i++)
+    {
+        a = a * a + c;
+        if (a.magnitude2() > 1000)
+            return i;
+    }
+
+    return i;
+}
+
+__global__
+void computeFractal(unsigned char *data, int gImageWidth, int gImageHeight, int maxiter,
+										MYFLOAT offsetx = -200, MYFLOAT offsety = 0,  MYFLOAT zoom = 0, MYFLOAT scale = 1.5)
+{
+    // map from x, y to pixel position
+		int x = blockIdx.x * blockDim.x + threadIdx.x;
+		int y = blockIdx.y * blockDim.y + threadIdx.y;
+		int index = x + y * gImageWidth;
+
+
+
+			int fractalValue = mandelbrot(x,y,gImageWidth, gImageHeight, maxiter);
+
+	    // Colorize it
+	    int red = 255 * fractalValue/maxiter;
+	    if (red > 255) red = 255 - red;
+	    	int green = 255 * fractalValue*4/maxiter;
+	    if (green > 255) green = 255 - green;
+	    	int blue = 255 * fractalValue*20/maxiter;
+	    if (blue > 255) blue = 255 - blue;
+
+			data[index*4 + 0] = red;
+    	data[index*4 + 1] = green;
+    	data[index*4 + 2] = blue;
+ 	    data[index*4 + 3] = 255;
+}
+
+
+// Main program, inits
+int main( int argc, char** argv)
+{
+	cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, 0);
+  printf("Device Number: %d\n", 0);
+  printf("  Device name: %s\n", prop.name);
+  printf("  Memory Clock Rate (KHz): %d\n",
+         prop.memoryClockRate);
+  printf("  Memory Bus Width (bits): %d\n",
+         prop.memoryBusWidth);
+  printf("  Peak Memory Bandwidth (GB/s): %f\n",
+         2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
+  printf("  Max threads per block: %i\n",
+         prop.maxThreadsPerBlock);
+	printf("  Max threads dim: %i x %i x %i\n",
+         prop.maxThreadsDim[0],prop.maxThreadsDim[1],prop.maxThreadsDim[2] );
+  printf("  Max grid size: %i x %i x %i\n",
+         prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2] );
+	initBitmap(DIM, DIM);
+	dim3 dimBlock( blocksize, blocksize );
+	dim3 dimGrid( gridsize, gridsize );
+	computeFractal<<<dimGrid, dimBlock>>>(pixelsCUDA, DIM, DIM, maxiter);
+	cudaError_t err = cudaGetLastError();
+	 if (err != cudaSuccess)
+			 printf("Error: %s\n", cudaGetErrorString(err));
+	cudaMemcpy( pixels, pixelsCUDA, size, cudaMemcpyDeviceToHost );
+	err = cudaGetLastError();
+	 if (err != cudaSuccess)
+			 printf("Error: %s\n", cudaGetErrorString(err));
+	cudaFree( pixelsCUDA );
+	err = cudaGetLastError();
+	 if (err != cudaSuccess)
+			 printf("Error: %s\n", cudaGetErrorString(err));
+
+// Dump to PPM
+	writeppm("fractalout.ppm", DIM, DIM, pixels);
+	delete[] pixels;
+}
--- a/Lab4/readppm.cpp
+++ b/Lab4/readppm.cpp
+// Cut-down version without texture loading
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "readppm.h"
+
+// By Ingemar Ragnemalm
+// based on a ppm reader by Edward Angel,
+// Added support for binary PPM (2002?)
+// 2006: Fixed a bug that caused some files to end prematurely
+// and added a PPM writer
+// for making screen shots and other output (i.e. GPGPU uses)
+// Added writeppm. (Writes in ASCII format only.)
+// 2009: Added readppmtexture. (A writer for a texture would also be possible when needed.)
+// Portable #include.
+
+int writeppm(char *filename, int height, int width, unsigned char *data)
+{
+	FILE *fp;
+	int error = 1;
+	int i, h, v;
+
+	if (filename != NULL)
+	{
+		fp = fopen(filename,"w");
+		
+		if (fp != NULL)
+		{
+			// Write PPM file
+			// Header	
+			fprintf(fp, "P3\n");
+			fprintf(fp, "# written by Ingemars PPM writer\n");
+			fprintf(fp, "%d %d\n", height, width);
+			fprintf(fp, "%d\n", 255); // range
+			
+			// Data
+			for (v = height-1; v >=0; v--)
+			{
+				for (h = 0; h < width; h++)
+				{
+					i = (width*v + h)*4; // assumes rgba, not rgba
+					fprintf(fp, "%d %d %d ", data[i], data[i+1], data[i+2]);
+				}
+				fprintf(fp, "\n"); // range
+			}
+			
+			if (fwrite("\n",sizeof(char),1,fp) == 1)
+				error = 0; // Probable success
+			fclose(fp);
+		}
+	}
+	return(error);
+}
+
+
+
+unsigned char *readppm(char *filename, int *height, int *width)
+{
+	FILE *fd;
+	int  k, nm;
+	char c;
+	int i,j;
+	char b[100];
+	float s;
+	int red, green, blue;
+	long numbytes, howmuch;
+	int n;
+	int m;
+	unsigned char *image;
+	
+	fd = fopen(filename, "rb");
+	if (fd == NULL)
+	{
+		printf("Could not open %s\n", filename);
+		return NULL;
+	}
+	c = getc(fd);
+	if (c=='P' || c=='p')
+		c = getc(fd);
+	
+	if (c == '3')
+	{
+		printf("%s is a PPM file (plain text version)\n", filename);
+		
+		// NOTE: This is not very good PPM code! Comments are not allowed
+		// except immediately after the magic number.
+		c = getc(fd);
+		if (c == '\n' || c == '\r') // Skip any line break and comments
+		{
+			c = getc(fd);
+			while(c == '#') 
+			{
+				fscanf(fd, "%[^\n\r] ", b);
+				printf("%s\n",b);
+				c = getc(fd);
+			}
+			ungetc(c,fd); 
+		}
+		fscanf(fd, "%d %d %d", &n, &m, &k);
+		
+		printf("%d rows  %d columns  max value= %d\n",n,m,k);
+		
+		numbytes = n * m * 3;
+		image = (unsigned char *) malloc(numbytes);
+		if (image == NULL)
+		{
+			printf("Memory allocation failed!\n"); 
+			return NULL;
+		}
+		for(i=n-1;i>=0;i--) for(j=0;j<m;j++)
+		{
+			fscanf(fd,"%d %d %d",&red, &green, &blue );
+			image[(i*n+j)*3]=red * 255 / k;
+			image[(i*n+j)*3+1]=green * 255 / k;
+			image[(i*n+j)*3+2]=blue * 255 / k;
+		}
+	}
+	else
+	if (c == '6')
+	{
+		printf("%s is a PPM file (raw version)!\n", filename); 
+		
+		c = getc(fd);
+		if (c == '\n' || c == '\r') // Skip any line break and comments
+		{
+			c = getc(fd);
+			while(c == '#') 
+			{
+				fscanf(fd, "%[^\n\r] ", b);
+				printf("%s\n",b);
+				c = getc(fd);
+			}
+			ungetc(c,fd); 
+		}
+		fscanf(fd, "%d %d %d", &n, &m, &k);
+		printf("%d rows  %d columns  max value= %d\n",n,m,k);
+		c = getc(fd); // Skip the last whitespace
+		
+		numbytes = n * m * 3;
+		image = (unsigned char *) malloc(numbytes);
+		if (image == NULL)
+		{
+			printf("Memory allocation failed!\n"); 
+			return NULL;
+		}
+		// Read and re-order as necessary
+		for(i=n-1;i>=0;i--) for(j=0;j<m;j++)
+		{
+			image[(i*n+j)*3+0]=getc(fd);
+			image[(i*n+j)*3+1]=getc(fd);
+			image[(i*n+j)*3+2]=getc(fd);
+		}
+	}
+	else
+	{
+		printf("%s is not a PPM file!\n", filename); 
+		return NULL;
+	}
+	
+	printf("read image\n");
+	
+	*height = n;
+	*width = m;
+	return image;
+}
--- a/Lab4/readppm.h
+++ b/Lab4/readppm.h
+#ifndef READ_PPM
+#define READ_PPM
+
+int writeppm(char *filename, int height, int width, unsigned char *data);
+unsigned char *readppm(char *filename, int *height, int *width);
+
+#endif
--- a/README.md
+++ b/README.md
@@ -70,6 +70,28 @@ int index = idy * blockDim.x + idx;
 result[index] = c[index] + d[index];

 ### QUESTION: What happens if you use too many threads per block?
+There is an error in CUDA and the kernel is not run properly - undefined behavior.
+
+### QUESTION: At what data size is the GPU faster than the CPU?
+At N = 32, the CPU is faster. At N = 64 the GPU is faster.
+
+### QUESTION: What block size seems like a good choice? Compared to what?
+We have set the block size to 32. The maximum number of threads per block in Olympen GPU:s is 1024 threads. We would like to have the block quadratic so we take the square root of 1024 = 32.
+
+### QUESTION: Write down your data size, block size and timing data for the best GPU performance you can get.
+#### 1. Blocksize = 32, gridsize = 32, N = 1024. Time (ms): 0.058720
+#### 2. Blocksize = 16, gridsize = 64, N = 1024. Time (ms): 0.069376
+#### 3. Blocksize = 8, gridsize = 128, N = 1024. Time (ms): 0.069440
+
+### QUESTION: How much performance did you lose by making data accesses non-coalesced?
+For coalesced data: 0.058720 ms
+For non coalesced data: 0.178624 ms
+Took about 3x the time.
+
+
+
+
+

 ## Lab 5