Lab 4 init

a7cbd9d1 · ylvse560 · 19e864d6 · a7cbd9d1 · a7cbd9d1 · a7cbd9d1
Commit a7cbd9d1 authored 4 years ago by ylvse560
--- a/Lab4/matrix_cpu.c
+++ b/Lab4/matrix_cpu.c
+// Matrix addition, CPU version
+// gcc matrix_cpu.c -o matrix_cpu -std=c99
+#include <stdio.h>
+#include "milli.h"
+void add_matrix(float *a, float *b, float *c, int N)
+{
+	int index;
+	for (int i = 0; i < N; i++)
+		for (int j = 0; j < N; j++)
+		{
+			index = i + j*N;
+			c[index] = a[index] + b[index];
+		}
+}
+int main()
+{
+	const int N = 16;
+	float a[N*N];
+	float b[N*N];
+	float c[N*N];
+	for (int i = 0; i < N; i++)
+		for (int j = 0; j < N; j++)
+		{
+			a[i+j*N] = 10 + i;
+			b[i+j*N] = (float)j / N;
+		}
+	ResetMilli();
+	add_matrix(a, b, c, N);
+	printf("%s%i\n","Measured time (us): ", GetMicroseconds());
+	for (int i = 0; i < N; i++)
+	{
+		for (int j = 0; j < N; j++)
+		{
+			printf("%0.2f ", c[i+j*N]);
+		}
+		printf("\n");
+	}
+}
--- a/Lab4/matrix_gpu.cu
+++ b/Lab4/matrix_gpu.cu
+// Simple CUDA example by Ingemar Ragnemalm 2009. Simplest possible?
+// Assigns every element in an array with its index.
+// nvcc simple.cu -L /usr/local/cuda/lib -lcudart -o simple
+#include <stdio.h>
+#include <cmath>
+const int N = 32;
+const int blocksize = 64;
+const int gridsize = 4;
+__global__
+void add(float* result, float *c, float *d)
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+	int idy = blockIdx.y * blockDim.y + threadIdx.y;
+	int index = idy * blockDim.x + idx;
+	result[index] = c[index] + d[index];
+}
+int main()
+{
+	float *a = new float[N*N];
+	float *b = new float[N*N];
+	float *result = new float[N*N];
+	float theTime = 0.0;
+	cudaEvent_t startEvent, finEvent;
+	for (int i = 0; i < N; i++)
+		for (int j = 0; j < N; j++)
+		{
+			a[i+j*N] = 10 + i;
+			b[i+j*N] = (float)j / N;
+			result[i+j*N] = 0.0;
+		}
+	float *c;
+	float *d;
+	float *result_cuda;
+	const int size = N*N*sizeof(float);
+	cudaMalloc( (void**)&c, size );
+	cudaMalloc( (void**)&d, size );
+	cudaMalloc( (void**)&result_cuda, size );
+	dim3 dimBlock( blocksize, blocksize );
+	dim3 dimGrid( gridsize, gridsize );
+	cudaMemcpy( c, a, size, cudaMemcpyHostToDevice );
+	cudaMemcpy( d, b, size, cudaMemcpyHostToDevice );
+  cudaEventCreate(&startEvent);
+	cudaEventCreate(&finEvent);
+	cudaEventRecord(startEvent, 0);
+	add<<<dimGrid, dimBlock>>>(result_cuda, c, d);
+	cudaThreadSynchronize();
+	cudaEventRecord(finEvent, 0);
+	cudaEventSynchronize(finEvent);
+	cudaEventElapsedTime(&theTime, startEvent, finEvent);
+	cudaMemcpy( result, result_cuda, size, cudaMemcpyDeviceToHost );
+	cudaFree( c );
+	cudaFree( d );
+	cudaFree( result_cuda );
+	for (int i = 0; i < N; i++)
+	{
+		for (int j = 0; j < N; j++)
+		{
+			printf("%0.2f ", result[i+j*N]);
+		}
+		printf("\n");
+	}
+	cudaError_t err = cudaGetLastError();
+	 if (err != cudaSuccess)
+			 printf("Error: %s\n", cudaGetErrorString(err));
+	printf("%s%f\n", "Time elapsed (ms): ", theTime);
+	cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, 0);
+  printf("Device Number: %d\n", 0);
+  printf("  Device name: %s\n", prop.name);
+  printf("  Memory Clock Rate (KHz): %d\n",
+         prop.memoryClockRate);
+  printf("  Memory Bus Width (bits): %d\n",
+         prop.memoryBusWidth);
+  printf("  Peak Memory Bandwidth (GB/s): %f\n",
+         2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
+  printf("  Max threads per block: %i\n",
+         prop.maxThreadsPerBlock);
+	printf("  Max threads dim: %i x %i x %i\n",
+         prop.maxThreadsDim[0],prop.maxThreadsDim[1],prop.maxThreadsDim[2] );
+  printf("  Max grid size: %i x %i x %i\n",
+         prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2] );
+	delete []a;
+	delete []b;
+	printf("done\n");
+	return EXIT_SUCCESS;
+}
--- a/Lab4/milli.c
+++ b/Lab4/milli.c
+// Simple little unit for timing using the gettimeofday() call.
+// By Ingemar 2009
+#include <stdlib.h>
+#include <sys/time.h>
+#include "milli.h"
+static struct timeval timeStart;
+static char hasStart = 0;
+int GetMilliseconds()
+{
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	if (!hasStart)
+	{
+		hasStart = 1;
+		timeStart = tv;
+	}
+	return (tv.tv_usec - timeStart.tv_usec) / 1000 + (tv.tv_sec - timeStart.tv_sec)*1000;
+}
+int GetMicroseconds()
+{
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	if (!hasStart)
+	{
+		hasStart = 1;
+		timeStart = tv;
+	}
+	return (tv.tv_usec - timeStart.tv_usec) + (tv.tv_sec - timeStart.tv_sec)*1000000;
+}
+double GetSeconds()
+{
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	if (!hasStart)
+	{
+		hasStart = 1;
+		timeStart = tv;
+	}
+	return (double)(tv.tv_usec - timeStart.tv_usec) / 1000000.0 + (double)(tv.tv_sec - timeStart.tv_sec);
+}
+// If you want to start from right now.
+void ResetMilli()
+{
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	hasStart = 1;
+	timeStart = tv;
+}
+// If you want to start from a specific time.
+void SetMilli(int seconds, int microseconds)
+{
+	hasStart = 1;
+	timeStart.tv_sec = seconds;
+	timeStart.tv_usec = microseconds;
+}
--- a/Lab4/milli.h
+++ b/Lab4/milli.h
+#ifndef _MILLI_
+#define _MILLI_
+#ifdef __cplusplus
+extern "C" {
+#endif
+int GetMilliseconds();
+int GetMicroseconds();
+double GetSeconds();
+// Optional setting of the start time. If these are not used,
+// the first call to the above functions will be the start time.
+void ResetMilli();
+void SetMilli(int seconds, int microseconds);
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/Lab4/simple.cu
+++ b/Lab4/simple.cu
+// Simple CUDA example by Ingemar Ragnemalm 2009. Simplest possible?
+// Assigns every element in an array with its index.
+// nvcc simple.cu -L /usr/local/cuda/lib -lcudart -o simple
+#include <stdio.h>
+#include <cmath>
+const int N = 16;
+const int blocksize = 16;
+__global__
+void simple(float *c)
+{
+	c[threadIdx.x] = threadIdx.x;
+}
+__global__
+void sqrt(float *c)
+{
+	c[threadIdx.x] = sqrt(c[threadIdx.x]);
+}
+int main()
+{
+	float *c = new float[N];
+	for(int i = 0; i < N; i++) {
+		c[i] = (float)i;
+		printf("%f ", sqrt(c[i]));
+	}
+	printf("\n\n");
+	float *cd;
+	const int size = N*sizeof(float);
+	cudaMalloc( (void**)&cd, size );
+	dim3 dimBlock( blocksize, 1 );
+	dim3 dimGrid( 1, 1 );
+	cudaMemcpy( cd, c, size, cudaMemcpyHostToDevice );
+	sqrt<<<dimGrid, dimBlock>>>(cd);
+	cudaThreadSynchronize();
+	cudaMemcpy( c, cd, size, cudaMemcpyDeviceToHost );
+	cudaFree( cd );
+	for (int i = 0; i < N; i++)
+		printf("%f ", c[i]);
+	printf("\n");
+	delete[] c;
+	printf("done\n");
+	return EXIT_SUCCESS;
+	// GIVEN VERSION
+	/*
+	float *c = new float[N];
+	float *cd;
+	const int size = N*sizeof(float);
+	cudaMalloc( (void**)&cd, size );
+	dim3 dimBlock( blocksize, 1 );
+	dim3 dimGrid( 1, 1 );
+	simple<<<dimGrid, dimBlock>>>(cd);
+	cudaThreadSynchronize();
+	cudaMemcpy( c, cd, size, cudaMemcpyDeviceToHost );
+	cudaFree( cd );
+	for (int i = 0; i < N; i++)
+		printf("%f ", c[i]);
+	printf("\n");
+	delete[] c;
+	printf("done\n");
+	return EXIT_SUCCESS;
+	*/
+}
--- a/README.md
+++ b/README.md
@@ -39,6 +39,7 @@ GPU: Big problem sizes will be faster because there are many more cores in the G
 Especially for OpenCL, the bottleneck is loading the data from the CPU to the GPU.
 #### Question 2.1: Which version of the averaging filter (unified, separable) is the most efficient? Why?
+The separable is the most efficient. This is because the filter application is 2N -> O(N) time instead of O(N²). The nested for loops can instead of being run in sequence be run in parallel.
 #### Question 3.1: In data-parallel skeletons like MapOverlap, all elements are processed independently of each other. Is this a good fit for the median filter? Why/why not?
@@ -48,6 +49,22 @@ Especially for OpenCL, the bottleneck is loading the data from the CPU to the GP
 ## Lab 4
+### QUESTION: How many cores will simple.cu use, max, as written? How many SMs?
+simple.cu has the dimension of the grid is 1 x 1, which means there only is one block. This gives a total of 16 threads. 1 SM = 8 cores and 1 SM = 1 block. This gives that there are 8 cores.
+### QUESTION: Is the calculated square root identical to what the CPU calculates? Should we assume that this is always the case?
+Yes, it is in Olympen. But we shall not expect that this is always the case on every GPU, since there are single precision GPU out there.
+### QUESTION: How do you calculate the index in the array, using 2-dimensional blocks?
+By adding index for the y-dimension:
+int idx = blockIdx.x * blockDim.x + threadIdx.x;
+int idy = blockIdx.y * blockDim.y + threadIdx.y;
+int index = idy * blockDim.x + idx;
+result[index] = c[index] + d[index];
+### QUESTION: What happens if you use too many threads per block?
 ## Lab 5
 ## Lab 6