Skip to content
Snippets Groups Projects
Commit a7cbd9d1 authored by ylvse560's avatar ylvse560
Browse files

Lab 4 init

parent 19e864d6
No related branches found
No related tags found
No related merge requests found
// Matrix addition, CPU version
// gcc matrix_cpu.c -o matrix_cpu -std=c99
#include <stdio.h>
#include "milli.h"
void add_matrix(float *a, float *b, float *c, int N)
{
int index;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
{
index = i + j*N;
c[index] = a[index] + b[index];
}
}
int main()
{
const int N = 16;
float a[N*N];
float b[N*N];
float c[N*N];
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
{
a[i+j*N] = 10 + i;
b[i+j*N] = (float)j / N;
}
ResetMilli();
add_matrix(a, b, c, N);
printf("%s%i\n","Measured time (us): ", GetMicroseconds());
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
printf("%0.2f ", c[i+j*N]);
}
printf("\n");
}
}
// Simple CUDA example by Ingemar Ragnemalm 2009. Simplest possible?
// Assigns every element in an array with its index.
// nvcc simple.cu -L /usr/local/cuda/lib -lcudart -o simple
#include <stdio.h>
#include <cmath>
const int N = 32;
const int blocksize = 64;
const int gridsize = 4;
__global__
void add(float* result, float *c, float *d)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int index = idy * blockDim.x + idx;
result[index] = c[index] + d[index];
}
int main()
{
float *a = new float[N*N];
float *b = new float[N*N];
float *result = new float[N*N];
float theTime = 0.0;
cudaEvent_t startEvent, finEvent;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
{
a[i+j*N] = 10 + i;
b[i+j*N] = (float)j / N;
result[i+j*N] = 0.0;
}
float *c;
float *d;
float *result_cuda;
const int size = N*N*sizeof(float);
cudaMalloc( (void**)&c, size );
cudaMalloc( (void**)&d, size );
cudaMalloc( (void**)&result_cuda, size );
dim3 dimBlock( blocksize, blocksize );
dim3 dimGrid( gridsize, gridsize );
cudaMemcpy( c, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( d, b, size, cudaMemcpyHostToDevice );
cudaEventCreate(&startEvent);
cudaEventCreate(&finEvent);
cudaEventRecord(startEvent, 0);
add<<<dimGrid, dimBlock>>>(result_cuda, c, d);
cudaThreadSynchronize();
cudaEventRecord(finEvent, 0);
cudaEventSynchronize(finEvent);
cudaEventElapsedTime(&theTime, startEvent, finEvent);
cudaMemcpy( result, result_cuda, size, cudaMemcpyDeviceToHost );
cudaFree( c );
cudaFree( d );
cudaFree( result_cuda );
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
printf("%0.2f ", result[i+j*N]);
}
printf("\n");
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
printf("%s%f\n", "Time elapsed (ms): ", theTime);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
printf("Device Number: %d\n", 0);
printf(" Device name: %s\n", prop.name);
printf(" Memory Clock Rate (KHz): %d\n",
prop.memoryClockRate);
printf(" Memory Bus Width (bits): %d\n",
prop.memoryBusWidth);
printf(" Peak Memory Bandwidth (GB/s): %f\n",
2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
printf(" Max threads per block: %i\n",
prop.maxThreadsPerBlock);
printf(" Max threads dim: %i x %i x %i\n",
prop.maxThreadsDim[0],prop.maxThreadsDim[1],prop.maxThreadsDim[2] );
printf(" Max grid size: %i x %i x %i\n",
prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2] );
delete []a;
delete []b;
printf("done\n");
return EXIT_SUCCESS;
}
// Simple little unit for timing using the gettimeofday() call.
// By Ingemar 2009
#include <stdlib.h>
#include <sys/time.h>
#include "milli.h"
static struct timeval timeStart;
static char hasStart = 0;
int GetMilliseconds()
{
struct timeval tv;
gettimeofday(&tv, NULL);
if (!hasStart)
{
hasStart = 1;
timeStart = tv;
}
return (tv.tv_usec - timeStart.tv_usec) / 1000 + (tv.tv_sec - timeStart.tv_sec)*1000;
}
int GetMicroseconds()
{
struct timeval tv;
gettimeofday(&tv, NULL);
if (!hasStart)
{
hasStart = 1;
timeStart = tv;
}
return (tv.tv_usec - timeStart.tv_usec) + (tv.tv_sec - timeStart.tv_sec)*1000000;
}
double GetSeconds()
{
struct timeval tv;
gettimeofday(&tv, NULL);
if (!hasStart)
{
hasStart = 1;
timeStart = tv;
}
return (double)(tv.tv_usec - timeStart.tv_usec) / 1000000.0 + (double)(tv.tv_sec - timeStart.tv_sec);
}
// If you want to start from right now.
void ResetMilli()
{
struct timeval tv;
gettimeofday(&tv, NULL);
hasStart = 1;
timeStart = tv;
}
// If you want to start from a specific time.
void SetMilli(int seconds, int microseconds)
{
hasStart = 1;
timeStart.tv_sec = seconds;
timeStart.tv_usec = microseconds;
}
#ifndef _MILLI_
#define _MILLI_
#ifdef __cplusplus
extern "C" {
#endif
int GetMilliseconds();
int GetMicroseconds();
double GetSeconds();
// Optional setting of the start time. If these are not used,
// the first call to the above functions will be the start time.
void ResetMilli();
void SetMilli(int seconds, int microseconds);
#ifdef __cplusplus
}
#endif
#endif
// Simple CUDA example by Ingemar Ragnemalm 2009. Simplest possible?
// Assigns every element in an array with its index.
// nvcc simple.cu -L /usr/local/cuda/lib -lcudart -o simple
#include <stdio.h>
#include <cmath>
const int N = 16;
const int blocksize = 16;
__global__
void simple(float *c)
{
c[threadIdx.x] = threadIdx.x;
}
__global__
void sqrt(float *c)
{
c[threadIdx.x] = sqrt(c[threadIdx.x]);
}
int main()
{
float *c = new float[N];
for(int i = 0; i < N; i++) {
c[i] = (float)i;
printf("%f ", sqrt(c[i]));
}
printf("\n\n");
float *cd;
const int size = N*sizeof(float);
cudaMalloc( (void**)&cd, size );
dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
cudaMemcpy( cd, c, size, cudaMemcpyHostToDevice );
sqrt<<<dimGrid, dimBlock>>>(cd);
cudaThreadSynchronize();
cudaMemcpy( c, cd, size, cudaMemcpyDeviceToHost );
cudaFree( cd );
for (int i = 0; i < N; i++)
printf("%f ", c[i]);
printf("\n");
delete[] c;
printf("done\n");
return EXIT_SUCCESS;
// GIVEN VERSION
/*
float *c = new float[N];
float *cd;
const int size = N*sizeof(float);
cudaMalloc( (void**)&cd, size );
dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
simple<<<dimGrid, dimBlock>>>(cd);
cudaThreadSynchronize();
cudaMemcpy( c, cd, size, cudaMemcpyDeviceToHost );
cudaFree( cd );
for (int i = 0; i < N; i++)
printf("%f ", c[i]);
printf("\n");
delete[] c;
printf("done\n");
return EXIT_SUCCESS;
*/
}
...@@ -39,6 +39,7 @@ GPU: Big problem sizes will be faster because there are many more cores in the G ...@@ -39,6 +39,7 @@ GPU: Big problem sizes will be faster because there are many more cores in the G
Especially for OpenCL, the bottleneck is loading the data from the CPU to the GPU. Especially for OpenCL, the bottleneck is loading the data from the CPU to the GPU.
#### Question 2.1: Which version of the averaging filter (unified, separable) is the most efficient? Why? #### Question 2.1: Which version of the averaging filter (unified, separable) is the most efficient? Why?
The separable is the most efficient. This is because the filter application is 2N -> O(N) time instead of O(N²). The nested for loops can instead of being run in sequence be run in parallel.
#### Question 3.1: In data-parallel skeletons like MapOverlap, all elements are processed independently of each other. Is this a good fit for the median filter? Why/why not? #### Question 3.1: In data-parallel skeletons like MapOverlap, all elements are processed independently of each other. Is this a good fit for the median filter? Why/why not?
...@@ -48,6 +49,22 @@ Especially for OpenCL, the bottleneck is loading the data from the CPU to the GP ...@@ -48,6 +49,22 @@ Especially for OpenCL, the bottleneck is loading the data from the CPU to the GP
## Lab 4 ## Lab 4
### QUESTION: How many cores will simple.cu use, max, as written? How many SMs?
simple.cu has the dimension of the grid is 1 x 1, which means there only is one block. This gives a total of 16 threads. 1 SM = 8 cores and 1 SM = 1 block. This gives that there are 8 cores.
### QUESTION: Is the calculated square root identical to what the CPU calculates? Should we assume that this is always the case?
Yes, it is in Olympen. But we shall not expect that this is always the case on every GPU, since there are single precision GPU out there.
### QUESTION: How do you calculate the index in the array, using 2-dimensional blocks?
By adding index for the y-dimension:
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int index = idy * blockDim.x + idx;
result[index] = c[index] + d[index];
### QUESTION: What happens if you use too many threads per block?
## Lab 5 ## Lab 5
## Lab 6 ## Lab 6
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment