Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
Multicore GPU programming
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Ylva Selling
Multicore GPU programming
Commits
1a435719
Commit
1a435719
authored
4 years ago
by
dansa828
Browse files
Options
Downloads
Patches
Plain Diff
update
parent
1d8bcdab
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
Lab5/filter.cu
+152
-0
152 additions, 0 deletions
Lab5/filter.cu
Lab5/filteroptimized.cu
+167
-0
167 additions, 0 deletions
Lab5/filteroptimized.cu
with
319 additions
and
0 deletions
Lab5/filter.cu
0 → 100644
+
152
−
0
View file @
1a435719
// Lab 5, image filters with CUDA.
// Compile with a command-line similar to Lab 4:
// nvcc filter.cu -c -arch=sm_30 -o filter.o
// g++ filter.o milli.c readppm.c -lGL -lm -lcuda -lcudart -L/usr/local/cuda/lib -lglut -o filter
// or (multicore lab)
// nvcc filter.cu -c -arch=sm_20 -o filter.o
// g++ filter.o milli.c readppm.c -lGL -lm -lcuda -L/usr/local/cuda/lib64 -lcudart -lglut -o filter
// 2017-11-27: Early pre-release, dubbed "beta".
// 2017-12-03: First official version! Brand new lab 5 based on the old lab 6.
// Better variable names, better prepared for some lab tasks. More changes may come
// but I call this version 1.0b2.
// 2017-12-04: Two fixes: Added command-lines (above), fixed a bug in computeImages
// that allocated too much memory. b3
// 2017-12-04: More fixes: Tightened up the kernel with edge clamping.
// Less code, nicer result (no borders). Cleaned up some messed up X and Y. b4
#include
<stdio.h>
#include
<stdlib.h>
#include
<string.h>
#include
<sys/time.h>
#ifdef __APPLE__
#include
<GLUT/glut.h>
#include
<OpenGL/gl.h>
#else
#include
<GL/glut.h>
#endif
#include
"readppm.h"
#include
"milli.h"
// Use these for setting shared memory size.
#define maxKernelSizeX 10
#define maxKernelSizeY 10
__global__
void
filter
(
unsigned
char
*
image
,
unsigned
char
*
out
,
const
unsigned
int
imagesizex
,
const
unsigned
int
imagesizey
,
const
int
kernelsizex
,
const
int
kernelsizey
)
{
// map from blockIdx to pixel position
int
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
dy
,
dx
;
unsigned
int
sumx
,
sumy
,
sumz
;
int
divby
=
(
2
*
kernelsizex
+
1
)
*
(
2
*
kernelsizey
+
1
);
// Works for box filters only!
if
(
x
<
imagesizex
&&
y
<
imagesizey
)
// If inside image
{
// Filter kernel (simple box filter)
sumx
=
0
;
sumy
=
0
;
sumz
=
0
;
for
(
dy
=-
kernelsizey
;
dy
<=
kernelsizey
;
dy
++
)
for
(
dx
=-
kernelsizex
;
dx
<=
kernelsizex
;
dx
++
)
{
// Use max and min to avoid branching!
int
yy
=
min
(
max
(
y
+
dy
,
0
),
imagesizey
-
1
);
int
xx
=
min
(
max
(
x
+
dx
,
0
),
imagesizex
-
1
);
sumx
+=
image
[((
yy
)
*
imagesizex
+
(
xx
))
*
3
+
0
];
sumy
+=
image
[((
yy
)
*
imagesizex
+
(
xx
))
*
3
+
1
];
sumz
+=
image
[((
yy
)
*
imagesizex
+
(
xx
))
*
3
+
2
];
}
out
[(
y
*
imagesizex
+
x
)
*
3
+
0
]
=
sumx
/
divby
;
out
[(
y
*
imagesizex
+
x
)
*
3
+
1
]
=
sumy
/
divby
;
out
[(
y
*
imagesizex
+
x
)
*
3
+
2
]
=
sumz
/
divby
;
}
}
// Global variables for image data
unsigned
char
*
image
,
*
pixels
,
*
dev_bitmap
,
*
dev_input
;
unsigned
int
imagesizey
,
imagesizex
;
// Image size
////////////////////////////////////////////////////////////////////////////////
// main computation function
////////////////////////////////////////////////////////////////////////////////
void
computeImages
(
int
kernelsizex
,
int
kernelsizey
)
{
if
(
kernelsizex
>
maxKernelSizeX
||
kernelsizey
>
maxKernelSizeY
)
{
printf
(
"Kernel size out of bounds!
\n
"
);
return
;
}
pixels
=
(
unsigned
char
*
)
malloc
(
imagesizex
*
imagesizey
*
3
);
cudaMalloc
(
(
void
**
)
&
dev_input
,
imagesizex
*
imagesizey
*
3
);
cudaMemcpy
(
dev_input
,
image
,
imagesizey
*
imagesizex
*
3
,
cudaMemcpyHostToDevice
);
cudaMalloc
(
(
void
**
)
&
dev_bitmap
,
imagesizex
*
imagesizey
*
3
);
dim3
grid
(
imagesizex
,
imagesizey
);
filter
<<<
grid
,
1
>>>
(
dev_input
,
dev_bitmap
,
imagesizex
,
imagesizey
,
kernelsizex
,
kernelsizey
);
// Awful load balance
cudaThreadSynchronize
();
// Check for errors!
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
printf
(
"Error: %s
\n
"
,
cudaGetErrorString
(
err
));
cudaMemcpy
(
pixels
,
dev_bitmap
,
imagesizey
*
imagesizex
*
3
,
cudaMemcpyDeviceToHost
);
cudaFree
(
dev_bitmap
);
cudaFree
(
dev_input
);
}
// Display images
void
Draw
()
{
// Dump the whole picture onto the screen.
glClearColor
(
0.0
,
0.0
,
0.0
,
1.0
);
glClear
(
GL_COLOR_BUFFER_BIT
);
if
(
imagesizey
>=
imagesizex
)
{
// Not wide - probably square. Original left, result right.
glRasterPos2f
(
-
1
,
-
1
);
glDrawPixels
(
imagesizex
,
imagesizey
,
GL_RGB
,
GL_UNSIGNED_BYTE
,
image
);
glRasterPos2i
(
0
,
-
1
);
glDrawPixels
(
imagesizex
,
imagesizey
,
GL_RGB
,
GL_UNSIGNED_BYTE
,
pixels
);
}
else
{
// Wide image! Original on top, result below.
glRasterPos2f
(
-
1
,
-
1
);
glDrawPixels
(
imagesizex
,
imagesizey
,
GL_RGB
,
GL_UNSIGNED_BYTE
,
pixels
);
glRasterPos2i
(
-
1
,
0
);
glDrawPixels
(
imagesizex
,
imagesizey
,
GL_RGB
,
GL_UNSIGNED_BYTE
,
image
);
}
glFlush
();
}
// Main program, inits
int
main
(
int
argc
,
char
**
argv
)
{
glutInit
(
&
argc
,
argv
);
glutInitDisplayMode
(
GLUT_SINGLE
|
GLUT_RGBA
);
if
(
argc
>
1
)
image
=
readppm
(
argv
[
1
],
(
int
*
)
&
imagesizex
,
(
int
*
)
&
imagesizey
);
else
image
=
readppm
((
char
*
)
"maskros512.ppm"
,
(
int
*
)
&
imagesizex
,
(
int
*
)
&
imagesizey
);
if
(
imagesizey
>=
imagesizex
)
glutInitWindowSize
(
imagesizex
*
2
,
imagesizey
);
else
glutInitWindowSize
(
imagesizex
,
imagesizey
*
2
);
glutCreateWindow
(
"Lab 5"
);
glutDisplayFunc
(
Draw
);
ResetMilli
();
computeImages
(
2
,
2
);
// You can save the result to a file like this:
// writeppm("out.ppm", imagesizey, imagesizex, pixels);
glutMainLoop
();
return
0
;
}
This diff is collapsed.
Click to expand it.
Lab5/filteroptimized.cu
0 → 100644
+
167
−
0
View file @
1a435719
// Lab 5, image filters with CUDA.
// Compile with a command-line similar to Lab 4:
// nvcc filter.cu -c -arch=sm_30 -o filter.o
// g++ filter.o milli.c readppm.c -lGL -lm -lcuda -lcudart -L/usr/local/cuda/lib -lglut -o filter
// or (multicore lab)
// nvcc filter.cu -c -arch=sm_20 -o filter.o
// g++ filter.o milli.c readppm.c -lGL -lm -lcuda -L/usr/local/cuda/lib64 -lcudart -lglut -o filter
// 2017-11-27: Early pre-release, dubbed "beta".
// 2017-12-03: First official version! Brand new lab 5 based on the old lab 6.
// Better variable names, better prepared for some lab tasks. More changes may come
// but I call this version 1.0b2.
// 2017-12-04: Two fixes: Added command-lines (above), fixed a bug in computeImages
// that allocated too much memory. b3
// 2017-12-04: More fixes: Tightened up the kernel with edge clamping.
// Less code, nicer result (no borders). Cleaned up some messed up X and Y. b4
#include
<stdio.h>
#include
<stdlib.h>
#include
<string.h>
#include
<sys/time.h>
#ifdef __APPLE__
#include
<GLUT/glut.h>
#include
<OpenGL/gl.h>
#else
#include
<GL/glut.h>
#endif
#include
"readppm.h"
#include
"milli.h"
// Use these for setting shared memory size.
#define maxKernelSizeX 10
#define maxKernelSizeY 10
__global__
void
filter
(
unsigned
char
*
image
,
unsigned
char
*
out
,
const
unsigned
int
imagesizex
,
const
unsigned
int
imagesizey
,
const
int
kernelsizex
,
const
int
kernelsizey
)
{
// map from blockIdx to pixel position
int
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
dy
,
dx
;
unsigned
int
sumx
,
sumy
,
sumz
;
int
divby
=
(
2
*
kernelsizex
+
1
)
*
(
2
*
kernelsizey
+
1
);
// Works for box filters only!
__shared__
float
patch
[
maxKernelSizeX
*
maxKernelSizeY
];
ii
=
blockDim
.
x
+
threadIdx
.
x
;
jj
=
blockDim
.
y
+
threadIdx
.
y
;
patch
[
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
]
=
image
[
ii
*
imagesizex
*
imagesizey
+
y
];
int
ii
=
blockDim
.
x
+
threadIdx
.
x
;
//int jj = blockDim.y+threadIdx.y;
patch
[
ii
+
0
]
=
image
[((
yy
)
*
imagesizex
+
(
xx
))
*
3
+
0
];
patch
[
ii
+
1
]
=
image
[((
yy
)
*
imagesizex
+
(
xx
))
*
3
+
1
];
patch
[
ii
+
2
]
=
image
[((
yy
)
*
imagesizex
+
(
xx
))
*
3
+
2
];
//index locked to patch
if
(
x
<
imagesizex
&&
y
<
imagesizey
)
// If inside image
{
// Filter kernel (simple box filter)
sumx
=
0
;
sumy
=
0
;
sumz
=
0
;
for
(
dy
=-
kernelsizey
;
dy
<=
kernelsizey
;
dy
++
)
for
(
dx
=-
kernelsizex
;
dx
<=
kernelsizex
;
dx
++
)
{
sumx
+=
patch
[
ii
+
dy
+
0
];
sumy
+=
patch
[
ii
+
dy
+
1
];
sumz
+=
patch
[
ii
+
dy
+
2
];
}
out
[(
y
*
imagesizex
+
x
)
*
3
+
0
]
=
sumx
/
divby
;
out
[(
y
*
imagesizex
+
x
)
*
3
+
1
]
=
sumy
/
divby
;
out
[(
y
*
imagesizex
+
x
)
*
3
+
2
]
=
sumz
/
divby
;
}
}
// Global variables for image data
unsigned
char
*
image
,
*
pixels
,
*
dev_bitmap
,
*
dev_input
;
unsigned
int
imagesizey
,
imagesizex
;
// Image size
////////////////////////////////////////////////////////////////////////////////
// main computation function
////////////////////////////////////////////////////////////////////////////////
void
computeImages
(
int
kernelsizex
,
int
kernelsizey
)
{
if
(
kernelsizex
>
maxKernelSizeX
||
kernelsizey
>
maxKernelSizeY
)
{
printf
(
"Kernel size out of bounds!
\n
"
);
return
;
}
pixels
=
(
unsigned
char
*
)
malloc
(
imagesizex
*
imagesizey
*
3
);
cudaMalloc
(
(
void
**
)
&
dev_input
,
imagesizex
*
imagesizey
*
3
);
cudaMemcpy
(
dev_input
,
image
,
imagesizey
*
imagesizex
*
3
,
cudaMemcpyHostToDevice
);
cudaMalloc
(
(
void
**
)
&
dev_bitmap
,
imagesizex
*
imagesizey
*
3
);
dim3
grid
(
imagesizex
,
imagesizey
);
filter
<<<
grid
,
1
>>>
(
dev_input
,
dev_bitmap
,
imagesizex
,
imagesizey
,
kernelsizex
,
kernelsizey
);
// Awful load balance
cudaThreadSynchronize
();
// Check for errors!
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
printf
(
"Error: %s
\n
"
,
cudaGetErrorString
(
err
));
cudaMemcpy
(
pixels
,
dev_bitmap
,
imagesizey
*
imagesizex
*
3
,
cudaMemcpyDeviceToHost
);
cudaFree
(
dev_bitmap
);
cudaFree
(
dev_input
);
}
// Display images
void
Draw
()
{
// Dump the whole picture onto the screen.
glClearColor
(
0.0
,
0.0
,
0.0
,
1.0
);
glClear
(
GL_COLOR_BUFFER_BIT
);
if
(
imagesizey
>=
imagesizex
)
{
// Not wide - probably square. Original left, result right.
glRasterPos2f
(
-
1
,
-
1
);
glDrawPixels
(
imagesizex
,
imagesizey
,
GL_RGB
,
GL_UNSIGNED_BYTE
,
image
);
glRasterPos2i
(
0
,
-
1
);
glDrawPixels
(
imagesizex
,
imagesizey
,
GL_RGB
,
GL_UNSIGNED_BYTE
,
pixels
);
}
else
{
// Wide image! Original on top, result below.
glRasterPos2f
(
-
1
,
-
1
);
glDrawPixels
(
imagesizex
,
imagesizey
,
GL_RGB
,
GL_UNSIGNED_BYTE
,
pixels
);
glRasterPos2i
(
-
1
,
0
);
glDrawPixels
(
imagesizex
,
imagesizey
,
GL_RGB
,
GL_UNSIGNED_BYTE
,
image
);
}
glFlush
();
}
// Main program, inits
int
main
(
int
argc
,
char
**
argv
)
{
glutInit
(
&
argc
,
argv
);
glutInitDisplayMode
(
GLUT_SINGLE
|
GLUT_RGBA
);
if
(
argc
>
1
)
image
=
readppm
(
argv
[
1
],
(
int
*
)
&
imagesizex
,
(
int
*
)
&
imagesizey
);
else
image
=
readppm
((
char
*
)
"maskros512.ppm"
,
(
int
*
)
&
imagesizex
,
(
int
*
)
&
imagesizey
);
if
(
imagesizey
>=
imagesizex
)
glutInitWindowSize
(
imagesizex
*
2
,
imagesizey
);
else
glutInitWindowSize
(
imagesizex
,
imagesizey
*
2
);
glutCreateWindow
(
"Lab 5"
);
glutDisplayFunc
(
Draw
);
ResetMilli
();
computeImages
(
2
,
2
);
// You can save the result to a file like this:
// writeppm("out.ppm", imagesizey, imagesizex, pixels);
glutMainLoop
();
return
0
;
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment