Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
TDDD56
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Mathias Berggren
TDDD56
Commits
9f445063
Commit
9f445063
authored
5 years ago
by
matbe320
Browse files
Options
Downloads
Patches
Plain Diff
Lab5: Finished hopefully
parent
4d95f649
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Lab5/filter.cu
+238
-29
238 additions, 29 deletions
Lab5/filter.cu
with
238 additions
and
29 deletions
Lab5/filter.cu
+
238
−
29
View file @
9f445063
...
@@ -55,8 +55,8 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne
...
@@ -55,8 +55,8 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne
while
(
local_index
<
LOCAL_MEM_SIZE
)
while
(
local_index
<
LOCAL_MEM_SIZE
)
{
{
int
global_x
=
max
(
0
,
blockOffsetX
-
kernelsizex
+
local_index
%
FILTER_W
);
int
global_x
=
min
(
max
(
0
,
blockOffsetX
-
kernelsizex
+
local_index
%
FILTER_W
)
,
imagesizex
-
1
)
;
int
global_y
=
max
(
0
,
blockOffsetY
-
kernelsizey
+
local_index
/
FILTER_W
);
int
global_y
=
min
(
max
(
0
,
blockOffsetY
-
kernelsizey
+
local_index
/
FILTER_W
)
,
imagesizey
-
1
)
;
unsigned
global_index
=
global_y
*
imagesizex
+
global_x
;
unsigned
global_index
=
global_y
*
imagesizex
+
global_x
;
local_memory
[
3
*
local_index
+
0
]
=
image
[
3
*
global_index
+
0
];
local_memory
[
3
*
local_index
+
0
]
=
image
[
3
*
global_index
+
0
];
...
@@ -65,6 +65,7 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne
...
@@ -65,6 +65,7 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne
local_index
+=
threadsPerBlock
;
local_index
+=
threadsPerBlock
;
}
}
__syncthreads
();
unsigned
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
unsigned
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
unsigned
y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
unsigned
y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
...
@@ -97,7 +98,7 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne
...
@@ -97,7 +98,7 @@ __global__ void filter_1(unsigned char *image, unsigned char *out, const unsigne
}
}
}
}
__global__
void
filter_separable
(
unsigned
char
*
image
,
unsigned
char
*
out
,
const
unsigned
int
imagesizex
,
const
unsigned
int
imagesizey
,
const
int
kernelsize
,
const
int
stride
)
__global__
void
filter_separable
(
unsigned
char
*
image
,
unsigned
char
*
out
,
const
unsigned
int
imagesizex
,
const
unsigned
int
imagesizey
,
const
int
kernelsize
,
const
bool
horizontal
)
{
{
__shared__
unsigned
char
local_memory
[
STATIC_SHARED_MEM_SIZE_SEP
*
3
];
__shared__
unsigned
char
local_memory
[
STATIC_SHARED_MEM_SIZE_SEP
*
3
];
// map from blockIdx to pixel position
// map from blockIdx to pixel position
...
@@ -106,14 +107,28 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const
...
@@ -106,14 +107,28 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const
int
blockOffsetX
=
blockIdx
.
x
*
blockDim
.
x
;
int
blockOffsetX
=
blockIdx
.
x
*
blockDim
.
x
;
int
blockOffsetY
=
blockIdx
.
y
*
blockDim
.
y
;
int
blockOffsetY
=
blockIdx
.
y
*
blockDim
.
y
;
int
local_index
=
threadIdx
.
x
;
/* Threadidx.x or threadIdx.y will always be zero depending on direction */
int
local_index
=
threadIdx
.
x
+
threadIdx
.
y
;
int
FILTER_SIZE
=
(
kernelsize
*
2
+
1
)
+
(
2
*
kernelsize
);
int
FILTER_SIZE
=
(
kernelsize
*
2
+
1
)
+
(
2
*
kernelsize
);
while
(
local_index
<
FILTER_SIZE
)
while
(
local_index
<
FILTER_SIZE
)
{
{
int
global_x
=
max
(
0
,
blockOffsetX
-
kernelsize
+
local_index
);
int
global_x
=
0
;
int
global_y
=
max
(
0
,
blockOffsetY
);
int
global_y
=
0
;
if
(
horizontal
)
{
global_x
=
min
(
max
(
0
,
blockOffsetX
-
kernelsize
+
local_index
),
imagesizex
-
1
);
global_y
=
blockOffsetY
;
}
else
{
global_x
=
blockOffsetX
;
global_y
=
min
(
max
(
0
,
blockOffsetY
-
kernelsize
+
local_index
),
imagesizey
-
1
);
//max(0, blockOffsetY);
}
unsigned
global_index
=
global_y
*
imagesizex
+
global_x
;
unsigned
global_index
=
global_y
*
imagesizex
+
global_x
;
local_memory
[
3
*
local_index
+
0
]
=
image
[
3
*
global_index
+
0
];
local_memory
[
3
*
local_index
+
0
]
=
image
[
3
*
global_index
+
0
];
...
@@ -122,10 +137,12 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const
...
@@ -122,10 +137,12 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const
local_index
+=
threadsPerBlock
;
local_index
+=
threadsPerBlock
;
}
}
__syncthreads
();
unsigned
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
unsigned
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
unsigned
y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
unsigned
y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
/* Since one of threadIdx.x or threadIdx.y is 0 we can just add them */
local_index
=
threadIdx
.
x
+
threadIdx
.
y
;
if
(
x
<
imagesizex
&&
y
<
imagesizey
)
// If inside image
if
(
x
<
imagesizex
&&
y
<
imagesizey
)
// If inside image
{
{
int
d
;
int
d
;
...
@@ -135,21 +152,90 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const
...
@@ -135,21 +152,90 @@ __global__ void filter_separable(unsigned char *image, unsigned char *out, const
sumx
=
0
;
sumy
=
0
;
sumz
=
0
;
sumx
=
0
;
sumy
=
0
;
sumz
=
0
;
for
(
d
=-
kernelsize
;
d
<=
kernelsize
;
d
++
)
for
(
d
=-
kernelsize
;
d
<=
kernelsize
;
d
++
)
{
{
unsigned
dx
=
kernelsize
+
threadIdx
.
x
+
d
;
unsigned
dx
=
kernelsize
+
local_inde
x
+
d
;
sumx
+=
local_memory
[
dx
*
3
+
0
];
sumx
+=
local_memory
[
dx
*
3
+
0
];
sumy
+=
local_memory
[
dx
*
3
+
1
];
sumy
+=
local_memory
[
dx
*
3
+
1
];
sumz
+=
local_memory
[
dx
*
3
+
2
];
sumz
+=
local_memory
[
dx
*
3
+
2
];
}
}
out
[(
y
*
imagesizex
+
x
)
*
3
+
0
]
=
local_memory
[
kernelsize
+
threadIdx
.
x
+
0
];
out
[(
y
*
imagesizex
+
x
)
*
3
+
1
]
=
local_memory
[
kernelsize
+
threadIdx
.
x
+
1
];
out
[(
y
*
imagesizex
+
x
)
*
3
+
2
]
=
local_memory
[
kernelsize
+
threadIdx
.
x
+
2
];
out
[(
y
*
imagesizex
+
x
)
*
3
+
0
]
=
sumx
/
divby
;
// out[(y*imagesizex+x)*3+0] = sumx/divby;
out
[(
y
*
imagesizex
+
x
)
*
3
+
1
]
=
sumy
/
divby
;
// out[(y*imagesizex+x)*3+1] = sumy/divby;
out
[(
y
*
imagesizex
+
x
)
*
3
+
2
]
=
sumz
/
divby
;
// out[(y*imagesizex+x)*3+2] = sumz/divby;
}
}
}
}
__global__
void
filter_gaussian
(
unsigned
char
*
image
,
unsigned
char
*
out
,
const
unsigned
int
imagesizex
,
const
unsigned
int
imagesizey
,
const
int
kernelsize
,
const
bool
horizontal
)
{
__shared__
unsigned
char
local_memory
[
STATIC_SHARED_MEM_SIZE_SEP
*
3
];
// map from blockIdx to pixel position
unsigned
threadsPerBlock
=
blockDim
.
x
;
int
blockOffsetX
=
blockIdx
.
x
*
blockDim
.
x
;
int
blockOffsetY
=
blockIdx
.
y
*
blockDim
.
y
;
/* Threadidx.x or threadIdx.y will always be zero depending on direction */
int
local_index
=
threadIdx
.
x
+
threadIdx
.
y
;
int
FILTER_SIZE
=
(
kernelsize
*
2
+
1
)
+
(
2
*
kernelsize
);
while
(
local_index
<
FILTER_SIZE
)
{
int
global_x
=
0
;
int
global_y
=
0
;
if
(
horizontal
)
{
global_x
=
min
(
max
(
0
,
blockOffsetX
-
kernelsize
+
local_index
),
imagesizex
-
1
);
global_y
=
blockOffsetY
;
}
else
{
global_x
=
blockOffsetX
;
global_y
=
min
(
max
(
0
,
blockOffsetY
-
kernelsize
+
local_index
),
imagesizey
-
1
);
//max(0, blockOffsetY);
}
unsigned
global_index
=
global_y
*
imagesizex
+
global_x
;
local_memory
[
3
*
local_index
+
0
]
=
image
[
3
*
global_index
+
0
];
local_memory
[
3
*
local_index
+
1
]
=
image
[
3
*
global_index
+
1
];
local_memory
[
3
*
local_index
+
2
]
=
image
[
3
*
global_index
+
2
];
local_index
+=
threadsPerBlock
;
}
__syncthreads
();
unsigned
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
unsigned
y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
/* Since one of threadIdx.x or threadIdx.y is 0 we can just add them */
local_index
=
threadIdx
.
x
+
threadIdx
.
y
;
if
(
x
<
imagesizex
&&
y
<
imagesizey
)
// If inside image
{
int
d
;
unsigned
int
sumx
,
sumy
,
sumz
;
float
divby
[
5
]
=
{
1.0
/
16.0
,
4.0
/
16.0
,
6.0
/
16.0
,
4.0
/
16.0
,
1.0
/
16.0
};
sumx
=
0
;
sumy
=
0
;
sumz
=
0
;
for
(
d
=-
kernelsize
;
d
<=
kernelsize
;
d
++
)
{
unsigned
dx
=
kernelsize
+
local_index
+
d
;
sumx
+=
local_memory
[
dx
*
3
+
0
]
*
divby
[
kernelsize
+
d
];
sumy
+=
local_memory
[
dx
*
3
+
1
]
*
divby
[
kernelsize
+
d
];
sumz
+=
local_memory
[
dx
*
3
+
2
]
*
divby
[
kernelsize
+
d
];
// printf("Divby factor %f kernelsize+d %d\n", divby[kernelsize+d], kernelsize+d);
}
out
[(
y
*
imagesizex
+
x
)
*
3
+
0
]
=
sumx
;
// / old_divby;
out
[(
y
*
imagesizex
+
x
)
*
3
+
1
]
=
sumy
;
// / old_divby;
out
[(
y
*
imagesizex
+
x
)
*
3
+
2
]
=
sumz
;
/// old_divby;
}
}
__global__
void
filter_old
(
unsigned
char
*
image
,
unsigned
char
*
out
,
const
unsigned
int
imagesizex
,
const
unsigned
int
imagesizey
,
const
int
kernelsizex
,
const
int
kernelsizey
)
__global__
void
filter_old
(
unsigned
char
*
image
,
unsigned
char
*
out
,
const
unsigned
int
imagesizex
,
const
unsigned
int
imagesizey
,
const
int
kernelsizex
,
const
int
kernelsizey
)
{
{
int
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
...
@@ -180,9 +266,98 @@ __global__ void filter_old(unsigned char *image, unsigned char *out, const unsig
...
@@ -180,9 +266,98 @@ __global__ void filter_old(unsigned char *image, unsigned char *out, const unsig
out
[(
y
*
imagesizex
+
x
)
*
3
+
2
]
=
sumz
/
divby
;
out
[(
y
*
imagesizex
+
x
)
*
3
+
2
]
=
sumz
/
divby
;
}
}
}
}
__global__
void
filter_median
(
unsigned
char
*
image
,
unsigned
char
*
out
,
const
unsigned
int
imagesizex
,
const
unsigned
int
imagesizey
,
const
int
kernelsizex
,
const
int
kernelsizey
)
{
__shared__
unsigned
char
local_memory
[
STATIC_SHARED_MEM_SIZE_PX
*
3
];
// map from blockIdx to pixel position
unsigned
threadNumInBlock
=
threadIdx
.
x
+
blockDim
.
x
*
threadIdx
.
y
;
unsigned
threadsPerBlock
=
blockDim
.
x
*
blockDim
.
y
;
int
blockOffsetX
=
blockIdx
.
x
*
blockDim
.
x
;
int
blockOffsetY
=
blockIdx
.
y
*
blockDim
.
y
;
int
local_index
=
threadNumInBlock
;
int
FILTER_W
=
(
kernelsizex
*
2
+
1
)
+
(
2
*
kernelsizex
);
int
FILTER_H
=
(
kernelsizey
*
2
+
1
)
+
(
2
*
kernelsizey
);
unsigned
LOCAL_MEM_SIZE
=
FILTER_W
*
FILTER_H
;
while
(
local_index
<
LOCAL_MEM_SIZE
)
{
int
global_x
=
min
(
max
(
0
,
blockOffsetX
-
kernelsizex
+
local_index
%
FILTER_W
),
imagesizex
-
1
);
int
global_y
=
min
(
max
(
0
,
blockOffsetY
-
kernelsizey
+
local_index
/
FILTER_W
),
imagesizey
-
1
);
unsigned
global_index
=
global_y
*
imagesizex
+
global_x
;
local_memory
[
3
*
local_index
+
0
]
=
image
[
3
*
global_index
+
0
];
local_memory
[
3
*
local_index
+
1
]
=
image
[
3
*
global_index
+
1
];
local_memory
[
3
*
local_index
+
2
]
=
image
[
3
*
global_index
+
2
];
local_index
+=
threadsPerBlock
;
}
__syncthreads
();
unsigned
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
unsigned
y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
if
(
x
<
imagesizex
&&
y
<
imagesizey
)
// If inside image
{
int
dy
,
dx
;
unsigned
int
sumx
[
256
]
=
{
0
},
sumy
[
256
]
=
{
0
},
sumz
[
256
]
=
{
0
};
unsigned
local_x
=
threadIdx
.
x
;
unsigned
local_y
=
threadIdx
.
y
;
for
(
dy
=-
kernelsizey
;
dy
<=
kernelsizey
;
dy
++
)
for
(
dx
=-
kernelsizex
;
dx
<=
kernelsizex
;
dx
++
)
{
unsigned
yy
=
kernelsizey
+
local_y
+
dy
;
unsigned
xx
=
kernelsizex
+
local_x
+
dx
;
unsigned
mem_access
=
yy
*
FILTER_W
+
xx
;
++
sumx
[
local_memory
[
mem_access
*
3
+
0
]];
++
sumy
[
local_memory
[
mem_access
*
3
+
1
]];
++
sumz
[
local_memory
[
mem_access
*
3
+
2
]];
}
unsigned
num_x
=
0
;
unsigned
num_y
=
0
;
unsigned
num_z
=
0
;
int
index_x
=
0
;
int
index_y
=
0
;
int
index_z
=
0
;
unsigned
half_filter_size
=
((
kernelsizex
*
2
+
1
)
*
(
kernelsizey
*
2
+
1
))
/
2
;
while
(
num_x
<
half_filter_size
)
{
// printf("Thread: %d Num_x %d index_x %d\n",y* blockDim.x + x, num_x, index_x);
num_x
+=
sumx
[
index_x
++
];
}
while
(
num_y
<
half_filter_size
)
{
num_y
+=
sumy
[
index_y
++
];
}
while
(
num_z
<
half_filter_size
)
{
num_z
+=
sumz
[
index_z
++
];
}
// while(num_x < half_filter_size && num_y < half_filter_size && num_z < half_filter_size)
// {
// num_x += sumx[index_x++];
// num_y += sumy[index_y++];
// num_z += sumz[index_z++];
// }
out
[(
y
*
imagesizex
+
x
)
*
3
+
0
]
=
index_x
;
out
[(
y
*
imagesizex
+
x
)
*
3
+
1
]
=
index_y
;
out
[(
y
*
imagesizex
+
x
)
*
3
+
2
]
=
index_z
;
}
}
// Global variables for image data
// Global variables for image data
unsigned
char
*
image
,
*
pixels
,
*
dev_bitmap
,
*
dev_input
;
unsigned
char
*
image
,
*
pixels
,
*
dev_bitmap
,
*
dev_input
,
*
dev_bitmap_tmp
;
unsigned
int
imagesizey
,
imagesizex
;
// Image size
unsigned
int
imagesizey
,
imagesizex
;
// Image size
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
...
@@ -199,41 +374,75 @@ void computeImages(int kernelsizex, int kernelsizey)
...
@@ -199,41 +374,75 @@ void computeImages(int kernelsizex, int kernelsizey)
pixels
=
(
unsigned
char
*
)
malloc
(
imagesizex
*
imagesizey
*
3
);
pixels
=
(
unsigned
char
*
)
malloc
(
imagesizex
*
imagesizey
*
3
);
cudaMalloc
(
(
void
**
)
&
dev_input
,
imagesizex
*
imagesizey
*
3
);
cudaMalloc
(
(
void
**
)
&
dev_input
,
imagesizex
*
imagesizey
*
3
);
cudaMemcpy
(
dev_input
,
image
,
imagesizey
*
imagesizex
*
3
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
dev_input
,
image
,
imagesizey
*
imagesizex
*
3
,
cudaMemcpyHostToDevice
);
cudaMalloc
(
(
void
**
)
&
dev_bitmap_tmp
,
imagesizex
*
imagesizey
*
3
);
cudaMalloc
(
(
void
**
)
&
dev_bitmap
,
imagesizex
*
imagesizey
*
3
);
cudaMalloc
(
(
void
**
)
&
dev_bitmap
,
imagesizex
*
imagesizey
*
3
);
dim3
grid
(
imagesizex
,
imagesizey
);
ResetMilli
();
filter_old
<<<
grid
,
1
>>>
(
dev_input
,
dev_bitmap
,
imagesizex
,
imagesizey
,
kernelsizex
,
kernelsizey
);
// Awful load balance
cudaThreadSynchronize
();
printf
(
"[Un-Optimized] %f
\n
"
,
GetSeconds
());
dim3
gridsize
(
imagesizex
/
(
kernelsizex
*
2
+
1
),
imagesizey
/
(
kernelsizey
*
2
+
1
));
/* Given filter */
// dim3 grid(imagesizex,imagesizey);
// ResetMilli();
// filter_old<<<grid,1>>>(dev_input, dev_bitmap, imagesizex, imagesizey, kernelsizex, kernelsizey); // Awful load balance
// cudaThreadSynchronize();
// printf("[Un-Optimized] %f\n", GetSeconds());
/* Our first block filter */
dim3
gridsize
(
imagesizex
/
(
kernelsizex
*
2
+
1
)
+
1
,
imagesizey
/
(
kernelsizey
*
2
+
1
)
+
1
);
dim3
blocksize
(
kernelsizex
*
2
+
1
,
kernelsizey
*
2
+
1
);
dim3
blocksize
(
kernelsizex
*
2
+
1
,
kernelsizey
*
2
+
1
);
ResetMilli
();
ResetMilli
();
filter_1
<<<
gridsize
,
blocksize
>>>
(
dev_input
,
dev_bitmap
,
imagesizex
,
imagesizey
,
kernelsizex
,
kernelsizey
);
// Nice load balance
filter_1
<<<
gridsize
,
blocksize
>>>
(
dev_input
,
dev_bitmap
,
imagesizex
,
imagesizey
,
kernelsizex
,
kernelsizey
);
// Nice load balance
cudaThreadSynchronize
();
cudaThreadSynchronize
();
printf
(
"[Optimized] %f
\n
"
,
GetSeconds
());
printf
(
"[Optimized] %f
\n
"
,
GetSeconds
());
dim3
gridsize_sep
(
imagesizex
/
(
kernelsizex
*
2
+
1
),
imagesizey
);
/* Separable filter */
dim3
blocksize_sep
(
kernelsizex
*
2
+
1
);
dim3
gridsize_sep_horizontal
(
imagesizex
/
(
kernelsizex
*
2
+
1
)
+
1
,
imagesizey
);
// unsigned char *tmp = dev_input;
dim3
gridsize_sep_vertical
(
imagesizex
,
imagesizey
/
(
kernelsizex
*
2
+
1
)
+
1
);
dim3
threads_horizontal
(
kernelsizex
*
2
+
1
);
dim3
threads_vertical
(
1
,
kernelsizex
*
2
+
1
);
// // unsigned char *tmp = dev_input;
ResetMilli
();
ResetMilli
();
/* X-direction */
//
/* X-direction */
filter_separable
<<<
gridsize_sep
,
blocksize_sep
>>>
(
dev_input
,
dev_bitmap
,
imagesizex
,
imagesizey
,
kernelsizex
,
1
);
filter_separable
<<<
gridsize_sep
_horizontal
,
threads_horizontal
>>>
(
dev_input
,
dev_bitmap
_tmp
,
imagesizex
,
imagesizey
,
kernelsizex
,
true
);
cudaThreadSynchronize
();
cudaThreadSynchronize
();
/* Y-direction */
/* Y-direction */
// dim3 gridsize_sep(imagesizex, imagesizey/(kernelsizex*2+1));
filter_separable
<<<
gridsize_sep_vertical
,
threads_vertical
>>>
(
dev_bitmap_tmp
,
dev_bitmap
,
imagesizex
,
imagesizey
,
kernelsizex
,
false
);
// filter_separable<<<gridsize_sep, blocksize_sep>>>(tmp, dev_bitmap, imagesizex, imagesizey, kernelsizex, 1);
cudaThreadSynchronize
();
cudaThreadSynchronize
();
printf
(
"[Separable] %f
\n
"
,
GetSeconds
());
printf
(
"[Separable] %f
\n
"
,
GetSeconds
());
/* Gaussian filter i.e. 2 x 5*1 filters run horizontal and vertical with gaussian weights */
if
(
kernelsizex
==
2
)
{
ResetMilli
();
/* X-direction */
filter_gaussian
<<<
gridsize_sep_horizontal
,
threads_horizontal
>>>
(
dev_input
,
dev_bitmap_tmp
,
imagesizex
,
imagesizey
,
kernelsizex
,
true
);
cudaThreadSynchronize
();
/* Y-direction */
filter_gaussian
<<<
gridsize_sep_vertical
,
threads_vertical
>>>
(
dev_bitmap_tmp
,
dev_bitmap
,
imagesizex
,
imagesizey
,
kernelsizex
,
false
);
cudaThreadSynchronize
();
printf
(
"[Gaussian] %f
\n
"
,
GetSeconds
());
}
/* Median filter */
ResetMilli
();
filter_median
<<<
gridsize
,
blocksize
>>>
(
dev_input
,
dev_bitmap
,
imagesizex
,
imagesizey
,
kernelsizex
,
kernelsizey
);
// Nice load balance
cudaThreadSynchronize
();
printf
(
"[Median] %f
\n
"
,
GetSeconds
());
// Check for errors!
// Check for errors!
cudaError_t
err
=
cudaGetLastError
();
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
if
(
err
!=
cudaSuccess
)
printf
(
"Error: %s
\n
"
,
cudaGetErrorString
(
err
));
printf
(
"Error: %s
\n
"
,
cudaGetErrorString
(
err
));
cudaMemcpy
(
pixels
,
dev_bitmap
,
imagesizey
*
imagesizex
*
3
,
cudaMemcpyDeviceToHost
);
cudaMemcpy
(
pixels
,
dev_bitmap
,
imagesizey
*
imagesizex
*
3
,
cudaMemcpyDeviceToHost
);
cudaFree
(
dev_bitmap_tmp
);
cudaFree
(
dev_bitmap
);
cudaFree
(
dev_bitmap
);
cudaFree
(
dev_input
);
cudaFree
(
dev_input
);
}
}
// Display images
// Display images
...
@@ -279,7 +488,7 @@ int main( int argc, char** argv)
...
@@ -279,7 +488,7 @@ int main( int argc, char** argv)
glutDisplayFunc
(
Draw
);
glutDisplayFunc
(
Draw
);
for
(
int
i
=
2
;
i
<
10
;
i
++
)
for
(
int
i
=
2
;
i
<
5
;
i
++
)
{
{
printf
(
"Frame is %i x %i
\n
"
,
i
*
2
+
1
,
i
*
2
+
1
);
printf
(
"Frame is %i x %i
\n
"
,
i
*
2
+
1
,
i
*
2
+
1
);
computeImages
(
i
,
i
);
computeImages
(
i
,
i
);
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment