Cuda Mean Filter C + 3*3 sliding window
up vote
-2
down vote
favorite
I am working on an assignment to create a 3 by 3 sliding window on a 256 by 256 matrix of random values between 0-31. Here im testing it on a 16 by 16 array. Im trying to solve it using 1 block with 16*16 threads, but am open to other suggestions. I am getting a bounds error for the kernel launch. I as you can see with the comments, I tried to implement a boundary to fix the problem, but this resulted in approximately 200 additional errors. I believe something is wrong with the way that I am indexing, but I cant figure out how to fix it. The errors that I am getting now, are that I am out of bounds for threads (0,x,0) block (0,0,0) for x between 1-16. I understand that this is because the kernel is trying to take idx-1 when idx is 0, or idy-1 when idy is 0, but for some reason I cant fix the issue.
Please advise.
#include
#include
#include
#include
#define MAXR 16
#define MAXC 16
__global__ void imagefilter(float **intermediates_d, int **result_d) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
//result_d[2][2]= 5;
//if ((idx < 15) & (idy < 15)){
//result_d[2][2]= 5;
//if((idx>0) & (idy>0)){
__syncthreads();
result_d[idx][idy] = (int) ( (float) (intermediates_d[idx - 1][idy - 1]
+ intermediates_d[idx - 1][idy]
+ intermediates_d[idx - 1][idy + 1]
+ intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
+ intermediates_d[idx][idy + 1]
+ intermediates_d[idx + 1][idy - 1]
+ intermediates_d[idx + 1][idy]
+ intermediates_d[idx + 1][idy + 1]) );
// result_d[2][2]= 5;
}
int main(void)
{
int i, j;
//double cpu_time_used;
float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
int matrix[MAXR][MAXC]; // This is the input matrix from file
int result[MAXR][MAXC];//={{0}}; //This is where we want to write the mean values. For now set to zeros
float **intermediates_d;
//int **matrix_d;
int **result_d;
int datasize_f = MAXR*MAXC*sizeof(float);
int datasize_i = MAXR*MAXC*sizeof(int);
//Allocate memory on the host.
cudaMalloc((void**) &intermediates_d, datasize_f);
//cudaMalloc((void**) &matrix_d, datasize);
cudaMalloc((void**) &result_d, datasize_i);
FILE *fp;
fp =fopen("arrays16.txt","r"); // reads in matrix
//clock_t start =clock();
for(i=0;i< MAXR;i++) // this loop takes the information from .txt file and puts it into arr1 matrix
{
for(j=0;j<MAXC;j++)
{
fscanf(fp,"%dt",&matrix[i][j]);
}
}
for(i=0;i<MAXR;i++)
{
printf("n");
for(j=0;j<MAXC;j++) {
printf("%dt",matrix[i][j]);
}
}
//This is where we convert the input matrix into floating point in intermediate matrix
for (int y = 0; y < MAXR; y++) {
for (int x = 0; x < MAXC; x++) {
intermediates[y][x] = (float) matrix[y][x];
}
}
for (i = 0; i < 16; i++) { // prints out the results array to .txt file
for (j = 0; j < 16; j++) {
printf("%.2f", intermediates[i][j]);
}
}
// copying the data from the host array to the device array
//cudaMemcpy(matrix_d, matrix, datasize,
//cudaMemcpyHostToDevice);
cudaMemcpy(intermediates_d, intermediates, 4*datasize_f,cudaMemcpyHostToDevice);
cudaMemcpy(result_d, result, datasize_i,cudaMemcpyHostToDevice);
/*-----------------------------------------------------*/
/* applies mean filter to the original inputed matrix
* uses floor function to truncate the mean value for
* a 3 x 3 sliding window
* */
/*-------------------------------------------------------*/
// how many blocks we will allocate
dim3 blocks(1,1);
//how many threads per block we will allocate
dim3 threadsPerBlock(16,16);
//Launch Kernel
imagefilter<<<blocks, threadsPerBlock>>>(intermediates_d,result_d);
//Copy back Results Matrix.
cudaMemcpy(result, result_d, datasize_f,cudaMemcpyDeviceToHost);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %sn", cudaGetErrorString(errAsync));
FILE *file;
file = fopen("results.txt", "w+"); // writes matrix to file
for (i = 1; i < 16 - 1; i++) { // prints out the results array to .txt file
for (j = 1; j < 16 - 1; j++) {
printf("%3dt", result[i][j]);
fprintf(file, "%3dt", result[i][j]);
}
printf("n");
fprintf(file, "n");
}
fclose(file);
}
filter cuda gpu mean
add a comment |
up vote
-2
down vote
favorite
I am working on an assignment to create a 3 by 3 sliding window on a 256 by 256 matrix of random values between 0-31. Here im testing it on a 16 by 16 array. Im trying to solve it using 1 block with 16*16 threads, but am open to other suggestions. I am getting a bounds error for the kernel launch. I as you can see with the comments, I tried to implement a boundary to fix the problem, but this resulted in approximately 200 additional errors. I believe something is wrong with the way that I am indexing, but I cant figure out how to fix it. The errors that I am getting now, are that I am out of bounds for threads (0,x,0) block (0,0,0) for x between 1-16. I understand that this is because the kernel is trying to take idx-1 when idx is 0, or idy-1 when idy is 0, but for some reason I cant fix the issue.
Please advise.
#include
#include
#include
#include
#define MAXR 16
#define MAXC 16
__global__ void imagefilter(float **intermediates_d, int **result_d) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
//result_d[2][2]= 5;
//if ((idx < 15) & (idy < 15)){
//result_d[2][2]= 5;
//if((idx>0) & (idy>0)){
__syncthreads();
result_d[idx][idy] = (int) ( (float) (intermediates_d[idx - 1][idy - 1]
+ intermediates_d[idx - 1][idy]
+ intermediates_d[idx - 1][idy + 1]
+ intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
+ intermediates_d[idx][idy + 1]
+ intermediates_d[idx + 1][idy - 1]
+ intermediates_d[idx + 1][idy]
+ intermediates_d[idx + 1][idy + 1]) );
// result_d[2][2]= 5;
}
int main(void)
{
int i, j;
//double cpu_time_used;
float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
int matrix[MAXR][MAXC]; // This is the input matrix from file
int result[MAXR][MAXC];//={{0}}; //This is where we want to write the mean values. For now set to zeros
float **intermediates_d;
//int **matrix_d;
int **result_d;
int datasize_f = MAXR*MAXC*sizeof(float);
int datasize_i = MAXR*MAXC*sizeof(int);
//Allocate memory on the host.
cudaMalloc((void**) &intermediates_d, datasize_f);
//cudaMalloc((void**) &matrix_d, datasize);
cudaMalloc((void**) &result_d, datasize_i);
FILE *fp;
fp =fopen("arrays16.txt","r"); // reads in matrix
//clock_t start =clock();
for(i=0;i< MAXR;i++) // this loop takes the information from .txt file and puts it into arr1 matrix
{
for(j=0;j<MAXC;j++)
{
fscanf(fp,"%dt",&matrix[i][j]);
}
}
for(i=0;i<MAXR;i++)
{
printf("n");
for(j=0;j<MAXC;j++) {
printf("%dt",matrix[i][j]);
}
}
//This is where we convert the input matrix into floating point in intermediate matrix
for (int y = 0; y < MAXR; y++) {
for (int x = 0; x < MAXC; x++) {
intermediates[y][x] = (float) matrix[y][x];
}
}
for (i = 0; i < 16; i++) { // prints out the results array to .txt file
for (j = 0; j < 16; j++) {
printf("%.2f", intermediates[i][j]);
}
}
// copying the data from the host array to the device array
//cudaMemcpy(matrix_d, matrix, datasize,
//cudaMemcpyHostToDevice);
cudaMemcpy(intermediates_d, intermediates, 4*datasize_f,cudaMemcpyHostToDevice);
cudaMemcpy(result_d, result, datasize_i,cudaMemcpyHostToDevice);
/*-----------------------------------------------------*/
/* applies mean filter to the original inputed matrix
* uses floor function to truncate the mean value for
* a 3 x 3 sliding window
* */
/*-------------------------------------------------------*/
// how many blocks we will allocate
dim3 blocks(1,1);
//how many threads per block we will allocate
dim3 threadsPerBlock(16,16);
//Launch Kernel
imagefilter<<<blocks, threadsPerBlock>>>(intermediates_d,result_d);
//Copy back Results Matrix.
cudaMemcpy(result, result_d, datasize_f,cudaMemcpyDeviceToHost);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %sn", cudaGetErrorString(errAsync));
FILE *file;
file = fopen("results.txt", "w+"); // writes matrix to file
for (i = 1; i < 16 - 1; i++) { // prints out the results array to .txt file
for (j = 1; j < 16 - 1; j++) {
printf("%3dt", result[i][j]);
fprintf(file, "%3dt", result[i][j]);
}
printf("n");
fprintf(file, "n");
}
fclose(file);
}
filter cuda gpu mean
add a comment |
up vote
-2
down vote
favorite
up vote
-2
down vote
favorite
I am working on an assignment to create a 3 by 3 sliding window on a 256 by 256 matrix of random values between 0-31. Here im testing it on a 16 by 16 array. Im trying to solve it using 1 block with 16*16 threads, but am open to other suggestions. I am getting a bounds error for the kernel launch. I as you can see with the comments, I tried to implement a boundary to fix the problem, but this resulted in approximately 200 additional errors. I believe something is wrong with the way that I am indexing, but I cant figure out how to fix it. The errors that I am getting now, are that I am out of bounds for threads (0,x,0) block (0,0,0) for x between 1-16. I understand that this is because the kernel is trying to take idx-1 when idx is 0, or idy-1 when idy is 0, but for some reason I cant fix the issue.
Please advise.
#include
#include
#include
#include
#define MAXR 16
#define MAXC 16
__global__ void imagefilter(float **intermediates_d, int **result_d) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
//result_d[2][2]= 5;
//if ((idx < 15) & (idy < 15)){
//result_d[2][2]= 5;
//if((idx>0) & (idy>0)){
__syncthreads();
result_d[idx][idy] = (int) ( (float) (intermediates_d[idx - 1][idy - 1]
+ intermediates_d[idx - 1][idy]
+ intermediates_d[idx - 1][idy + 1]
+ intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
+ intermediates_d[idx][idy + 1]
+ intermediates_d[idx + 1][idy - 1]
+ intermediates_d[idx + 1][idy]
+ intermediates_d[idx + 1][idy + 1]) );
// result_d[2][2]= 5;
}
int main(void)
{
int i, j;
//double cpu_time_used;
float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
int matrix[MAXR][MAXC]; // This is the input matrix from file
int result[MAXR][MAXC];//={{0}}; //This is where we want to write the mean values. For now set to zeros
float **intermediates_d;
//int **matrix_d;
int **result_d;
int datasize_f = MAXR*MAXC*sizeof(float);
int datasize_i = MAXR*MAXC*sizeof(int);
//Allocate memory on the host.
cudaMalloc((void**) &intermediates_d, datasize_f);
//cudaMalloc((void**) &matrix_d, datasize);
cudaMalloc((void**) &result_d, datasize_i);
FILE *fp;
fp =fopen("arrays16.txt","r"); // reads in matrix
//clock_t start =clock();
for(i=0;i< MAXR;i++) // this loop takes the information from .txt file and puts it into arr1 matrix
{
for(j=0;j<MAXC;j++)
{
fscanf(fp,"%dt",&matrix[i][j]);
}
}
for(i=0;i<MAXR;i++)
{
printf("n");
for(j=0;j<MAXC;j++) {
printf("%dt",matrix[i][j]);
}
}
//This is where we convert the input matrix into floating point in intermediate matrix
for (int y = 0; y < MAXR; y++) {
for (int x = 0; x < MAXC; x++) {
intermediates[y][x] = (float) matrix[y][x];
}
}
for (i = 0; i < 16; i++) { // prints out the results array to .txt file
for (j = 0; j < 16; j++) {
printf("%.2f", intermediates[i][j]);
}
}
// copying the data from the host array to the device array
//cudaMemcpy(matrix_d, matrix, datasize,
//cudaMemcpyHostToDevice);
cudaMemcpy(intermediates_d, intermediates, 4*datasize_f,cudaMemcpyHostToDevice);
cudaMemcpy(result_d, result, datasize_i,cudaMemcpyHostToDevice);
/*-----------------------------------------------------*/
/* applies mean filter to the original inputed matrix
* uses floor function to truncate the mean value for
* a 3 x 3 sliding window
* */
/*-------------------------------------------------------*/
// how many blocks we will allocate
dim3 blocks(1,1);
//how many threads per block we will allocate
dim3 threadsPerBlock(16,16);
//Launch Kernel
imagefilter<<<blocks, threadsPerBlock>>>(intermediates_d,result_d);
//Copy back Results Matrix.
cudaMemcpy(result, result_d, datasize_f,cudaMemcpyDeviceToHost);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %sn", cudaGetErrorString(errAsync));
FILE *file;
file = fopen("results.txt", "w+"); // writes matrix to file
for (i = 1; i < 16 - 1; i++) { // prints out the results array to .txt file
for (j = 1; j < 16 - 1; j++) {
printf("%3dt", result[i][j]);
fprintf(file, "%3dt", result[i][j]);
}
printf("n");
fprintf(file, "n");
}
fclose(file);
}
filter cuda gpu mean
I am working on an assignment to create a 3 by 3 sliding window on a 256 by 256 matrix of random values between 0-31. Here im testing it on a 16 by 16 array. Im trying to solve it using 1 block with 16*16 threads, but am open to other suggestions. I am getting a bounds error for the kernel launch. I as you can see with the comments, I tried to implement a boundary to fix the problem, but this resulted in approximately 200 additional errors. I believe something is wrong with the way that I am indexing, but I cant figure out how to fix it. The errors that I am getting now, are that I am out of bounds for threads (0,x,0) block (0,0,0) for x between 1-16. I understand that this is because the kernel is trying to take idx-1 when idx is 0, or idy-1 when idy is 0, but for some reason I cant fix the issue.
Please advise.
#include
#include
#include
#include
#define MAXR 16
#define MAXC 16
__global__ void imagefilter(float **intermediates_d, int **result_d) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
//result_d[2][2]= 5;
//if ((idx < 15) & (idy < 15)){
//result_d[2][2]= 5;
//if((idx>0) & (idy>0)){
__syncthreads();
result_d[idx][idy] = (int) ( (float) (intermediates_d[idx - 1][idy - 1]
+ intermediates_d[idx - 1][idy]
+ intermediates_d[idx - 1][idy + 1]
+ intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
+ intermediates_d[idx][idy + 1]
+ intermediates_d[idx + 1][idy - 1]
+ intermediates_d[idx + 1][idy]
+ intermediates_d[idx + 1][idy + 1]) );
// result_d[2][2]= 5;
}
int main(void)
{
int i, j;
//double cpu_time_used;
float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
int matrix[MAXR][MAXC]; // This is the input matrix from file
int result[MAXR][MAXC];//={{0}}; //This is where we want to write the mean values. For now set to zeros
float **intermediates_d;
//int **matrix_d;
int **result_d;
int datasize_f = MAXR*MAXC*sizeof(float);
int datasize_i = MAXR*MAXC*sizeof(int);
//Allocate memory on the host.
cudaMalloc((void**) &intermediates_d, datasize_f);
//cudaMalloc((void**) &matrix_d, datasize);
cudaMalloc((void**) &result_d, datasize_i);
FILE *fp;
fp =fopen("arrays16.txt","r"); // reads in matrix
//clock_t start =clock();
for(i=0;i< MAXR;i++) // this loop takes the information from .txt file and puts it into arr1 matrix
{
for(j=0;j<MAXC;j++)
{
fscanf(fp,"%dt",&matrix[i][j]);
}
}
for(i=0;i<MAXR;i++)
{
printf("n");
for(j=0;j<MAXC;j++) {
printf("%dt",matrix[i][j]);
}
}
//This is where we convert the input matrix into floating point in intermediate matrix
for (int y = 0; y < MAXR; y++) {
for (int x = 0; x < MAXC; x++) {
intermediates[y][x] = (float) matrix[y][x];
}
}
for (i = 0; i < 16; i++) { // prints out the results array to .txt file
for (j = 0; j < 16; j++) {
printf("%.2f", intermediates[i][j]);
}
}
// copying the data from the host array to the device array
//cudaMemcpy(matrix_d, matrix, datasize,
//cudaMemcpyHostToDevice);
cudaMemcpy(intermediates_d, intermediates, 4*datasize_f,cudaMemcpyHostToDevice);
cudaMemcpy(result_d, result, datasize_i,cudaMemcpyHostToDevice);
/*-----------------------------------------------------*/
/* applies mean filter to the original inputed matrix
* uses floor function to truncate the mean value for
* a 3 x 3 sliding window
* */
/*-------------------------------------------------------*/
// how many blocks we will allocate
dim3 blocks(1,1);
//how many threads per block we will allocate
dim3 threadsPerBlock(16,16);
//Launch Kernel
imagefilter<<<blocks, threadsPerBlock>>>(intermediates_d,result_d);
//Copy back Results Matrix.
cudaMemcpy(result, result_d, datasize_f,cudaMemcpyDeviceToHost);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %sn", cudaGetErrorString(errAsync));
FILE *file;
file = fopen("results.txt", "w+"); // writes matrix to file
for (i = 1; i < 16 - 1; i++) { // prints out the results array to .txt file
for (j = 1; j < 16 - 1; j++) {
printf("%3dt", result[i][j]);
fprintf(file, "%3dt", result[i][j]);
}
printf("n");
fprintf(file, "n");
}
fclose(file);
}
filter cuda gpu mean
filter cuda gpu mean
edited Nov 11 at 21:10
talonmies
58.8k17126192
58.8k17126192
asked Nov 10 at 18:24
Eden Shuster
11
11
add a comment |
add a comment |
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53242085%2fcuda-mean-filter-c-33-sliding-window%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown