Cuda Mean Filter C + 3*3 sliding window

up vote
-2
down vote

favorite

I am working on an assignment to create a 3 by 3 sliding window on a 256 by 256 matrix of random values between 0-31. Here im testing it on a 16 by 16 array. Im trying to solve it using 1 block with 16*16 threads, but am open to other suggestions. I am getting a bounds error for the kernel launch. I as you can see with the comments, I tried to implement a boundary to fix the problem, but this resulted in approximately 200 additional errors. I believe something is wrong with the way that I am indexing, but I cant figure out how to fix it. The errors that I am getting now, are that I am out of bounds for threads (0,x,0) block (0,0,0) for x between 1-16. I understand that this is because the kernel is trying to take idx-1 when idx is 0, or idy-1 when idy is 0, but for some reason I cant fix the issue.
Please advise.
#include
#include
#include
#include

#define MAXR 16

#define MAXC 16



__global__ void imagefilter(float **intermediates_d, int **result_d) {

    int idx = blockDim.x * blockIdx.x + threadIdx.x;

    int idy = blockDim.y * blockIdx.y + threadIdx.y;

    //result_d[2][2]= 5;

    //if ((idx < 15) & (idy < 15)){

    //result_d[2][2]= 5;

    //if((idx>0) & (idy>0)){

    __syncthreads();

    result_d[idx][idy] = (int) ( (float) (intermediates_d[idx - 1][idy - 1]

                                                                   + intermediates_d[idx - 1][idy]

                                                                                              + intermediates_d[idx - 1][idy + 1]

                                                                                                                         + intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]

                                                                                                                                                                                + intermediates_d[idx][idy + 1]

                                                                                                                                                                                                       + intermediates_d[idx + 1][idy - 1]

                                                                                                                                                                                                                                  + intermediates_d[idx + 1][idy]

                                                                                                                                                                                                                                                             + intermediates_d[idx + 1][idy + 1]) );



    //  result_d[2][2]= 5;









}



int main(void)

{

    int i, j;

    //double cpu_time_used;

    float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating

    int matrix[MAXR][MAXC]; // This is the input matrix from file

    int result[MAXR][MAXC];//={{0}}; //This is where we want to write the mean values. For now set to zeros

    float **intermediates_d;

    //int **matrix_d;

    int **result_d;



    int datasize_f = MAXR*MAXC*sizeof(float);

    int datasize_i = MAXR*MAXC*sizeof(int);

    //Allocate memory on the host.

    cudaMalloc((void**) &intermediates_d, datasize_f);

    //cudaMalloc((void**) &matrix_d, datasize);

    cudaMalloc((void**) &result_d, datasize_i);



    FILE *fp;

    fp =fopen("arrays16.txt","r"); // reads in matrix

    //clock_t start =clock();

    for(i=0;i< MAXR;i++) // this loop takes the information from .txt file and puts it into arr1 matrix

    {

        for(j=0;j<MAXC;j++)

        {

            fscanf(fp,"%dt",&matrix[i][j]);

        }

    }

    for(i=0;i<MAXR;i++)

    {

        printf("n");

        for(j=0;j<MAXC;j++) {

            printf("%dt",matrix[i][j]);

        }

    }





    //This is where we convert the input matrix into floating point in intermediate matrix

    for (int y = 0; y < MAXR; y++) {

        for (int x = 0; x < MAXC; x++) {

            intermediates[y][x] = (float) matrix[y][x];

        }

    }



    for (i = 0; i < 16; i++) { // prints out the results array to .txt file

        for (j = 0; j < 16; j++) {

            printf("%.2f", intermediates[i][j]);



        }

    }



    // copying the data from the host array to the device array

    //cudaMemcpy(matrix_d, matrix, datasize,

    //cudaMemcpyHostToDevice);

    cudaMemcpy(intermediates_d, intermediates, 4*datasize_f,cudaMemcpyHostToDevice);

    cudaMemcpy(result_d, result, datasize_i,cudaMemcpyHostToDevice);



    /*-----------------------------------------------------*/

    /*  applies mean filter to the original inputed matrix

     * uses floor function to truncate the mean value for

     *          a 3 x 3 sliding window

     *                                                          */

    /*-------------------------------------------------------*/



    // how many blocks we will allocate

    dim3 blocks(1,1);



    //how many threads per block we will allocate

    dim3 threadsPerBlock(16,16);



    //Launch Kernel

    imagefilter<<<blocks, threadsPerBlock>>>(intermediates_d,result_d);



    //Copy back Results Matrix.

    cudaMemcpy(result, result_d, datasize_f,cudaMemcpyDeviceToHost);





    cudaError_t errSync = cudaGetLastError();

    cudaError_t errAsync = cudaDeviceSynchronize();

    if (errSync != cudaSuccess)

        printf("Sync kernel error: %sn", cudaGetErrorString(errSync));

    if (errAsync != cudaSuccess)

        printf("Async kernel error: %sn", cudaGetErrorString(errAsync));



    FILE *file;

    file = fopen("results.txt", "w+"); // writes  matrix to file



    for (i = 1; i < 16 - 1; i++) { // prints out the results array to .txt file

        for (j = 1; j < 16 - 1; j++) {

            printf("%3dt", result[i][j]);

            fprintf(file, "%3dt", result[i][j]);

        }



        printf("n");

        fprintf(file, "n");

    }



    fclose(file);



}

edited Nov 11 at 21:10

talonmies

58.8k17126192

asked Nov 10 at 18:24

Eden Shuster

add a comment |

up vote
-2
down vote

favorite

#define MAXR 16

#define MAXC 16



__global__ void imagefilter(float **intermediates_d, int **result_d) {

    int idx = blockDim.x * blockIdx.x + threadIdx.x;

    int idy = blockDim.y * blockIdx.y + threadIdx.y;

    //result_d[2][2]= 5;

    //if ((idx < 15) & (idy < 15)){

    //result_d[2][2]= 5;

    //if((idx>0) & (idy>0)){

    __syncthreads();

    result_d[idx][idy] = (int) ( (float) (intermediates_d[idx - 1][idy - 1]

                                                                   + intermediates_d[idx - 1][idy]

                                                                                              + intermediates_d[idx - 1][idy + 1]

                                                                                                                         + intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]

                                                                                                                                                                                + intermediates_d[idx][idy + 1]

                                                                                                                                                                                                       + intermediates_d[idx + 1][idy - 1]

                                                                                                                                                                                                                                  + intermediates_d[idx + 1][idy]

                                                                                                                                                                                                                                                             + intermediates_d[idx + 1][idy + 1]) );



    //  result_d[2][2]= 5;









}



int main(void)

{

    int i, j;

    //double cpu_time_used;

    float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating

    int matrix[MAXR][MAXC]; // This is the input matrix from file

    int result[MAXR][MAXC];//={{0}}; //This is where we want to write the mean values. For now set to zeros

    float **intermediates_d;

    //int **matrix_d;

    int **result_d;



    int datasize_f = MAXR*MAXC*sizeof(float);

    int datasize_i = MAXR*MAXC*sizeof(int);

    //Allocate memory on the host.

    cudaMalloc((void**) &intermediates_d, datasize_f);

    //cudaMalloc((void**) &matrix_d, datasize);

    cudaMalloc((void**) &result_d, datasize_i);



    FILE *fp;

    fp =fopen("arrays16.txt","r"); // reads in matrix

    //clock_t start =clock();

    for(i=0;i< MAXR;i++) // this loop takes the information from .txt file and puts it into arr1 matrix

    {

        for(j=0;j<MAXC;j++)

        {

            fscanf(fp,"%dt",&matrix[i][j]);

        }

    }

    for(i=0;i<MAXR;i++)

    {

        printf("n");

        for(j=0;j<MAXC;j++) {

            printf("%dt",matrix[i][j]);

        }

    }





    //This is where we convert the input matrix into floating point in intermediate matrix

    for (int y = 0; y < MAXR; y++) {

        for (int x = 0; x < MAXC; x++) {

            intermediates[y][x] = (float) matrix[y][x];

        }

    }



    for (i = 0; i < 16; i++) { // prints out the results array to .txt file

        for (j = 0; j < 16; j++) {

            printf("%.2f", intermediates[i][j]);



        }

    }



    // copying the data from the host array to the device array

    //cudaMemcpy(matrix_d, matrix, datasize,

    //cudaMemcpyHostToDevice);

    cudaMemcpy(intermediates_d, intermediates, 4*datasize_f,cudaMemcpyHostToDevice);

    cudaMemcpy(result_d, result, datasize_i,cudaMemcpyHostToDevice);



    /*-----------------------------------------------------*/

    /*  applies mean filter to the original inputed matrix

     * uses floor function to truncate the mean value for

     *          a 3 x 3 sliding window

     *                                                          */

    /*-------------------------------------------------------*/



    // how many blocks we will allocate

    dim3 blocks(1,1);



    //how many threads per block we will allocate

    dim3 threadsPerBlock(16,16);



    //Launch Kernel

    imagefilter<<<blocks, threadsPerBlock>>>(intermediates_d,result_d);



    //Copy back Results Matrix.

    cudaMemcpy(result, result_d, datasize_f,cudaMemcpyDeviceToHost);





    cudaError_t errSync = cudaGetLastError();

    cudaError_t errAsync = cudaDeviceSynchronize();

    if (errSync != cudaSuccess)

        printf("Sync kernel error: %sn", cudaGetErrorString(errSync));

    if (errAsync != cudaSuccess)

        printf("Async kernel error: %sn", cudaGetErrorString(errAsync));



    FILE *file;

    file = fopen("results.txt", "w+"); // writes  matrix to file



    for (i = 1; i < 16 - 1; i++) { // prints out the results array to .txt file

        for (j = 1; j < 16 - 1; j++) {

            printf("%3dt", result[i][j]);

            fprintf(file, "%3dt", result[i][j]);

        }



        printf("n");

        fprintf(file, "n");

    }



    fclose(file);



}

edited Nov 11 at 21:10

talonmies

58.8k17126192

asked Nov 10 at 18:24

Eden Shuster

add a comment |

up vote
-2
down vote

favorite

#define MAXR 16

#define MAXC 16



__global__ void imagefilter(float **intermediates_d, int **result_d) {

    int idx = blockDim.x * blockIdx.x + threadIdx.x;

    int idy = blockDim.y * blockIdx.y + threadIdx.y;

    //result_d[2][2]= 5;

    //if ((idx < 15) & (idy < 15)){

    //result_d[2][2]= 5;

    //if((idx>0) & (idy>0)){

    __syncthreads();

    result_d[idx][idy] = (int) ( (float) (intermediates_d[idx - 1][idy - 1]

                                                                   + intermediates_d[idx - 1][idy]

                                                                                              + intermediates_d[idx - 1][idy + 1]

                                                                                                                         + intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]

                                                                                                                                                                                + intermediates_d[idx][idy + 1]

                                                                                                                                                                                                       + intermediates_d[idx + 1][idy - 1]

                                                                                                                                                                                                                                  + intermediates_d[idx + 1][idy]

                                                                                                                                                                                                                                                             + intermediates_d[idx + 1][idy + 1]) );



    //  result_d[2][2]= 5;









}



int main(void)

{

    int i, j;

    //double cpu_time_used;

    float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating

    int matrix[MAXR][MAXC]; // This is the input matrix from file

    int result[MAXR][MAXC];//={{0}}; //This is where we want to write the mean values. For now set to zeros

    float **intermediates_d;

    //int **matrix_d;

    int **result_d;



    int datasize_f = MAXR*MAXC*sizeof(float);

    int datasize_i = MAXR*MAXC*sizeof(int);

    //Allocate memory on the host.

    cudaMalloc((void**) &intermediates_d, datasize_f);

    //cudaMalloc((void**) &matrix_d, datasize);

    cudaMalloc((void**) &result_d, datasize_i);



    FILE *fp;

    fp =fopen("arrays16.txt","r"); // reads in matrix

    //clock_t start =clock();

    for(i=0;i< MAXR;i++) // this loop takes the information from .txt file and puts it into arr1 matrix

    {

        for(j=0;j<MAXC;j++)

        {

            fscanf(fp,"%dt",&matrix[i][j]);

        }

    }

    for(i=0;i<MAXR;i++)

    {

        printf("n");

        for(j=0;j<MAXC;j++) {

            printf("%dt",matrix[i][j]);

        }

    }





    //This is where we convert the input matrix into floating point in intermediate matrix

    for (int y = 0; y < MAXR; y++) {

        for (int x = 0; x < MAXC; x++) {

            intermediates[y][x] = (float) matrix[y][x];

        }

    }



    for (i = 0; i < 16; i++) { // prints out the results array to .txt file

        for (j = 0; j < 16; j++) {

            printf("%.2f", intermediates[i][j]);



        }

    }



    // copying the data from the host array to the device array

    //cudaMemcpy(matrix_d, matrix, datasize,

    //cudaMemcpyHostToDevice);

    cudaMemcpy(intermediates_d, intermediates, 4*datasize_f,cudaMemcpyHostToDevice);

    cudaMemcpy(result_d, result, datasize_i,cudaMemcpyHostToDevice);



    /*-----------------------------------------------------*/

    /*  applies mean filter to the original inputed matrix

     * uses floor function to truncate the mean value for

     *          a 3 x 3 sliding window

     *                                                          */

    /*-------------------------------------------------------*/



    // how many blocks we will allocate

    dim3 blocks(1,1);



    //how many threads per block we will allocate

    dim3 threadsPerBlock(16,16);



    //Launch Kernel

    imagefilter<<<blocks, threadsPerBlock>>>(intermediates_d,result_d);



    //Copy back Results Matrix.

    cudaMemcpy(result, result_d, datasize_f,cudaMemcpyDeviceToHost);





    cudaError_t errSync = cudaGetLastError();

    cudaError_t errAsync = cudaDeviceSynchronize();

    if (errSync != cudaSuccess)

        printf("Sync kernel error: %sn", cudaGetErrorString(errSync));

    if (errAsync != cudaSuccess)

        printf("Async kernel error: %sn", cudaGetErrorString(errAsync));



    FILE *file;

    file = fopen("results.txt", "w+"); // writes  matrix to file



    for (i = 1; i < 16 - 1; i++) { // prints out the results array to .txt file

        for (j = 1; j < 16 - 1; j++) {

            printf("%3dt", result[i][j]);

            fprintf(file, "%3dt", result[i][j]);

        }



        printf("n");

        fprintf(file, "n");

    }



    fclose(file);



}

edited Nov 11 at 21:10

talonmies

58.8k17126192

asked Nov 10 at 18:24

Eden Shuster

#define MAXR 16

#define MAXC 16



__global__ void imagefilter(float **intermediates_d, int **result_d) {

    int idx = blockDim.x * blockIdx.x + threadIdx.x;

    int idy = blockDim.y * blockIdx.y + threadIdx.y;

    //result_d[2][2]= 5;

    //if ((idx < 15) & (idy < 15)){

    //result_d[2][2]= 5;

    //if((idx>0) & (idy>0)){

    __syncthreads();

    result_d[idx][idy] = (int) ( (float) (intermediates_d[idx - 1][idy - 1]

                                                                   + intermediates_d[idx - 1][idy]

                                                                                              + intermediates_d[idx - 1][idy + 1]

                                                                                                                         + intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]

                                                                                                                                                                                + intermediates_d[idx][idy + 1]

                                                                                                                                                                                                       + intermediates_d[idx + 1][idy - 1]

                                                                                                                                                                                                                                  + intermediates_d[idx + 1][idy]

                                                                                                                                                                                                                                                             + intermediates_d[idx + 1][idy + 1]) );



    //  result_d[2][2]= 5;









}



int main(void)

{

    int i, j;

    //double cpu_time_used;

    float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating

    int matrix[MAXR][MAXC]; // This is the input matrix from file

    int result[MAXR][MAXC];//={{0}}; //This is where we want to write the mean values. For now set to zeros

    float **intermediates_d;

    //int **matrix_d;

    int **result_d;



    int datasize_f = MAXR*MAXC*sizeof(float);

    int datasize_i = MAXR*MAXC*sizeof(int);

    //Allocate memory on the host.

    cudaMalloc((void**) &intermediates_d, datasize_f);

    //cudaMalloc((void**) &matrix_d, datasize);

    cudaMalloc((void**) &result_d, datasize_i);



    FILE *fp;

    fp =fopen("arrays16.txt","r"); // reads in matrix

    //clock_t start =clock();

    for(i=0;i< MAXR;i++) // this loop takes the information from .txt file and puts it into arr1 matrix

    {

        for(j=0;j<MAXC;j++)

        {

            fscanf(fp,"%dt",&matrix[i][j]);

        }

    }

    for(i=0;i<MAXR;i++)

    {

        printf("n");

        for(j=0;j<MAXC;j++) {

            printf("%dt",matrix[i][j]);

        }

    }





    //This is where we convert the input matrix into floating point in intermediate matrix

    for (int y = 0; y < MAXR; y++) {

        for (int x = 0; x < MAXC; x++) {

            intermediates[y][x] = (float) matrix[y][x];

        }

    }



    for (i = 0; i < 16; i++) { // prints out the results array to .txt file

        for (j = 0; j < 16; j++) {

            printf("%.2f", intermediates[i][j]);



        }

    }



    // copying the data from the host array to the device array

    //cudaMemcpy(matrix_d, matrix, datasize,

    //cudaMemcpyHostToDevice);

    cudaMemcpy(intermediates_d, intermediates, 4*datasize_f,cudaMemcpyHostToDevice);

    cudaMemcpy(result_d, result, datasize_i,cudaMemcpyHostToDevice);



    /*-----------------------------------------------------*/

    /*  applies mean filter to the original inputed matrix

     * uses floor function to truncate the mean value for

     *          a 3 x 3 sliding window

     *                                                          */

    /*-------------------------------------------------------*/



    // how many blocks we will allocate

    dim3 blocks(1,1);



    //how many threads per block we will allocate

    dim3 threadsPerBlock(16,16);



    //Launch Kernel

    imagefilter<<<blocks, threadsPerBlock>>>(intermediates_d,result_d);



    //Copy back Results Matrix.

    cudaMemcpy(result, result_d, datasize_f,cudaMemcpyDeviceToHost);





    cudaError_t errSync = cudaGetLastError();

    cudaError_t errAsync = cudaDeviceSynchronize();

    if (errSync != cudaSuccess)

        printf("Sync kernel error: %sn", cudaGetErrorString(errSync));

    if (errAsync != cudaSuccess)

        printf("Async kernel error: %sn", cudaGetErrorString(errAsync));



    FILE *file;

    file = fopen("results.txt", "w+"); // writes  matrix to file



    for (i = 1; i < 16 - 1; i++) { // prints out the results array to .txt file

        for (j = 1; j < 16 - 1; j++) {

            printf("%3dt", result[i][j]);

            fprintf(file, "%3dt", result[i][j]);

        }



        printf("n");

        fprintf(file, "n");

    }



    fclose(file);



}

filter cuda gpu mean

edited Nov 11 at 21:10

talonmies

58.8k17126192

asked Nov 10 at 18:24

Eden Shuster

edited Nov 11 at 21:10

talonmies

58.8k17126192

asked Nov 10 at 18:24

Eden Shuster

edited Nov 11 at 21:10

talonmies

58.8k17126192

edited Nov 11 at 21:10

talonmies

58.8k17126192

edited Nov 11 at 21:10

talonmies

58.8k17126192

asked Nov 10 at 18:24

Eden Shuster

asked Nov 10 at 18:24

Eden Shuster

asked Nov 10 at 18:24

Eden Shuster

add a comment |

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53242085%2fcuda-mean-filter-c-33-sliding-window%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Ndtyjky