Cuda Mean Filter C + 3*3 sliding window











up vote
-2
down vote

favorite












I am working on an assignment to create a 3 by 3 sliding window on a 256 by 256 matrix of random values between 0-31. Here im testing it on a 16 by 16 array. Im trying to solve it using 1 block with 16*16 threads, but am open to other suggestions. I am getting a bounds error for the kernel launch. I as you can see with the comments, I tried to implement a boundary to fix the problem, but this resulted in approximately 200 additional errors. I believe something is wrong with the way that I am indexing, but I cant figure out how to fix it. The errors that I am getting now, are that I am out of bounds for threads (0,x,0) block (0,0,0) for x between 1-16. I understand that this is because the kernel is trying to take idx-1 when idx is 0, or idy-1 when idy is 0, but for some reason I cant fix the issue.
Please advise.
#include
#include
#include
#include



#define MAXR 16
#define MAXC 16

__global__ void imagefilter(float **intermediates_d, int **result_d) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
//result_d[2][2]= 5;
//if ((idx < 15) & (idy < 15)){
//result_d[2][2]= 5;
//if((idx>0) & (idy>0)){
__syncthreads();
result_d[idx][idy] = (int) ( (float) (intermediates_d[idx - 1][idy - 1]
+ intermediates_d[idx - 1][idy]
+ intermediates_d[idx - 1][idy + 1]
+ intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
+ intermediates_d[idx][idy + 1]
+ intermediates_d[idx + 1][idy - 1]
+ intermediates_d[idx + 1][idy]
+ intermediates_d[idx + 1][idy + 1]) );

// result_d[2][2]= 5;




}

int main(void)
{
int i, j;
//double cpu_time_used;
float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
int matrix[MAXR][MAXC]; // This is the input matrix from file
int result[MAXR][MAXC];//={{0}}; //This is where we want to write the mean values. For now set to zeros
float **intermediates_d;
//int **matrix_d;
int **result_d;

int datasize_f = MAXR*MAXC*sizeof(float);
int datasize_i = MAXR*MAXC*sizeof(int);
//Allocate memory on the host.
cudaMalloc((void**) &intermediates_d, datasize_f);
//cudaMalloc((void**) &matrix_d, datasize);
cudaMalloc((void**) &result_d, datasize_i);

FILE *fp;
fp =fopen("arrays16.txt","r"); // reads in matrix
//clock_t start =clock();
for(i=0;i< MAXR;i++) // this loop takes the information from .txt file and puts it into arr1 matrix
{
for(j=0;j<MAXC;j++)
{
fscanf(fp,"%dt",&matrix[i][j]);
}
}
for(i=0;i<MAXR;i++)
{
printf("n");
for(j=0;j<MAXC;j++) {
printf("%dt",matrix[i][j]);
}
}


//This is where we convert the input matrix into floating point in intermediate matrix
for (int y = 0; y < MAXR; y++) {
for (int x = 0; x < MAXC; x++) {
intermediates[y][x] = (float) matrix[y][x];
}
}

for (i = 0; i < 16; i++) { // prints out the results array to .txt file
for (j = 0; j < 16; j++) {
printf("%.2f", intermediates[i][j]);

}
}

// copying the data from the host array to the device array
//cudaMemcpy(matrix_d, matrix, datasize,
//cudaMemcpyHostToDevice);
cudaMemcpy(intermediates_d, intermediates, 4*datasize_f,cudaMemcpyHostToDevice);
cudaMemcpy(result_d, result, datasize_i,cudaMemcpyHostToDevice);

/*-----------------------------------------------------*/
/* applies mean filter to the original inputed matrix
* uses floor function to truncate the mean value for
* a 3 x 3 sliding window
* */
/*-------------------------------------------------------*/

// how many blocks we will allocate
dim3 blocks(1,1);

//how many threads per block we will allocate
dim3 threadsPerBlock(16,16);

//Launch Kernel
imagefilter<<<blocks, threadsPerBlock>>>(intermediates_d,result_d);

//Copy back Results Matrix.
cudaMemcpy(result, result_d, datasize_f,cudaMemcpyDeviceToHost);


cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %sn", cudaGetErrorString(errAsync));

FILE *file;
file = fopen("results.txt", "w+"); // writes matrix to file

for (i = 1; i < 16 - 1; i++) { // prints out the results array to .txt file
for (j = 1; j < 16 - 1; j++) {
printf("%3dt", result[i][j]);
fprintf(file, "%3dt", result[i][j]);
}

printf("n");
fprintf(file, "n");
}

fclose(file);

}









share|improve this question




























    up vote
    -2
    down vote

    favorite












    I am working on an assignment to create a 3 by 3 sliding window on a 256 by 256 matrix of random values between 0-31. Here im testing it on a 16 by 16 array. Im trying to solve it using 1 block with 16*16 threads, but am open to other suggestions. I am getting a bounds error for the kernel launch. I as you can see with the comments, I tried to implement a boundary to fix the problem, but this resulted in approximately 200 additional errors. I believe something is wrong with the way that I am indexing, but I cant figure out how to fix it. The errors that I am getting now, are that I am out of bounds for threads (0,x,0) block (0,0,0) for x between 1-16. I understand that this is because the kernel is trying to take idx-1 when idx is 0, or idy-1 when idy is 0, but for some reason I cant fix the issue.
    Please advise.
    #include
    #include
    #include
    #include



    #define MAXR 16
    #define MAXC 16

    __global__ void imagefilter(float **intermediates_d, int **result_d) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int idy = blockDim.y * blockIdx.y + threadIdx.y;
    //result_d[2][2]= 5;
    //if ((idx < 15) & (idy < 15)){
    //result_d[2][2]= 5;
    //if((idx>0) & (idy>0)){
    __syncthreads();
    result_d[idx][idy] = (int) ( (float) (intermediates_d[idx - 1][idy - 1]
    + intermediates_d[idx - 1][idy]
    + intermediates_d[idx - 1][idy + 1]
    + intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
    + intermediates_d[idx][idy + 1]
    + intermediates_d[idx + 1][idy - 1]
    + intermediates_d[idx + 1][idy]
    + intermediates_d[idx + 1][idy + 1]) );

    // result_d[2][2]= 5;




    }

    int main(void)
    {
    int i, j;
    //double cpu_time_used;
    float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
    int matrix[MAXR][MAXC]; // This is the input matrix from file
    int result[MAXR][MAXC];//={{0}}; //This is where we want to write the mean values. For now set to zeros
    float **intermediates_d;
    //int **matrix_d;
    int **result_d;

    int datasize_f = MAXR*MAXC*sizeof(float);
    int datasize_i = MAXR*MAXC*sizeof(int);
    //Allocate memory on the host.
    cudaMalloc((void**) &intermediates_d, datasize_f);
    //cudaMalloc((void**) &matrix_d, datasize);
    cudaMalloc((void**) &result_d, datasize_i);

    FILE *fp;
    fp =fopen("arrays16.txt","r"); // reads in matrix
    //clock_t start =clock();
    for(i=0;i< MAXR;i++) // this loop takes the information from .txt file and puts it into arr1 matrix
    {
    for(j=0;j<MAXC;j++)
    {
    fscanf(fp,"%dt",&matrix[i][j]);
    }
    }
    for(i=0;i<MAXR;i++)
    {
    printf("n");
    for(j=0;j<MAXC;j++) {
    printf("%dt",matrix[i][j]);
    }
    }


    //This is where we convert the input matrix into floating point in intermediate matrix
    for (int y = 0; y < MAXR; y++) {
    for (int x = 0; x < MAXC; x++) {
    intermediates[y][x] = (float) matrix[y][x];
    }
    }

    for (i = 0; i < 16; i++) { // prints out the results array to .txt file
    for (j = 0; j < 16; j++) {
    printf("%.2f", intermediates[i][j]);

    }
    }

    // copying the data from the host array to the device array
    //cudaMemcpy(matrix_d, matrix, datasize,
    //cudaMemcpyHostToDevice);
    cudaMemcpy(intermediates_d, intermediates, 4*datasize_f,cudaMemcpyHostToDevice);
    cudaMemcpy(result_d, result, datasize_i,cudaMemcpyHostToDevice);

    /*-----------------------------------------------------*/
    /* applies mean filter to the original inputed matrix
    * uses floor function to truncate the mean value for
    * a 3 x 3 sliding window
    * */
    /*-------------------------------------------------------*/

    // how many blocks we will allocate
    dim3 blocks(1,1);

    //how many threads per block we will allocate
    dim3 threadsPerBlock(16,16);

    //Launch Kernel
    imagefilter<<<blocks, threadsPerBlock>>>(intermediates_d,result_d);

    //Copy back Results Matrix.
    cudaMemcpy(result, result_d, datasize_f,cudaMemcpyDeviceToHost);


    cudaError_t errSync = cudaGetLastError();
    cudaError_t errAsync = cudaDeviceSynchronize();
    if (errSync != cudaSuccess)
    printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
    if (errAsync != cudaSuccess)
    printf("Async kernel error: %sn", cudaGetErrorString(errAsync));

    FILE *file;
    file = fopen("results.txt", "w+"); // writes matrix to file

    for (i = 1; i < 16 - 1; i++) { // prints out the results array to .txt file
    for (j = 1; j < 16 - 1; j++) {
    printf("%3dt", result[i][j]);
    fprintf(file, "%3dt", result[i][j]);
    }

    printf("n");
    fprintf(file, "n");
    }

    fclose(file);

    }









    share|improve this question


























      up vote
      -2
      down vote

      favorite









      up vote
      -2
      down vote

      favorite











      I am working on an assignment to create a 3 by 3 sliding window on a 256 by 256 matrix of random values between 0-31. Here im testing it on a 16 by 16 array. Im trying to solve it using 1 block with 16*16 threads, but am open to other suggestions. I am getting a bounds error for the kernel launch. I as you can see with the comments, I tried to implement a boundary to fix the problem, but this resulted in approximately 200 additional errors. I believe something is wrong with the way that I am indexing, but I cant figure out how to fix it. The errors that I am getting now, are that I am out of bounds for threads (0,x,0) block (0,0,0) for x between 1-16. I understand that this is because the kernel is trying to take idx-1 when idx is 0, or idy-1 when idy is 0, but for some reason I cant fix the issue.
      Please advise.
      #include
      #include
      #include
      #include



      #define MAXR 16
      #define MAXC 16

      __global__ void imagefilter(float **intermediates_d, int **result_d) {
      int idx = blockDim.x * blockIdx.x + threadIdx.x;
      int idy = blockDim.y * blockIdx.y + threadIdx.y;
      //result_d[2][2]= 5;
      //if ((idx < 15) & (idy < 15)){
      //result_d[2][2]= 5;
      //if((idx>0) & (idy>0)){
      __syncthreads();
      result_d[idx][idy] = (int) ( (float) (intermediates_d[idx - 1][idy - 1]
      + intermediates_d[idx - 1][idy]
      + intermediates_d[idx - 1][idy + 1]
      + intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
      + intermediates_d[idx][idy + 1]
      + intermediates_d[idx + 1][idy - 1]
      + intermediates_d[idx + 1][idy]
      + intermediates_d[idx + 1][idy + 1]) );

      // result_d[2][2]= 5;




      }

      int main(void)
      {
      int i, j;
      //double cpu_time_used;
      float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
      int matrix[MAXR][MAXC]; // This is the input matrix from file
      int result[MAXR][MAXC];//={{0}}; //This is where we want to write the mean values. For now set to zeros
      float **intermediates_d;
      //int **matrix_d;
      int **result_d;

      int datasize_f = MAXR*MAXC*sizeof(float);
      int datasize_i = MAXR*MAXC*sizeof(int);
      //Allocate memory on the host.
      cudaMalloc((void**) &intermediates_d, datasize_f);
      //cudaMalloc((void**) &matrix_d, datasize);
      cudaMalloc((void**) &result_d, datasize_i);

      FILE *fp;
      fp =fopen("arrays16.txt","r"); // reads in matrix
      //clock_t start =clock();
      for(i=0;i< MAXR;i++) // this loop takes the information from .txt file and puts it into arr1 matrix
      {
      for(j=0;j<MAXC;j++)
      {
      fscanf(fp,"%dt",&matrix[i][j]);
      }
      }
      for(i=0;i<MAXR;i++)
      {
      printf("n");
      for(j=0;j<MAXC;j++) {
      printf("%dt",matrix[i][j]);
      }
      }


      //This is where we convert the input matrix into floating point in intermediate matrix
      for (int y = 0; y < MAXR; y++) {
      for (int x = 0; x < MAXC; x++) {
      intermediates[y][x] = (float) matrix[y][x];
      }
      }

      for (i = 0; i < 16; i++) { // prints out the results array to .txt file
      for (j = 0; j < 16; j++) {
      printf("%.2f", intermediates[i][j]);

      }
      }

      // copying the data from the host array to the device array
      //cudaMemcpy(matrix_d, matrix, datasize,
      //cudaMemcpyHostToDevice);
      cudaMemcpy(intermediates_d, intermediates, 4*datasize_f,cudaMemcpyHostToDevice);
      cudaMemcpy(result_d, result, datasize_i,cudaMemcpyHostToDevice);

      /*-----------------------------------------------------*/
      /* applies mean filter to the original inputed matrix
      * uses floor function to truncate the mean value for
      * a 3 x 3 sliding window
      * */
      /*-------------------------------------------------------*/

      // how many blocks we will allocate
      dim3 blocks(1,1);

      //how many threads per block we will allocate
      dim3 threadsPerBlock(16,16);

      //Launch Kernel
      imagefilter<<<blocks, threadsPerBlock>>>(intermediates_d,result_d);

      //Copy back Results Matrix.
      cudaMemcpy(result, result_d, datasize_f,cudaMemcpyDeviceToHost);


      cudaError_t errSync = cudaGetLastError();
      cudaError_t errAsync = cudaDeviceSynchronize();
      if (errSync != cudaSuccess)
      printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
      if (errAsync != cudaSuccess)
      printf("Async kernel error: %sn", cudaGetErrorString(errAsync));

      FILE *file;
      file = fopen("results.txt", "w+"); // writes matrix to file

      for (i = 1; i < 16 - 1; i++) { // prints out the results array to .txt file
      for (j = 1; j < 16 - 1; j++) {
      printf("%3dt", result[i][j]);
      fprintf(file, "%3dt", result[i][j]);
      }

      printf("n");
      fprintf(file, "n");
      }

      fclose(file);

      }









      share|improve this question















      I am working on an assignment to create a 3 by 3 sliding window on a 256 by 256 matrix of random values between 0-31. Here im testing it on a 16 by 16 array. Im trying to solve it using 1 block with 16*16 threads, but am open to other suggestions. I am getting a bounds error for the kernel launch. I as you can see with the comments, I tried to implement a boundary to fix the problem, but this resulted in approximately 200 additional errors. I believe something is wrong with the way that I am indexing, but I cant figure out how to fix it. The errors that I am getting now, are that I am out of bounds for threads (0,x,0) block (0,0,0) for x between 1-16. I understand that this is because the kernel is trying to take idx-1 when idx is 0, or idy-1 when idy is 0, but for some reason I cant fix the issue.
      Please advise.
      #include
      #include
      #include
      #include



      #define MAXR 16
      #define MAXC 16

      __global__ void imagefilter(float **intermediates_d, int **result_d) {
      int idx = blockDim.x * blockIdx.x + threadIdx.x;
      int idy = blockDim.y * blockIdx.y + threadIdx.y;
      //result_d[2][2]= 5;
      //if ((idx < 15) & (idy < 15)){
      //result_d[2][2]= 5;
      //if((idx>0) & (idy>0)){
      __syncthreads();
      result_d[idx][idy] = (int) ( (float) (intermediates_d[idx - 1][idy - 1]
      + intermediates_d[idx - 1][idy]
      + intermediates_d[idx - 1][idy + 1]
      + intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
      + intermediates_d[idx][idy + 1]
      + intermediates_d[idx + 1][idy - 1]
      + intermediates_d[idx + 1][idy]
      + intermediates_d[idx + 1][idy + 1]) );

      // result_d[2][2]= 5;




      }

      int main(void)
      {
      int i, j;
      //double cpu_time_used;
      float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
      int matrix[MAXR][MAXC]; // This is the input matrix from file
      int result[MAXR][MAXC];//={{0}}; //This is where we want to write the mean values. For now set to zeros
      float **intermediates_d;
      //int **matrix_d;
      int **result_d;

      int datasize_f = MAXR*MAXC*sizeof(float);
      int datasize_i = MAXR*MAXC*sizeof(int);
      //Allocate memory on the host.
      cudaMalloc((void**) &intermediates_d, datasize_f);
      //cudaMalloc((void**) &matrix_d, datasize);
      cudaMalloc((void**) &result_d, datasize_i);

      FILE *fp;
      fp =fopen("arrays16.txt","r"); // reads in matrix
      //clock_t start =clock();
      for(i=0;i< MAXR;i++) // this loop takes the information from .txt file and puts it into arr1 matrix
      {
      for(j=0;j<MAXC;j++)
      {
      fscanf(fp,"%dt",&matrix[i][j]);
      }
      }
      for(i=0;i<MAXR;i++)
      {
      printf("n");
      for(j=0;j<MAXC;j++) {
      printf("%dt",matrix[i][j]);
      }
      }


      //This is where we convert the input matrix into floating point in intermediate matrix
      for (int y = 0; y < MAXR; y++) {
      for (int x = 0; x < MAXC; x++) {
      intermediates[y][x] = (float) matrix[y][x];
      }
      }

      for (i = 0; i < 16; i++) { // prints out the results array to .txt file
      for (j = 0; j < 16; j++) {
      printf("%.2f", intermediates[i][j]);

      }
      }

      // copying the data from the host array to the device array
      //cudaMemcpy(matrix_d, matrix, datasize,
      //cudaMemcpyHostToDevice);
      cudaMemcpy(intermediates_d, intermediates, 4*datasize_f,cudaMemcpyHostToDevice);
      cudaMemcpy(result_d, result, datasize_i,cudaMemcpyHostToDevice);

      /*-----------------------------------------------------*/
      /* applies mean filter to the original inputed matrix
      * uses floor function to truncate the mean value for
      * a 3 x 3 sliding window
      * */
      /*-------------------------------------------------------*/

      // how many blocks we will allocate
      dim3 blocks(1,1);

      //how many threads per block we will allocate
      dim3 threadsPerBlock(16,16);

      //Launch Kernel
      imagefilter<<<blocks, threadsPerBlock>>>(intermediates_d,result_d);

      //Copy back Results Matrix.
      cudaMemcpy(result, result_d, datasize_f,cudaMemcpyDeviceToHost);


      cudaError_t errSync = cudaGetLastError();
      cudaError_t errAsync = cudaDeviceSynchronize();
      if (errSync != cudaSuccess)
      printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
      if (errAsync != cudaSuccess)
      printf("Async kernel error: %sn", cudaGetErrorString(errAsync));

      FILE *file;
      file = fopen("results.txt", "w+"); // writes matrix to file

      for (i = 1; i < 16 - 1; i++) { // prints out the results array to .txt file
      for (j = 1; j < 16 - 1; j++) {
      printf("%3dt", result[i][j]);
      fprintf(file, "%3dt", result[i][j]);
      }

      printf("n");
      fprintf(file, "n");
      }

      fclose(file);

      }






      filter cuda gpu mean






      share|improve this question















      share|improve this question













      share|improve this question




      share|improve this question








      edited Nov 11 at 21:10









      talonmies

      58.8k17126192




      58.8k17126192










      asked Nov 10 at 18:24









      Eden Shuster

      11




      11





























          active

          oldest

          votes











          Your Answer






          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "1"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          convertImagesToLinks: true,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: 10,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });














           

          draft saved


          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53242085%2fcuda-mean-filter-c-33-sliding-window%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown






























          active

          oldest

          votes













          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes
















           

          draft saved


          draft discarded



















































           


          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53242085%2fcuda-mean-filter-c-33-sliding-window%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          Florida Star v. B. J. F.

          Danny Elfman

          Lugert, Oklahoma