CUDA C + MEAN FILTER + SYNC/ASYNC ERRORS











up vote
-1
down vote

favorite












I'm trying to produce a mean filter of sliding window 3 x 3. I'm inputting a 16 x 16 array (arrays16.txt --> int matrix) and trying to allocate 16 x 16 threads for each block (1 block right now). Using cuda-memcheck i'm receiving a number of sync/async errors and I have been going around and around. Is there something obviously wrong? I understand it is some sort of segfault and probably an issue with my pointers but I can't seem to get past it.Using cuda-memcheck with the -lineinfo flag it seems to be happening in the loop in the kernel.



Here is my code, thank you:



#include <stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdlib.h>

#define MAXR 16
#define MAXC 16

__global__ void imagefilter(float ** intermediates_d, int ** result_d) {

int idx = threadIdx.x;
int idy = threadIdx.y;

int x,y;
//result_d[2][2]= 5;
//if ((idx < 15) && (idy < 15)) {
result_d[x][y] = result_d[idx][idy];
for(x=1; x < MAXR; x++) {
for(y=1; y < MAXC; y++) {
result_d[y][x] = (int) (((float) (intermediates_d[idx - 1][idy- 1]
+ intermediates_d[idx - 1][idy]
+ intermediates_d[idx - 1][idy + 1]
+ intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
+ intermediates_d[idx][idy + 1]
+ intermediates_d[idx + 1][idy - 1]
+ intermediates_d[idx + 1][idy]
+ intermediates_d[idx + 1][idy + 1]) / 9.0F));
// result_d[2][2]= 5;
result_d[idy][idx]= result_d[y][x];
}
}
}



__syncthreads();



int main(void)

{
int i, j;
//double cpu_time_used;
float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
int matrix[MAXR][MAXC]; // This is the input matrix from file
int result[MAXR][MAXC]={{0}}; //This is where we want to write the mean values. For now set to zeros
float ** intermediates_d;
//int **matrix_d;

int ** result_d;
int datasize_f = MAXR * MAXC * sizeof(float);
int datasize_i = MAXR * MAXC * sizeof(int);
//Allocate memory on the host.

cudaMalloc((void**) &intermediates_d, datasize_f);
//cudaMalloc((void**) &matrix_d, datasize);
cudaMalloc((void**) &result_d, datasize_i);

FILE *fp;
fp = fopen("arrays16.txt", "r"); // reads in matrix
//clock_t start =clock();
for (i = 0; i < MAXR; i++) // this loop takes the information from .txt file and puts it into arr1 matrix
{
for (j = 0; j < MAXC; j++)
{
fscanf(fp, "%dt", &matrix[i][j]);
}
}

printf("*****INPUT MATRIX*****n");
for (i = 0; i < MAXR; i++)
{
printf("n");
for (j = 0; j < MAXC; j++) {
printf("%d ", matrix[i][j]);
}
}
printf("nn");

//This is where we convert the input matrix into floating point in intermediate matrix
for (int y = 0; y < MAXR; y++) {

for (int x = 0; x < MAXC; x++) {
intermediates[y][x] = (float) matrix[y][x];
}
}
printf("*******INTERMEDIATE MATRIX*******n");
for (i = 0; i < 16; i++) {
printf("n"); // prints out the results array to .txt file
for (j = 0; j < 16; j++) {
printf("%.1f ", intermediates[i][j]);
}
}
printf("nn");
// copying the data from the host array to the device array

//cudaMemcpy(matrix_d, matrix, datasize,

//cudaMemcpyHostToDevice);

cudaMemcpy(intermediates_d, intermediates, datasize_f,cudaMemcpyHostToDevice);
cudaMemcpy(result_d, result, datasize_i, cudaMemcpyHostToDevice);

// how many blocks we will allocate
dim3 blocks(1, 1);

//how many threads per block we will allocate
dim3 threadsPerBlock(16, 16);

//Launch Kernel
imagefilter<<<blocks, threadsPerBlock,MAXR*MAXC*sizeof(float)>>>(intermediates_d,result_d);

//Copy back Results Matrix.
cudaMemcpy(result, result_d, datasize_i, cudaMemcpyDeviceToHost);

cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();

if (errSync != cudaSuccess)
printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %sn", cudaGetErrorString(errAsync));

FILE *file;
file = fopen("results.txt", "w+"); // writes matrix to file
printf("*******RESULTS MATRIX******nn");
for (i = 1; i < MAXR - 1; i++) { // prints out the results array to .txt file
for (j = 1; j < MAXC - 1; j++) {
printf("%d ", result[i][j]);
fprintf(file, "%d ",result[i][j]);
}
printf("n");

fprintf(file, "n");
}

fclose(file);
}









share|improve this question
























  • You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
    – Florent DUGUET
    Nov 10 at 23:42










  • when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
    – Yeinberg
    Nov 11 at 0:24










  • any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
    – Yeinberg
    Nov 11 at 5:00






  • 1




    The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
    – Florent DUGUET
    Nov 11 at 5:41










  • Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
    – Yeinberg
    Nov 11 at 15:18















up vote
-1
down vote

favorite












I'm trying to produce a mean filter of sliding window 3 x 3. I'm inputting a 16 x 16 array (arrays16.txt --> int matrix) and trying to allocate 16 x 16 threads for each block (1 block right now). Using cuda-memcheck i'm receiving a number of sync/async errors and I have been going around and around. Is there something obviously wrong? I understand it is some sort of segfault and probably an issue with my pointers but I can't seem to get past it.Using cuda-memcheck with the -lineinfo flag it seems to be happening in the loop in the kernel.



Here is my code, thank you:



#include <stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdlib.h>

#define MAXR 16
#define MAXC 16

__global__ void imagefilter(float ** intermediates_d, int ** result_d) {

int idx = threadIdx.x;
int idy = threadIdx.y;

int x,y;
//result_d[2][2]= 5;
//if ((idx < 15) && (idy < 15)) {
result_d[x][y] = result_d[idx][idy];
for(x=1; x < MAXR; x++) {
for(y=1; y < MAXC; y++) {
result_d[y][x] = (int) (((float) (intermediates_d[idx - 1][idy- 1]
+ intermediates_d[idx - 1][idy]
+ intermediates_d[idx - 1][idy + 1]
+ intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
+ intermediates_d[idx][idy + 1]
+ intermediates_d[idx + 1][idy - 1]
+ intermediates_d[idx + 1][idy]
+ intermediates_d[idx + 1][idy + 1]) / 9.0F));
// result_d[2][2]= 5;
result_d[idy][idx]= result_d[y][x];
}
}
}



__syncthreads();



int main(void)

{
int i, j;
//double cpu_time_used;
float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
int matrix[MAXR][MAXC]; // This is the input matrix from file
int result[MAXR][MAXC]={{0}}; //This is where we want to write the mean values. For now set to zeros
float ** intermediates_d;
//int **matrix_d;

int ** result_d;
int datasize_f = MAXR * MAXC * sizeof(float);
int datasize_i = MAXR * MAXC * sizeof(int);
//Allocate memory on the host.

cudaMalloc((void**) &intermediates_d, datasize_f);
//cudaMalloc((void**) &matrix_d, datasize);
cudaMalloc((void**) &result_d, datasize_i);

FILE *fp;
fp = fopen("arrays16.txt", "r"); // reads in matrix
//clock_t start =clock();
for (i = 0; i < MAXR; i++) // this loop takes the information from .txt file and puts it into arr1 matrix
{
for (j = 0; j < MAXC; j++)
{
fscanf(fp, "%dt", &matrix[i][j]);
}
}

printf("*****INPUT MATRIX*****n");
for (i = 0; i < MAXR; i++)
{
printf("n");
for (j = 0; j < MAXC; j++) {
printf("%d ", matrix[i][j]);
}
}
printf("nn");

//This is where we convert the input matrix into floating point in intermediate matrix
for (int y = 0; y < MAXR; y++) {

for (int x = 0; x < MAXC; x++) {
intermediates[y][x] = (float) matrix[y][x];
}
}
printf("*******INTERMEDIATE MATRIX*******n");
for (i = 0; i < 16; i++) {
printf("n"); // prints out the results array to .txt file
for (j = 0; j < 16; j++) {
printf("%.1f ", intermediates[i][j]);
}
}
printf("nn");
// copying the data from the host array to the device array

//cudaMemcpy(matrix_d, matrix, datasize,

//cudaMemcpyHostToDevice);

cudaMemcpy(intermediates_d, intermediates, datasize_f,cudaMemcpyHostToDevice);
cudaMemcpy(result_d, result, datasize_i, cudaMemcpyHostToDevice);

// how many blocks we will allocate
dim3 blocks(1, 1);

//how many threads per block we will allocate
dim3 threadsPerBlock(16, 16);

//Launch Kernel
imagefilter<<<blocks, threadsPerBlock,MAXR*MAXC*sizeof(float)>>>(intermediates_d,result_d);

//Copy back Results Matrix.
cudaMemcpy(result, result_d, datasize_i, cudaMemcpyDeviceToHost);

cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();

if (errSync != cudaSuccess)
printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %sn", cudaGetErrorString(errAsync));

FILE *file;
file = fopen("results.txt", "w+"); // writes matrix to file
printf("*******RESULTS MATRIX******nn");
for (i = 1; i < MAXR - 1; i++) { // prints out the results array to .txt file
for (j = 1; j < MAXC - 1; j++) {
printf("%d ", result[i][j]);
fprintf(file, "%d ",result[i][j]);
}
printf("n");

fprintf(file, "n");
}

fclose(file);
}









share|improve this question
























  • You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
    – Florent DUGUET
    Nov 10 at 23:42










  • when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
    – Yeinberg
    Nov 11 at 0:24










  • any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
    – Yeinberg
    Nov 11 at 5:00






  • 1




    The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
    – Florent DUGUET
    Nov 11 at 5:41










  • Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
    – Yeinberg
    Nov 11 at 15:18













up vote
-1
down vote

favorite









up vote
-1
down vote

favorite











I'm trying to produce a mean filter of sliding window 3 x 3. I'm inputting a 16 x 16 array (arrays16.txt --> int matrix) and trying to allocate 16 x 16 threads for each block (1 block right now). Using cuda-memcheck i'm receiving a number of sync/async errors and I have been going around and around. Is there something obviously wrong? I understand it is some sort of segfault and probably an issue with my pointers but I can't seem to get past it.Using cuda-memcheck with the -lineinfo flag it seems to be happening in the loop in the kernel.



Here is my code, thank you:



#include <stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdlib.h>

#define MAXR 16
#define MAXC 16

__global__ void imagefilter(float ** intermediates_d, int ** result_d) {

int idx = threadIdx.x;
int idy = threadIdx.y;

int x,y;
//result_d[2][2]= 5;
//if ((idx < 15) && (idy < 15)) {
result_d[x][y] = result_d[idx][idy];
for(x=1; x < MAXR; x++) {
for(y=1; y < MAXC; y++) {
result_d[y][x] = (int) (((float) (intermediates_d[idx - 1][idy- 1]
+ intermediates_d[idx - 1][idy]
+ intermediates_d[idx - 1][idy + 1]
+ intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
+ intermediates_d[idx][idy + 1]
+ intermediates_d[idx + 1][idy - 1]
+ intermediates_d[idx + 1][idy]
+ intermediates_d[idx + 1][idy + 1]) / 9.0F));
// result_d[2][2]= 5;
result_d[idy][idx]= result_d[y][x];
}
}
}



__syncthreads();



int main(void)

{
int i, j;
//double cpu_time_used;
float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
int matrix[MAXR][MAXC]; // This is the input matrix from file
int result[MAXR][MAXC]={{0}}; //This is where we want to write the mean values. For now set to zeros
float ** intermediates_d;
//int **matrix_d;

int ** result_d;
int datasize_f = MAXR * MAXC * sizeof(float);
int datasize_i = MAXR * MAXC * sizeof(int);
//Allocate memory on the host.

cudaMalloc((void**) &intermediates_d, datasize_f);
//cudaMalloc((void**) &matrix_d, datasize);
cudaMalloc((void**) &result_d, datasize_i);

FILE *fp;
fp = fopen("arrays16.txt", "r"); // reads in matrix
//clock_t start =clock();
for (i = 0; i < MAXR; i++) // this loop takes the information from .txt file and puts it into arr1 matrix
{
for (j = 0; j < MAXC; j++)
{
fscanf(fp, "%dt", &matrix[i][j]);
}
}

printf("*****INPUT MATRIX*****n");
for (i = 0; i < MAXR; i++)
{
printf("n");
for (j = 0; j < MAXC; j++) {
printf("%d ", matrix[i][j]);
}
}
printf("nn");

//This is where we convert the input matrix into floating point in intermediate matrix
for (int y = 0; y < MAXR; y++) {

for (int x = 0; x < MAXC; x++) {
intermediates[y][x] = (float) matrix[y][x];
}
}
printf("*******INTERMEDIATE MATRIX*******n");
for (i = 0; i < 16; i++) {
printf("n"); // prints out the results array to .txt file
for (j = 0; j < 16; j++) {
printf("%.1f ", intermediates[i][j]);
}
}
printf("nn");
// copying the data from the host array to the device array

//cudaMemcpy(matrix_d, matrix, datasize,

//cudaMemcpyHostToDevice);

cudaMemcpy(intermediates_d, intermediates, datasize_f,cudaMemcpyHostToDevice);
cudaMemcpy(result_d, result, datasize_i, cudaMemcpyHostToDevice);

// how many blocks we will allocate
dim3 blocks(1, 1);

//how many threads per block we will allocate
dim3 threadsPerBlock(16, 16);

//Launch Kernel
imagefilter<<<blocks, threadsPerBlock,MAXR*MAXC*sizeof(float)>>>(intermediates_d,result_d);

//Copy back Results Matrix.
cudaMemcpy(result, result_d, datasize_i, cudaMemcpyDeviceToHost);

cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();

if (errSync != cudaSuccess)
printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %sn", cudaGetErrorString(errAsync));

FILE *file;
file = fopen("results.txt", "w+"); // writes matrix to file
printf("*******RESULTS MATRIX******nn");
for (i = 1; i < MAXR - 1; i++) { // prints out the results array to .txt file
for (j = 1; j < MAXC - 1; j++) {
printf("%d ", result[i][j]);
fprintf(file, "%d ",result[i][j]);
}
printf("n");

fprintf(file, "n");
}

fclose(file);
}









share|improve this question















I'm trying to produce a mean filter of sliding window 3 x 3. I'm inputting a 16 x 16 array (arrays16.txt --> int matrix) and trying to allocate 16 x 16 threads for each block (1 block right now). Using cuda-memcheck i'm receiving a number of sync/async errors and I have been going around and around. Is there something obviously wrong? I understand it is some sort of segfault and probably an issue with my pointers but I can't seem to get past it.Using cuda-memcheck with the -lineinfo flag it seems to be happening in the loop in the kernel.



Here is my code, thank you:



#include <stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdlib.h>

#define MAXR 16
#define MAXC 16

__global__ void imagefilter(float ** intermediates_d, int ** result_d) {

int idx = threadIdx.x;
int idy = threadIdx.y;

int x,y;
//result_d[2][2]= 5;
//if ((idx < 15) && (idy < 15)) {
result_d[x][y] = result_d[idx][idy];
for(x=1; x < MAXR; x++) {
for(y=1; y < MAXC; y++) {
result_d[y][x] = (int) (((float) (intermediates_d[idx - 1][idy- 1]
+ intermediates_d[idx - 1][idy]
+ intermediates_d[idx - 1][idy + 1]
+ intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
+ intermediates_d[idx][idy + 1]
+ intermediates_d[idx + 1][idy - 1]
+ intermediates_d[idx + 1][idy]
+ intermediates_d[idx + 1][idy + 1]) / 9.0F));
// result_d[2][2]= 5;
result_d[idy][idx]= result_d[y][x];
}
}
}



__syncthreads();



int main(void)

{
int i, j;
//double cpu_time_used;
float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
int matrix[MAXR][MAXC]; // This is the input matrix from file
int result[MAXR][MAXC]={{0}}; //This is where we want to write the mean values. For now set to zeros
float ** intermediates_d;
//int **matrix_d;

int ** result_d;
int datasize_f = MAXR * MAXC * sizeof(float);
int datasize_i = MAXR * MAXC * sizeof(int);
//Allocate memory on the host.

cudaMalloc((void**) &intermediates_d, datasize_f);
//cudaMalloc((void**) &matrix_d, datasize);
cudaMalloc((void**) &result_d, datasize_i);

FILE *fp;
fp = fopen("arrays16.txt", "r"); // reads in matrix
//clock_t start =clock();
for (i = 0; i < MAXR; i++) // this loop takes the information from .txt file and puts it into arr1 matrix
{
for (j = 0; j < MAXC; j++)
{
fscanf(fp, "%dt", &matrix[i][j]);
}
}

printf("*****INPUT MATRIX*****n");
for (i = 0; i < MAXR; i++)
{
printf("n");
for (j = 0; j < MAXC; j++) {
printf("%d ", matrix[i][j]);
}
}
printf("nn");

//This is where we convert the input matrix into floating point in intermediate matrix
for (int y = 0; y < MAXR; y++) {

for (int x = 0; x < MAXC; x++) {
intermediates[y][x] = (float) matrix[y][x];
}
}
printf("*******INTERMEDIATE MATRIX*******n");
for (i = 0; i < 16; i++) {
printf("n"); // prints out the results array to .txt file
for (j = 0; j < 16; j++) {
printf("%.1f ", intermediates[i][j]);
}
}
printf("nn");
// copying the data from the host array to the device array

//cudaMemcpy(matrix_d, matrix, datasize,

//cudaMemcpyHostToDevice);

cudaMemcpy(intermediates_d, intermediates, datasize_f,cudaMemcpyHostToDevice);
cudaMemcpy(result_d, result, datasize_i, cudaMemcpyHostToDevice);

// how many blocks we will allocate
dim3 blocks(1, 1);

//how many threads per block we will allocate
dim3 threadsPerBlock(16, 16);

//Launch Kernel
imagefilter<<<blocks, threadsPerBlock,MAXR*MAXC*sizeof(float)>>>(intermediates_d,result_d);

//Copy back Results Matrix.
cudaMemcpy(result, result_d, datasize_i, cudaMemcpyDeviceToHost);

cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();

if (errSync != cudaSuccess)
printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %sn", cudaGetErrorString(errAsync));

FILE *file;
file = fopen("results.txt", "w+"); // writes matrix to file
printf("*******RESULTS MATRIX******nn");
for (i = 1; i < MAXR - 1; i++) { // prints out the results array to .txt file
for (j = 1; j < MAXC - 1; j++) {
printf("%d ", result[i][j]);
fprintf(file, "%d ",result[i][j]);
}
printf("n");

fprintf(file, "n");
}

fclose(file);
}






filter parallel-processing cuda box






share|improve this question















share|improve this question













share|improve this question




share|improve this question








edited Nov 11 at 9:41









talonmies

58.9k17127194




58.9k17127194










asked Nov 10 at 22:20









Yeinberg

33




33












  • You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
    – Florent DUGUET
    Nov 10 at 23:42










  • when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
    – Yeinberg
    Nov 11 at 0:24










  • any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
    – Yeinberg
    Nov 11 at 5:00






  • 1




    The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
    – Florent DUGUET
    Nov 11 at 5:41










  • Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
    – Yeinberg
    Nov 11 at 15:18


















  • You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
    – Florent DUGUET
    Nov 10 at 23:42










  • when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
    – Yeinberg
    Nov 11 at 0:24










  • any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
    – Yeinberg
    Nov 11 at 5:00






  • 1




    The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
    – Florent DUGUET
    Nov 11 at 5:41










  • Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
    – Yeinberg
    Nov 11 at 15:18
















You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
– Florent DUGUET
Nov 10 at 23:42




You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
– Florent DUGUET
Nov 10 at 23:42












when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
– Yeinberg
Nov 11 at 0:24




when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
– Yeinberg
Nov 11 at 0:24












any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
– Yeinberg
Nov 11 at 5:00




any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
– Yeinberg
Nov 11 at 5:00




1




1




The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
– Florent DUGUET
Nov 11 at 5:41




The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
– Florent DUGUET
Nov 11 at 5:41












Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
– Yeinberg
Nov 11 at 15:18




Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
– Yeinberg
Nov 11 at 15:18

















active

oldest

votes











Your Answer






StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});


}
});














draft saved

draft discarded


















StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53244001%2fcuda-c-mean-filter-sync-async-errors%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown






























active

oldest

votes













active

oldest

votes









active

oldest

votes






active

oldest

votes
















draft saved

draft discarded




















































Thanks for contributing an answer to Stack Overflow!


  • Please be sure to answer the question. Provide details and share your research!

But avoid



  • Asking for help, clarification, or responding to other answers.

  • Making statements based on opinion; back them up with references or personal experience.


To learn more, see our tips on writing great answers.





Some of your past answers have not been well-received, and you're in danger of being blocked from answering.


Please pay close attention to the following guidance:


  • Please be sure to answer the question. Provide details and share your research!

But avoid



  • Asking for help, clarification, or responding to other answers.

  • Making statements based on opinion; back them up with references or personal experience.


To learn more, see our tips on writing great answers.




draft saved


draft discarded














StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53244001%2fcuda-c-mean-filter-sync-async-errors%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown





















































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown

































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown







Popular posts from this blog

Guess what letter conforming each word

Run scheduled task as local user group (not BUILTIN)

Port of Spain