CUDA C + MEAN FILTER + SYNC/ASYNC ERRORS











up vote
-1
down vote

favorite












I'm trying to produce a mean filter of sliding window 3 x 3. I'm inputting a 16 x 16 array (arrays16.txt --> int matrix) and trying to allocate 16 x 16 threads for each block (1 block right now). Using cuda-memcheck i'm receiving a number of sync/async errors and I have been going around and around. Is there something obviously wrong? I understand it is some sort of segfault and probably an issue with my pointers but I can't seem to get past it.Using cuda-memcheck with the -lineinfo flag it seems to be happening in the loop in the kernel.



Here is my code, thank you:



#include <stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdlib.h>

#define MAXR 16
#define MAXC 16

__global__ void imagefilter(float ** intermediates_d, int ** result_d) {

int idx = threadIdx.x;
int idy = threadIdx.y;

int x,y;
//result_d[2][2]= 5;
//if ((idx < 15) && (idy < 15)) {
result_d[x][y] = result_d[idx][idy];
for(x=1; x < MAXR; x++) {
for(y=1; y < MAXC; y++) {
result_d[y][x] = (int) (((float) (intermediates_d[idx - 1][idy- 1]
+ intermediates_d[idx - 1][idy]
+ intermediates_d[idx - 1][idy + 1]
+ intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
+ intermediates_d[idx][idy + 1]
+ intermediates_d[idx + 1][idy - 1]
+ intermediates_d[idx + 1][idy]
+ intermediates_d[idx + 1][idy + 1]) / 9.0F));
// result_d[2][2]= 5;
result_d[idy][idx]= result_d[y][x];
}
}
}



__syncthreads();



int main(void)

{
int i, j;
//double cpu_time_used;
float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
int matrix[MAXR][MAXC]; // This is the input matrix from file
int result[MAXR][MAXC]={{0}}; //This is where we want to write the mean values. For now set to zeros
float ** intermediates_d;
//int **matrix_d;

int ** result_d;
int datasize_f = MAXR * MAXC * sizeof(float);
int datasize_i = MAXR * MAXC * sizeof(int);
//Allocate memory on the host.

cudaMalloc((void**) &intermediates_d, datasize_f);
//cudaMalloc((void**) &matrix_d, datasize);
cudaMalloc((void**) &result_d, datasize_i);

FILE *fp;
fp = fopen("arrays16.txt", "r"); // reads in matrix
//clock_t start =clock();
for (i = 0; i < MAXR; i++) // this loop takes the information from .txt file and puts it into arr1 matrix
{
for (j = 0; j < MAXC; j++)
{
fscanf(fp, "%dt", &matrix[i][j]);
}
}

printf("*****INPUT MATRIX*****n");
for (i = 0; i < MAXR; i++)
{
printf("n");
for (j = 0; j < MAXC; j++) {
printf("%d ", matrix[i][j]);
}
}
printf("nn");

//This is where we convert the input matrix into floating point in intermediate matrix
for (int y = 0; y < MAXR; y++) {

for (int x = 0; x < MAXC; x++) {
intermediates[y][x] = (float) matrix[y][x];
}
}
printf("*******INTERMEDIATE MATRIX*******n");
for (i = 0; i < 16; i++) {
printf("n"); // prints out the results array to .txt file
for (j = 0; j < 16; j++) {
printf("%.1f ", intermediates[i][j]);
}
}
printf("nn");
// copying the data from the host array to the device array

//cudaMemcpy(matrix_d, matrix, datasize,

//cudaMemcpyHostToDevice);

cudaMemcpy(intermediates_d, intermediates, datasize_f,cudaMemcpyHostToDevice);
cudaMemcpy(result_d, result, datasize_i, cudaMemcpyHostToDevice);

// how many blocks we will allocate
dim3 blocks(1, 1);

//how many threads per block we will allocate
dim3 threadsPerBlock(16, 16);

//Launch Kernel
imagefilter<<<blocks, threadsPerBlock,MAXR*MAXC*sizeof(float)>>>(intermediates_d,result_d);

//Copy back Results Matrix.
cudaMemcpy(result, result_d, datasize_i, cudaMemcpyDeviceToHost);

cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();

if (errSync != cudaSuccess)
printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %sn", cudaGetErrorString(errAsync));

FILE *file;
file = fopen("results.txt", "w+"); // writes matrix to file
printf("*******RESULTS MATRIX******nn");
for (i = 1; i < MAXR - 1; i++) { // prints out the results array to .txt file
for (j = 1; j < MAXC - 1; j++) {
printf("%d ", result[i][j]);
fprintf(file, "%d ",result[i][j]);
}
printf("n");

fprintf(file, "n");
}

fclose(file);
}









share|improve this question
























  • You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
    – Florent DUGUET
    Nov 10 at 23:42










  • when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
    – Yeinberg
    Nov 11 at 0:24










  • any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
    – Yeinberg
    Nov 11 at 5:00






  • 1




    The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
    – Florent DUGUET
    Nov 11 at 5:41










  • Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
    – Yeinberg
    Nov 11 at 15:18















up vote
-1
down vote

favorite












I'm trying to produce a mean filter of sliding window 3 x 3. I'm inputting a 16 x 16 array (arrays16.txt --> int matrix) and trying to allocate 16 x 16 threads for each block (1 block right now). Using cuda-memcheck i'm receiving a number of sync/async errors and I have been going around and around. Is there something obviously wrong? I understand it is some sort of segfault and probably an issue with my pointers but I can't seem to get past it.Using cuda-memcheck with the -lineinfo flag it seems to be happening in the loop in the kernel.



Here is my code, thank you:



#include <stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdlib.h>

#define MAXR 16
#define MAXC 16

__global__ void imagefilter(float ** intermediates_d, int ** result_d) {

int idx = threadIdx.x;
int idy = threadIdx.y;

int x,y;
//result_d[2][2]= 5;
//if ((idx < 15) && (idy < 15)) {
result_d[x][y] = result_d[idx][idy];
for(x=1; x < MAXR; x++) {
for(y=1; y < MAXC; y++) {
result_d[y][x] = (int) (((float) (intermediates_d[idx - 1][idy- 1]
+ intermediates_d[idx - 1][idy]
+ intermediates_d[idx - 1][idy + 1]
+ intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
+ intermediates_d[idx][idy + 1]
+ intermediates_d[idx + 1][idy - 1]
+ intermediates_d[idx + 1][idy]
+ intermediates_d[idx + 1][idy + 1]) / 9.0F));
// result_d[2][2]= 5;
result_d[idy][idx]= result_d[y][x];
}
}
}



__syncthreads();



int main(void)

{
int i, j;
//double cpu_time_used;
float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
int matrix[MAXR][MAXC]; // This is the input matrix from file
int result[MAXR][MAXC]={{0}}; //This is where we want to write the mean values. For now set to zeros
float ** intermediates_d;
//int **matrix_d;

int ** result_d;
int datasize_f = MAXR * MAXC * sizeof(float);
int datasize_i = MAXR * MAXC * sizeof(int);
//Allocate memory on the host.

cudaMalloc((void**) &intermediates_d, datasize_f);
//cudaMalloc((void**) &matrix_d, datasize);
cudaMalloc((void**) &result_d, datasize_i);

FILE *fp;
fp = fopen("arrays16.txt", "r"); // reads in matrix
//clock_t start =clock();
for (i = 0; i < MAXR; i++) // this loop takes the information from .txt file and puts it into arr1 matrix
{
for (j = 0; j < MAXC; j++)
{
fscanf(fp, "%dt", &matrix[i][j]);
}
}

printf("*****INPUT MATRIX*****n");
for (i = 0; i < MAXR; i++)
{
printf("n");
for (j = 0; j < MAXC; j++) {
printf("%d ", matrix[i][j]);
}
}
printf("nn");

//This is where we convert the input matrix into floating point in intermediate matrix
for (int y = 0; y < MAXR; y++) {

for (int x = 0; x < MAXC; x++) {
intermediates[y][x] = (float) matrix[y][x];
}
}
printf("*******INTERMEDIATE MATRIX*******n");
for (i = 0; i < 16; i++) {
printf("n"); // prints out the results array to .txt file
for (j = 0; j < 16; j++) {
printf("%.1f ", intermediates[i][j]);
}
}
printf("nn");
// copying the data from the host array to the device array

//cudaMemcpy(matrix_d, matrix, datasize,

//cudaMemcpyHostToDevice);

cudaMemcpy(intermediates_d, intermediates, datasize_f,cudaMemcpyHostToDevice);
cudaMemcpy(result_d, result, datasize_i, cudaMemcpyHostToDevice);

// how many blocks we will allocate
dim3 blocks(1, 1);

//how many threads per block we will allocate
dim3 threadsPerBlock(16, 16);

//Launch Kernel
imagefilter<<<blocks, threadsPerBlock,MAXR*MAXC*sizeof(float)>>>(intermediates_d,result_d);

//Copy back Results Matrix.
cudaMemcpy(result, result_d, datasize_i, cudaMemcpyDeviceToHost);

cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();

if (errSync != cudaSuccess)
printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %sn", cudaGetErrorString(errAsync));

FILE *file;
file = fopen("results.txt", "w+"); // writes matrix to file
printf("*******RESULTS MATRIX******nn");
for (i = 1; i < MAXR - 1; i++) { // prints out the results array to .txt file
for (j = 1; j < MAXC - 1; j++) {
printf("%d ", result[i][j]);
fprintf(file, "%d ",result[i][j]);
}
printf("n");

fprintf(file, "n");
}

fclose(file);
}









share|improve this question
























  • You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
    – Florent DUGUET
    Nov 10 at 23:42










  • when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
    – Yeinberg
    Nov 11 at 0:24










  • any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
    – Yeinberg
    Nov 11 at 5:00






  • 1




    The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
    – Florent DUGUET
    Nov 11 at 5:41










  • Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
    – Yeinberg
    Nov 11 at 15:18













up vote
-1
down vote

favorite









up vote
-1
down vote

favorite











I'm trying to produce a mean filter of sliding window 3 x 3. I'm inputting a 16 x 16 array (arrays16.txt --> int matrix) and trying to allocate 16 x 16 threads for each block (1 block right now). Using cuda-memcheck i'm receiving a number of sync/async errors and I have been going around and around. Is there something obviously wrong? I understand it is some sort of segfault and probably an issue with my pointers but I can't seem to get past it.Using cuda-memcheck with the -lineinfo flag it seems to be happening in the loop in the kernel.



Here is my code, thank you:



#include <stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdlib.h>

#define MAXR 16
#define MAXC 16

__global__ void imagefilter(float ** intermediates_d, int ** result_d) {

int idx = threadIdx.x;
int idy = threadIdx.y;

int x,y;
//result_d[2][2]= 5;
//if ((idx < 15) && (idy < 15)) {
result_d[x][y] = result_d[idx][idy];
for(x=1; x < MAXR; x++) {
for(y=1; y < MAXC; y++) {
result_d[y][x] = (int) (((float) (intermediates_d[idx - 1][idy- 1]
+ intermediates_d[idx - 1][idy]
+ intermediates_d[idx - 1][idy + 1]
+ intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
+ intermediates_d[idx][idy + 1]
+ intermediates_d[idx + 1][idy - 1]
+ intermediates_d[idx + 1][idy]
+ intermediates_d[idx + 1][idy + 1]) / 9.0F));
// result_d[2][2]= 5;
result_d[idy][idx]= result_d[y][x];
}
}
}



__syncthreads();



int main(void)

{
int i, j;
//double cpu_time_used;
float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
int matrix[MAXR][MAXC]; // This is the input matrix from file
int result[MAXR][MAXC]={{0}}; //This is where we want to write the mean values. For now set to zeros
float ** intermediates_d;
//int **matrix_d;

int ** result_d;
int datasize_f = MAXR * MAXC * sizeof(float);
int datasize_i = MAXR * MAXC * sizeof(int);
//Allocate memory on the host.

cudaMalloc((void**) &intermediates_d, datasize_f);
//cudaMalloc((void**) &matrix_d, datasize);
cudaMalloc((void**) &result_d, datasize_i);

FILE *fp;
fp = fopen("arrays16.txt", "r"); // reads in matrix
//clock_t start =clock();
for (i = 0; i < MAXR; i++) // this loop takes the information from .txt file and puts it into arr1 matrix
{
for (j = 0; j < MAXC; j++)
{
fscanf(fp, "%dt", &matrix[i][j]);
}
}

printf("*****INPUT MATRIX*****n");
for (i = 0; i < MAXR; i++)
{
printf("n");
for (j = 0; j < MAXC; j++) {
printf("%d ", matrix[i][j]);
}
}
printf("nn");

//This is where we convert the input matrix into floating point in intermediate matrix
for (int y = 0; y < MAXR; y++) {

for (int x = 0; x < MAXC; x++) {
intermediates[y][x] = (float) matrix[y][x];
}
}
printf("*******INTERMEDIATE MATRIX*******n");
for (i = 0; i < 16; i++) {
printf("n"); // prints out the results array to .txt file
for (j = 0; j < 16; j++) {
printf("%.1f ", intermediates[i][j]);
}
}
printf("nn");
// copying the data from the host array to the device array

//cudaMemcpy(matrix_d, matrix, datasize,

//cudaMemcpyHostToDevice);

cudaMemcpy(intermediates_d, intermediates, datasize_f,cudaMemcpyHostToDevice);
cudaMemcpy(result_d, result, datasize_i, cudaMemcpyHostToDevice);

// how many blocks we will allocate
dim3 blocks(1, 1);

//how many threads per block we will allocate
dim3 threadsPerBlock(16, 16);

//Launch Kernel
imagefilter<<<blocks, threadsPerBlock,MAXR*MAXC*sizeof(float)>>>(intermediates_d,result_d);

//Copy back Results Matrix.
cudaMemcpy(result, result_d, datasize_i, cudaMemcpyDeviceToHost);

cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();

if (errSync != cudaSuccess)
printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %sn", cudaGetErrorString(errAsync));

FILE *file;
file = fopen("results.txt", "w+"); // writes matrix to file
printf("*******RESULTS MATRIX******nn");
for (i = 1; i < MAXR - 1; i++) { // prints out the results array to .txt file
for (j = 1; j < MAXC - 1; j++) {
printf("%d ", result[i][j]);
fprintf(file, "%d ",result[i][j]);
}
printf("n");

fprintf(file, "n");
}

fclose(file);
}









share|improve this question















I'm trying to produce a mean filter of sliding window 3 x 3. I'm inputting a 16 x 16 array (arrays16.txt --> int matrix) and trying to allocate 16 x 16 threads for each block (1 block right now). Using cuda-memcheck i'm receiving a number of sync/async errors and I have been going around and around. Is there something obviously wrong? I understand it is some sort of segfault and probably an issue with my pointers but I can't seem to get past it.Using cuda-memcheck with the -lineinfo flag it seems to be happening in the loop in the kernel.



Here is my code, thank you:



#include <stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdlib.h>

#define MAXR 16
#define MAXC 16

__global__ void imagefilter(float ** intermediates_d, int ** result_d) {

int idx = threadIdx.x;
int idy = threadIdx.y;

int x,y;
//result_d[2][2]= 5;
//if ((idx < 15) && (idy < 15)) {
result_d[x][y] = result_d[idx][idy];
for(x=1; x < MAXR; x++) {
for(y=1; y < MAXC; y++) {
result_d[y][x] = (int) (((float) (intermediates_d[idx - 1][idy- 1]
+ intermediates_d[idx - 1][idy]
+ intermediates_d[idx - 1][idy + 1]
+ intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
+ intermediates_d[idx][idy + 1]
+ intermediates_d[idx + 1][idy - 1]
+ intermediates_d[idx + 1][idy]
+ intermediates_d[idx + 1][idy + 1]) / 9.0F));
// result_d[2][2]= 5;
result_d[idy][idx]= result_d[y][x];
}
}
}



__syncthreads();



int main(void)

{
int i, j;
//double cpu_time_used;
float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
int matrix[MAXR][MAXC]; // This is the input matrix from file
int result[MAXR][MAXC]={{0}}; //This is where we want to write the mean values. For now set to zeros
float ** intermediates_d;
//int **matrix_d;

int ** result_d;
int datasize_f = MAXR * MAXC * sizeof(float);
int datasize_i = MAXR * MAXC * sizeof(int);
//Allocate memory on the host.

cudaMalloc((void**) &intermediates_d, datasize_f);
//cudaMalloc((void**) &matrix_d, datasize);
cudaMalloc((void**) &result_d, datasize_i);

FILE *fp;
fp = fopen("arrays16.txt", "r"); // reads in matrix
//clock_t start =clock();
for (i = 0; i < MAXR; i++) // this loop takes the information from .txt file and puts it into arr1 matrix
{
for (j = 0; j < MAXC; j++)
{
fscanf(fp, "%dt", &matrix[i][j]);
}
}

printf("*****INPUT MATRIX*****n");
for (i = 0; i < MAXR; i++)
{
printf("n");
for (j = 0; j < MAXC; j++) {
printf("%d ", matrix[i][j]);
}
}
printf("nn");

//This is where we convert the input matrix into floating point in intermediate matrix
for (int y = 0; y < MAXR; y++) {

for (int x = 0; x < MAXC; x++) {
intermediates[y][x] = (float) matrix[y][x];
}
}
printf("*******INTERMEDIATE MATRIX*******n");
for (i = 0; i < 16; i++) {
printf("n"); // prints out the results array to .txt file
for (j = 0; j < 16; j++) {
printf("%.1f ", intermediates[i][j]);
}
}
printf("nn");
// copying the data from the host array to the device array

//cudaMemcpy(matrix_d, matrix, datasize,

//cudaMemcpyHostToDevice);

cudaMemcpy(intermediates_d, intermediates, datasize_f,cudaMemcpyHostToDevice);
cudaMemcpy(result_d, result, datasize_i, cudaMemcpyHostToDevice);

// how many blocks we will allocate
dim3 blocks(1, 1);

//how many threads per block we will allocate
dim3 threadsPerBlock(16, 16);

//Launch Kernel
imagefilter<<<blocks, threadsPerBlock,MAXR*MAXC*sizeof(float)>>>(intermediates_d,result_d);

//Copy back Results Matrix.
cudaMemcpy(result, result_d, datasize_i, cudaMemcpyDeviceToHost);

cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();

if (errSync != cudaSuccess)
printf("Sync kernel error: %sn", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %sn", cudaGetErrorString(errAsync));

FILE *file;
file = fopen("results.txt", "w+"); // writes matrix to file
printf("*******RESULTS MATRIX******nn");
for (i = 1; i < MAXR - 1; i++) { // prints out the results array to .txt file
for (j = 1; j < MAXC - 1; j++) {
printf("%d ", result[i][j]);
fprintf(file, "%d ",result[i][j]);
}
printf("n");

fprintf(file, "n");
}

fclose(file);
}






filter parallel-processing cuda box






share|improve this question















share|improve this question













share|improve this question




share|improve this question








edited Nov 11 at 9:41









talonmies

58.9k17127194




58.9k17127194










asked Nov 10 at 22:20









Yeinberg

33




33












  • You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
    – Florent DUGUET
    Nov 10 at 23:42










  • when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
    – Yeinberg
    Nov 11 at 0:24










  • any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
    – Yeinberg
    Nov 11 at 5:00






  • 1




    The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
    – Florent DUGUET
    Nov 11 at 5:41










  • Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
    – Yeinberg
    Nov 11 at 15:18


















  • You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
    – Florent DUGUET
    Nov 10 at 23:42










  • when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
    – Yeinberg
    Nov 11 at 0:24










  • any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
    – Yeinberg
    Nov 11 at 5:00






  • 1




    The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
    – Florent DUGUET
    Nov 11 at 5:41










  • Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
    – Yeinberg
    Nov 11 at 15:18
















You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
– Florent DUGUET
Nov 10 at 23:42




You may want to verify result_d. It is a pointer to pointer but allocated as a pointer. C arrays and pointers behave differently. You probably want to change result_d to be a single pointer and calculate 2D indexing in the kernel.
– Florent DUGUET
Nov 10 at 23:42












when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
– Yeinberg
Nov 11 at 0:24




when i try that, i receive a "error: expression must have pointer-to-object type". It seems im having kernel launch failures because when i print out the results_d after having copied back to the host as results (host) it prints out all zeros. Which is the way it was "filled up" before copying to the device.
– Yeinberg
Nov 11 at 0:24












any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
– Yeinberg
Nov 11 at 5:00




any advice? im a beginner in CUDA and trying to fully understand why im receiving the errors im receiving and seemingly no communication between device and host given the results matrix prints out as all zeros
– Yeinberg
Nov 11 at 5:00




1




1




The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
– Florent DUGUET
Nov 11 at 5:41




The error you mention is a C++ error. You need to review the C++ syntax of your code as CUDA/C is based on C++.
– Florent DUGUET
Nov 11 at 5:41












Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
– Yeinberg
Nov 11 at 15:18




Yes I think you are right. I have tinkered with the pointers and the memory allocation but still haven't got it to work properly. can you please be a bit more specific?
– Yeinberg
Nov 11 at 15:18

















active

oldest

votes











Your Answer






StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});


}
});














draft saved

draft discarded


















StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53244001%2fcuda-c-mean-filter-sync-async-errors%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown






























active

oldest

votes













active

oldest

votes









active

oldest

votes






active

oldest

votes
















draft saved

draft discarded




















































Thanks for contributing an answer to Stack Overflow!


  • Please be sure to answer the question. Provide details and share your research!

But avoid



  • Asking for help, clarification, or responding to other answers.

  • Making statements based on opinion; back them up with references or personal experience.


To learn more, see our tips on writing great answers.





Some of your past answers have not been well-received, and you're in danger of being blocked from answering.


Please pay close attention to the following guidance:


  • Please be sure to answer the question. Provide details and share your research!

But avoid



  • Asking for help, clarification, or responding to other answers.

  • Making statements based on opinion; back them up with references or personal experience.


To learn more, see our tips on writing great answers.




draft saved


draft discarded














StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53244001%2fcuda-c-mean-filter-sync-async-errors%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown





















































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown

































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown







Popular posts from this blog

鏡平學校

ꓛꓣだゔៀៅຸ໢ທຮ໕໒ ,ໂ'໥໓າ໼ឨឲ៵៭ៈゎゔit''䖳𥁄卿' ☨₤₨こゎもょの;ꜹꟚꞖꞵꟅꞛေၦေɯ,ɨɡ𛃵𛁹ޝ޳ޠ޾,ޤޒޯ޾𫝒𫠁သ𛅤チョ'サノބޘދ𛁐ᶿᶇᶀᶋᶠ㨑㽹⻮ꧬ꧹؍۩وَؠ㇕㇃㇪ ㇦㇋㇋ṜẰᵡᴠ 軌ᵕ搜۳ٰޗޮ޷ސޯ𫖾𫅀ल, ꙭ꙰ꚅꙁꚊꞻꝔ꟠Ꝭㄤﺟޱސꧨꧼ꧴ꧯꧽ꧲ꧯ'⽹⽭⾁⿞⼳⽋២៩ញណើꩯꩤ꩸ꩮᶻᶺᶧᶂ𫳲𫪭𬸄𫵰𬖩𬫣𬊉ၲ𛅬㕦䬺𫝌𫝼,,𫟖𫞽ហៅ஫㆔ాఆఅꙒꚞꙍ,Ꙟ꙱エ ,ポテ,フࢰࢯ𫟠𫞶 𫝤𫟠ﺕﹱﻜﻣ𪵕𪭸𪻆𪾩𫔷ġ,ŧآꞪ꟥,ꞔꝻ♚☹⛵𛀌ꬷꭞȄƁƪƬșƦǙǗdžƝǯǧⱦⱰꓕꓢႋ神 ဴ၀க௭எ௫ឫោ ' េㇷㇴㇼ神ㇸㇲㇽㇴㇼㇻㇸ'ㇸㇿㇸㇹㇰㆣꓚꓤ₡₧ ㄨㄟ㄂ㄖㄎ໗ツڒذ₶।ऩछएोञयूटक़कयँृी,冬'𛅢𛅥ㇱㇵㇶ𥄥𦒽𠣧𠊓𧢖𥞘𩔋цѰㄠſtʯʭɿʆʗʍʩɷɛ,əʏダヵㄐㄘR{gỚṖḺờṠṫảḙḭᴮᵏᴘᵀᵷᵕᴜᴏᵾq﮲ﲿﴽﭙ軌ﰬﶚﶧ﫲Ҝжюїкӈㇴffצּ﬘﭅﬈軌'ffistfflſtffतभफɳɰʊɲʎ𛁱𛁖𛁮𛀉 𛂯𛀞నఋŀŲ 𫟲𫠖𫞺ຆຆ ໹້໕໗ๆทԊꧢꧠ꧰ꓱ⿝⼑ŎḬẃẖỐẅ ,ờỰỈỗﮊDžȩꭏꭎꬻ꭮ꬿꭖꭥꭅ㇭神 ⾈ꓵꓑ⺄㄄ㄪㄙㄅㄇstA۵䞽ॶ𫞑𫝄㇉㇇゜軌𩜛𩳠Jﻺ‚Üမ႕ႌႊၐၸဓၞၞၡ៸wyvtᶎᶪᶹစဎ꣡꣰꣢꣤ٗ؋لㇳㇾㇻㇱ㆐㆔,,㆟Ⱶヤマފ޼ޝަݿݞݠݷݐ',ݘ,ݪݙݵ𬝉𬜁𫝨𫞘くせぉて¼óû×ó£…𛅑הㄙくԗԀ5606神45,神796'𪤻𫞧ꓐ㄁ㄘɥɺꓵꓲ3''7034׉ⱦⱠˆ“𫝋ȍ,ꩲ軌꩷ꩶꩧꩫఞ۔فڱێظペサ神ナᴦᵑ47 9238їﻂ䐊䔉㠸﬎ffiﬣ,לּᴷᴦᵛᵽ,ᴨᵤ ᵸᵥᴗᵈꚏꚉꚟ⻆rtǟƴ𬎎

Why https connections are so slow when debugging (stepping over) in Java?