I can't get output numbers with ctypes cuda

Question:

cuda1.cu

#include <iostream>

using namespace std ;

# define DELLEXPORT extern "C" __declspec(dllexport)

__global__ void kernel(long* answer = 0){
    *answer = threadIdx.x + (blockIdx.x * blockDim.x);
}

DELLEXPORT void resoult(long* h_answer){

    long* d_answer = 0;
    
    cudaMalloc(&d_answer, sizeof(long));

    kernel<<<10,1000>>>(d_answer);
    cudaMemcpy(&h_answer, d_answer, sizeof(long), cudaMemcpyDeviceToHost);
        cudaFree(d_answer);
}

main.py

import ctypes
import numpy as np

add_lib = ctypes.CDLL(".\a.dll")
resoult= add_lib.resoult
resoult.argtypes = [ctypes.POINTER(ctypes.c_long)]

x = ctypes.c_long()

print("R:",resoult(x))

print("RV: ",x.value)

print("RB: ",resoult(ctypes.byref(x)))

output in python:0

output in cuda: 2096

I implemented based on c language without any problems but in cuda mode I have a problem how can I have the correct output value

Thanks

Asked By: user5963087

||

Answers:

cudaMemcpy is expecting pointers for dst and src.
In your function resoult, h_answer is a pointer to a long allocated by the caller.
Since it’s already the pointer where the data should be copied to, you should use it as is and not take it’s address by using &h_answer.

Therefore you need to change your cudaMemcpy from:

cudaMemcpy(&h_answer, d_answer, sizeof(long), cudaMemcpyDeviceToHost);

To:

cudaMemcpy(h_answer, d_answer, sizeof(long), cudaMemcpyDeviceToHost);
Answered By: wohlstad
Categories: questions Tags: , , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.