CUDA debug invalid kernel image error -


i wrote following cuda kernel , trying load module:

#include <stdio.h>  extern "c"   // ensure function name "vadd" {     __global__ void vadd(const float *a, const float *b, float *c)     {         int = threadidx.x + blockidx.x * blockdim.x;         printf("thread id %d\n", i);         c[i] = a[i] + b[i];     } } 

i compile ptx code using following command:

nvcc -ptx -arch=sm_20 vadd.cu 

when trying load file module using cumoduleload cuda 200 error (invalid kernel image). how can find out wrong kernel image? have tried ptxas, according that, generated ptx code fine.

edit: code using load module:

#include "cuda.h" #include <cassert> #include <dlfcn.h> #include <stdio.h>  void check(curesult err) {   if (err != cuda_success) {     printf("error %i\n", err);   }   assert(err == cuda_success); }  int main(int argc, char **argv) {     void *cuda = dlopen("libcuda.so", rtld_now | rtld_deepbind | rtld_global);     assert(cuda != null);      printf("cuinit\n");     curesult (*init)() = (curesult (*)()) dlsym(cuda, "cuinit");     check(init());      printf("cudeviceget\n");     curesult (*deviceget)(cudevice *, int) = (curesult (*)(cudevice *, int)) dlsym(cuda, "cudeviceget");     cudevice device;     check(deviceget(&device, 0));      printf("cuctxcreate\n");     curesult (*ctxcreate)(cucontext * , unsigned int, cudevice) = (curesult (*)(cucontext * , unsigned int, cudevice)) dlsym(cuda, "cuctxcreate");     cucontext context;     check(ctxcreate(&context, 0, device));      printf("cumoduleload\n");     curesult (*moduleload)(cumodule *, const char*) = (curesult (*)(cumodule *, const char*)) dlsym(cuda, "cumoduleload");     cumodule mod;     check(moduleload(&mod, "vadd.ptx"));      return 0; } 

this related why cuctxcreate creates old context?: using cuctxcreate directly, gives old api context (v3.1) incompatible usage of printf. can check api version cuctxgetapiversion. if switch cuctxcreate_v2, used through #define's in cuda.h, you'll more recent api context.

in order spot discrepancy, i've run sample ld_debug=symbols, , compared using cuda api directly (since runs sample ptx). comparing symbol resolutions, big difference call cuctxcreate:

cuctxcreate(...)     symbol=cuctxcreate_v2;  lookup in file=./test [0]     symbol=cuctxcreate_v2;  lookup in file=/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0] 

... in original code, using dlsym(..., "cuctxcreate") mapped directly cuctxcreate.


Comments

Popular posts from this blog

android - Get AccessToken using signpost OAuth without opening a browser (Two legged Oauth) -

org.mockito.exceptions.misusing.InvalidUseOfMatchersException: mockito -

google shop client API returns 400 bad request error while adding an item -