c++ - using a pointer to vector<T>::data() for cublasSgemm -
i trying use vector::data() pointer when using cudamalloc, cudamemcpy, , cublassgemm can't seem work. if not mistaken, vector::data() should return pointer actual array stored in memory vector should same having t* aarray pointer array of type t stored in memory. using latter work, not data() pointer.
here code working on:
matrix<t> matrix<t>::cudaprod(matrix<t>&a,matrix<t>&b, matrix<t>&c) { c = matrix<t>(a.height, b.width); //resizing of vector of elements matrix c //a[m][n]*b[n][k]=c[m][k] int m = a.height; int n = b.height; int k = b.width; float alpha = 1.0f; float beta = 0.0f; t* d_a = a.getpointer(); t* d_b = b.getpointer(); t* d_c = c.getpointer(); cudamalloc(&d_a,a.size); cudamalloc(&d_b,b.size); cudamalloc(&d_c,c.size); cudamemcpy(d_a,a.getpointer(),a.size,cudamemcpyhosttodevice); cudamemcpy(d_b,b.getpointer(),b.size,cudamemcpyhosttodevice); cublashandle_t handle; cublasstatus_t status = cublascreate(&handle); if (status != cublas_status_success) { std::cerr << "!!!! cublas initialization error\n"; } status = cublassgemm(handle,cublas_op_n,cublas_op_n,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k); if (status != cublas_status_success) { std::cerr << "!!!! kernel execution error.\n"; } status = cublasdestroy(handle); if (status != cublas_status_success) { std::cerr << "!!!! shutdown error (a)\n"; } cudamemcpy(c.getpointer(), d_c, c.size,cudamemcpydevicetohost); cudafree(d_a); cudafree(d_b); cudafree(d_c);
the getpointer() member function returns vector::data() of vector of elements matrix object. size vector element's size in memory.
the vector of matrix c returns zeros when using data() pointer, , returns product of matrix , b when using t* aarray pointers without vectors.
is possible use vectors store array of elements , data() pointer initialize device copy of array, or forced use c style array storage on host? also, have tried using thrust::device_vector , works stay away creating raw_pointer_casts.
thanks help!
edit: having trouble copy , pasting, here complete example:
#include <cuda.h> #include <cuda_runtime.h> #include <cuda_device_runtime_api.h> #include <cublas_v2.h> #include <vector> #include <iostream> using namespace std; template<typename t> class matrix { public: ~matrix(); matrix(); matrix(int rows, int columns); int width; int height; int stride; size_t size; t &getelement(int row, int column); void setelement(int row, int column, t value); void setelements(vector<t> value); vector<t>& getelements(); t* getpointer(); matrix<t> cudaprod(matrix<t>&a,matrix<t>&b, matrix<t>&c); private: vector<t> elements; t* firstelement; }; template<typename t> matrix<t>::~matrix() { } template<typename t> matrix<t>::matrix() { } template<typename t> matrix<t>::matrix(int rows, int columns) { height = rows; width = columns; stride = columns; //in row major order equal # of columns elements.resize(rows*columns); firstelement = elements.data(); size = height*width*sizeof(t); } template<typename t> t &matrix<t>::getelement(int row, int column) { return elements[row*width + column]; //row major order return } template<typename t> vector<t>& matrix<t>::getelements() { return elements; //row major order return } template<typename t> void matrix<t>::setelement(int row, int column, t value) { elements[row*width + column] = value; //row major order return } template<typename t> void matrix<t>::setelements(vector<t> value) { elements = value; } template<typename t> t* matrix<t>::getpointer() { return firstelement; } template<typename t> //matrix multiplication using cuda matrix<t> matrix<t>::cudaprod(matrix<t>&a,matrix<t>&b, matrix<t>&c) { c = matrix<t>(a.height, b.width); //a[m][n]*b[n][k]=c[m][k] int m = a.height; int n = b.height; int k = b.width; float alpha = 1.0f; float beta = 0.0f; //thrust usage /*thrust::device_vector<t> d_a = a.getelements(); t* d_a = thrust::raw_pointer_cast(&d_a[0]); thrust::device_vector<t> d_b = b.getelements(); t* d_b = thrust::raw_pointer_cast(&d_b[0]); thrust::device_vector<t> d_c = c.getelements(); t* d_c = thrust::raw_pointer_cast(&d_c[0]);*/ t* d_a = a.getpointer(); t* d_b = b.getpointer(); t* d_c = c.getpointer(); cudamalloc(&d_a,a.size); cudamalloc(&d_b,b.size); cudamalloc(&d_c,c.size); cudamemcpy(d_a,a.getpointer(),a.size,cudamemcpyhosttodevice); cudamemcpy(d_b,b.getpointer(),b.size,cudamemcpyhosttodevice); cudamemcpy(d_c,c.getpointer(),c.size,cudamemcpyhosttodevice); cublashandle_t handle; cublasstatus_t status = cublascreate(&handle); if (status != cublas_status_success) { std::cerr << "!!!! cublas initialization error\n"; } status = cublassgemm(handle,cublas_op_n,cublas_op_n,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k); if (status != cublas_status_success) { std::cerr << "!!!! kernel execution error.\n"; } status = cublasdestroy(handle); if (status != cublas_status_success) { std::cerr << "!!!! shutdown error (a)\n"; } //thrust::copy(d_c.begin(), d_c.end(), c.getelements().begin()); cudamemcpy(c.getpointer(), d_c, c.size,cudamemcpydevicetohost); cudafree(d_a); cudafree(d_b); cudafree(d_c); return c; } int main() { matrix<float> a(2,2); matrix<float> b(2,2); matrix<float> c; vector<float> ae(4,2); vector<float> be(4,4); a.setelements(ae); b.setelements(be); c = c.cudaprod(a, b, c); //function call cudaprod() for(int row = 0; row < a.height; ++row) { for(int col = 0; col < a.width; ++col) { cout<<a.getelement(row, col)<<" "; //h_c stored on device in column major order, need switch row major order } printf("\n"); } printf("\n"); for(int row = 0; row < b.height; ++row) { for(int col = 0; col < b.width; ++col) { cout<<b.getelement(row, col)<<" "; //h_c stored on device in column major order, need switch row major order } printf("\n"); } printf("\n"); for(int row = 0; row < c.height; ++row) { for(int col = 0; col < c.width; ++col) { cout<<c.getelement(row, col)<<" "; //h_c stored on device in column major order, need switch row major order } printf("\n"); } printf("\n"); }
from std::vector::data documentation, data()
returns both const
, non-const
qualified pointers, depending on fact vector
qualified const
or not. quoting documentation
if vector object const-qualified, function returns pointer const value_type. otherwise, returns pointer value_type.
accordingly, using
firstelement = elements.data();
in matrix
constructor fine read/write data.
the main problem code declaring c
in main
, passing reference c
cudaprod
method , internally using
c = matrix<t>(a.height, b.width);
which redeclare matrix
.
if change definition of cudaprod
method to
template<typename t> void cudaprod(matrix<t>&a,matrix<t>&b, matrix<t>&c)
remove the
return c;
statement , allocate space c
in main as
matrix<float> c(2,2); vector<float> ce(4,10); c.setelements(ce);
your code should work correctly.
Comments
Post a Comment