CPU到GPU
在CUDA程序中,将数据从CPU传输到GPU,或者从GPU传输到CPU的时候,需要调用底层的内存拷贝函数。当有很多不同类型的数据的时候,这个过程会非常繁琐。于是,我专门写了个内存拷贝的模板函数,使这个过程变的十分方便。
template <typename T> T* valueHostToDevice(T *value, const int &num = 1, bool isDelete = false){
T *devValue;
cudaMalloc((void**)&devValue, num * sizeof(T));
cudaMemcpy(devValue, value, num * sizeof(T), cudaMemcpyHostToDevice);
if(isDelete){
if(num == 1){delete value;}
else{delete[] value;}
}
return devValue;
}
以上的函数能够将数据从CPU拷贝到GPU,而且能够支持任意类型的数据,比如整型int、浮点型float或自定义的类型,同时既能拷贝单个对象,也能拷贝一个对象数组,最后还可以销毁CPU内存数据,其调用方法也是非常方便
// 任意类型的数组指针
A *m_a = new A[3];
// 从CPU的m_a拷贝到GPU的a
A *a = valueHostToDevice(m_a, 3);
以上是带有返回值的函数,也可以写成是不带返回值的函数
template <typename T> void valueHostToDevice(T &value, T **devValue){
T *hostValue;
cudaHostAlloc((void**)&hostValue, sizeof(T), cudaHostAllocDefault);
cudaMalloc((void**)devValue, sizeof(T));
*hostValue = value;
cudaMemcpy(*devValue, hostValue, sizeof(T), cudaMemcpyHostToDevice);
cudaFreeHost(hostValue);
}
// 调用
valueHostToDevice(m_a, &a);
以上这个函数就有点繁琐了。为了GPU的开辟内存空间,需要使用cudaMalloc函数操作GPU指针a。cudaalloc函数需要传入指针a的地址,而直接传入a的话,这是作为形参的,所以函数里的指针是个临时变量,并不是原始的指针a的地址,所以在函数中直接传入指针a是无法操作的。所以需要定义二级指针,再让这个二级指针指向a指针,才能对其进行操作,所调用时需要传入的形参是&a。
GPU到CPU
类似的还有从GPU传到CPU的过程,其调用方法也是类似的。
template <typename T> T* valueDeviceToHost(T *devValue, const int &num = 1){
T *hostValue;
cudaHostAlloc((void**)&hostValue, num * sizeof(T), cudaHostAllocDefault);
cudaMemcpy(hostValue, devValue, num * sizeof(T), cudaMemcpyDeviceToHost);
return hostValue;
}
template <typename T> void valueDeviceToHost(T *devValue, T &value){
T *hostValue;
cudaHostAlloc((void**)&hostValue, 1 * sizeof(T), cudaHostAllocDefault);
cudaMemcpy(hostValue, devValue, 1 * sizeof(T), cudaMemcpyDeviceToHost);
value = *hostValue;
cudaFreeHost(hostValue);
}
二维数组
以上只是拷贝一维数组,同时还可以拷贝二维数组。
template <typename T> T** array2DHostToDevice(T *hostData, T **devData,
const int &rowNum, const int &colNum){
// 开辟内存
T **hostArray;
cudaHostAlloc((void**)&hostArray, rowNum * sizeof(T*), cudaHostAllocDefault);
T **devArray;
cudaMalloc((void**)&devArray, rowNum * sizeof(T*));
size_t pitch;
cudaMallocPitch(devData, &pitch, colNum * sizeof(T), rowNum);
cudaMemset2D(*devData, pitch, 0, colNum * sizeof(T), rowNum);
// 设置二维数组的指针
for(int i = 0; i < rowNum; i++){
hostArray[i] = (T*)((float*)(*devData) + i*pitch / sizeof(float));
}
// 复制数据
if(hostData != NULL)
cudaMemcpy2D(*devData, pitch, hostData, colNum * sizeof(T), colNum * sizeof(T), rowNum, cudaMemcpyHostToDevice);
cudaMemcpy(devArray, hostArray, rowNum * sizeof(T*), cudaMemcpyHostToDevice);
cudaFreeHost(hostArray);
return devArray;
}
// 调用
vec **image2D = NULL, *gpu_image2DData = NULL;
vec *cpu_image2DData = NULL;
image2D = array2DHostToDevice(cpu_image2DData, &gpu_normalData, height, width);
template <typename T> void array2DHostToDevice(T ***devArray,
T *hostData, T **devData,
const int &rowNum, const int &colNum){
// 开辟内存
T **hostArray;
cudaHostAlloc((void**)&hostArray, rowNum * sizeof(T*), cudaHostAllocDefault);
cudaMalloc((void**)devArray, rowNum * sizeof(T*));
size_t pitch;
cudaMallocPitch(devData, &pitch, colNum * sizeof(T), rowNum);
cudaMemset2D(*devData, pitch, 0, colNum * sizeof(T), rowNum);
// 设置二维数组的指针
for(int i = 0; i < rowNum; i++){
hostArray[i] = (T*)((float*)(*devData) + i*pitch / sizeof(float));
}
// 复制数据
cudaMemcpy2D(*devData, pitch, hostData, colNum * sizeof(T), colNum * sizeof(T), rowNum, cudaMemcpyHostToDevice);
cudaMemcpy(*devArray, hostArray, rowNum * sizeof(T*), cudaMemcpyHostToDevice);
cudaFreeHost(hostArray);
}
封装成类之后,可以更加方便。
class MemCopy{
public:
MemCopy(){}
template <typename T> T* hostToDevice(T *value,
const int &num = 1,
bool isDelete = false);
template <typename T> T** hostToDevice(T *hostData,
T **devData,
const int &rowNum,
const int &colNum);
};
template <typename T> T* MemCopy::hostToDevice(T *value,
const int &num,
bool isDelete){
// 任意类型的数组指针
// A *m_a = new A[3];
// 从CPU的m_a拷贝到GPU的a
// A *a = valueHostToDevice(m_a, 3);
T *devValue;
cudaMalloc((void**)&devValue, num * sizeof(T));
cudaMemcpy(devValue, value, num * sizeof(T), cudaMemcpyHostToDevice);
if(isDelete){
if(num == 1){delete value;}
else{delete[] value;}
}
return devValue;
}
template <typename T> T** MemCopy::hostToDevice(T *hostData,
T **devData,
const int &rowNum,
const int &colNum){
// 调用
// vec **image2D = NULL, *gpu_image2DData = NULL;
// vec *cpu_image2DData = NULL;
// image2D = array2DHostToDevice(cpu_image2DData, &gpu_normalData, height, width);
// 开辟内存
T **hostArray;
cudaHostAlloc((void**)&hostArray, rowNum * sizeof(T*), cudaHostAllocDefault);
T **devArray;
cudaMalloc((void**)&devArray, rowNum * sizeof(T*));
size_t pitch;
cudaMallocPitch(devData, &pitch, colNum * sizeof(T), rowNum);
cudaMemset2D(*devData, pitch, 0, colNum * sizeof(T), rowNum);
// 设置二维数组的指针
for(int i = 0; i < rowNum; i++){
hostArray[i] = (T*)((float*)(*devData) + i*pitch / sizeof(float));
}
// 复制数据
if(hostData != NULL)
cudaMemcpy2D(*devData, pitch, hostData, colNum * sizeof(T), colNum * sizeof(T), rowNum, cudaMemcpyHostToDevice);
cudaMemcpy(devArray, hostArray, rowNum * sizeof(T*), cudaMemcpyHostToDevice);
cudaFreeHost(hostArray);
return devArray;
}
int main(void){
Timing t = Timing();
int *c_a = new int[100000000];
for(int i = 0; i < 10000; i++){
for(int j = 0; j < 10000; j++){
c_a[i*10000+j] = 10;
}
}
MemCopy c = MemCopy();
t.tic();
// Do Something
int *g_a;
int **a = c.hostToDevice(c_a, &g_a, 10000, 10000);
t.toc();
delete[] c_a;
cudaFree(g_a);
cudaFree(a);
return 0;
}
版权声明:本文为wanchaochaochao原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。