Date:2016-02-03
Author:kagula
Environment:
[1]Win10
[2]VS2013 Update5
[3]Cuda 7.5
从Uvidia官网下载的cuda_7.5.18_windows.exe,cudatoolkit_3.1_win_64.exe
Author:kagula
Environment:
[1]Win10
[2]VS2013 Update5
[3]Cuda 7.5
从Uvidia官网下载的cuda_7.5.18_windows.exe,cudatoolkit_3.1_win_64.exe
[4]Core i7-4790k + GTX960
测试方式
两个500阶矩阵相乘。
Corei7-4790K+GTX960 测试结果(GPU Boost模式打开情况下)
CPU: 217.866ms
C++ AMP: 35.1236ms
CUDA: 1.969ms
Corei5-2500K+Quadro K600测试结果
CPU: 252.744ms
C++ AMP: 73.1175ms
CUDA: 19.706ms
CUDA由于对threads做了分块优化,我这里写的cpu和amp代码没有优化,所以性能差距很大。
程序提示计算结果有错误,需要关闭boost改进GPU计算的正确性。
这里没有进一步测试。
测试用到的代码段
C++源代码
void matrixMultiplication(int count, int **ppA, int **ppB, int ***pppR){
int **ppMatrix = new int*[count];
*pppR = ppMatrix;
for (int row = 0; row < count; row++){
ppMatrix[row] = new int[count];
}
//
for (int row = 0; row < count; row++)
{
for (int col = 0; col < count; col++)
{
ppMatrix[row][col] = 0;
for (int k = 0; k < count; k++)
{
ppMatrix[row][col] += ppA[row][col] * ppB[col][row];
}
}
}
}
C++AMP源代码
float Matrix_AMP()
{
float score = .0f;
int *a = new int[MATRIX_ORDER*MATRIX_ORDER];
int *b = new int[MATRIX_ORDER*MATRIX_ORDER];
int *r = new int[MATRIX_ORDER*MATRIX_ORDER];
for (int row = 0; row < MATRIX_ORDER; row++)
{
for (int col = 0; col < MATRIX_ORDER; col++)
{
a[row*MATRIX_ORDER + col] = row*MATRIX_ORDER + col;
b[row*MATRIX_ORDER + col] = row*MATRIX_ORDER + col;
r[row*MATRIX_ORDER + col] = 0;
}
}
startTiming();
//amp.begin
array_view<const int, 2> src(MATRIX_ORDER, MATRIX_ORDER, a);
array_view<const int, 2> dst(MATRIX_ORDER, MATRIX_ORDER, b);
array_view<int, 2> result(MATRIX_ORDER, MATRIX_ORDER, r);
result.discard_data();
parallel_for_each(
result.extent,
[=](index<2> idx) restrict(amp)
{
const unsigned int row = idx[0];
const unsigned int col = idx[1];
int r = 0;
for (unsigned int i = 0; i < MATRIX_ORDER;i++)
{
r += src[row][i]*dst[i][col];
}
result[idx] = r;
}
);
result.synchronize();
//amp.end
score = (float)stopTiming();
delete a;
delete b;
delete r;
//以毫秒为单位,返回耗时。
return score;
}
CUDA源代码
源于Nvidia自带的sample,只列出修改部分
/**
* Run a simple test of matrix multiplication using CUDA
*/
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
{
// Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = new float[mem_size_A];//(float *)malloc(mem_size_A);
unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = new float[mem_size_B];// (float *)malloc(mem_size_B);
// Initialize host memory
const float valB = 0.01f;
constantInit(h_A, size_A, 1.0f);
constantInit(h_B, size_B, valB);
// Allocate device memory
float *d_A, *d_B, *d_C;
// Allocate host matrix C
dim3 dimsC(dimsB.x, dimsA.y, 1);
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C = new float[mem_size_C];//(float *) malloc(mem_size_C);
if (h_C == NULL)
{
fprintf(stderr, "Failed to allocate host matrix C!\n");
exit(EXIT_FAILURE);
}
// Record the start event
cudaError_t error;
// Allocate CUDA events that we'll use for timing
cudaEvent_t start;
error = cudaEventCreate(&start);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaEventRecord(start, NULL);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **) &d_A, mem_size_A);
if (error != cudaSuccess)
{
printf("cudaMalloc d_A returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **) &d_B, mem_size_B);
if (error != cudaSuccess)
{
printf("cudaMalloc d_B returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **) &d_C, mem_size_C);
if (error != cudaSuccess)
{
printf("cudaMalloc d_C returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}
// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
if (error != cudaSuccess)
{
printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
if (error != cudaSuccess)
{
printf("cudaMemcpy (d_B,h_B) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}
// Setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
// Create and start timer
printf("Computing result using CUDA Kernel...\n");
// Performs warmup operation using matrixMul CUDA kernel
if (block_size == 16)
{
matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
else
{
matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
printf("done\n");
cudaDeviceSynchronize();
cudaEvent_t stop;
error = cudaEventCreate(&stop);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
// Execute the kernel
int nIter = 1;
for (int j = 0; j < nIter; j++)
{
if (block_size == 16)
{
matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
else
{
matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
}
// Copy result from device to host
error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
if (error != cudaSuccess)
{
printf("cudaMemcpy (h_C,d_C) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}
// Record the stop event
error = cudaEventRecord(stop, NULL);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
// Wait for the stop event to complete
error = cudaEventSynchronize(stop);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
float msecTotal = 0.0f;
error = cudaEventElapsedTime(&msecTotal, start, stop);
if (error != cudaSuccess)
{
fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
// Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf(
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
gigaFlops,
msecPerMatrixMul,
flopsPerMatrixMul,
threads.x * threads.y);
printf("Checking computed result for correctness: ");
bool correct = true;
// test relative error by the formula
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
//double eps = 1.e-6 ; // machine zero
//for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++)
//{
// double abs_err = fabs(h_C[i] - (dimsA.x * valB));
// double dot_length = dimsA.x;
// double abs_val = fabs(h_C[i]);
// double rel_err = abs_err/abs_val/dot_length ;
// if (rel_err > eps)
// {
// printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps);
// correct = false;
// }
//}
//printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
// Clean up memory
delete h_A;//free(h_A);
delete h_B;//free(h_B);
delete h_C;//free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();
if (correct)
{
return EXIT_SUCCESS;
}
else
{
return EXIT_FAILURE;
}
}
参考资料
[1]https://developer.nvidia.com/cuda-zone
版权声明:本文为lee353086原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。