C++ AMP同CUDA之间的性能比较

Date:2016-02-03
Author:kagula
Environment:
[1]Win10
[2]VS2013 Update5
[3]Cuda 7.5
从Uvidia官网下载的cuda_7.5.18_windows.exe，cudatoolkit_3.1_win_64.exe

[4]Core i7-4790k + GTX960

测试方式

两个500阶矩阵相乘。

Corei7-4790K+GTX960 测试结果（GPU Boost模式打开情况下）

CPU: 217.866ms
C++ AMP: 35.1236ms
CUDA: 1.969ms

Corei5-2500K+Quadro K600测试结果

CPU: 252.744ms
C++ AMP: 73.1175ms
CUDA: 19.706ms

CUDA由于对threads做了分块优化，我这里写的cpu和amp代码没有优化，所以性能差距很大。

程序提示计算结果有错误，需要关闭boost改进GPU计算的正确性。

这里没有进一步测试。

测试用到的代码段

C++源代码

void matrixMultiplication(int count, int **ppA, int **ppB, int ***pppR){
	int **ppMatrix = new int*[count];
	*pppR = ppMatrix;

	for (int row = 0; row < count; row++){
		ppMatrix[row] = new int[count];
	}

	//  
	for (int row = 0; row < count; row++)
	{
		for (int col = 0; col < count; col++)
		{
			ppMatrix[row][col] = 0;
			for (int k = 0; k < count; k++)
			{
				ppMatrix[row][col] += ppA[row][col] * ppB[col][row];
			}
		}
	}
}

C++AMP源代码

float Matrix_AMP()
{
	float score = .0f;

	int *a = new int[MATRIX_ORDER*MATRIX_ORDER];
	int *b = new int[MATRIX_ORDER*MATRIX_ORDER];
	int *r = new int[MATRIX_ORDER*MATRIX_ORDER];

	for (int row = 0; row < MATRIX_ORDER; row++)
	{
		for (int col = 0; col < MATRIX_ORDER; col++)
		{
			a[row*MATRIX_ORDER + col] = row*MATRIX_ORDER + col;
			b[row*MATRIX_ORDER + col] = row*MATRIX_ORDER + col;
			r[row*MATRIX_ORDER + col] = 0;
		}
	}

	startTiming();

	//amp.begin
	array_view<const int, 2> src(MATRIX_ORDER, MATRIX_ORDER, a);
	array_view<const int, 2> dst(MATRIX_ORDER, MATRIX_ORDER, b);
	array_view<int, 2> result(MATRIX_ORDER, MATRIX_ORDER, r);
	result.discard_data();

	parallel_for_each(
		result.extent,
		[=](index<2> idx) restrict(amp)
	{
		const unsigned int row = idx[0];
		const unsigned int col = idx[1];
		int r = 0;

		for (unsigned int i = 0; i < MATRIX_ORDER;i++)
		{
			r += src[row][i]*dst[i][col];
		}		
		result[idx] = r;
	}
	);

	result.synchronize();
	//amp.end

	score = (float)stopTiming();

	delete a;
	delete b;
	delete r;

	//以毫秒为单位，返回耗时。
	return score;
}

CUDA源代码

源于Nvidia自带的sample，只列出修改部分

/**
 * Run a simple test of matrix multiplication using CUDA
 */
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
{
    // Allocate host memory for matrices A and B
    unsigned int size_A = dimsA.x * dimsA.y;
    unsigned int mem_size_A = sizeof(float) * size_A;
	float *h_A = new float[mem_size_A];//(float *)malloc(mem_size_A);
    unsigned int size_B = dimsB.x * dimsB.y;
    unsigned int mem_size_B = sizeof(float) * size_B;
	float *h_B = new float[mem_size_B];// (float *)malloc(mem_size_B);

    // Initialize host memory
    const float valB = 0.01f;
    constantInit(h_A, size_A, 1.0f);
    constantInit(h_B, size_B, valB);

    // Allocate device memory
    float *d_A, *d_B, *d_C;

    // Allocate host matrix C
    dim3 dimsC(dimsB.x, dimsA.y, 1);
    unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
	float *h_C = new float[mem_size_C];//(float *) malloc(mem_size_C);

    if (h_C == NULL)
    {
        fprintf(stderr, "Failed to allocate host matrix C!\n");
        exit(EXIT_FAILURE);
    }

	// Record the start event
	cudaError_t error;
	// Allocate CUDA events that we'll use for timing
	cudaEvent_t start;
	error = cudaEventCreate(&start);

	if (error != cudaSuccess)
	{
		fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
		exit(EXIT_FAILURE);
	}
	error = cudaEventRecord(start, NULL);

	if (error != cudaSuccess)
	{
		fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
		exit(EXIT_FAILURE);
	}

    error = cudaMalloc((void **) &d_A, mem_size_A);

    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_A returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
        exit(EXIT_FAILURE);
    }

    error = cudaMalloc((void **) &d_B, mem_size_B);

    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_B returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
        exit(EXIT_FAILURE);
    }

    error = cudaMalloc((void **) &d_C, mem_size_C);

    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_C returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
        exit(EXIT_FAILURE);
    }

    // copy host memory to device
    error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);

    if (error != cudaSuccess)
    {
        printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
        exit(EXIT_FAILURE);
    }

    error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);

    if (error != cudaSuccess)
    {
        printf("cudaMemcpy (d_B,h_B) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
        exit(EXIT_FAILURE);
    }

    // Setup execution parameters
    dim3 threads(block_size, block_size);
    dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);

    // Create and start timer
    printf("Computing result using CUDA Kernel...\n");

    // Performs warmup operation using matrixMul CUDA kernel
    if (block_size == 16)
    {
        matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
    }
    else
    {
        matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
    }

    printf("done\n");

    cudaDeviceSynchronize();



    cudaEvent_t stop;
    error = cudaEventCreate(&stop);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    // Execute the kernel
    int nIter = 1;

    for (int j = 0; j < nIter; j++)
    {
        if (block_size == 16)
        {
            matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
        }
        else
        {
            matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
        }
    }

	// Copy result from device to host
	error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);

	if (error != cudaSuccess)
	{
		printf("cudaMemcpy (h_C,d_C) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
		exit(EXIT_FAILURE);
	}

    // Record the stop event
    error = cudaEventRecord(stop, NULL);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    // Wait for the stop event to complete
    error = cudaEventSynchronize(stop);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    float msecTotal = 0.0f;
    error = cudaEventElapsedTime(&msecTotal, start, stop);

    if (error != cudaSuccess)
    {
        fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }

    // Compute and print the performance
    float msecPerMatrixMul = msecTotal / nIter;
    double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;
    double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
    printf(
        "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
        gigaFlops,
        msecPerMatrixMul,
        flopsPerMatrixMul,
        threads.x * threads.y);

    printf("Checking computed result for correctness: ");
    bool correct = true;

    // test relative error by the formula
    //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
    //double eps = 1.e-6 ; // machine zero

    //for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++)
    //{
    //    double abs_err = fabs(h_C[i] - (dimsA.x * valB));
    //    double dot_length = dimsA.x;
    //    double abs_val = fabs(h_C[i]);
    //    double rel_err = abs_err/abs_val/dot_length ;

    //    if (rel_err > eps)
    //    {
    //        printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps);
    //        correct = false;
    //    }
    //}

    //printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");

    // Clean up memory
	delete h_A;//free(h_A);
	delete h_B;//free(h_B);
	delete h_C;//free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    if (correct)
    {
        return EXIT_SUCCESS;
    }
    else
    {
        return EXIT_FAILURE;
    }
}

参考资料

[1]https://developer.nvidia.com/cuda-zone

原文链接：https://blog.csdn.net/lee353086/article/details/50629922