CUDA之旅：矩阵相加

225 阅读 0 评论 149 点赞

我是靠谱客的博主冷傲金针菇，这篇文章主要介绍CUDA之旅：矩阵相加，现在分享给大家，希望可以做个参考。

矩阵相加CUDA实现

//矩阵相加的CUDA程序实现
//Author: Eric Lv
//Email： Eric2014_Lv@sjtu.edu.cn
//Date: 6/7/2017
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
//#include <cuda.h>
#define N 32
__global__ void matrix_add(const int a[][N], const int b[][N], int c[][N])
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
c[idx][idy] = a[idx][idy] + b[idx][idy];
}
int main(void)
{
int i;
int *dev_a, *dev_b, *dev_c;
int *host_a, *host_b, *host_c;
//分配block里面线程的维数 N*N
dim3 threads_in_block (N, N);
cudaError_t err = cudaSuccess;
host_a = (int *)malloc( sizeof(int) * N * N );
host_b = (int *)malloc( sizeof(int) * N * N );
host_c = (int *)malloc( sizeof(int) * N * N );
err = cudaMalloc((void **)&dev_a, sizeof(int) * N * N);
if(err != cudaSuccess)
{
printf("cudaMalloc (a) is failed!n");
return -1;
}
err = cudaMalloc((void **)&dev_b, sizeof(int) * N * N);
if(err != cudaSuccess)
{
printf("cudaMalloc (b) is failed!n");
return -1;
}
err = cudaMalloc((void **)&dev_c, sizeof(int) * N * N);
if(err != cudaSuccess)
{
printf("cudaMalloc (c) is failed!n");
return -1;
}
for(i = 0; i < N * N; i++)
{
host_a[i] = 2*i+1;
host_b[i] = 3*i-1;
}
err = cudaMemcpy(dev_a, host_a, sizeof(int) * N * N, cudaMemcpyHostToDevice);
if(err != cudaSuccess)
{
printf("Host to device (a) is failed!n");
return -1;
}
err = cudaMemcpy(dev_b, host_b, sizeof(int) * N * N, cudaMemcpyHostToDevice);
if(err != cudaSuccess)
{
printf("Host to device (b) is failed!n");
return -1;
}
// 调用GPU上的核函数
matrix_add<<<1, threads_in_block>>>((int (*)[N])dev_a, (int (*)[N])dev_b, (int (*)[N])dev_c);
err = cudaMemcpy(host_c, dev_c, sizeof(int) * N * N, cudaMemcpyDeviceToHost);
if(err != cudaSuccess)
{
printf("Device to host (c) is failed!n");
return -1;
}
for (i = 0; i < N * N; i++)
{
if (host_a[i] + host_b[i] != host_c[i])
{
printf("a[%d]%d + b[%d]%d != c[%d]%d.n", i, host_a[i], i, host_b[i], i, host_c[i]);
return -1;
}
}
printf("Congratulations! All entris are correct! You have finished the CUDA code!n");
free(host_a);
free(host_b);
free(host_c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}