【CUDA C】矩阵的乘与加
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
void GenerateMatrix(float *matrix, int nx, int ny)
int i, j;
float cnt = 0;
for (i = 0; i < nx; i++)
for (j = 0; j < ny; j++)
matrix = cnt++;
printf("[*] GenerateMatrix has done!\n");
void PrintMatrix(float *matrix, int nx, int ny)
int i, j;
for (i = 0; i < nx; i++)
for (j = 0; j < ny; j++)
printf("%.2f\t", matrix);
printf("[*] PrintMatrix has done!\n");
/************************* matrix summary begin *************************/
void AddMatrixOnCPU(float *A, float *B, float *C, int nx, int ny)
int i, j;
for (i = 0; i < nx; i++)
for (j = 0; j < ny; j++)
C = A + B;
printf("[*] AddMatrix on CPU has done!\n");
__global__ void AddMatrixOnGPU(float *A, float *B, float *C, int nx, int ny)
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
int idx = i*nx + j;
if (i <= nx && j <= ny)
C = A + B;
/************************* matrix summary done **************************/
/************************ matrix multiply begin *************************/
void MulMatrixOnCPU(float *A, float *B, float *C, int nx, int ny)
int i, j, k;
float sum = 0.0;
for (i = 0; i < nx; i++)
for (j = 0; j < ny; j++)
sum = 0.0;
for (k = 0; k < nx; k++)
sum = sum + A * B;
C = sum;
__global__ void MulMatrixOnGPU(float *A, float *B, float *C, int nx, int ny)
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
int k;
if (i < nx && j < ny) // we should to identify the "i" and "j" scope.
float sum = 0.0;
for (k = 0; k < nx; k++)
sum += A * B;
C = sum;
/************************ matrix multiply end ***************************/
// compare the result
int Compare(float *cpu_ref, float *gpu_ref, int nx, int ny)
int i, j;
for (i = 0; i < nx; i++)
for (j = 0; j < ny; j++)
if (cpu_ref != gpu_ref)
return 0;
return 1;
int main(int argc, char *argv[])
// the size of the elements in the matrix can not be much larger....
// because of my worse GPU: nVIDIA GeForce GT710
unsigned int N = 1<<12;
int nx = (int)sqrt((float)N);
int ny = (int)sqrt((float)N);
float *A = NULL;
float *B = NULL;
float *C = NULL;
float *gpu_ref = NULL;
float *d_A = NULL;
float *d_B = NULL;
float *d_C = NULL;
// allocate the memory on CPU
A = (float *)malloc(sizeof(float)* N);
B = (float *)malloc(sizeof(float)* N);
C = (float *)malloc(sizeof(float)* N);
gpu_ref = (float *)malloc(sizeof(float)*N);
// set the memory to zero
memset(A, 0, sizeof(float)*N);
memset(B, 0, sizeof(float)*N);
memset(C, 0, sizeof(float)*N);
memset(gpu_ref, 0, sizeof(float)*N);
// allocate the memory on GPU
cudaMalloc((float **)&d_A, sizeof(float)*N);
cudaMalloc((float **)&d_B, sizeof(float)*N);
cudaMalloc((float **)&d_C, sizeof(float)*N);
// reset the memory to zero
cudaMemset(d_A, 0, sizeof(float)*N);
cudaMemset(d_B, 0, sizeof(float)*N);
cudaMemset(d_C, 0, sizeof(float)*N);
// generate the matrix on CPU
GenerateMatrix(A, nx, ny);
GenerateMatrix(B, nx, ny);
// transfer the data from CPU to GPU
cudaMemcpy(d_A, A, sizeof(float)*N, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, sizeof(float)*N, cudaMemcpyHostToDevice);
// set the grid number and the block thread number
dim3 block(32, 32);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
// Add the matrix on CPU
AddMatrixOnCPU(A, B, C, nx, ny);
// Add the matrix on GPU
AddMatrixOnGPU << <grid, block >> >(d_A, d_B, d_C, nx, ny);
cudaDeviceSynchronize();// let the CPU wait the GPU to do its calculation.
// transform the data from the GPU to CPU
cudaMemcpy(gpu_ref, d_C, sizeof(float)*N, cudaMemcpyDeviceToHost);
if (Compare(C, gpu_ref, nx, ny))
printf("[*] Compare : Matrix_ADD => the result are the same!\n");
printf("[*] Compare : Matrix_ADD => the result are NOT the same...\n");
// test the matrix multiply
MulMatrixOnCPU(A, B, C, nx, ny);
// test the matrix multiply on GPU
MulMatrixOnGPU << <grid, block >> >(d_A, d_B, d_C, nx, ny);
cudaMemcpy(gpu_ref, d_C, sizeof(float)*N, cudaMemcpyDeviceToHost);
// make the comparison
if (Compare(C, gpu_ref, nx, ny))
printf("[*] Compare : Matrix_MUL => the result are the same!\n");
printf("[*] Compare : Matrix_MUL => the result are NOT the same...\n");
// Debug Print
// PrintMatrix(gpu_ref, nx, ny);
// PrintMatrix(C, nx, ny);
return 0;
// [*] GenerateMatrix has done!
// [*] GenerateMatrix has done!
// [*] AddMatrix on CPU has done!
// [*] Compare : Matrix_ADD = > the result are the same!
// [*] Compare : Matrix_MUL = > the result are the same!
// Press any key to continue...
很期待您的star! watermelon 发表于 2021-3-23 10:14
话说A5大佬知识是真的丰富,我也怀疑是我的显卡的因素,显卡的CUDA加速器个数、寄存器 ...
其实CUDA的寄存器、加速器、共享内存等概念,和OpenGL、Direct3D的常量寄存器、VS输入输出、GS输入输出、FS输入输出、纹理材质、Buffer等东西式一一对应的。 要知道,我当年还在使用 GTX 780 的时候,我就发现我当时创建的单张纹理的分辨率不能超过 16384 x 16384 。后来我看了DX的Caps结构体,以及OpenGL的 glGetIntegerv(GL_MAX_TEXTURE_SIZE, &xxx); 的数值,发现当时我能创建的纹理的面积还确实只能最大是 16384 x 16384 。 0xAA55 发表于 2021-3-23 06:56
要知道,我当年还在使用 GTX 780 的时候,我就发现我当时创建的单张纹理的分辨率不能超过 16384 x 16384 。 ...