【CUDA C】矩阵的乘与加

watermelon · 发表于 2021-3-22 21:04:42

欢迎访问技术宅的结界，请注册或者登录吧。

您需要登录才可以下载或查看，没有账号？立即注册→加入我们

×

好长时间没有来论坛发帖子啦！
水个帖子来冒个泡！
就是很简单的矩阵乘法与加法，就不详细阐述咯；

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
void GenerateMatrix(float *matrix, int nx, int ny)
{
int i, j;
float cnt = 0;
for (i = 0; i < nx; i++)
{
for (j = 0; j < ny; j++)
{
matrix[i*nx + j] = cnt++;
}
}
printf("[*] GenerateMatrix has done!\n");
}
void PrintMatrix(float *matrix, int nx, int ny)
{
int i, j;
for (i = 0; i < nx; i++)
{
for (j = 0; j < ny; j++)
{
printf("%.2f\t", matrix[i*nx + j]);
}
printf("\n");
}
printf("[*] PrintMatrix has done!\n");
}
/************************* matrix summary begin *************************/
void AddMatrixOnCPU(float *A, float *B, float *C, int nx, int ny)
{
int i, j;
for (i = 0; i < nx; i++)
{
for (j = 0; j < ny; j++)
{
C[i*nx + j] = A[i*nx + j] + B[i*nx + j];
}
}
printf("[*] AddMatrix on CPU has done!\n");
}
__global__ void AddMatrixOnGPU(float *A, float *B, float *C, int nx, int ny)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
int idx = i*nx + j;
if (i <= nx && j <= ny)
{
C[idx] = A[idx] + B[idx];
}
}
/************************* matrix summary done **************************/
//
//
//
/************************ matrix multiply begin *************************/
void MulMatrixOnCPU(float *A, float *B, float *C, int nx, int ny)
{
int i, j, k;
float sum = 0.0;
for (i = 0; i < nx; i++)
{
for (j = 0; j < ny; j++)
{
sum = 0.0;
for (k = 0; k < nx; k++)
{
sum = sum + A[i*nx + k] * B[k*nx + j];
}
C[i*nx + j] = sum;
}
}
}
__global__ void MulMatrixOnGPU(float *A, float *B, float *C, int nx, int ny)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
int k;
if (i < nx && j < ny) // we should to identify the "i" and "j" scope.
{
float sum = 0.0;
for (k = 0; k < nx; k++)
{
sum += A[i*nx + k] * B[k*nx + j];
}
C[i*nx + j] = sum;
}
}
/************************ matrix multiply end ***************************/
// compare the result
int Compare(float *cpu_ref, float *gpu_ref, int nx, int ny)
{
int i, j;
for (i = 0; i < nx; i++)
{
for (j = 0; j < ny; j++)
{
if (cpu_ref[i*nx + j] != gpu_ref[i*nx + j])
{
return 0;
}
}
}
return 1;
}
int main(int argc, char *argv[])
{
// the size of the elements in the matrix can not be much larger....
// because of my worse GPU: nVIDIA GeForce GT710
unsigned int N = 1<<12;
int nx = (int)sqrt((float)N);
int ny = (int)sqrt((float)N);
float *A = NULL;
float *B = NULL;
float *C = NULL;
float *gpu_ref = NULL;
float *d_A = NULL;
float *d_B = NULL;
float *d_C = NULL;
// allocate the memory on CPU
A = (float *)malloc(sizeof(float)* N);
B = (float *)malloc(sizeof(float)* N);
C = (float *)malloc(sizeof(float)* N);
gpu_ref = (float *)malloc(sizeof(float)*N);
// set the memory to zero
memset(A, 0, sizeof(float)*N);
memset(B, 0, sizeof(float)*N);
memset(C, 0, sizeof(float)*N);
memset(gpu_ref, 0, sizeof(float)*N);
// allocate the memory on GPU
cudaMalloc((float **)&d_A, sizeof(float)*N);
cudaMalloc((float **)&d_B, sizeof(float)*N);
cudaMalloc((float **)&d_C, sizeof(float)*N);
// reset the memory to zero
cudaMemset(d_A, 0, sizeof(float)*N);
cudaMemset(d_B, 0, sizeof(float)*N);
cudaMemset(d_C, 0, sizeof(float)*N);
// generate the matrix on CPU
GenerateMatrix(A, nx, ny);
GenerateMatrix(B, nx, ny);
// transfer the data from CPU to GPU
cudaMemcpy(d_A, A, sizeof(float)*N, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, sizeof(float)*N, cudaMemcpyHostToDevice);
// set the grid number and the block thread number
dim3 block(32, 32);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
// Add the matrix on CPU
AddMatrixOnCPU(A, B, C, nx, ny);
// Add the matrix on GPU
AddMatrixOnGPU << <grid, block >> >(d_A, d_B, d_C, nx, ny);
cudaDeviceSynchronize(); // let the CPU wait the GPU to do its calculation.
// transform the data from the GPU to CPU
cudaMemcpy(gpu_ref, d_C, sizeof(float)*N, cudaMemcpyDeviceToHost);
if (Compare(C, gpu_ref, nx, ny))
{
printf("[*] Compare : Matrix_ADD => the result are the same!\n");
}
else
{
printf("[*] Compare : Matrix_ADD => the result are NOT the same...\n");
}
// test the matrix multiply
MulMatrixOnCPU(A, B, C, nx, ny);
// test the matrix multiply on GPU
MulMatrixOnGPU << <grid, block >> >(d_A, d_B, d_C, nx, ny);
cudaDeviceSynchronize();
cudaMemcpy(gpu_ref, d_C, sizeof(float)*N, cudaMemcpyDeviceToHost);
// make the comparison
if (Compare(C, gpu_ref, nx, ny))
{
printf("[*] Compare : Matrix_MUL => the result are the same!\n");
}
else
{
printf("[*] Compare : Matrix_MUL => the result are NOT the same...\n");
}
// Debug Print
// PrintMatrix(gpu_ref, nx, ny);
// PrintMatrix(C, nx, ny);
free(A);
free(B);
free(C);
free(gpu_ref);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}
// [*] GenerateMatrix has done!
// [*] GenerateMatrix has done!
// [*] AddMatrix on CPU has done!
// [*] Compare : Matrix_ADD = > the result are the same!
// [*] Compare : Matrix_MUL = > the result are the same!
// Press any key to continue...

复制代码

有一个很困惑的bug：
QQ图片20210322210347.png

GitHub：https://github.com/XingguangZhou/CUDA_Note/blob/main/MulMatrix.cu
很期待您的star！

0xAA55 · 发表于 2021-3-23 10:16:19

watermelon 发表于 2021-3-23 10:14
感谢A5大佬解惑！
话说A5大佬知识是真的丰富，我也怀疑是我的显卡的因素，显卡的CUDA加速器个数、寄存器 ...

其实CUDA的寄存器、加速器、共享内存等概念，和OpenGL、Direct3D的常量寄存器、VS输入输出、GS输入输出、FS输入输出、纹理材质、Buffer等东西式一一对应的。

0xAA55 · 发表于 2021-3-23 06:56:34

要知道，我当年还在使用 GTX 780 的时候，我就发现我当时创建的单张纹理的分辨率不能超过 16384 x 16384 。后来我看了DX的Caps结构体，以及OpenGL的 glGetIntegerv(GL_MAX_TEXTURE_SIZE, &xxx); 的数值，发现当时我能创建的纹理的面积还确实只能最大是 16384 x 16384 。

watermelon · 发表于 2021-3-23 10:14:12

0xAA55 发表于 2021-3-23 06:56
要知道，我当年还在使用 GTX 780 的时候，我就发现我当时创建的单张纹理的分辨率不能超过 16384 x 16384 。 ...

感谢A5大佬解惑！
话说A5大佬知识是真的丰富，我也怀疑是我的显卡的因素，显卡的CUDA加速器个数、寄存器、共享内存等方面限制了这方面。没想到A5大佬当年就已经有过类似方面的体验了啊！

账号		自动登录	找回密码
密码			立即注册→加入我们

【CUDA C】矩阵的乘与加

欢迎访问技术宅的结界，请注册或者登录吧。

点评