作者:无声胜有剩 | 来源:互联网 | 2023-09-23 16:36
以一个博文《CUDA例子》来说明。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
#define N 10
__global__ void add(int *a, int *b, int *c)
{
int tid =blockIdx.x;
c[tid] = a[tid] + b[tid];
}
int main()
{
int a[N], b[N], c[N];
int *deva, *devb, *devc;
//在GPU上分配内存
cudaMalloc((void **)&deva, N*sizeof(int));
cudaMalloc((void **)&devb, N*sizeof(int));
cudaMalloc((void **)&devc, N*sizeof(int));
//在CPU上为数组赋值
for (int i = 0; i {
a[i] = -i;
b[i] = i*i;
}
//将数组a和b传到GPU
cudaMemcpy(deva, a, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(devb, b, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(devc, c, N*sizeof(int), cudaMemcpyHostToDevice);
add <<> >(deva, devb, devc);
//将数组c从GPU传到CPU
cudaMemcpy(c, devc, N*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i {
printf("%d+%d=%d\n", a[i], b[i], c[i]);
}
cudaFree(deva);
cudaFree(devb);
cudaFree(devc);
return 0;
}
cpu
//#include "cuda_runtime.h"
//#include "device_launch_parameters.h"
#include
#define N 10
//加一个全局并行变量
int blockIdx_x;
//__global__
void add(int *a, int *b, int *c)
{
int tid =blockIdx_x;//blockIdx.x;
c[tid] = a[tid] + b[tid];
}
int main()
{
int a[N], b[N], c[N];
int *deva, *devb, *devc;
// //在GPU上分配内存
// cudaMalloc((void **)&deva, N*sizeof(int));
// cudaMalloc((void **)&devb, N*sizeof(int));
// cudaMalloc((void **)&devc, N*sizeof(int));
//在CPU上为数组赋值
for (int i = 0; i {
a[i] = -i;
b[i] = i*i;
}
// //将数组a和b传到GPU
// cudaMemcpy(deva, a, N*sizeof(int), cudaMemcpyHostToDevice);
// cudaMemcpy(devb, b, N*sizeof(int), cudaMemcpyHostToDevice);
// cudaMemcpy(devc, c, N*sizeof(int), cudaMemcpyHostToDevice);
deva = a; devb = b; devc = c;
for (int blockIdx_x = 0; blockIdx_x {
//<<> >
add (deva, devb, devc);
//add (a, b, c);//也可以直接这句
}
// //将数组c从GPU传到CPU
// cudaMemcpy(c, devc, N*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i {
printf("%d+%d=%d\n", a[i], b[i], c[i]);
}
// cudaFree(deva);
// cudaFree(devb);
// cudaFree(devc);
return 0;
}
对于OpenMP 加上 “#pragma omp parallel for”就是CPU并行了
总结:
1。去掉数据从cpu-->gpu 和 gpu-->cpu 的相关代码
2。核函数作为cpu并行对象放到for循环中
3。把双角括号<<>>的数据变为for的循环变量
------------------------------------------------------分割线----------------------------------------------------
glsl
先来对比一下glsl 和 cuda 计算单位结构图
可见,除了名称叫法不同,组成方式是一样的。
//#include "cuda_runtime.h"
//#include "device_launch_parameters.h"
#include
#define N 10
//__global__
//void add(int *a, int *b, int *c)
//{
// int tid =blockIdx_x;//blockIdx.x;
// c[tid] = a[tid] + b[tid];
//
//}
//核函数放入Source中,作为glsl源码
char *Source={
"#version 430 core\n"
"layout (local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = 1) in;\n"
"layout(std430, binding = 0) buffer buffer0\n"
"{\n"
" int a[];\n"
"};\n"
"layout(std430, binding = 1) buffer buffer1\n"
"{\n"
" int b[];\n"
"};\n"
"layout(std430, binding = 2) buffer buffer2\n"
"{\n"
" int c[];\n"
"};\n"
"void main(void)\n"
"{\n"
" int tid =int(gl_WorkGroupID.x);\n"
" c[tid] = a[tid] + b[tid];\n"
"}\n"
}
int main(int argc,char **argv)
{
//glsl 初始化
init_glut_glew(argc,argv);
int a[N], b[N], c[N];
int *deva, *devb, *devc;
// //在GPU上分配内存
// cudaMalloc((void **)&deva, N*sizeof(int));
// cudaMalloc((void **)&devb, N*sizeof(int));
// cudaMalloc((void **)&devc, N*sizeof(int));
分配glsl缓存ID();//glGenBuffers()
//分配缓存 和传送数据一起
//在CPU上为数组赋值
for (int i = 0; i {
a[i] = -i;
b[i] = i*i;
}
// //将数组a和b传到GPU
// cudaMemcpy(deva, a, N*sizeof(int), cudaMemcpyHostToDevice);
// cudaMemcpy(devb, b, N*sizeof(int), cudaMemcpyHostToDevice);
// cudaMemcpy(devc, c, N*sizeof(int), cudaMemcpyHostToDevice);
传送数组(a,b,c,N*sizeof(int));
//glBindBuffer();glBufferData();glBindBufferBase();glUnmapBuffer();
设置glsl运行参数(N);
//local_size_x = N, local_size_y = 1, local_size_z = 1;//代入Source中
//NumGroupsX = 1, NumGroupsY = 1, NumGroupsZ = 1;
编译链接源码(Source);
//glCompileShader();glLinkProgram()
运行计算着色器();
//performCompute();
//add (deva, devb, devc);
// //将数组c从GPU传到CPU
// cudaMemcpy(c, devc, N*sizeof(int), cudaMemcpyDeviceToHost);
取回结果(c);
//glBindBuffer();glMapBuffer();glUnmapBuffer();
for (int i = 0; i {
printf("%d+%d=%d\n", a[i], b[i], c[i]);
}
// cudaFree(deva);
// cudaFree(devb);
// cudaFree(devc);
return 0;
}
总结:
1。核函数作为源码放入一个字符串中
2。计算单位的设置都有对应的名称代入
3。分配存储传送取回数据也有对应的函数