VS中将cuda项目编译为DLL并调用
本篇来介绍一下如何将cuda工程编译为DLL, 并在其他工程中调用。
配置环境 : VS2022+CUDA12.2, 工程目录如下
普通C++工程
首先在VS中创建一个空项目或者控制台程序MainTest 作为主程序。
其次,再创建一个空项目或者控制台程序作为生成DLL的工程(CPlusPlusProject)。
在工程中写一个简单的打印helloworld函数
Demo1.h
#pragma once
#include <iostream>
__declspec(dllexport) void PrintfHello();
Demo1.cpp
#include "Demo1.h"void PrintfHello() {for (int i = 0; i < 10; i++) {printf("hello world Cpu\n");}
}
注: __declspec(dllexport) 含义是我要把这个函数、类或变量导出到 DLL 中,让其他程序可以通过这个 DLL 使用它。
step1:
修改项目属性->改为动态库.DLL。 其中,输出目录可以更改生成的DLL、Lib文件的存储路径。
对CPlusPlusProject工程进行编译, 会得到如下文件(路径大概率在MainTest工程下\x64\Release文件夹中。 )
step2:
在MainTest中进行配置, 和配置其他第三方库一样。
包含目录:
库目录:
附加依赖项:
在MainTest添加主函数进行调用
MainTest.cpp
#include <iostream>
#include "Demo1.h"
//#include "ParalleReduction.h"int main() {PrintfHello();//paralleReduction();
}
结果如下。
编译CUDA程序
首先, 右键解决方案->添加->新建项目->CUDA项目(CudaRuntime), 添加代码文件
ParalleReduction.h
#pragma once
#include <iostream>extern "C"
__declspec(dllexport) void paralleReduction();
可以看出, 相比与普通c++程序, 多了extern “C”, 这是为了告诉编译器请用 C 语言的方式(不带名字修饰)来导出这个函数, 防止因编译器的不同导致的名称更改。
为了偷懒使用前面cuda并行规约的代码进行测试, 代码如下。
ParallelReduction.cu
//#define Time// 并行规约
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include "ParalleReduction.h"// 每个内核只计算一个block中的和
__global__ void reduceNeighbored(int *g_idata, int *g_odata, unsigned int n) {int idx = threadIdx.x;unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;int* idata = g_idata + blockIdx.x * blockDim.x;if (index > n){return; }for (int stride = 1; stride < blockDim.x; stride *= 2) {if (idx % (2 * stride) == 0) {idata[idx] += idata[idx + stride];}// 等待线程同步__syncthreads();}if (idx == 0){g_odata[blockIdx.x] = idata[0];}
}// 相邻配对方法
__global__ void reduceNeighboredLess(int* g_idata, int* g_odata, unsigned int n) { // 避免线程分化版本unsigned int tid = threadIdx.x;unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;// get dataint* idata = g_idata + blockIdx.x * blockDim.x; // 对应的数据if (index > n) {return;}for (int stride = 1; stride < blockDim.x; stride *= 2) {int sumIndex = tid * 2 * stride;if (sumIndex < blockDim.x) {idata[sumIndex] += idata[sumIndex + stride];}// 等待线程同步__syncthreads();}if (tid == 0) {g_odata[blockIdx.x] = idata[0];}}// 交错规约
__global__ void reduceInterleaved(int* g_idata, int* g_odata, unsigned int n) {unsigned int tid = threadIdx.x;unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;int* idata = g_idata + blockIdx.x * blockDim.x;if (index >= n) {return;}int dataSize = blockDim.x;for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {if (tid < stride) {idata[tid] += idata[tid + stride];}// 等待线程同步__syncthreads();}if (tid == 0) {g_odata[blockIdx.x] = idata[0];}
}// 循环展开
__global__ void reduceUnrolling2(int* g_idata, int* g_odata, unsigned int n) { // 将相邻两个block数据合并成一组数据, 对合并后的数据进行求和计算unsigned int tid = threadIdx.x;unsigned int idx = blockIdx.x * blockDim.x * 2 + threadIdx.x; // 全局索引int* idata = g_idata + blockIdx.x * blockDim.x * 2; // 每次归并两个block blockId.x 取值范围从0-nif (idx + blockDim.x < n) // block0 与 block1 相同的thread 对应的数据 全局数据g_idata[idx] += g_idata[idx + blockDim.x];__syncthreads();//for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {if (tid < stride) {idata[tid] += idata[tid + stride];}__syncthreads();}if (tid == 0) {g_odata[blockIdx.x] = idata[0];}
}// reduceUnrolling8
__global__ void reduceUnrolling8(int* g_idata, int* g_odata, unsigned int n) {unsigned int tid = threadIdx.x;unsigned int index = blockIdx.x * blockDim.x * 8 + threadIdx.x;int* idata = g_idata + blockIdx.x * blockDim.x * 8;if (index + 7*blockDim.x < n) { // 越界条件检查int a1 = g_idata[index];int a2 = g_idata[index + blockDim.x];int a3 = g_idata[index + blockDim.x * 2];int a4 = g_idata[index + blockDim.x * 3];int a5 = g_idata[index + blockDim.x * 4];int a6 = g_idata[index + blockDim.x * 5];int a7 = g_idata[index + blockDim.x * 6];int a8 = g_idata[index + blockDim.x * 7];g_idata[index] = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8;}__syncthreads();for (int stride = blockDim.x / 2; stride > 32; stride >>= 1) {if (tid < stride) {idata[tid] += idata[tid + stride];}__syncthreads();}if (tid < 32) {volatile int* vmem = idata;vmem[tid] += vmem[tid + 32];vmem[tid] += vmem[tid + 16];vmem[tid] += vmem[tid + 8];vmem[tid] += vmem[tid + 4];vmem[tid] += vmem[tid + 2];vmem[tid] += vmem[tid + 1];}if (tid == 0) {g_odata[blockIdx.x] = idata[0];}
}__host__ void GetSumCpu(int* idata, int& odata, const int n) {for (int i = 0; i < n; i++) {odata += idata[i];}
}void paralleReduction() {// 选择设备, 打印设备信息int dev = 0;cudaDeviceProp deviceProp;cudaGetDeviceProperties(&deviceProp, dev);printf("device %d: %s \n", dev, deviceProp.name);// 申请host 端的memoryint size = 1 << 24;int nByte = size * sizeof(int);dim3 block(512, 1);dim3 grid((size + block.x -1) / block.x, 1);int* idata_h = (int*)malloc(nByte);int* odata_h = (int*)malloc(grid.x * sizeof(int));int* temp = (int*)malloc(nByte);// 初始化数组for (int i = 0; i < size; i++) {idata_h[i] = (int)(rand() & 0xFF);}memcpy(temp, idata_h, nByte);#ifdef Timeauto start = std::chrono::high_resolution_clock::now();
#endif // TImeint result_cpu = 0;GetSumCpu(temp, result_cpu, size);printf("CPU result: %d \n", result_cpu);
#ifdef Timeauto end = std::chrono::high_resolution_clock::now();std::chrono::duration<double, std::milli> elapsed = end - start;printf("CPU use Time: %f \n", elapsed);
#endif // Time// cuda memoryint* idata_d = nullptr;int* odata_d = nullptr;cudaMalloc((int **)&idata_d, nByte);cudaMalloc((int **)&odata_d, grid.x *sizeof(int));// 内存拷贝 -- 耗时cudaMemcpy(idata_d, idata_h, nByte, cudaMemcpyHostToDevice);
#ifdef Timefloat milliseconds = 0;cudaEvent_t k_start, k_stop;cudaEventCreate(&k_start);cudaEventCreate(&k_stop);cudaEventRecord(k_start);
#endif // TimereduceNeighbored<< <grid, block>> >(idata_d, odata_d, size);cudaDeviceSynchronize();#ifdef TimecudaEventRecord(k_stop);cudaEventSynchronize(k_stop); // 等待事件完成cudaEventElapsedTime(&milliseconds, k_start, k_stop); // 计算耗时(毫秒)printf("GPU use Time %.5f ms\n", milliseconds);cudaEventDestroy(k_start);cudaEventDestroy(k_stop);
#endif // TimecudaMemcpy(odata_h, odata_d, grid.x * sizeof(int), cudaMemcpyDeviceToHost);int gpu_sum = 0;for (int i = 0; i < grid.x; i++) {gpu_sum += odata_h[i];}printf("GPU result %d \n", gpu_sum);// 改进后的并行规约算法cudaMemcpy(idata_d, idata_h, nByte, cudaMemcpyHostToDevice);
#ifdef Timefloat milliseconds_L = 0;cudaEvent_t kl_start, kl_stop;cudaEventCreate(&kl_start);cudaEventCreate(&kl_stop);cudaEventRecord(kl_start);
#endif // TimereduceNeighboredLess << <grid, block >> > (idata_d, odata_d, size);cudaDeviceSynchronize();#ifdef TimecudaEventRecord(kl_stop);cudaEventSynchronize(kl_stop); // 等待事件完成cudaEventElapsedTime(&milliseconds_L, kl_start, kl_stop); // 计算耗时(毫秒)printf("GPU use Time reduceNeighboredLess %.5f ms\n", milliseconds_L);cudaEventDestroy(kl_start); // 销毁事件cudaEventDestroy(kl_stop);
#endif // TimecudaMemcpy(odata_h, odata_d, grid.x * sizeof(int), cudaMemcpyDeviceToHost);gpu_sum = 0;for (int i = 0; i < grid.x; i++) {gpu_sum += odata_h[i];}printf("GPU result reduceNeighboredLess %d \n", gpu_sum);// 交错规约算法cudaMemcpy(idata_d, idata_h, nByte, cudaMemcpyHostToDevice);
#ifdef Timefloat milliseconds_2 = 0;cudaEvent_t k2_start, k2_stop;cudaEventCreate(&k2_start);cudaEventCreate(&k2_stop);cudaEventRecord(k2_start);
#endif // TimereduceInterleaved<<<grid, block>>>(idata_d, odata_d, size);cudaDeviceSynchronize();#ifdef TimecudaEventRecord(k2_stop);cudaEventSynchronize(k2_stop); // 等待事件完成cudaEventElapsedTime(&milliseconds_2, k2_start, k2_stop); // 计算耗时(毫秒)printf("GPU use Time reduceInterleaved %.5f ms\n", milliseconds_2);cudaEventDestroy(k2_start); // 销毁事件cudaEventDestroy(k2_stop);
#endif // !TimecudaMemcpy(odata_h, odata_d, grid.x * sizeof(int), cudaMemcpyDeviceToHost);gpu_sum = 0;for (int i = 0; i < grid.x; i++) {gpu_sum += odata_h[i];}printf("GPU result reduceInterleaved %d \n", gpu_sum);// 展开规约算法 1 每次规约两个block// 可以降低耗时的原因:一个线程有更多的独立内存加载、存储操作会产生更好的性能。GPU会将多个内存操作请求合并成一个进行下发// 更高效的利用带宽cudaMemcpy(idata_d, idata_h, nByte, cudaMemcpyHostToDevice);reduceUnrolling2 << <grid.x / 2, block >> > (idata_d, odata_d, size);cudaMemcpy(odata_h, odata_d, grid.x / 2 * sizeof(int), cudaMemcpyDeviceToHost);gpu_sum = 0;for (int i = 0; i < grid.x / 2; i++) {gpu_sum += odata_h[i];}printf("GPU result reduceUnrolling2 %d \n", gpu_sum);free(idata_h);free(odata_h);free(temp);cudaFree(idata_d);cudaFree(odata_d);}
配置CUDA工程属性
step1:
- 右键CudaRuntime工程->生成依赖项->自定义生成->选中已安装的cuda版本 , 点击应用.
step2:
2.将项目类型更改为DLL
step3:
参考普通c++程序DLL调用配置的库目录、包含目录、附加依赖项的方式, 将cuda工程的配置 添加进去。
编译成功后,运行主函数
#include <iostream>
#include "Demo1.h"
#include "ParalleReduction.h"int main() {PrintfHello();paralleReduction();
}
结果如下: