当前位置: 首页 > news >正文

【cuda学习日记】4.3 结构体数组与数组结构体

4.3 数组结构体(AoS)和结构体数组(SoA)

AoS方法进行存储

struct innerStruct{float x;float y;
};struct innerStruct myAOS[N];

SoA方法来存储数据

struct innerArray{float x[N];float y[N];
};struct innerArray moa;

如图说明了AoS和SoA方法的内存布局,用AoS模式在GPU上存储示例数据并执行一个只有x字段的应用程序,将导致50%的带宽损失
在这里插入图片描述
4.3.1 简单示例AoS

#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>#define LEN 1 << 20struct innerStruct{float x;float y;
};struct innerArray{float x[LEN];float y[LEN];
};__global__ void testInnerStruct(innerStruct *data, innerStruct *result, const int n){unsigned int i  = blockIdx.x * blockDim.x + threadIdx.x;if (i < n){innerStruct tmp = data[i];tmp.x += 10.f;tmp.y += 20.f;result[i] = tmp;}
}__global__ void warmup(innerStruct *data, innerStruct *result, const int n){unsigned int i  = blockIdx.x * blockDim.x + threadIdx.x;if (i < n){innerStruct tmp = data[i];tmp.x += 10.f;tmp.y += 20.f;result[i] = tmp;}
}void testInnerStructHost(innerStruct *data, innerStruct *result, const int n){for (int i = 0; i < n ; i ++){innerStruct tmp = data[i];tmp.x += 10.f;tmp.y += 20.f;result[i] = tmp;}
}void initialInnerStruct(innerStruct *ip,  int size)
{for (int i = 0; i < size; i++){ip[i].x = (float)(rand() & 0xFF) / 100.0f;ip[i].y = (float)(rand() & 0xFF) / 100.0f;}return;
}void checkInnerStruct(innerStruct *hostRef, innerStruct *gpuRef, const int N)
{double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(hostRef[i].x - gpuRef[i].x) > epsilon){match = 0;printf("different on %dth element: host %f gpu %f\n", i,hostRef[i].x, gpuRef[i].x);break;}if (abs(hostRef[i].y - gpuRef[i].y) > epsilon){match = 0;printf("different on %dth element: host %f gpu %f\n", i,hostRef[i].y, gpuRef[i].y);break;}}if (!match)  printf("Arrays do not match.\n\n");
}int main(int argc, char ** argv){int dev = 0;cudaSetDevice(dev);cudaDeviceProp deviceprop;CHECK(cudaGetDeviceProperties(&deviceprop,dev));printf("device %d: %s \n", dev, deviceprop.name);int nElem = LEN;size_t nBytes = nElem * sizeof(innerStruct);innerStruct     *h_A = (innerStruct *)malloc(nBytes);innerStruct *hostRef = (innerStruct *)malloc(nBytes);innerStruct *gpuRef  = (innerStruct *)malloc(nBytes);initialInnerStruct(h_A, nElem);testInnerStructHost(h_A, hostRef, nElem);innerStruct *d_A, *d_C;cudaMalloc((innerStruct**)&d_A, nBytes);cudaMalloc((innerStruct**)&d_C, nBytes);cudaMemcpy(d_A, h_A, nBytes,cudaMemcpyHostToDevice);int blocksize = 128;if (argc > 1) blocksize = atoi(argv[1]);dim3 block(blocksize,1);dim3 grid((nElem + block.x - 1)/block.x, 1);Timer timer;timer.start();warmup<<<grid,block>>>(d_A, d_C, nElem);cudaDeviceSynchronize();timer.stop();float elapsedTime = timer.elapsedms();printf("warmup <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x,  elapsedTime);timer.start();testInnerStruct<<<grid,block>>>(d_A, d_C, nElem);cudaDeviceSynchronize();timer.stop();elapsedTime = timer.elapsedms();printf("testInnerStruct <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x,  elapsedTime);cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);checkInnerStruct(hostRef, gpuRef, nElem);cudaFree(d_A);cudaFree(d_C);free(h_A);free(hostRef);free(gpuRef);cudaDeviceReset();return 0;
}

用NCU查看加载内存效率,只有50%:
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct [%] 50

执行时间:
testInnerStruct <<<8192, 128>>> elapsed 0.036864 ms

4.3.1 简单示例SoA

#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>#define LEN 1 << 20struct innerStruct{float x;float y;
};struct innerArray{float x[LEN];float y[LEN];
};__global__ void testInnerArray( innerArray *data, innerArray *result, const int n){unsigned int i  = blockIdx.x * blockDim.x + threadIdx.x;if (i < n) {float tmpx = data -> x[i];float tmpy = data -> y[i];tmpx += 10.0f;tmpy += 20.0f;result -> x[i] = tmpx;result -> y[i] = tmpy;}
}__global__ void warmup( innerArray *data, innerArray *result, const int n){unsigned int i  = blockIdx.x * blockDim.x + threadIdx.x;if (i < n) {float tmpx = data -> x[i];float tmpy = data -> y[i];tmpx += 10.0f;tmpy += 20.0f;result -> x[i] = tmpx;result -> y[i] = tmpy;}
}// functions for inner array outer struct
void initialInnerArray(innerArray *ip,  int size)
{for (int i = 0; i < size; i++){ip->x[i] = (float)( rand() & 0xFF ) / 100.0f;ip->y[i] = (float)( rand() & 0xFF ) / 100.0f;}return;
}void testInnerArrayHost(innerArray *A, innerArray *C, const int n)
{for (int idx = 0; idx < n; idx++){C->x[idx] = A->x[idx] + 10.f;C->y[idx] = A->y[idx] + 20.f;}return;
}void checkInnerArray(innerArray *hostRef, innerArray *gpuRef, const int N)
{double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(hostRef->x[i] - gpuRef->x[i]) > epsilon){match = 0;printf("different on x %dth element: host %f gpu %f\n", i,hostRef->x[i], gpuRef->x[i]);break;}if (abs(hostRef->y[i] - gpuRef->y[i]) > epsilon){match = 0;printf("different on y %dth element: host %f gpu %f\n", i,hostRef->y[i], gpuRef->y[i]);break;}}if (!match)  printf("Arrays do not match.\n\n");
}int main(int argc, char ** argv){int dev = 0;cudaSetDevice(dev);cudaDeviceProp deviceprop;CHECK(cudaGetDeviceProperties(&deviceprop,dev));printf("device %d: %s \n", dev, deviceprop.name);int nElem = LEN;size_t nBytes =   sizeof(innerArray);innerArray     *h_A = (innerArray *)malloc(nBytes);innerArray *hostRef = (innerArray *)malloc(nBytes);innerArray *gpuRef  = (innerArray *)malloc(nBytes);initialInnerArray(h_A, nElem);testInnerArrayHost(h_A, hostRef, nElem);innerArray *d_A, *d_C;cudaMalloc((innerArray**)&d_A, nBytes);cudaMalloc((innerArray**)&d_C, nBytes);cudaMemcpy(d_A, h_A, nBytes,cudaMemcpyHostToDevice);int blocksize = 128;if (argc > 1) blocksize = atoi(argv[1]);dim3 block(blocksize,1);dim3 grid((nElem + block.x - 1)/block.x, 1);Timer timer;timer.start();warmup<<<grid,block>>>(d_A, d_C, nElem);cudaDeviceSynchronize();timer.stop();float elapsedTime = timer.elapsedms();printf("warmup <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x,  elapsedTime);timer.start();testInnerArray<<<grid,block>>>(d_A, d_C, nElem);cudaDeviceSynchronize();timer.stop();elapsedTime = timer.elapsedms();printf("testInnerArray <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x,  elapsedTime);cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);checkInnerArray(hostRef, gpuRef, nElem);cudaFree(d_A);cudaFree(d_C);free(h_A);free(hostRef);free(gpuRef);cudaDeviceReset();return 0;
}

NCU查看内存加载效率:
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct [%] 100

优化设备内存带宽利用率有两个目标:
·对齐及合并内存访问,以减少带宽的浪费
·足够的并发内存操作,以隐藏内存延迟

http://www.lryc.cn/news/542398.html

相关文章:

  • 2025最新高维多目标优化:基于城市场景下无人机三维路径规划的导航变量的多目标粒子群优化算法(NMOPSO),MATLAB代码
  • 数字IC后端设计实现OCC(On-chip Clock Controller)电路介绍及时钟树综合案例
  • Linux内核,slub分配流程
  • 本地部署DeepSeek-R1(Ollama+Docker+OpenWebUI知识库)
  • Java 实现快速排序算法:一条快速通道,分而治之
  • 20250223下载并制作RTX2080Ti显卡的显存的测试工具mats
  • element-ui的组件使用
  • 医疗AI领域中GPU集群训练的关键技术与实践经验探究(上)
  • 详解Redis淘汰策略
  • HarmonyOS 5.0应用开发——鸿蒙接入高德地图实现POI搜索
  • nginx关于配置SSL后启动失败原因分析
  • 【自学嵌入式(9)ESP8266网络服务器的使用】
  • 危化品经营单位安全管理人员的职责及注意事项
  • 项目实战--网页五子棋(匹配模块)(5)
  • mysql 迁移到人大金仓数据库
  • uniapp 网络请求封装(uni.request 与 uView-Plus)
  • 计算机网络与通讯知识总结
  • DPVS-2:单臂负载均衡测试
  • open webui 部署 以及解决,首屏加载缓慢,nginx反向代理访问404,WebSocket后端服务器链接失败等问题
  • 交通物联网:概念、历史、现状与展望
  • 如何实现应用程序与中间件的类进行隔离
  • MySQL 数据库基础
  • 微服务即时通信系统---(三)框架学习
  • 解决Spring Data JPA set值后自动更新到数据库问题
  • 心理咨询小程序的未来发展
  • STM32-智能台灯项目
  • c# —— StringBuilder 类
  • Linux 核心架构与组件(2025更新中)
  • Unity打包APK报错 using a newer Android Gradle plugin to use compileSdk = 35
  • 陀螺匠·企业助手v1.8 产品介绍