【cuda学习日记】4.3 结构体数组与数组结构体
4.3 数组结构体(AoS)和结构体数组(SoA)
AoS方法进行存储
struct innerStruct{float x;float y;
};struct innerStruct myAOS[N];
SoA方法来存储数据
struct innerArray{float x[N];float y[N];
};struct innerArray moa;
如图说明了AoS和SoA方法的内存布局,用AoS模式在GPU上存储示例数据并执行一个只有x字段的应用程序,将导致50%的带宽损失
4.3.1 简单示例AoS
#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>#define LEN 1 << 20struct innerStruct{float x;float y;
};struct innerArray{float x[LEN];float y[LEN];
};__global__ void testInnerStruct(innerStruct *data, innerStruct *result, const int n){unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;if (i < n){innerStruct tmp = data[i];tmp.x += 10.f;tmp.y += 20.f;result[i] = tmp;}
}__global__ void warmup(innerStruct *data, innerStruct *result, const int n){unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;if (i < n){innerStruct tmp = data[i];tmp.x += 10.f;tmp.y += 20.f;result[i] = tmp;}
}void testInnerStructHost(innerStruct *data, innerStruct *result, const int n){for (int i = 0; i < n ; i ++){innerStruct tmp = data[i];tmp.x += 10.f;tmp.y += 20.f;result[i] = tmp;}
}void initialInnerStruct(innerStruct *ip, int size)
{for (int i = 0; i < size; i++){ip[i].x = (float)(rand() & 0xFF) / 100.0f;ip[i].y = (float)(rand() & 0xFF) / 100.0f;}return;
}void checkInnerStruct(innerStruct *hostRef, innerStruct *gpuRef, const int N)
{double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(hostRef[i].x - gpuRef[i].x) > epsilon){match = 0;printf("different on %dth element: host %f gpu %f\n", i,hostRef[i].x, gpuRef[i].x);break;}if (abs(hostRef[i].y - gpuRef[i].y) > epsilon){match = 0;printf("different on %dth element: host %f gpu %f\n", i,hostRef[i].y, gpuRef[i].y);break;}}if (!match) printf("Arrays do not match.\n\n");
}int main(int argc, char ** argv){int dev = 0;cudaSetDevice(dev);cudaDeviceProp deviceprop;CHECK(cudaGetDeviceProperties(&deviceprop,dev));printf("device %d: %s \n", dev, deviceprop.name);int nElem = LEN;size_t nBytes = nElem * sizeof(innerStruct);innerStruct *h_A = (innerStruct *)malloc(nBytes);innerStruct *hostRef = (innerStruct *)malloc(nBytes);innerStruct *gpuRef = (innerStruct *)malloc(nBytes);initialInnerStruct(h_A, nElem);testInnerStructHost(h_A, hostRef, nElem);innerStruct *d_A, *d_C;cudaMalloc((innerStruct**)&d_A, nBytes);cudaMalloc((innerStruct**)&d_C, nBytes);cudaMemcpy(d_A, h_A, nBytes,cudaMemcpyHostToDevice);int blocksize = 128;if (argc > 1) blocksize = atoi(argv[1]);dim3 block(blocksize,1);dim3 grid((nElem + block.x - 1)/block.x, 1);Timer timer;timer.start();warmup<<<grid,block>>>(d_A, d_C, nElem);cudaDeviceSynchronize();timer.stop();float elapsedTime = timer.elapsedms();printf("warmup <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x, elapsedTime);timer.start();testInnerStruct<<<grid,block>>>(d_A, d_C, nElem);cudaDeviceSynchronize();timer.stop();elapsedTime = timer.elapsedms();printf("testInnerStruct <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x, elapsedTime);cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);checkInnerStruct(hostRef, gpuRef, nElem);cudaFree(d_A);cudaFree(d_C);free(h_A);free(hostRef);free(gpuRef);cudaDeviceReset();return 0;
}
用NCU查看加载内存效率,只有50%:
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct [%] 50
执行时间:
testInnerStruct <<<8192, 128>>> elapsed 0.036864 ms
4.3.1 简单示例SoA
#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>#define LEN 1 << 20struct innerStruct{float x;float y;
};struct innerArray{float x[LEN];float y[LEN];
};__global__ void testInnerArray( innerArray *data, innerArray *result, const int n){unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;if (i < n) {float tmpx = data -> x[i];float tmpy = data -> y[i];tmpx += 10.0f;tmpy += 20.0f;result -> x[i] = tmpx;result -> y[i] = tmpy;}
}__global__ void warmup( innerArray *data, innerArray *result, const int n){unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;if (i < n) {float tmpx = data -> x[i];float tmpy = data -> y[i];tmpx += 10.0f;tmpy += 20.0f;result -> x[i] = tmpx;result -> y[i] = tmpy;}
}// functions for inner array outer struct
void initialInnerArray(innerArray *ip, int size)
{for (int i = 0; i < size; i++){ip->x[i] = (float)( rand() & 0xFF ) / 100.0f;ip->y[i] = (float)( rand() & 0xFF ) / 100.0f;}return;
}void testInnerArrayHost(innerArray *A, innerArray *C, const int n)
{for (int idx = 0; idx < n; idx++){C->x[idx] = A->x[idx] + 10.f;C->y[idx] = A->y[idx] + 20.f;}return;
}void checkInnerArray(innerArray *hostRef, innerArray *gpuRef, const int N)
{double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(hostRef->x[i] - gpuRef->x[i]) > epsilon){match = 0;printf("different on x %dth element: host %f gpu %f\n", i,hostRef->x[i], gpuRef->x[i]);break;}if (abs(hostRef->y[i] - gpuRef->y[i]) > epsilon){match = 0;printf("different on y %dth element: host %f gpu %f\n", i,hostRef->y[i], gpuRef->y[i]);break;}}if (!match) printf("Arrays do not match.\n\n");
}int main(int argc, char ** argv){int dev = 0;cudaSetDevice(dev);cudaDeviceProp deviceprop;CHECK(cudaGetDeviceProperties(&deviceprop,dev));printf("device %d: %s \n", dev, deviceprop.name);int nElem = LEN;size_t nBytes = sizeof(innerArray);innerArray *h_A = (innerArray *)malloc(nBytes);innerArray *hostRef = (innerArray *)malloc(nBytes);innerArray *gpuRef = (innerArray *)malloc(nBytes);initialInnerArray(h_A, nElem);testInnerArrayHost(h_A, hostRef, nElem);innerArray *d_A, *d_C;cudaMalloc((innerArray**)&d_A, nBytes);cudaMalloc((innerArray**)&d_C, nBytes);cudaMemcpy(d_A, h_A, nBytes,cudaMemcpyHostToDevice);int blocksize = 128;if (argc > 1) blocksize = atoi(argv[1]);dim3 block(blocksize,1);dim3 grid((nElem + block.x - 1)/block.x, 1);Timer timer;timer.start();warmup<<<grid,block>>>(d_A, d_C, nElem);cudaDeviceSynchronize();timer.stop();float elapsedTime = timer.elapsedms();printf("warmup <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x, elapsedTime);timer.start();testInnerArray<<<grid,block>>>(d_A, d_C, nElem);cudaDeviceSynchronize();timer.stop();elapsedTime = timer.elapsedms();printf("testInnerArray <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x, elapsedTime);cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);checkInnerArray(hostRef, gpuRef, nElem);cudaFree(d_A);cudaFree(d_C);free(h_A);free(hostRef);free(gpuRef);cudaDeviceReset();return 0;
}
NCU查看内存加载效率:
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct [%] 100
优化设备内存带宽利用率有两个目标:
·对齐及合并内存访问,以减少带宽的浪费
·足够的并发内存操作,以隐藏内存延迟