当前位置: 首页 > news >正文

debug mccl 02 —— 环境搭建及初步调试

1, 搭建nccl 调试环境

下载 nccl 源代码

 git clone --recursive https://github.com/NVIDIA/nccl.git


只debug host代码,故将设备代码的编译标志改成 -O3

 

(base) hipper@hipper-G21:~/let_debug_nccl/nccl$ git diff 
diff --git a/makefiles/common.mk b/makefiles/common.mk
index a037cf3..ee2aa8e 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -82,7 +82,8 @@ ifeq ($(DEBUG), 0)NVCUFLAGS += -O3CXXFLAGS  += -O3 -gelse
-NVCUFLAGS += -O0 -G -g
+#NVCUFLAGS += -O0 -G -g
+NVCUFLAGS += -O3CXXFLAGS  += -O0 -g -ggdb3endif

修改后变成如下:

nccl$ vim makefiles/common.mk

ifeq ($(DEBUG), 0)
NVCUFLAGS += -O3
CXXFLAGS  += -O3 -g
else
#NVCUFLAGS += -O0 -G -g
NVCUFLAGS += -O3
CXXFLAGS  += -O0 -g -ggdb3
endif

构建 nccl shared library:

机器上是几张sm_85 的卡,故:

$ cd nccl
$ make -j src.build  DEBUG=1       NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80"

到此即可,不需要安装nccl,直接过来使用即可;

2, 创建调试APP


在nccl所在的目录中创建app文件夹:

$ mkdir app$ cd app$ vim sp_md_nccl.cpp$ vim Makefile

sp_md_nccl.cpp:

#include <stdlib.h>
#include <stdio.h>
#include "cuda_runtime.h"
#include "nccl.h"
#include <time.h>
#include <sys/time.h>#define CUDACHECK(cmd) do {                         \cudaError_t err = cmd;                            \if (err != cudaSuccess) {                         \printf("Failed: Cuda error %s:%d '%s'\n",       \__FILE__,__LINE__,cudaGetErrorString(err)); \exit(EXIT_FAILURE);                             \}                                                 \
} while(0)#define NCCLCHECK(cmd) do {                         \ncclResult_t res = cmd;                           \if (res != ncclSuccess) {                         \printf("Failed, NCCL error %s:%d '%s'\n",       \__FILE__,__LINE__,ncclGetErrorString(res)); \exit(EXIT_FAILURE);                             \}                                                 \
} while(0)void  get_seed(long long &seed)
{struct timeval tv;gettimeofday(&tv, NULL);seed = (long long)tv.tv_sec * 1000*1000 + tv.tv_usec;//only second and usecond;printf("useconds:%lld\n", seed);
}void  init_vector(float* A, int n)
{long long seed = 0;get_seed(seed);srand(seed);for(int i=0; i<n; i++){A[i] = (rand()%100)/100.0f;}
}void print_vector(float* A, float size)
{for(int i=0; i<size; i++)printf("%.2f ", A[i]);printf("\n");
}void vector_add_vector(float* sum, float* A, int n)
{for(int i=0; i<n; i++){sum[i] += A[i];}
}int main(int argc, char* argv[])
{ncclComm_t comms[4];printf("ncclComm_t is a pointer type, sizeof(ncclComm_t)=%lu\n", sizeof(ncclComm_t));//managing 4 devices//int nDev = 4;int nDev = 2;//int size = 32*1024*1024;int size = 16*16;int devs[4] = { 0, 1, 2, 3 };float** sendbuff_host = (float**)malloc(nDev * sizeof(float*));float** recvbuff_host = (float**)malloc(nDev * sizeof(float*));for(int dev=0; dev<nDev; dev++){sendbuff_host[dev] = (float*)malloc(size*sizeof(float));recvbuff_host[dev] = (float*)malloc(size*sizeof(float));init_vector(sendbuff_host[dev], size);init_vector(recvbuff_host[dev], size);}//sigma(sendbuff_host[i]); i = 0, 1, ..., nDev-1float* result = (float*)malloc(size*sizeof(float));memset(result, 0, size*sizeof(float));for(int dev=0; dev<nDev; dev++){vector_add_vector(result, sendbuff_host[dev], size);printf("sendbuff_host[%d]=\n", dev);print_vector(sendbuff_host[dev], size);}printf("result=\n");print_vector(result, size);//allocating and initializing device buffersfloat** sendbuff = (float**)malloc(nDev * sizeof(float*));float** recvbuff = (float**)malloc(nDev * sizeof(float*));cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);for (int i = 0; i < nDev; ++i) {CUDACHECK(cudaSetDevice(i));CUDACHECK(cudaMalloc(sendbuff + i, size * sizeof(float)));CUDACHECK(cudaMalloc(recvbuff + i, size * sizeof(float)));CUDACHECK(cudaMemcpy(sendbuff[i], sendbuff_host[i], size*sizeof(float), cudaMemcpyHostToDevice));CUDACHECK(cudaMemcpy(recvbuff[i], recvbuff_host[i], size*sizeof(float), cudaMemcpyHostToDevice));CUDACHECK(cudaStreamCreate(s+i));}//initializing NCCLNCCLCHECK(ncclCommInitAll(comms, nDev, devs));//calling NCCL communication API. Group API is required when using//multiple devices per threadNCCLCHECK(ncclGroupStart());printf("blocked ncclAllReduce will be calleded\n");fflush(stdout);for (int i = 0; i < nDev; ++i)NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], size, ncclFloat, ncclSum, comms[i], s[i]));printf("blocked ncclAllReduce is calleded nDev =%d\n", nDev);fflush(stdout);NCCLCHECK(ncclGroupEnd());//synchronizing on CUDA streams to wait for completion of NCCL operationfor (int i = 0; i < nDev; ++i) {CUDACHECK(cudaSetDevice(i));CUDACHECK(cudaStreamSynchronize(s[i]));}for (int i = 0; i < nDev; ++i) {CUDACHECK(cudaSetDevice(i));CUDACHECK(cudaMemcpy(recvbuff_host[i], recvbuff[i], size*sizeof(float), cudaMemcpyDeviceToHost));}for (int i = 0; i < nDev; ++i) {CUDACHECK(cudaSetDevice(i));CUDACHECK(cudaStreamSynchronize(s[i]));}for(int i=0; i<nDev; i++) {printf("recvbuff_dev2host[%d]=\n", i);print_vector(recvbuff_host[i], size);}//free device buffersfor (int i = 0; i < nDev; ++i) {CUDACHECK(cudaSetDevice(i));CUDACHECK(cudaFree(sendbuff[i]));CUDACHECK(cudaFree(recvbuff[i]));}//finalizing NCCLfor(int i = 0; i < nDev; ++i)ncclCommDestroy(comms[i]);printf("Success \n");return 0;
}

Makefile:


INC := -I /usr/local/cuda/include -I ../nccl/build/include
LD_FLAGS := -L ../nccl/build/lib -lnccl -L /usr/local/cuda/lib64 -lcudartEXE := singleProc_multiDev_ncclall: $(EXE)%: %.cppg++ -g -ggdb3  $<  -o  $@  $(INC)  $(LD_FLAGS).PHONY: clean
clean: -rm -rf $(EXE)

export LD_LIBRARY_PATH=../nccl/build/lib

3, 开始调试


$ cuda-gdb sp_md_nccl(cuda-gdb) start (cuda-gdb) rbreak doLauches(cuda-gdb) c(cuda-gdb) p ncclGroupCommHead->tasks.collQueue.head->op 

初步实现了可debug的效果:

现在想要搞清楚在程序调用 ncclAllReduce(..., ncclSum,  ... ) 后,是如何映射到 cudaLaunchKernel调用到了正确的 cuda kernel 函数的。

在doLaunches函数中,作如下debug动作:

查看 doLaunches(ncclComm*) 的函数参数,即,gropu.cc中的变量:ncclGroupCommHead的某个成员的成员的值:op

其结果如下:

(cuda-gdb) p ncclGroupCommHead                           
$5 = (ncclComm *) 0x5555563231e0
(cuda-gdb) p ncclGroupCommHead->tasks.collQueue.head->op 
$6 = {op = ncclDevSum, proxyOp = ncclSum, scalarArgIsPtr = false, scalarArg = 256}
(cuda-gdb) 

不过这依然只停留在了 ncclSum的这个枚举类型上,还没锁定对应的cudaKernel。

接下来继续努力 ...

http://www.lryc.cn/news/274858.html

相关文章:

  • ros python 接收GPS RTK 串口消息再转发 ros 主题消息
  • 2024年网络安全竞赛-页面信息发现任务解析
  • ARM DMA使用整理
  • 移动通信原理与关键技术学习(第四代蜂窝移动通信系统)
  • 光明源@智慧厕所技术:优化生活,提升卫生舒适度
  • 【Bootstrap学习 day13】
  • Shell:常用命令之dirname与basename
  • Linux-v4l2框架
  • VPC网络架构下的网络上数据采集
  • 模拟算法(模拟算法 == 依葫芦画瓢)万字
  • QtApplets-SystemInfo
  • vue3防抖函数封装与使用,以指令的形式使用
  • Hive学习(13)lag和lead函数取偏移量
  • Centos Unable to verify the graphical display setup
  • Java 说一下 synchronized 底层实现原理?
  • nginx访问路径匹配方法
  • 偌依 项目部署及上线步骤
  • PHP特性知识点扫盲 - 上篇
  • Docker一键极速安装Nacos,并配置数据库!
  • 交换机04_远程连接
  • ES6定义一个类(函数内部定义属性,,原型定义方法 ), 实现继承?
  • 使用 Process Explorer 和 Windbg 排查软件线程堵塞案例分享
  • “智慧”千里眼助力水泵站
  • C++多态性——(5)运算符重载(第二节)
  • ES -极客学习
  • 【大厂秘籍】系列 - Java多线程面试题
  • vue实现画笔回放,canvas转视频播放功能
  • Docker中镜像的相关操作
  • [python]python利用pyaudio录制系统声音没有立体声混音怎么录制系统音频
  • 使用echarts的bmap配置项绘制区域轮廓遮罩