当前位置: 首页 > news >正文

昇腾310内存拷贝测试

目的

从服务器将数据拷贝到昇腾310 ai卡

结论

数据进入到服务器后的内存不能直接用昇腾acl接口拷贝到AI卡。

需要

1)先用acl接口申请内存;

2) 将数据拷贝到acl申请的内存

3)用acl接口将数据拷贝到AI卡

makefile

# Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved.# CMake lowest version requirement
cmake_minimum_required(VERSION 3.5.1)# project information
project(mem310p)set(LIB_PATH $ENV{NPU_HOST_LIB})
# Dynamic libraries in the stub directory can only be used for compilation
if (NOT DEFINED ENV{NPU_HOST_LIB})#set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/CANN-6.4/runtime/lib64/stub/aarch64")set(LIB_PATH "/usr/local/Ascend/nnrt/latest/runtime/lib64/stub/")message(STATUS "set default LIB_PATH: ${LIB_PATH}")
else ()message(STATUS "env LIB_PATH: ${LIB_PATH}")
endif()set(INC_PATH $ENV{NPU_HOST_INC})
# Dynamic libraries in the stub directory can only be used for compilation
if (NOT DEFINED ENV{NPU_HOST_INC})#set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest/CANN-6.4/runtime/include")# set(INC_PATH "/usr/local/Ascend/nnrt/6.2.RC2/aarch64-linux/include")#  set(INC_PATH "/usr/local/Ascend/nnrt/7.0.0/aarch64-linux/include")set(INC_PATH "/usr/local/Ascend/nnrt/latest/aarch64-linux/include")message(STATUS "set default INC_PATH: ${INC_PATH}")
else ()message(STATUS "env INC_PATH: ${INC_PATH}")
endif()link_directories(${LIB_PATH}${LIB_PATH}/stub
)add_executable(mem310p#add_library(zh_sensor SHARED mem310p.c
)include_directories(mem310p${INC_PATH}${INC_PATH}/acl/media)add_compile_options(mem310p-O2-Wall-fpic
)# sns_zhtarget_link_libraries(mem310pacl_dvpp_mpi   ascendclpthread
)

测试代码


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <errno.h>
#include <stdint.h>
#include "acl/acl.h"
#define SAMPLE_PRT(fmt...)   \do { \printf("[%s]-%d: ", __FUNCTION__, __LINE__); \printf(fmt); \} while (0)int SUCCESS = 0;
int FAILED = 1;
aclrtContext g_context      = NULL;
uint32_t g_device_id        = 0;typedef uint64_t u64;
typedef uint32_t u32;int32_t setup_acl_device()
{aclError aclRet = aclInit(NULL);if (aclRet != ACL_SUCCESS){SAMPLE_PRT("aclInit fail with %d.\n", aclRet);return aclRet;}SAMPLE_PRT("aclInit succ.\n");aclRet = aclrtSetDevice(g_device_id);if (aclRet != ACL_SUCCESS){SAMPLE_PRT("aclrtSetDevice %u fail with %d.\n", g_device_id, aclRet);aclFinalize();return aclRet;}SAMPLE_PRT("aclrtSetDevice(%u) succ.\n", g_device_id);aclRet = aclrtCreateContext(&g_context, g_device_id);if (aclRet != ACL_SUCCESS){SAMPLE_PRT("acl create context failed with %d.\n", aclRet);aclrtResetDevice(g_device_id);aclFinalize();return aclRet;}SAMPLE_PRT("create context success\n");aclRet = aclrtGetCurrentContext(&g_context);if (aclRet != ACL_SUCCESS){SAMPLE_PRT("get current context failed\n");aclrtDestroyContext(g_context);g_context = NULL;aclrtResetDevice(g_device_id);aclFinalize();return aclRet;}SAMPLE_PRT("get current context success\n");return SUCCESS;
}void destroy_acl_device()
{if (g_context){aclrtDestroyContext(g_context);g_context = NULL;aclrtResetDevice(g_device_id);aclFinalize();}
}struct dev_addr
{u64  phy_addr;void  *virt_addr;u32  reg_len;void *mapbase;
};static void *ioremap_system_mem(struct dev_addr *addr)
{void *map_base;int fd;off_t target;target = addr->phy_addr;// if ((fd = open("/dev/mem", O_RDWR | O_SYNC)) == -1)if ((fd = open("/dev/mem", O_RDWR | O_NONBLOCK)) == -1){fprintf(stderr, "Error at line %d, file %s (%d) [%s]\n", \__LINE__, __FILE__, errno, strerror(errno));return NULL ;}fflush(stdout);map_base = mmap(0, addr->reg_len, PROT_READ | PROT_WRITE, MAP_SHARED, fd,target);if (map_base == (void *) -1){fprintf(stderr, "Error at line %d, file %s (%d) [%s]\n", \__LINE__, __FILE__, errno, strerror(errno));return NULL ;}fflush(stdout);addr->mapbase = map_base;addr->virt_addr = map_base;//(map_base + (target & MAP_MASK));close(fd);return addr->virt_addr;
}//#ifdef  __ARM__
#if 1
void neon_memcpy(volatile void *dst, volatile void *src, int sz)
{if (sz & 63){sz = (sz & -64) + 64;}asm volatile("NEONCopyPLD: \n""sub %[dst], %[dst], #64 \n""1: \n""ldnp q0, q1, [%[src]] \n""ldnp q2, q3, [%[src], #32] \n""add %[dst], %[dst], #64 \n""subs %[sz], %[sz], #64 \n""add %[src], %[src], #64 \n""stnp q0, q1, [%[dst]] \n""stnp q2, q3, [%[dst], #32] \n""b.gt 1b \n": [dst]"+r"(dst), [src]"+r"(src), [sz]"+r"(sz) : : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory");
}#else#include <arm_neon.h>void *neon_memcpy(void *dest, void *src, size_t count)
{int i;unsigned long *s = (unsigned long *)src;unsigned long *d = (unsigned long *)dest;for (i = 0; i < count / 64; i++){vst1q_u64(&d[0], vld1q_u64(&s[0]));vst1q_u64(&d[2], vld1q_u64(&s[2]));vst1q_u64(&d[4], vld1q_u64(&s[4]));vst1q_u64(&d[6], vld1q_u64(&s[6]));d += 8;s += 8;}return dest;
}#endif
//#endifstruct timeval start_time;
struct timeval end_time;
double elapsed_time = 0.0;static double elapsed(struct timeval start_time, struct timeval end_time)
{elapsed_time = (end_time.tv_sec * 1000000 + end_time.tv_usec - start_time.tv_sec* 1000000 - start_time.tv_usec);return elapsed_time;
}int32_t main(int32_t argc, char *argv[])
{int ret = SUCCESS;aclError aclRet = ACL_SUCCESS;// acl资源初始化ret = setup_acl_device();if (ret != SUCCESS){SAMPLE_PRT("Setup Device failed! ret code:%#x\n", ret);return FAILED;}struct dev_addr atu_addr;//atu_addr.phy_addr = 0x2000000000;atu_addr.phy_addr = 0x2200000000;  //0x22 0000 0000atu_addr.virt_addr = 0;atu_addr.reg_len = 0x2000000;size_t g_memory_size        = 4752 * 3795;if (NULL == ioremap_system_mem(&atu_addr)){printf("ioremap fail\n");return -1;}void *host_buffer = NULL;void *host_buffer1 = NULL;void *device_buffer = NULL;void *my_host_buffer = NULL;my_host_buffer = malloc(g_memory_size);if (NULL == my_host_buffer){printf("malloc fail\n");return 0;}memset((unsigned char *)my_host_buffer, 0xaa, g_memory_size);memset((unsigned char *)atu_addr.virt_addr, 0x55, g_memory_size);// 申请device内存aclRet = aclrtMalloc(&device_buffer, g_memory_size, ACL_MEM_MALLOC_HUGE_FIRST);if (aclRet != ACL_SUCCESS){SAMPLE_PRT("aclrtMalloc failed\n");}// 申请host内存aclRet = aclrtMallocHost(&host_buffer, g_memory_size);if (aclRet != ACL_SUCCESS){SAMPLE_PRT("aclrtMallocHost failed\n");}// 申请host内存aclRet = aclrtMallocHost(&host_buffer1, g_memory_size);if (aclRet != ACL_SUCCESS){SAMPLE_PRT("aclrtMallocHost failed\n");}gettimeofday(&(start_time), NULL);memcpy(host_buffer1, (unsigned char *)atu_addr.virt_addr, g_memory_size);gettimeofday(&(end_time), NULL);elapsed_time = elapsed(start_time, end_time);// 内存拷贝H2DaclRet = aclrtMemcpy(device_buffer, g_memory_size,host_buffer1 /* atu_addr.virt_addr*/, g_memory_size, ACL_MEMCPY_HOST_TO_DEVICE);if (aclRet != ACL_SUCCESS){SAMPLE_PRT("aclrtMemcpy H2D failed\n");}printf("....,dma time %f\n", elapsed_time);// 内存拷贝D2HaclRet = aclrtMemcpy(host_buffer, g_memory_size, device_buffer,g_memory_size, ACL_MEMCPY_DEVICE_TO_HOST);if (aclRet != ACL_SUCCESS){SAMPLE_PRT("aclrtMemcpy D2H failed\n");}SAMPLE_PRT("\data  is %x\n",  *(unsigned char *)host_buffer);if (munmap(atu_addr.mapbase, atu_addr.reg_len) == -1){printf("un ioremap fail\n");return -1;}// 释放al资源destroy_acl_device();SAMPLE_PRT("run success!\n");return SUCCESS;
}

上述测试代码,测试从mmap映射的内存到acl分配的内存,耗时70几个ms,这是不正确的。

耗时分析

待续。。。。

http://www.lryc.cn/news/435530.html

相关文章:

  • ‘$store‘ is not defined.
  • 如何利用Linux提升工作效率和安全性?
  • 初始Linux 和 各种常见指令
  • 【稀疏矩阵】使用torch.sparse模块
  • 如何增加谷歌网站曝光率?
  • 虚幻中的c++(持续更新)
  • 83-MySQL 索引有几种
  • 文献解读-The trans-omics landscape of COVID-19
  • Unity核心实践小项目
  • Avaloia 实现国产麒麟系统中文显示界面
  • pytest 生成allure测试报告
  • 查询GPU版本以及PyTorch中使用单GPU和多GPU
  • 基于SpringBoot+Vue的线上考试系统
  • 动手学深度学习(pytorch土堆)-02TensorBoard的使用
  • STM3学习记录
  • 【网络】应用层协议-http协议
  • 【python】OpenCV—Mask RCNN for Object Detection and Instance Segmentation
  • 通过 Python 使用 Pexels图片库 API 打造个性化壁纸应用
  • 多线程篇(其它容器- CopyOnWriteArrayList)(持续更新迭代)
  • OPENAIGC开发者大赛高校组金奖 | 知洞—基于大模型的智慧题库
  • java服务CPU使用率高排查
  • 聚焦:clicOH 借助 NVIDIA cuOpt 实现最后一英里交付速度 20 倍提升
  • 从头开始嵌入式第三十八天(数据结构 双向链表)
  • chapter14-集合——(List-HashSet)——day18
  • 企业会议室预约管理系统
  • 安全API
  • 【论文阅读】视觉分割新SOTA: Segment Anything(SAM)
  • redis之list核心命令演示与细节探索
  • [数据集][目标检测]智慧农业草莓叶子病虫害检测数据集VOC+YOLO格式4040张9类别
  • Lua 与 C#交互