当前位置: 首页 > news >正文

将 magma example 改写成 cusolver example eqrf

1,简单安装Magma

1.1 下载编译 OpenBLAS

$ git clone https://github.com/OpenMathLib/OpenBLAS.git
$ cd OpenBLAS/
$ make -j DEBUG=1
$ make install PREFIX=/home/hipper/ex_magma/local_d/OpenBLAS/

1.2 下载编译 magma

$ git clone https://bitbucket.org/icl/magma.git
$ cd magma/
$ cp make.inc-examples/make.inc.openblas ./make.inc
$ vim make.inc
// # edit openblasdir to abouve
// # -O2 -> -g
$ make -j

vim make.inc

2. 改写 testing_xxxqr_gpu.cpp

testing/testing_sgeqrf_gpu.cpp

运行效果:

原始代码: 

/*-- MAGMA (version 2.0) --Univ. of Tennessee, KnoxvilleUniv. of California, BerkeleyUniv. of Colorado, Denver@date@generated from testing/testing_zgeqrf_gpu.cpp, normal z -> s, Mon Jul 29 01:23:15 2024
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>// includes, project
#include "flops.h"
#include "magma_v2.h"
#include "magma_lapack.h"
#include "testings.h"/* -- Testing sgeqrf
*/
int main( int argc, char** argv)
{TESTING_CHECK( magma_init() );magma_print_environment();const float             d_neg_one = MAGMA_D_NEG_ONE;const float             d_one     = MAGMA_D_ONE;const float c_neg_one = MAGMA_S_NEG_ONE;const float c_one     = MAGMA_S_ONE;const float c_zero    = MAGMA_S_ZERO;const magma_int_t        ione      = 1;real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;float           Anorm, error=0, error2=0;float *h_A, *h_R, *tau, *h_work, tmp[1], unused[1];magmaFloat_ptr d_A, dT;magma_int_t M, N, n2, lda, ldda, lwork, info, min_mn, nb, size;magma_int_t ISEED[4] = {0,0,0,1};magma_opts opts;opts.parse_opts( argc, argv );int status = 0;float tol = opts.tolerance * lapackf77_slamch("E");// for expert API testingmagma_device_t cdev;magma_queue_t queues[2];magma_getdevice( &cdev );magma_queue_create( cdev, &queues[0] );magma_queue_create( cdev, &queues[1] );// version 3 can do either checkif (opts.check == 1 && ( opts.version == 1 || opts.version == 4 ) ) {opts.check = 2;printf( "%% versions 1 and 4 requires check 2 (solve A*x=b)\n" );}if (opts.check == 2 && opts.version == 2) {opts.check = 1;printf( "%% version 2 requires check 1 (R - Q^H*A)\n" );}printf( "%% version %lld\n", (long long) opts.version );if ( opts.check == 1 ) {printf("%%   M     N   CPU Gflop/s (sec)   GPU Gflop/s (sec)   |R - Q^H*A|   |I - Q^H*Q|\n");printf("%%==============================================================================\n");}else {printf("%%   M     N   CPU Gflop/s (sec)   GPU Gflop/s (sec)    |b - A*x|\n");printf("%%===============================================================\n");}for( int itest = 0; itest < opts.ntest; ++itest ) {for( int iter = 0; iter < opts.niter; ++iter ) {M = opts.msize[itest];N = opts.nsize[itest];min_mn = min( M, N );lda    = M;n2     = lda*N;ldda   = magma_roundup( M, opts.align );  // multiple of 32 by defaultnb     = magma_get_sgeqrf_nb( M, N );gflops = FLOPS_SGEQRF( M, N ) / 1e9;// query for workspace sizelwork = -1;lapackf77_sgeqrf( &M, &N, unused, &M, unused, tmp, &lwork, &info );lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] );TESTING_CHECK( magma_smalloc_cpu( &tau,    min_mn ));TESTING_CHECK( magma_smalloc_cpu( &h_A,    n2     ));TESTING_CHECK( magma_smalloc_cpu( &h_work, lwork  ));TESTING_CHECK( magma_smalloc_pinned( &h_R,    n2     ));TESTING_CHECK( magma_smalloc( &d_A,    ldda*N ));if ( opts.version == 1 || opts.version == 3 || opts.version == 4 ) {size = (2*min(M, N) + magma_roundup( N, 32 ) )*nb;TESTING_CHECK( magma_smalloc( &dT, size ));magmablas_slaset( MagmaFull, size, 1, c_zero, c_zero, dT, size, opts.queue );}/* Initialize the matrix */magma_generate_matrix( opts, M, N, h_A, lda );lapackf77_slacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda );magma_ssetmatrix( M, N, h_R, lda, d_A, ldda, opts.queue );/* ====================================================================Performs operation using MAGMA=================================================================== */if ( opts.version == 1 ) {// stores dT, V blocks have zeros, R blocks inverted & stored in dTgpu_time = magma_wtime();magma_sgeqrf_gpu( M, N, d_A, ldda, tau, dT, &info );gpu_time = magma_wtime() - gpu_time;}else if ( opts.version == 2 ) {// LAPACK complaint argumentsgpu_time = magma_wtime();magma_sgeqrf2_gpu( M, N, d_A, ldda, tau, &info );gpu_time = magma_wtime() - gpu_time;}#if defined(MAGMA_HAVE_CUDA) || defined(MAGMA_HAVE_HIP)else if ( opts.version == 3 ) {// stores dT, V blocks have zeros, R blocks stored in dTgpu_time = magma_wtime();magma_sgeqrf3_gpu( M, N, d_A, ldda, tau, dT, &info );gpu_time = magma_wtime() - gpu_time;}#endifelse if (opts.version == 4) {// expert API for magma_sgeqrf_gpumagma_mode_t mode = MagmaHybrid;// query workspacevoid *host_work = NULL, *device_work=NULL;magma_int_t lhwork[1] = {-1}, ldwork[1] = {-1};magma_sgeqrf_expert_gpu_work(M, N, NULL, ldda,NULL, NULL, &info,mode, nb,NULL, lhwork,NULL, ldwork, queues );// alloc workspaceif( lhwork[0] > 0 ) {magma_malloc_pinned( (void**)&host_work, lhwork[0] );}if( ldwork[0] > 0 ) {magma_malloc( (void**)&device_work, ldwork[0] );}// time actual call onlygpu_time = magma_wtime();magma_sgeqrf_expert_gpu_work(M, N, d_A, ldda, tau, dT, &info,mode, nb,host_work, lhwork, device_work, ldwork, queues );magma_queue_sync( queues[0] );magma_queue_sync( queues[1] );gpu_time = magma_wtime() - gpu_time;// free workspaceif( host_work != NULL) {magma_free_pinned( host_work );}if( device_work != NULL ) {magma_free( device_work );}}else {printf( "Unknown version %lld\n", (long long) opts.version );return -1;}gpu_perf = gflops / gpu_time;if (info != 0) {printf("magma_sgeqrf returned error %lld: %s.\n",(long long) info, magma_strerror( info ));}if ( opts.check == 1 && (opts.version == 2 || opts.version == 3) ) {if ( opts.version == 3 ) {// copy diagonal blocks of R back to Afor( int i=0; i < min_mn-nb; i += nb ) {magma_int_t ib = min( min_mn-i, nb );magmablas_slacpy( MagmaUpper, ib, ib, &dT[min_mn*nb + i*nb], nb, &d_A[ i + i*ldda ], ldda, opts.queue );}}/* =====================================================================Check the result, following zqrt01 except using the reduced Q.This works for any M,N (square, tall, wide).Only for version 2, which has LAPACK complaint output.Or   for version 3, after restoring diagonal blocks of A above.=================================================================== */magma_sgetmatrix( M, N, d_A, ldda, h_R, lda, opts.queue );magma_int_t ldq = M;magma_int_t ldr = min_mn;float *Q, *R;float *work;TESTING_CHECK( magma_smalloc_cpu( &Q,    ldq*min_mn ));  // M by KTESTING_CHECK( magma_smalloc_cpu( &R,    ldr*N ));       // K by NTESTING_CHECK( magma_smalloc_cpu( &work, min_mn ));// generate M by K matrix Q, where K = min(M,N)lapackf77_slacpy( "Lower", &M, &min_mn, h_R, &lda, Q, &ldq );lapackf77_sorgqr( &M, &min_mn, &min_mn, Q, &ldq, tau, h_work, &lwork, &info );assert( info == 0 );// copy K by N matrix Rlapackf77_slaset( "Lower", &min_mn, &N, &c_zero, &c_zero, R, &ldr );lapackf77_slacpy( "Upper", &min_mn, &N, h_R, &lda,        R, &ldr );// error = || R - Q^H*A || / (N * ||A||)blasf77_sgemm( "Conj", "NoTrans", &min_mn, &N, &M,&c_neg_one, Q, &ldq, h_A, &lda, &c_one, R, &ldr );Anorm = lapackf77_slange( "1", &M,      &N, h_A, &lda, work );error = lapackf77_slange( "1", &min_mn, &N, R,   &ldr, work );if ( N > 0 && Anorm > 0 )error /= (N*Anorm);// set R = I (K by K identity), then R = I - Q^H*Q// error = || I - Q^H*Q || / Nlapackf77_slaset( "Upper", &min_mn, &min_mn, &c_zero, &c_one, R, &ldr );blasf77_ssyrk( "Upper", "Conj", &min_mn, &M, &d_neg_one, Q, &ldq, &d_one, R, &ldr );error2 = safe_lapackf77_slansy( "1", "Upper", &min_mn, R, &ldr, work );if ( N > 0 )error2 /= N;magma_free_cpu( Q    );  Q    = NULL;magma_free_cpu( R    );  R    = NULL;magma_free_cpu( work );  work = NULL;}else if ( opts.check == 2 && M >= N && (opts.version == 1 || opts.version == 3 || opts.version == 4) ) {/* =====================================================================Check the result by solving consistent linear system, A*x = b.Only for versions 1 & 3 with M >= N.=================================================================== */magma_int_t lwork2;float *x, *b, *hwork;magmaFloat_ptr d_B;// initialize RHS, b = A*randomTESTING_CHECK( magma_smalloc_cpu( &x, N ));TESTING_CHECK( magma_smalloc_cpu( &b, M ));lapackf77_slarnv( &ione, ISEED, &N, x );blasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_zero, b, &ione );// copy to GPUTESTING_CHECK( magma_smalloc( &d_B, M ));magma_ssetvector( M, b, 1, d_B, 1, opts.queue );if ( opts.version == 1 || opts.version == 4) {// allocate hworkmagma_sgeqrs_gpu( M, N, 1,d_A, ldda, tau, dT,d_B, M, tmp, -1, &info );lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] );TESTING_CHECK( magma_smalloc_cpu( &hwork, lwork2 ));// solve linear systemmagma_sgeqrs_gpu( M, N, 1,d_A, ldda, tau, dT,d_B, M, hwork, lwork2, &info );if (info != 0) {printf("magma_sgeqrs returned error %lld: %s.\n",(long long) info, magma_strerror( info ));}magma_free_cpu( hwork );}#if defined(MAGMA_HAVE_CUDA) || defined(MAGMA_HAVE_HIP)else if ( opts.version == 3 ) {// allocate hworkmagma_sgeqrs3_gpu( M, N, 1,d_A, ldda, tau, dT,d_B, M, tmp, -1, &info );lwork2 = (magma_int_t)MAGMA_S_REAL( tmp[0] );TESTING_CHECK( magma_smalloc_cpu( &hwork, lwork2 ));// solve linear systemmagma_sgeqrs3_gpu( M, N, 1,d_A, ldda, tau, dT,d_B, M, hwork, lwork2, &info );if (info != 0) {printf("magma_sgeqrs3 returned error %lld: %s.\n",(long long) info, magma_strerror( info ));}magma_free_cpu( hwork );}#endifelse {printf( "Unknown version %lld\n", (long long) opts.version );return -1;}magma_sgetvector( N, d_B, 1, x, 1, opts.queue );// compute r = Ax - b, saved in bblasf77_sgemv( "Notrans", &M, &N, &c_one, h_A, &lda, x, &ione, &c_neg_one, b, &ione );// compute residual |Ax - b| / (max(m,n)*|A|*|x|)float norm_x, norm_A, norm_r, work[1];norm_A = lapackf77_slange( "F", &M, &N, h_A, &lda, work );norm_r = lapackf77_slange( "F", &M, &ione, b, &M, work );norm_x = lapackf77_slange( "F", &N, &ione, x, &N, work );magma_free_cpu( x );magma_free_cpu( b );magma_free( d_B );error = norm_r / (max(M,N) * norm_A * norm_x);}/* =====================================================================Performs operation using LAPACK=================================================================== */if ( opts.lapack ) {cpu_time = magma_wtime();lapackf77_sgeqrf( &M, &N, h_A, &lda, tau, h_work, &lwork, &info );cpu_time = magma_wtime() - cpu_time;cpu_perf = gflops / cpu_time;if (info != 0) {printf("lapackf77_sgeqrf returned error %lld: %s.\n",(long long) info, magma_strerror( info ));}}/* =====================================================================Print performance and error.=================================================================== */printf("%5lld %5lld   ", (long long) M, (long long) N );if ( opts.lapack ) {printf( "%7.2f (%7.2f)", cpu_perf, cpu_time );}else {printf("  ---   (  ---  )" );}printf( "   %7.2f (%7.2f)   ", gpu_perf, gpu_time );if ( opts.check == 1 ) {bool okay = (error < tol && error2 < tol);status += ! okay;printf( "%11.2e   %11.2e   %s\n", error, error2, (okay ? "ok" : "failed") );}else if ( opts.check == 2 ) {if ( M >= N ) {bool okay = (error < tol);status += ! okay;printf( "%10.2e   %s\n", error, (okay ? "ok" : "failed") );}else {printf( "(error check only for M >= N)\n" );}}else {printf( "    ---\n" );}magma_free_cpu( tau    );magma_free_cpu( h_A    );magma_free_cpu( h_work );magma_free_pinned( h_R );magma_free( d_A );if ( opts.version == 1 || opts.version == 3 || opts.version == 4 ) {magma_free( dT );}fflush( stdout );}if ( opts.niter > 1 ) {printf( "\n" );}}magma_queue_destroy( queues[0] );magma_queue_destroy( queues[1] );opts.cleanup();TESTING_CHECK( magma_finalize() );return status;
}

流程分析:

改写为:

testing_cusolver_sgeqrf_gpu.cpp

#include <>

待补。。。

http://www.lryc.cn/news/409818.html

相关文章:

  • 微信小程序教程007:数据绑定
  • Git -- git stash 暂存
  • 基于YOLO的植物病害识别系统:从训练到部署全攻略
  • 数据库开发:MySQL基础(二)
  • 实现物理数据库迁移到云上
  • [Spring] MyBatis操作数据库(进阶)
  • 【Websim.ai】一句话让AI帮你生成一个网页
  • 云计算实训16——关于web,http协议,https协议,apache,nginx的学习与认知
  • 2024年必备技能:小红书笔记评论自动采集,零基础也能学会的方法
  • 【Gitlab】SSH配置和克隆仓库
  • [Day 35] 區塊鏈與人工智能的聯動應用:理論、技術與實踐
  • Vue 3 中使用 inMap.js 实现蜂窝热力图的可视化
  • nginx隐藏server及版本号
  • Oracle DBMS_XPLAN包
  • 【ffmpeg命令入门】分离音视频流
  • 小红书笔记评论采集全攻略:三种高效方法教你批量导出
  • 实战:ZooKeeper 操作命令和集群部署
  • linux运维一天一个shell命令之 top详解
  • 大模型微调:参数高效微调(PEFT)方法总结
  • Spark+实例解读
  • WPF多语言国际化,中英文切换
  • Halcon深度学习分类模型
  • 洗地机哪种牌子好?洗地机排行榜前十名公布
  • C++中的虚函数与多态机制如何工作?
  • 《LeetCode热题100》---<哈希三道>
  • 秒懂C++之string类(下)
  • github简单地操作
  • 模型改进-损失函数合集
  • C++模板(初阶)
  • 下面关于Date类的描述错误的一项是?