北京并行科技GPU-CUDA Fortran计算平台配置

启用MPI OpenMP

module load nvhpc/nvhpc/21.7_cuda11.0_openmpi4.0.5_gcc7.5_qd


调用ifort 编译器

 module load intel/parallelstudio/2017.1.5



Step1 抢占一个GPU计算节点

[scv1172@ln01 xyqtest2022]$ salloc –gpus=1

salloc:Pending job allocation 308347

salloc:job 308347 queued and waiting for resources

salloc:job 308347 has been allocated resources

salloc:Granted job allocation 308347

salloc:Waiting for resource configuration

salloc:Nodes g0001 are ready for job  (找到的计算节点为g0001)

 

Step2 ssh命令加载找到的节点 g0001

[scv1172@ln01 xyqtest2022]$ ssh g0001

Warning:Permanently added ‘g0001,192.168.11.1’ (ECDSA) to the list of known hosts.

[scv1172@g0001 ~]$

以上结果表明加载g0001节点成功

 

Step3 查询NVHPC:用 module ava命令查找 是否安装nvhpc,以及版本

[scv1172@g0001 ~]$ module ava nvhpc

——————-
/data/apps/modulefiles ————————–

nvhpc/nvhpc/20.7_cuda10.1_openmpi3.1.5

nvhpc/nvhpc/20.7_cuda10.2_openmpi3.1.5

nvhpc/nvhpc/21.7_cuda11.0_openmpi4.0.5_gcc7.5_qd

nvhpc/nvhpc_byo_compiler/20.7_cuda10.1

nvhpc/nvhpc_byo_compiler/20.7_cuda10.2

nvhpc/nvhpc_byo_compiler/21.7_cuda11.0_gcc7.5

nvhpc/nvhpc_nompi/20.7_cuda10.1

nvhpc/nvhpc_nompi/20.7_cuda10.2

nvhpc/nvhpc_nompi/21.7_cuda11.0_gcc7.5_qd

 

Step 4 module load加载相应版本的nvhpc

[scv1172@g0001 ~]$ module load nvhpc/nvhpc_nompi/21.7_cuda11.0_gcc7.5_qd

 

Step 5 which命令检查nvhpc的加载是否成功

[scv1172@g0001 xyqtest2022]$ which nvfortran

/data/apps/nvhpc/nvhpc_21.7/Linux_x86_64/21.7/compilers/bin/nvfortran

[scv1172@g0001
xyqtest2022]$

以上结果表明相应版本的nvhpc加载成功

利用以上5步则可以对Cuda-GPU的计算节点进性配置

~~~~~~测试代码(命名为test.cuf)~~~~~~~~~~~~

program deviceQuery
 use cudafor
 type (cudaDeviceProp) :: prop
 integer :: nDevices=0, i, ierr
 ! Number of CUDA -capable devices
ierr = cudaGetDeviceCount(nDevices)
 if (nDevices == 0) then
write(*,”(/,’No CUDA devices found ‘,/)”)
 stop
 else if (nDevices == 1) then
 write(*,”(/,’One CUDA device found ‘,/)”)
 else
 write(*,”(/,i0,’ CUDA devices found ‘,/)”) nDevices
 end if

! Loop over devices
 do i = 0, nDevices -1
 write(*,”(‘Device Number: ‘,i0)”) i
 ierr = cudaGetDeviceProperties(prop , i)
 ! General device info
 write(*,”(‘ Device Name: ‘,a)”) trim(prop%name)
 write(*,”(‘ Compute Capability: ‘,i0,’.’,i0)”) &
 prop%major , prop%minor
 write(*,”(‘ Number of Multiprocessors: ‘,i0)”) &
 prop%multiProcessorCount
write(*,”(‘ Max Threads per Multiprocessor: ‘,i0)”) &
 prop%maxThreadsPerMultiprocessor
 write(*,”(‘ Global Memory (GB): ‘,f9.3,/)”) &
 prop%totalGlobalMem /1024.0**3

 ! Execution Configuration
 write(*,”(‘ Execution Configuration Limits ‘)”)
 write(*,”(‘ Max Grid Dims: ‘,2(i0,’ x ‘),i0)”) &
 prop%maxGridSize
 write(*,”(‘ Max Block Dims: ‘,2(i0,’ x ‘),i0)”) &
 prop%maxThreadsDim
 write(*,”(‘ Max Threads per Block: ‘,i0 ,/)”) &
 prop%maxThreadsPerBlock
 enddo
end program deviceQuery

~~~~~~~~~~~~~编译运行~~~~~~~~~~~~~~~~~

 [scv1172@g0001 xyqtest2022]$ nvfortran *.cuf -o out

 [scv1172@g0001 xyqtest2022]$ ./out
One CUDA device found 
Device Number: 0
 Device Name: Tesla V100-SXM2-32GB
 Compute Capability: 7.0
 Number of Multiprocessors: 80
 Max Threads per Multiprocessor: 2048
 Global Memory (GB):    31.749
 Execution Configuration Limits 
 Max Grid Dims: 2147483647 x 65535 x 65535
 Max Block Dims: 1024 x 1024 x 64
 Max Threads per Block: 1024