启用MPI OpenMP
module load nvhpc/nvhpc/21.7_cuda11.0_openmpi4.0.5_gcc7.5_qd
调用ifort 编译器
module load intel/parallelstudio/2017.1.5
Step1 抢占一个GPU计算节点
[scv1172@ln01 xyqtest2022]$ salloc –gpus=1
salloc:Pending job allocation 308347
salloc:job 308347 queued and waiting for resources
salloc:job 308347 has been allocated resources
salloc:Granted job allocation 308347
salloc:Waiting for resource configuration
salloc:Nodes g0001 are ready for job (找到的计算节点为g0001)
Step2 用ssh命令加载找到的节点 g0001
[scv1172@ln01 xyqtest2022]$ ssh g0001
Warning:Permanently added ‘g0001,192.168.11.1’ (ECDSA) to the list of known hosts.
[scv1172@g0001 ~]$
以上结果表明加载g0001节点成功
Step3 查询NVHPC:用 module ava命令查找 是否安装nvhpc,以及版本
[scv1172@g0001 ~]$ module ava nvhpc
——————-
/data/apps/modulefiles ————————–
nvhpc/nvhpc/20.7_cuda10.1_openmpi3.1.5
nvhpc/nvhpc/20.7_cuda10.2_openmpi3.1.5
nvhpc/nvhpc/21.7_cuda11.0_openmpi4.0.5_gcc7.5_qd
nvhpc/nvhpc_byo_compiler/20.7_cuda10.1
nvhpc/nvhpc_byo_compiler/20.7_cuda10.2
nvhpc/nvhpc_byo_compiler/21.7_cuda11.0_gcc7.5
nvhpc/nvhpc_nompi/20.7_cuda10.1
nvhpc/nvhpc_nompi/20.7_cuda10.2
nvhpc/nvhpc_nompi/21.7_cuda11.0_gcc7.5_qd
Step 4 用module load加载相应版本的nvhpc
[scv1172@g0001 ~]$ module load nvhpc/nvhpc_nompi/21.7_cuda11.0_gcc7.5_qd
Step 5 用which命令检查nvhpc的加载是否成功
[scv1172@g0001 xyqtest2022]$ which nvfortran
/data/apps/nvhpc/nvhpc_21.7/Linux_x86_64/21.7/compilers/bin/nvfortran
[scv1172@g0001
xyqtest2022]$
以上结果表明相应版本的nvhpc加载成功
利用以上5步则可以对Cuda-GPU的计算节点进性配置
~~~~~~测试代码(命名为test.cuf)~~~~~~~~~~~~
program deviceQuery
use cudafor
type (cudaDeviceProp) :: prop
integer :: nDevices=0, i, ierr
! Number of CUDA -capable devices
ierr = cudaGetDeviceCount(nDevices)
if (nDevices == 0) then
write(*,”(/,’No CUDA devices found ‘,/)”)
stop
else if (nDevices == 1) then
write(*,”(/,’One CUDA device found ‘,/)”)
else
write(*,”(/,i0,’ CUDA devices found ‘,/)”) nDevices
end if
! Loop over devices
do i = 0, nDevices -1
write(*,”(‘Device Number: ‘,i0)”) i
ierr = cudaGetDeviceProperties(prop , i)
! General device info
write(*,”(‘ Device Name: ‘,a)”) trim(prop%name)
write(*,”(‘ Compute Capability: ‘,i0,’.’,i0)”) &
prop%major , prop%minor
write(*,”(‘ Number of Multiprocessors: ‘,i0)”) &
prop%multiProcessorCount
write(*,”(‘ Max Threads per Multiprocessor: ‘,i0)”) &
prop%maxThreadsPerMultiprocessor
write(*,”(‘ Global Memory (GB): ‘,f9.3,/)”) &
prop%totalGlobalMem /1024.0**3
! Execution Configuration
write(*,”(‘ Execution Configuration Limits ‘)”)
write(*,”(‘ Max Grid Dims: ‘,2(i0,’ x ‘),i0)”) &
prop%maxGridSize
write(*,”(‘ Max Block Dims: ‘,2(i0,’ x ‘),i0)”) &
prop%maxThreadsDim
write(*,”(‘ Max Threads per Block: ‘,i0 ,/)”) &
prop%maxThreadsPerBlock
enddo
end program deviceQuery
~~~~~~~~~~~~~编译运行~~~~~~~~~~~~~~~~~
[scv1172@g0001 xyqtest2022]$ nvfortran *.cuf -o out
[scv1172@g0001 xyqtest2022]$ ./out
One CUDA device found
Device Number: 0
Device Name: Tesla V100-SXM2-32GB
Compute Capability: 7.0
Number of Multiprocessors: 80
Max Threads per Multiprocessor: 2048
Global Memory (GB): 31.749
Execution Configuration Limits
Max Grid Dims: 2147483647 x 65535 x 65535
Max Block Dims: 1024 x 1024 x 64
Max Threads per Block: 1024