module simpleOps_m
contains
attributes(global) subroutine inc(a, b)
implicit none
real :: a(:,:)
real, value :: b
integer :: i, j, n(2)
i = (blockIdx%x-1)*blockDim%x + threadIdx%x
j = (blockIdx%y-1)*blockDim%y + threadIdx%y
!threadIdx%x,threadIdx%y每个线程块内的线程编号(二维)
!blockDim%x,blockDim%y每个线程块内含有的线程数(二维)
!blockIdx%x,blockIdx%y线程块编号(二维)
n = size(a)
if (i<=n(1) .and. j<=n(2)) a(i,j) = a(i,j) + b
!不能越界
end subroutine inc
end module simpleOps_m
contains
attributes(global) subroutine inc(a, b)
implicit none
real :: a(:,:)
real, value :: b
integer :: i, j, n(2)
i = (blockIdx%x-1)*blockDim%x + threadIdx%x
j = (blockIdx%y-1)*blockDim%y + threadIdx%y
!threadIdx%x,threadIdx%y每个线程块内的线程编号(二维)
!blockDim%x,blockDim%y每个线程块内含有的线程数(二维)
!blockIdx%x,blockIdx%y线程块编号(二维)
n = size(a)
if (i<=n(1) .and. j<=n(2)) a(i,j) = a(i,j) + b
!不能越界
end subroutine inc
end module simpleOps_m
program incTest
use cudafor
use simpleOps_m
implicit none
integer, parameter :: nx=1024, ny=512
!定义1024行512列的矩阵
real :: a(nx,ny), b
real, device :: a_d(nx,ny)
type(dim3) :: grid, tBlock
a = 1; b = 3
tBlock = dim3(32,8,1)
!单位线程块行计算线程数32,单位线程块列计算线程数为8
grid = dim3(ceiling(real(nx)/tBlock%x), ceiling(real(ny)/tBlock%y), 1)
!参与行计算的线程块数块为32,参与列计算的线程块数为64
!线程块数不能大于max grid dims
a_d = a
call inc<<<grid,tBlock>>>(a_d, b)
!调用格式kernelfunction<<<dimGrid,dimBlock>>>(parameters)
a = a_d
write(*,*) ‘Max error: ‘, maxval(abs(a-4))
end program incTest
From:http://blog.163.com/zpfzcjndx@126/blog/static/635456812014121104122345/