【发布时间】:2013-09-12 17:14:37
【问题描述】:
我将在这里发布整个代码段,但唯一的问题是最后的嵌套循环。所有读入矩阵的尺寸为 180x180,循环速度慢得令人难以忍受。我看不到简化计算的简单方法,因为索引的三次出现,得到矩阵“AnaInt”的索引乘法不是简单的矩阵乘积。有什么想法吗?谢谢!
program AC
implicit none
integer, parameter :: dp = selected_real_kind(15, 307)
integer :: n, ndim, k, j, i, o, l, m, steps
real(dp) :: emax, omega, pi, EFermi, auev
complex(dp) :: Grs,Gas, ACCond, tinyc, cunit, czero, cone
complex(dp), allocatable :: GammaL(:,:)
complex(dp), allocatable :: GammaL_EB(:,:)
complex(dp), allocatable :: GammaR(:,:)
complex(dp), allocatable :: R(:,:)
complex(dp), allocatable :: Yc(:,:)
complex(dp), allocatable :: Yd(:,:)
complex(dp), allocatable :: AnaInt(:,:)
complex(dp), allocatable :: H(:,:)
complex(dp), allocatable :: HamEff(:,:)
complex(dp), allocatable :: EigVec(:,:)
complex(dp), allocatable :: InvEigVec(:,:)
complex(dp), allocatable :: EigVal(:)
complex(dp), allocatable :: ctemp(:,:)
complex(dp), allocatable :: ctemp2(:,:)
complex(dp), allocatable :: S(:,:)
complex(dp), allocatable :: SelfL(:,:)
complex(dp), allocatable :: SelfR(:,:)
complex(dp), allocatable :: SHalf(:,:)
complex(dp), allocatable :: InvSHalf(:,:)
complex(dp), allocatable :: HEB(:,:)
complex(dp), allocatable :: Integrand(:,:)
!Lapack arrays and variables
integer :: info, lwork
complex(dp), allocatable :: work(:)
real(dp), allocatable :: rwork(:)
integer,allocatable :: ipiv(:)
!########################################################################
!Constants
auev = 27.211385
pi = 3.14159265359
cunit = (0,1)
czero = (0,0)
cone = (1,0)
tinyc = (0.0, 0.000000000001)
!System and calculation parameters
open(unit=123, file="ForAC.dat", action='read', form='formatted')
read(123,*) ndim, EFermi
lwork = ndim*ndim
emax = 5.0/auev
steps = 1000
allocate(HEB(ndim,ndim))
allocate(H(ndim,ndim))
allocate(Yc(ndim,ndim))
allocate(Yd(ndim,ndim))
allocate(S(ndim,ndim))
allocate(SelfL(ndim,ndim))
allocate(SelfR(ndim,ndim))
allocate(HamEff(ndim,ndim))
allocate(GammaR(ndim,ndim))
allocate(GammaL(ndim,ndim))
allocate(AnaInt(ndim,ndim))
allocate(EigVec(ndim,ndim))
allocate(EigVal(ndim))
allocate(InvEigVec(ndim,ndim))
allocate(R(ndim,ndim))
allocate(GammaL_EB(ndim,ndim))
allocate(Integrand(ndim,ndim))
!################################################
read(123,*) H, S, SelfL, SelfR
close(unit=123)
HamEff(:,:)=(H(:,:) + SelfL(:,:) + SelfR(:,:))
allocate(SHalf(ndim, ndim))
allocate(InvSHalf(ndim,ndim))
SHalf(:,:) = (cmplx(real(S(:,:),dp),0.0_dp,dp))
call zpotrf('l', ndim, SHalf, ndim, info)
InvSHalf(:,:) = SHalf(:,:)
call ztrtri('l', 'n', ndim, InvSHalf, ndim, info)
call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, HamEff, ndim)
call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, HamEff, ndim)
call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaL, ndim)
call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaL, ndim)
call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaR, ndim)
call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaR, ndim)
deallocate(SHalf)
deallocate(InvSHalf)
!In the PDF: B = EigVec, B^(-1) = InvEigVec, Hk = EigVal
allocate(ctemp(ndim,ndim))
ctemp(:,:) = HamEff(:,:)
allocate(work(lwork),rwork(2*ndim))
call zgeev('N', 'V', ndim, ctemp, ndim, EigVal, InvEigVec, ndim, EigVec, ndim, work, lwork, rwork, info)
if(info/=0)write(*,*) "Warning: zgeev info=", info
deallocate(work,rwork)
deallocate(ctemp)
InvEigVec(:,:)=EigVec(:,:)
lwork = 3*ndim
allocate(ipiv(ndim))
allocate(work(lwork))
call zgetrf(ndim,ndim,InvEigVec,ndim,ipiv,info)
if(info/=0)write(*,*) "Warning: zgetrf info=", info ! LU decomposition
call zgetri(ndim,InvEigVec,ndim,ipiv,work,lwork,info)
if(info/=0)write(*,*) "Warning: zgetri info=", info ! Inversion by LU decomposition (Building of InvEigVec)
deallocate(work)
deallocate(ipiv)
R(:,:) = 0.0_dp
do j=1,ndim
do m=1,ndim
do k=1,ndim
do l=1,ndim
R(j,m) = R(j,m) + InvEigVec(j,k) * GammaR(k,l) * conjg(InvEigVec(m,l))
end do
end do
end do
end do
!!!THIS IS THE LOOP IN QUESTION. MATRIX DIMENSION 180x180, STEPS=1000
open(unit=125,file="ACCond.dat")
!Looping over omega
do o=1,steps
omega=real(o,dp)*emax/real(steps,dp)
AnaInt(:,:) = 0.0_dp
do i=1,ndim
do n=1,ndim
do j=1,ndim
do m=1,ndim
Grs = log((EFermi-(EigVal(j)+tinyc)+omega)/(EFermi-(EigVal(j)+tinyc)))
Gas = log((EFermi-conjg(EigVal(m)+tinyc))/(EFermi-omega-conjg(EigVal(m)+tinyc)))
Integrand = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))
AnaInt(i,n)= AnaInt(i,n) + EigVec(i,j) * R(j,m) * Integrand(j,m) * conjg(EigVec(n,m))
end do
end do
end do
end do
Yc = 1/(2.0*pi*omega) * matmul(AnaInt,GammaL)
Yd(:,:) = - 1/(2.0*pi) * cunit * AnaInt(:,:)
ACCond = czero
do k=1,ndim
ACCond=ACCond+Yc(k,k) + 1/(2.0) * Yd(k,k)
end do
write(125,*) omega, real(ACCond,dp), aimag(ACCond)
end do
!#############################################
deallocate(Integrand)
deallocate(HEB)
deallocate(Yc)
deallocate(Yd)
deallocate(HamEff)
deallocate(GammaR)
deallocate(GammaL)
deallocate(AnaInt)
deallocate(EigVec)
deallocate(EigVal)
deallocate(InvEigVec)
deallocate(H)
deallocate(S)
deallocate(SelfL)
deallocate(SelfR)
deallocate(R)
deallocate(GammaL_EB)
end program AC
所以,这里是根据建议的第一个改编:
HermEigVec(:,:) = 0.0_dp
do i=1, ndim
do j=1, ndim
HermEigVec(i,j) = conjg(EigVec(j,i))
end do
end do
HermInvEigVec(:,:) = 0.0_dp
do i=1, ndim
do j=1, ndim
HermInvEigVec(i,j) = conjg(InvEigVec(j,i))
end do
end do
R(:,:) = 0.0_dp
R = matmul(InvEigVec,matmul(GammaR,HermInvEigVec))
open(unit=125,file="ACCond.dat")
!Looping over omega
do o=1,steps
omega=real(o,dp)*emax/real(steps,dp)
AnaInt(:,:) = 0.0_dp
do j=1,ndim
do m=1,ndim
Grs = log((EFermi-(EigVal(j)+tinyc)+omega)/(EFermi-(EigVal(j)+tinyc)))
Gas = log((EFermi-conjg(EigVal(m)+tinyc))/(EFermi-omega-conjg(EigVal(m)+tinyc)))
Integrand(j,m) = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))
T(j,m) = R(j,m) * Integrand(j,m)
end do
end do
AnaInt = matmul(EigVec,matmul(T,HermEigVec))
Yc = 1/(2.0*pi*omega) * matmul(AnaInt,GammaL)
Yd(:,:) = - 1/(2.0*pi) * cunit * AnaInt(:,:)
ACCond = czero
do k=1,ndim
ACCond=ACCond+Yc(k,k) + 1/(2.0) * Yd(k,k)
end do
write(125,*) omega, real(ACCond,dp), aimag(ACCond)
end do
【问题讨论】:
-
你使用什么编译器标志?
-
我是编译编码语言的初学者,所以我只在 ifort 中使用“检查”,因为您在另一个线程中推荐了它。
-
标志
check可以大大降低代码速度,因为它确保满足数组的边界(以及其他一些事情)。由于您显然没有超出界限,请删除-check并添加-O3(大写 o,而不是 0)。 -
并没有真正帮助,我认为必须以某种方式重组整个循环,但我不知道如何。编辑:O3 很有趣,我看到它正在优化循环。奇怪还是这么慢
-
我在
do j=1,ndim之前加入了一个call cpu_time(start),在enddo之后加入了另一个j循环;它说(对于ndim=180)每个j循环大约需要 1.1 秒,导致大约 10 小时的总运行时间。