如何提高这个巨大的嵌套循环的性能？ (Fortran 90)答案

【问题标题】：How can I improve the performance of this huge nested loop? (Fortran 90)如何提高这个巨大的嵌套循环的性能？ (Fortran 90)
【发布时间】：2013-09-12 17:14:37
【问题描述】：

我将在这里发布整个代码段，但唯一的问题是最后的嵌套循环。所有读入矩阵的尺寸为 180x180，循环速度慢得令人难以忍受。我看不到简化计算的简单方法，因为索引的三次出现，得到矩阵“AnaInt”的索引乘法不是简单的矩阵乘积。有什么想法吗？谢谢！

program AC
 implicit none
  integer, parameter :: dp = selected_real_kind(15, 307)
  integer :: n, ndim, k, j, i, o, l, m, steps
  real(dp) :: emax, omega, pi, EFermi, auev
  complex(dp) :: Grs,Gas, ACCond, tinyc, cunit, czero, cone

  complex(dp), allocatable :: GammaL(:,:)     
  complex(dp), allocatable :: GammaL_EB(:,:)  
  complex(dp), allocatable :: GammaR(:,:)     
  complex(dp), allocatable :: R(:,:)  
  complex(dp), allocatable :: Yc(:,:)         
  complex(dp), allocatable :: Yd(:,:)         
  complex(dp), allocatable :: AnaInt(:,:)     
  complex(dp), allocatable :: H(:,:)         
  complex(dp), allocatable :: HamEff(:,:)     
  complex(dp), allocatable :: EigVec(:,:)    
  complex(dp), allocatable :: InvEigVec(:,:)  
  complex(dp), allocatable :: EigVal(:)       
  complex(dp), allocatable :: ctemp(:,:)      
  complex(dp), allocatable :: ctemp2(:,:)      
  complex(dp), allocatable :: S(:,:)          
  complex(dp), allocatable :: SelfL(:,:)     
  complex(dp), allocatable :: SelfR(:,:)     
  complex(dp), allocatable :: SHalf(:,:)      
  complex(dp), allocatable :: InvSHalf(:,:)   
  complex(dp), allocatable :: HEB(:,:)
  complex(dp), allocatable :: Integrand(:,:)


!Lapack arrays and variables
  integer :: info, lwork
  complex(dp), allocatable :: work(:)       
  real(dp), allocatable :: rwork(:)    
  integer,allocatable :: ipiv(:)

!########################################################################

!Constants
    auev = 27.211385
    pi = 3.14159265359
    cunit = (0,1)
    czero = (0,0)
    cone = (1,0)
    tinyc = (0.0, 0.000000000001)


!System and calculation parameters
    open(unit=123, file="ForAC.dat", action='read', form='formatted')
    read(123,*) ndim, EFermi
    lwork = ndim*ndim

    emax = 5.0/auev
    steps = 1000 


    allocate(HEB(ndim,ndim))
    allocate(H(ndim,ndim))
    allocate(Yc(ndim,ndim))
    allocate(Yd(ndim,ndim))
    allocate(S(ndim,ndim))
    allocate(SelfL(ndim,ndim))
    allocate(SelfR(ndim,ndim))
    allocate(HamEff(ndim,ndim))
    allocate(GammaR(ndim,ndim))
    allocate(GammaL(ndim,ndim))
    allocate(AnaInt(ndim,ndim))
    allocate(EigVec(ndim,ndim))
    allocate(EigVal(ndim))
    allocate(InvEigVec(ndim,ndim))
    allocate(R(ndim,ndim))
    allocate(GammaL_EB(ndim,ndim))
    allocate(Integrand(ndim,ndim))

!################################################



    read(123,*) H, S, SelfL, SelfR
    close(unit=123)

    HamEff(:,:)=(H(:,:) + SelfL(:,:) + SelfR(:,:))   



    allocate(SHalf(ndim, ndim))
    allocate(InvSHalf(ndim,ndim))
    SHalf(:,:) = (cmplx(real(S(:,:),dp),0.0_dp,dp))

    call zpotrf('l', ndim, SHalf, ndim, info)         
    InvSHalf(:,:) = SHalf(:,:)
    call ztrtri('l', 'n', ndim, InvSHalf, ndim, info) 

    call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, HamEff, ndim) 
    call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, HamEff, ndim) 
    call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaL, ndim) 
    call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaL, ndim) 
    call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaR, ndim)
    call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaR, ndim)

    deallocate(SHalf)
    deallocate(InvSHalf)




!In the PDF: B = EigVec, B^(-1) = InvEigVec, Hk = EigVal

    allocate(ctemp(ndim,ndim))
    ctemp(:,:) = HamEff(:,:)
    allocate(work(lwork),rwork(2*ndim))
    call zgeev('N', 'V', ndim, ctemp, ndim, EigVal, InvEigVec, ndim, EigVec, ndim, work, lwork, rwork, info)
    if(info/=0)write(*,*) "Warning: zgeev info=", info
    deallocate(work,rwork)
    deallocate(ctemp) 

    InvEigVec(:,:)=EigVec(:,:)
    lwork = 3*ndim
    allocate(ipiv(ndim))
    allocate(work(lwork))
    call zgetrf(ndim,ndim,InvEigVec,ndim,ipiv,info)
    if(info/=0)write(*,*) "Warning: zgetrf info=", info   ! LU decomposition
    call zgetri(ndim,InvEigVec,ndim,ipiv,work,lwork,info)
    if(info/=0)write(*,*) "Warning: zgetri info=", info ! Inversion by LU decomposition (Building of InvEigVec)
    deallocate(work)
    deallocate(ipiv)


 R(:,:) = 0.0_dp
 do j=1,ndim
 do m=1,ndim
 do k=1,ndim
 do l=1,ndim
 R(j,m) = R(j,m) + InvEigVec(j,k) * GammaR(k,l) * conjg(InvEigVec(m,l))
 end do
 end do
 end do
 end do





!!!THIS IS THE LOOP IN QUESTION. MATRIX DIMENSION 180x180, STEPS=1000

 open(unit=125,file="ACCond.dat")

     !Looping over omega
     do o=1,steps
         omega=real(o,dp)*emax/real(steps,dp) 
         AnaInt(:,:) = 0.0_dp
         do i=1,ndim
             do n=1,ndim
                 do j=1,ndim
                      do m=1,ndim
                           Grs = log((EFermi-(EigVal(j)+tinyc)+omega)/(EFermi-(EigVal(j)+tinyc)))
                           Gas = log((EFermi-conjg(EigVal(m)+tinyc))/(EFermi-omega-conjg(EigVal(m)+tinyc)))
                           Integrand = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))

                           AnaInt(i,n)= AnaInt(i,n) + EigVec(i,j) * R(j,m) * Integrand(j,m) * conjg(EigVec(n,m))
                      end do
                 end do
             end do
        end do 

         Yc = 1/(2.0*pi*omega) * matmul(AnaInt,GammaL)
         Yd(:,:) = - 1/(2.0*pi) * cunit * AnaInt(:,:)

          ACCond = czero
          do k=1,ndim
              ACCond=ACCond+Yc(k,k) + 1/(2.0) * Yd(k,k)
          end do
          write(125,*) omega, real(ACCond,dp), aimag(ACCond)
      end do



!#############################################

    deallocate(Integrand)
    deallocate(HEB)
    deallocate(Yc)
    deallocate(Yd)
    deallocate(HamEff)
    deallocate(GammaR)
    deallocate(GammaL)
    deallocate(AnaInt)
    deallocate(EigVec)
    deallocate(EigVal)
    deallocate(InvEigVec)
    deallocate(H)
    deallocate(S)
    deallocate(SelfL)
    deallocate(SelfR)
    deallocate(R)
    deallocate(GammaL_EB)
end program AC

所以，这里是根据建议的第一个改编：

HermEigVec(:,:) = 0.0_dp
do i=1, ndim
do j=1, ndim
HermEigVec(i,j) = conjg(EigVec(j,i))
end do
end do

HermInvEigVec(:,:) = 0.0_dp
do i=1, ndim
do j=1, ndim
HermInvEigVec(i,j) = conjg(InvEigVec(j,i))
end do
end do


R(:,:) = 0.0_dp

R = matmul(InvEigVec,matmul(GammaR,HermInvEigVec))


open(unit=125,file="ACCond.dat")

    !Looping over omega
     do o=1,steps
         omega=real(o,dp)*emax/real(steps,dp)

         AnaInt(:,:) = 0.0_dp
             do j=1,ndim
             do m=1,ndim
                 Grs = log((EFermi-(EigVal(j)+tinyc)+omega)/(EFermi-(EigVal(j)+tinyc)))
                 Gas = log((EFermi-conjg(EigVal(m)+tinyc))/(EFermi-omega-conjg(EigVal(m)+tinyc)))
                 Integrand(j,m) = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))
                 T(j,m) = R(j,m) * Integrand(j,m)
             end do
             end do
         AnaInt = matmul(EigVec,matmul(T,HermEigVec))


         Yc = 1/(2.0*pi*omega) * matmul(AnaInt,GammaL)                      
         Yd(:,:) = - 1/(2.0*pi) * cunit * AnaInt(:,:)

         ACCond = czero
         do k=1,ndim
             ACCond=ACCond+Yc(k,k) + 1/(2.0) * Yd(k,k)
         end do
       write(125,*) omega, real(ACCond,dp), aimag(ACCond)
     end do

【问题讨论】：

你使用什么编译器标志？
我是编译编码语言的初学者，所以我只在 ifort 中使用“检查”，因为您在另一个线程中推荐了它。
标志 check 可以大大降低代码速度，因为它确保满足数组的边界（以及其他一些事情）。由于您显然没有超出界限，请删除 -check 并添加 -O3（大写 o，而不是 0）。
并没有真正帮助，我认为必须以某种方式重组整个循环，但我不知道如何。编辑：O3 很有趣，我看到它正在优化循环。奇怪还是这么慢
我在do j=1,ndim之前加入了一个call cpu_time(start)，在enddo之后加入了另一个j循环；它说（对于 ndim=180）每个 j 循环大约需要 1.1 秒，导致大约 10 小时的总运行时间。

标签： fortran fortran90

【解决方案1】：

您的代码中有几个问题。让我们从你强调的循环之前的循环开始（它更容易理解，但下面的大循环或多或少存在相同的问题）。

所以我们在 i、j、k、l 上有一个循环。

您可以考虑重新排序循环，以便更好地访问缓存。您最内部的循环位于 l 上，它仅显示为列索引。使用 Fortran 中的 column-major 数组，您可能会因此预期性能不佳。 j 上的内部循环可能会更好。

更糟糕的是，您的整个循环是由三个矩阵 (InvEigVec * GammaR * InvEigVec^H) 的乘积进行的矩阵更新，但您在 O(ndim^4) 中执行此操作。每个矩阵乘积为 O(n^3)（或者如果您使用 Strassen algorithm 调用优化的ZGEMM，则可能更少）。因此，通过存储矩阵乘积，两个乘积应该是 O(n^3)，而不是 O(n^4)。也就是说，您可以先进行矩阵乘积，然后再进行矩阵乘积更新。

现在，您的大循环：steps 次超过 i、n、j、m。

如果我读得好，你写

Integrand = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))

右侧的所有变量都是标量，但 Integrand 是 ndim*ndim 矩阵。在多个地方复制一个值需要做很多工作。但是然后你在积分上循环，在那里你可以简单地使用一个标量。或者也许这是一个错误，你应该在左侧有 Integrand(j, m) 或类似的？

那么，你的四个内部循环就像在前面的 cmets 中一样，更新带有数组乘积 EigVec * (R .* Integrand) * EigVec^H 的 AnaInt，带有 .* 数组的（逐项）标量积（或者如果 Integrand 只是一个标量，则只是 EigVec * R * EigVec^H）。

同样，尝试使用 ZGEMM 编写此代码可能会很好，从而大大降低了复杂性。

【讨论】：

哦，有很多东西可以尝试，谢谢。给我一分钟来检查你的建议
所以，我认为我错过的主要事情是，如果您在 j-m-loop 中定义 Integrand，它确实是一个标量。因此，一旦将其设置为某个数字，我就可以进行矩阵乘法 (matmul,lapack) EigVec R EigVec^H 然后简单地将其乘以 Integrand 的标量值？
哇。设置 Integrand(j,m) 从对所有 j 值进行 1.1 秒的循环变为需要 4e-3 秒。
我不确定，但在我看来，您应该将 Integrand 设为一个数组，根据索引 j、m 存储其可能的值（在 4 个内部循环之外，或者甚至通过乘以更新 R被积分的相应元素）。然后我认为您可以使用矩阵产品更轻松（并且更快）编写整个 4 个循环。
该死的我没想到。显然，在内循环中进行整个矩阵乘法并不好。我仍然不确定如何在 lapack/matmul-statments 中处理这一点，因为不能简单地将 j 和 m 相加。编辑：啊arbautjc，好主意。我会试试的。 EDIT2：顺便说一句，积分取决于欧米茄，它在“o”循环中循环

【解决方案2】：

您是否考虑过使用 OPENMP 对循环进行并行化？这很容易实现。如果有兴趣我可以给你一些提示。

试试看这里：openMP DO tutorial

【讨论】：

一旦算法从 O(n^4) 转换为 O(n^3)，调用并行化 ZGEMM 可能是一个好主意，以进一步优化......