Python：加速我的 Runge-Kutta 集成代码挑战答案

【问题标题】：Python : Speeding up my Runge-Kutta integration code challengePython：加速我的 Runge-Kutta 集成代码挑战
【发布时间】：2016-11-29 12:51:24
【问题描述】：

我正在使用附件代码集成Fitzhugh-Nagumo模型的一个版本：

from scipy.integrate import odeint
import numpy as np
import time

P = {'epsilon':0.1,
     'a1':1.0,
     'a2':1.0,
     'b':2.0,
     'c':0.2}

def fhn_rhs(V,t,P):
    u,v = V[0],V[1]
    u_t = u - u**3 - v
    v_t = P['epsilon']*(u - P['b']*v - P['c'])
    return np.stack((u_t,v_t))

def integrate(func,V0,t,args,step='RK4'):
    start = time.clock()
    P = args[0]
    result=[V0]
    for i,tmp in enumerate(t[1:]):
        result.append(RK4step(func,result[i],tmp,P,(t[i+1]-t[i])))
    print "Integration took ",time.clock() - start, " s"
    return np.array(result)


def RK4step(rhs,V,t,P,dt):
    k_1 = dt*rhs(V,t,P)
    k_2 = dt*rhs((V+(1.0/2.0)*k_1),t,P)
    k_3 = dt*rhs((V+(1.0/2.0)*k_2),t,P)
    k_4 = dt*rhs((V+k_3),t,P)
    return V+(1.0/6.0)*k_1+(1.0/3.0)*k_2+(1.0/3.0)*k_3+(1.0/6.0)*k_4

比较我的integrate 和scipy.integrate.odeint 得出以下结果：

In [8]: import cProfile

In [9]: %timeit integrate(fhn_rhs,np.stack((0.1,0.2)),np.linspace(0,100,1000),args=(P,))
10 loops, best of 3: 36.4 ms per loop

In [10]: %timeit odeint(fhn_rhs,np.stack((0.1,0.2)),np.linspace(0,100,1000),args=(P,))
100 loops, best of 3: 3.45 ms per loop

In [11]: cProfile.run('integrate(fhn_rhs,np.stack((0.1,0.2)),np.linspace(0,100,1000),args=(P,))')
         45972 function calls in 0.098 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.098    0.098 <string>:1(<module>)
     3996    0.011    0.000    0.078    0.000 fhn.py:20(fhn_rhs)
        1    0.002    0.002    0.097    0.097 fhn.py:42(integrate)
      999    0.016    0.000    0.094    0.000 fhn.py:52(RK4step)
        1    0.000    0.000    0.000    0.000 function_base.py:9(linspace)
     7994    0.011    0.000    0.021    0.000 numeric.py:484(asanyarray)
     3997    0.029    0.000    0.067    0.000 shape_base.py:282(stack)
    11991    0.008    0.000    0.008    0.000 shape_base.py:337(<genexpr>)
     3997    0.002    0.000    0.002    0.000 {len}
      999    0.001    0.000    0.001    0.000 {method 'append' of 'list' objects}
        1    0.000    0.000    0.000    0.000 {method 'astype' of 'numpy.ndarray' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        1    0.000    0.000    0.000    0.000 {numpy.core.multiarray.arange}
     7995    0.010    0.000    0.010    0.000 {numpy.core.multiarray.array}
     3997    0.006    0.000    0.006    0.000 {numpy.core.multiarray.concatenate}
        1    0.000    0.000    0.000    0.000 {numpy.core.multiarray.result_type}

关于如何让我的代码运行得更快有什么建议吗？我可以使用numba 以某种方式加速它吗？

【问题讨论】：

一般来说最慢的操作是控制台和文件输出。您在使用timeit 之前删除了它们吗？
是的，我在比较之前确实删除了控制台输出
那就像我写的一样，只是差别太大了。编译与解释，隐式多步与显式一步，自适应与固定步长。 -- 下一步可能是将方法更改为嵌入式 RK 方法，如 Runge-Kutta-Fehlberg 或 Dormand-Price。使用 RK4，您可以通过将两个大小为 h 的步骤与一个大小为 2h 的并行步骤相结合来模拟一种嵌入式方法，并使用 Richardson 外推法计算错误。

标签： python python-2.7 numpy numba runge-kutta

【解决方案1】：

你不是在比较相同的东西。要查看 odeint 实际评估 ODE 函数的时间点，请将 print t 语句放入（当然，不要计时）。 odeint 和通常具有自适应时间步长的方法会生成一个稀疏的积分样本列表，并从中插入所需的输出。

您必须为 RK4 方法使用误差估计器，并在此基础上复制此自适应方案。

当然，使用向量对象解释的 Python 代码永远不会与在执行期间使用简单数组从 odeint 调用的 lsoda 的编译 FORTRAN 代码竞争。

在带有插值的自适应步长方案中使用 RK4 的示例：

def RK4Step(f, x, y, h, k1):
    k2=f(x+0.5*h, y+0.5*h*k1)
    k3=f(x+0.5*h, y+0.5*h*k2)
    k4=f(x+    h, y+    h*k3)
    return (k1+2*(k2+k3)+k4)/6.0

def RK4TwoStep(f, x, y, h, k1):
    step1 = RK4Step(f, x , y , 0.5*h, k1        )
    x1, y1 = x+0.5*h, y+0.5*h*step1;
    step2 = RK4Step(f, x1, y1, 0.5*h, f(x1, y1) )
    return (step1+step2)/2

def RK4odeint(fin,y0,times, tol=1e-6, args=None):
    # numpy-ify the inputs
    if args:
        f = lambda t,y : np.array(fin(y,t,*args));
    else:
        f = lambda t,y : np.array(fin(y,t))
    y0 = np.array(y0)
    # allocate output structure
    yout = np.array([y0]*len(times));
    # in consequence, yout[0] = y0;
    # initialize integrator variables
    h = times[1]-times[0];
    hmax = abs(times[-1]-times[0]);

    # last and current point of the numerical integration
    ycurr = ylast = qcurr = qlast = y0; 
    tcurr = tlast = times[0];
    fcurr = flast = f(tcurr, ycurr);
    totalerr = 0.0
    totalvar = 0.0
    for i, t in enumerate(times[1:]):
        # remember that t == t[i+1], result goes to yout[i+1]
        while (t-tcurr)*h>0:
            # advance the integration                
            k1, k2 = RK4Step(f,tcurr,ycurr,h, fcurr), RK4TwoStep(f,tcurr,ycurr,h, fcurr);
            # RK4 is of fourth order, that is,
            # k1 = (y(x+h)-y(x))/h + C*h^4
            # k2 = (y(x+h)-y(x))/h + C*h^4/16
            # Using the double step k2 gives  
            # C*h^4/16 = (k2-k1)/15 as local error density
            # change h to match the global relative error density tol
            # use |k2| as scale for the absolute error
            # |k1-k2|/15*hfac^4 = tol*|k2|, h <- h*hfac

            scale = max(abs(k2))
            steperr = max(abs(k1-k2))/2
            # compute the ideal step size factor and sanitize the result to prevent ridiculous changes
            hfac = (  tol*scale / ( 1e-16+steperr)  )**0.25
            hfac = min(10, max(0.01, hfac) )

            # repeat the step if there is a significant step size correction
            if ( abs(h*hfac)<hmax and (0.6 > hfac or hfac > 3 )):
                # recompute with new step size
                h *= hfac;
                k2 = RK4TwoStep(f, tcurr, ycurr, h, fcurr) ;
            # update and cycle the integration points
            ylast = ycurr; ycurr = ycurr + h*k2;
            tlast = tcurr; tcurr += h;
            flast = fcurr; fcurr = f(tcurr, ycurr);
            # cubic Bezier control points
            qlast = ylast + (tcurr-tlast)/3*flast;
            qcurr = ycurr - (tcurr-tlast)/3*fcurr;

            totalvar += h*scale;
            totalerr = (1+h*scale)*totalerr + h*steperr;
            reportstr = "internal step to t=%12.8f \t" % tcurr;

        #now tlast <= t <= tcurr, can interpolate the value for yout[i+1] using the cubic Bezier formula
        s = (t - tlast)/(tcurr - tlast);
        yout[i+1] = (1-s)**2*((1-s)*ylast + 3*s*qlast) + s**2*(3*(1-s)*qcurr + s*ycurr)

    return np.array(yout)

这可以被称为类似于scipy.integrate.odeint，具有相同的导数函数的参数顺序约定（接口是根据该要求构造的，没有内在的必要性，可以随意更改）

P = {'epsilon':0.1,
     'a1':1.0,
     'a2':1.0,
     'b':2.0,
     'c':0.2}

def fhn_rhs(V,t,P):
    u,v = V
    u_t = u - u**3 - v
    v_t = P['epsilon']*(u - P['b']*v - P['c'])
    return np.array([u_t,v_t])


V0 = np.array([0.1,0.2])
time = np.linspace(0,100,1000)

V = RK4odeint(fhn_rhs,V0,time,args=(P,))

u,v = V.T
plt.plot(time, v) # or plt.plot(u,v) or ...

【讨论】：