实施逻辑回归——为什么不收敛？答案

【问题标题】：Implementing logistic regression -- why does this not converge?实施逻辑回归——为什么不收敛？
【发布时间】：2016-11-09 22:51:20
【问题描述】：

我正在调整现有的逻辑回归实现，但我不知道我做错了什么。

这是我的实现：

from scipy.optimize import fmin_bfgs
import numpy as np
import pandas as pd
# With help from http://stackoverflow.com/questions/13794754/logistic-regression-using-scipy
# as well as https://bryantravissmith.com/2015/12/29/implementing-logistic-regression-from-scratch-part-2-python-code/

def sigma(features, weights):
    """returns sigma(<w,x>)"""
    return 1 / (1 + np.exp(-features.dot(weights)))


def log_likelihood(weights, features, labels):
    """calculates -ln p(t|w)"""
    s = sigma(features, weights)
    #s += 1e-24  # pseudocount to prevent logs of 0
    t = labels * np.log(s + 1e-24)
    t2 = (1 - labels) * (np.log((1 - s) + 1e-24))
    ll = (t + t2).sum()
    print -ll
    return -ll


def gradient_log_likelihood(weights, features, labels):
    """calculates the gradient (Jacobian) of the log likelihood"""
    error = labels - sigma(features, weights)
    grad = (error * features).sum(axis=0)
    return grad.reshape(grad.shape[0], 1)

这是一个示例数据集：

labels = np.array([0, 1, 1]).reshape(3, 1)
df = pd.DataFrame.from_dict({'a': [1,2,3], 'b': [2,3,4], 'c': [6,7,8]})

n, m = df.shape
weights = np.zeros(m + 1).reshape(m + 1, 1)  # zero vector of starting weights

# add the intercept column
features = np.ones((n, m + 1))  # make matrix with all 1's
features[:,1:] = df  # replace the 1's in all columns after column 0 with actual data

如果我在起始权重向量上单独运行这些方法中的每一个，它们就会运行。但是一旦我尝试优化，就会出现形状错误：

optimized = fmin_bfgs(log_likelihood, x0=weights, args=(features, labels), gtol=1e-4, fprime=gradient_log_likelihood)

ValueError                                Traceback (most recent call last)
<ipython-input-26-34c3cde48ac4> in <module>()
----> 1 optimized = fmin_bfgs(log_likelihood, x0=weights, args=(features, labels), gtol=1e-4, fprime=gradient_log_likelihood)

/Users/ifiddes/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in fmin_bfgs(f, x0, fprime, args, gtol, norm, epsilon, maxiter, full_output, disp, retall, callback)
    791             'return_all': retall}
    792
--> 793     res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)
    794
    795     if full_output:

/Users/ifiddes/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in _minimize_bfgs(fun, x0, args, jac, callback, gtol, norm, eps, maxiter, disp, return_all, **unknown_options)
    845     else:
    846         grad_calls, myfprime = wrap_function(fprime, args)
--> 847     gfk = myfprime(x0)
    848     k = 0
    849     N = len(x0)

/Users/ifiddes/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in function_wrapper(*wrapper_args)
    287     def function_wrapper(*wrapper_args):
    288         ncalls[0] += 1
--> 289         return function(*(wrapper_args + args))
    290
    291     return ncalls, function_wrapper

<ipython-input-3-9678bc972b41> in gradient_log_likelihood(weights, features, labels)
      2         """calculates the gradient (Jacobian) of the log likelihood"""
      3         error = labels - sigma(features, weights)
----> 4         grad = (error * features).sum(axis=0)
      5         return grad.reshape(grad.shape[0], 1)
      6

ValueError: operands could not be broadcast together with shapes (3,3) (3,4)

【问题讨论】：

标签： python numpy logistic-regression

【解决方案1】：

问题在于这条线：

error = (labels - sigma(features, weights))

将error 从 3 x 1 向量转换为 3 x 3 矩阵。

请注意，如果您打印 error 并运行 gradient_log_likelihood(weights, features, labels)，则会得到输出：

[[-0.5]
 [ 0.5]
 [ 0.5]]

如果你运行优化，你会得到：

[[-0.5 -0.5 -0.5]
 [ 0.5  0.5  0.5]
 [ 0.5  0.5  0.5]]

除了 ValueError。这是因为labels - sigma(features, weights) 改变了形状。

您可以调查原因，但如果您绕过它，您可以将第一列拉出，error = (labels - sigma(features, weights)).T[0].reshape(3,1)，当您运行 gradient_log_likelihood(weights, features, labels) 时，它会为您提供相同的解决方案，但您会在优化函数中遇到一个新错误。

optimized = fmin_bfgs(log_likelihood, x0=weights, args=(features, labels), gtol=1e-3, fprime=gradient_log_likelihood)

6.23832462504
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-135-d7e8b04daeba> in <module>()
    ----> 1 optimized = fmin_bfgs(log_likelihood, x0=weights, args=(features, labels), gtol=1e-3, fprime=gradient_log_likelihood)

    /Library/Python/2.7/site-packages/scipy/optimize/optimize.pyc in fmin_bfgs(f, x0, fprime, args, gtol, norm, epsilon, maxiter, full_output, disp, retall, callback)
        791             'return_all': retall}
        792 
    --> 793     res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)
        794 
        795     if full_output:

    /Library/Python/2.7/site-packages/scipy/optimize/optimize.pyc in _minimize_bfgs(fun, x0, args, jac, callback, gtol, norm, eps, maxiter, disp, return_all, **unknown_options)
        863             alpha_k, fc, gc, old_fval, old_old_fval, gfkp1 = \
        864                      _line_search_wolfe12(f, myfprime, xk, pk, gfk,
    --> 865                                           old_fval, old_old_fval)
        866         except _LineSearchError:
        867             # Line search failed to find a better solution.

    /Library/Python/2.7/site-packages/scipy/optimize/optimize.pyc in _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs)
        697     ret = line_search_wolfe1(f, fprime, xk, pk, gfk,
        698                              old_fval, old_old_fval,
    --> 699                              **kwargs)
        700 
        701     if ret[0] is None:

    /Library/Python/2.7/site-packages/scipy/optimize/linesearch.pyc in line_search_wolfe1(f, fprime, xk, pk, gfk, old_fval, old_old_fval, args, c1, c2, amax, amin, xtol)
         95         return np.dot(gval[0], pk)
         96 
    ---> 97     derphi0 = np.dot(gfk, pk)
         98 
         99     stp, fval, old_fval = scalar_search_wolfe1(

    ValueError: shapes (4,1) and (4,1) not aligned: 1 (dim 1) != 4 (dim 0)

【讨论】：