【发布时间】:2021-02-05 08:20:49
【问题描述】:
我正在尝试在 Python 上使用线性回归运行逐步自动搜索过程,我的代码如下所示,使用来自 https://datascience.stackexchange.com/a/24447 的代码我没有更改贡献者提供的任何代码,但仍然遇到错误:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import statsmodels.api as sm
data = load_boston()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
def stepwise_selection(X, y,
initial_list=[],
threshold_in=0.01,
threshold_out = 0.05,
verbose=True):
""" Perform a forward-backward feature selection
based on p-value from statsmodels.api.OLS
Arguments:
X - pandas.DataFrame with candidate features
y - list-like with the target
initial_list - list of features to start with (column names of X)
threshold_in - include a feature if its p-value < threshold_in
threshold_out - exclude a feature if its p-value > threshold_out
verbose - whether to print the sequence of inclusions and exclusions
Returns: list of selected features
Always set threshold_in < threshold_out to avoid infinite looping.
See https://en.wikipedia.org/wiki/Stepwise_regression for the details
"""
included = list(initial_list)
while True:
changed=False
# forward step
excluded = list(set(X.columns)-set(included))
new_pval = pd.Series(index=excluded)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.argmin()
included.append(best_feature)
changed=True
if verbose:
print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
# backward step
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
# use all coefs except intercept
pvalues = model.pvalues.iloc[1:]
worst_pval = pvalues.max() # null if pvalues is empty
if worst_pval > threshold_out:
changed=True
worst_feature = pvalues.argmax()
included.remove(worst_feature)
if verbose:
print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
if not changed:
break
return included
result = stepwise_selection(X, y)
print('resulting features:')
print(result)
但是,我遇到了以下错误:
--------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-21-782c721f1ba0> in <module>
59 return included
60
---> 61 result = stepwise_selection(X, y)
62
63 print('resulting features:')
<ipython-input-21-782c721f1ba0> in stepwise_selection(X, y, initial_list, threshold_in, threshold_out, verbose)
45
46 # backward step
---> 47 model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
48 # use all coefs except intercept
49 pvalues = model.pvalues.iloc[1:]
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2804 if is_iterator(key):
2805 key = list(key)
-> 2806 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
2807
2808 # take() does not accept boolean indexers
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1551
1552 self._validate_read_indexer(
-> 1553 keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
1554 )
1555 return keyarr, indexer
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1638 if missing == len(indexer):
1639 axis_name = self.obj._get_axis_name(axis)
-> 1640 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
1641
1642 # We (temporarily) allow for some missing keys with .loc, except in
KeyError: "None of [Int64Index([8], dtype='int64')] are in the [columns]"
预期的输出应该是这样的:
Add LSTAT with p-value 5.0811e-88
Add RM with p-value 3.47226e-27
Add PTRATIO with p-value 1.64466e-14
Add DIS with p-value 1.66847e-05
Add NOX with p-value 5.48815e-08
Add CHAS with p-value 0.000265473
Add B with p-value 0.000771946
Add ZN with p-value 0.00465162
resulting features:
['LSTAT', 'RM', 'PTRATIO', 'DIS', 'NOX', 'CHAS', 'B', 'ZN']
感谢您提供的任何帮助,谢谢!
【问题讨论】:
标签: python scikit-learn linear-regression statsmodels