结合 Fred Foo 的答案以及 nopper、ihadanny 和 jimijazz 的 cmets,以下代码得到的结果与 Lab 1 (6.5.5) 中第一个示例的 R 函数 regsubsets()(leaps 库的一部分)相同。 1 Best Subset Selection)在“An Introduction to Statistical Learning with Applications in R”一书中。
from itertools import combinations
from sklearn.cross_validation import cross_val_score
def best_subset(estimator, X, y, max_size=8, cv=5):
'''Calculates the best model of up to max_size features of X.
estimator must have a fit and score functions.
X must be a DataFrame.'''
n_features = X.shape[1]
subsets = (combinations(range(n_features), k + 1)
for k in range(min(n_features, max_size)))
best_size_subset = []
for subsets_k in subsets: # for each list of subsets of the same size
best_score = -np.inf
best_subset = None
for subset in subsets_k: # for each subset
estimator.fit(X.iloc[:, list(subset)], y)
# get the subset with the best score among subsets of the same size
score = estimator.score(X.iloc[:, list(subset)], y)
if score > best_score:
best_score, best_subset = score, subset
# to compare subsets of different sizes we must use CV
# first store the best subset of each size
best_size_subset.append(best_subset)
# compare best subsets of each size
best_score = -np.inf
best_subset = None
list_scores = []
for subset in best_size_subset:
score = cross_val_score(estimator, X.iloc[:, list(subset)], y, cv=cv).mean()
list_scores.append(score)
if score > best_score:
best_score, best_subset = score, subset
return best_subset, best_score, best_size_subset, list_scores
查看笔记本http://nbviewer.jupyter.org/github/pedvide/ISLR_Python/blob/master/Chapter6_Linear_Model_Selection_and_Regularization.ipynb#6.5.1-Best-Subset-Selection