堆叠分类器无法识别 Keras答案

【问题标题】：Stacking Classifier doesn't recognize Keras堆叠分类器无法识别 Keras
【发布时间】：2020-08-28 17:20:36
【问题描述】：

我在 5 个 scikit-learn 分类器和一个 Keras 分类器上使用 StackingClassifier。然而，它似乎没有将 Keras 识别为分类器。

相关代码：

from tensorflow.keras import layers
from tensorflow import keras
from keras.constraints import maxnorm

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation,  Flatten, Input
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import metrics
import joblib
from joblib import parallel_backend
np.random.seed(42)
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import sklearn
from sklearn.ensemble import StackingClassifier
def create_model ():
    # create model
    model = Sequential()
    model.add(Dense(best_neurons, input_shape=(X_train.shape[1],), kernel_initializer=best_init_mode, activation='relu', 
                   kernel_constraint=maxnorm(best_weight_constraint)))
    model.add(Dropout(best_dropout_rate))
    model.add(Flatten())
    optimizer= tf.keras.optimizers.RMSprop(lr=best_learn_rate)
    model.add(Dense(units = 1, kernel_initializer=best_init_mode, activation = 'sigmoid'))  # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[keras.metrics.AUC(), 'accuracy'])
    return model


NN_clf=KerasClassifier(build_fn=create_model, epochs=best_epochs, batch_size= best_batch_size)
RF_clf =RandomForestClassifier(max_depth=best_max_depth_rf, n_estimators=best_n_estimators_rf, 
                               min_samples_leaf=best_min_samples_leaf_rf, max_features=best_max_features_rf,
                               class_weight=best_class_weight_rf, max_samples=best_max_samples_rf,
                               random_state=42, oob_score=True)
KN_clf =KNeighborsClassifier(n_neighbors=best_n_neighbors,  p=best_p, leaf_size=best_leaf_size )
#DT_clf = DecisionTreeClassifier(max_depth=best_max_depth_dt, min_samples_leaf=best_min_samples_leaf_dt)
SV_clf =  SVC(gamma=best_gamma_sv, C=best_c_sv, kernel=best_kernel_sv, random_state=42, probability=True)
GBC_clf =  xgb.XGBClassifier(learning_rate=best_learning_rate_gbc, random_state=42, colsample_bytree=best_colsample_bytree_gbc,
                             max_depth=best_max_depth_gbc, n_estimators=best_n_estimators_gbc,
                            gamma=best_gamma_gbc, subsample=best_subsample_gbc)
EX_clf= ExtraTreesClassifier(max_depth=best_max_depth_ex, n_estimators=best_n_estimators_ex, 
                             min_samples_leaf=best_min_samples_leaf_ex, max_features=best_max_features_ex,
                             warm_start=False, oob_score=True, bootstrap=True, random_state=42)
LR_clf=LogisticRegression(random_state=42, solver=best_solver, penalty=best_penalty, class_weight=best_class_weight, C=best_log_C)

estimators= [('RF', RF_clf), ('GBC', GBC_clf),  ('EX', EX_clf), ('LR',LR_clf), ('KN', KN_clf),
            ('SV', SV_clf), ('NN', NN_clf) ]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), n_jobs=-1)
clf.fit(X_train, y_train.values.ravel())
print("Stacking model score: %.3f" % clf.score(X_test, y_test.values.ravel()))

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-41-272df6aa838e> in <module>
      2             ('SV', SV_clf), ('NN', NN_clf) ]
      3 clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), n_jobs=-1)
----> 4 clf.fit(X_train, y_train.values.ravel())
      5 print("Stacking model score: %.3f" % clf.score(X_test, y_test.values.ravel()))

~\Anaconda3\lib\site-packages\sklearn\ensemble\_stacking.py in fit(self, X, y, sample_weight)
    411         self._le = LabelEncoder().fit(y)
    412         self.classes_ = self._le.classes_
--> 413         return super().fit(X, self._le.transform(y), sample_weight)
    414 
    415     @if_delegate_has_method(delegate='final_estimator_')

~\Anaconda3\lib\site-packages\sklearn\ensemble\_stacking.py in fit(self, X, y, sample_weight)
    129         # all_estimators contains all estimators, the one to be fitted and the
    130         # 'drop' string.
--> 131         names, all_estimators = self._validate_estimators()
    132         self._validate_final_estimator()
    133 

~\Anaconda3\lib\site-packages\sklearn\ensemble\_base.py in _validate_estimators(self)
    247                 raise ValueError(
    248                     "The estimator {} should be a {}.".format(
--> 249                         est.__class__.__name__, is_estimator_type.__name__[3:]
    250                     )
    251                 )

ValueError: The estimator KerasClassifier should be a classifier.

我正在使用 Sci-kit learn 版本 2.2，TF 版本 2.x。我看到了类似的错误here，但不想重写我的代码并使用 MLextend 库。

【问题讨论】：

标签： tensorflow keras scikit-learn ensemble-learning

【解决方案1】：

这个问题是因为here 报告了VotingClassifier 的类似问题。

解决方案就是将这个_estimator_type='classifier' 添加到KerasClassifier。

注意：请仅提供重现问题的最少代码。

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
from tensorflow.keras import layers
from tensorflow import keras
from keras.constraints import maxnorm

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation,  Flatten, Input
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras import metrics
import joblib
from joblib import parallel_backend
np.random.seed(42)
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import sklearn
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier

def create_model ():
    # create model
    model = Sequential()
    model.add(Dense(20, input_dim=20, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Flatten())
    optimizer= keras.optimizers.RMSprop(lr=0.001)
    model.add(Dense(units = 1, activation = 'sigmoid'))  # Compile model
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer, metrics=[keras.metrics.AUC(), 'accuracy'])
    return model


NN_clf=KerasClassifier(build_fn=create_model, epochs=15, batch_size= 32)
NN_clf._estimator_type = "classifier"

RF_clf =RandomForestClassifier(random_state=42, oob_score=True)
KN_clf =KNeighborsClassifier()
SV_clf =  SVC(random_state=42, probability=True)
EX_clf= ExtraTreesClassifier(random_state=42)
LR_clf=LogisticRegression(random_state=42,)

estimators= [('RF', RF_clf), ('EX', EX_clf), ('LR',LR_clf), ('KN', KN_clf),
            ('SV', SV_clf), ('NN', NN_clf) ]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

X, y = make_classification()

from sklearn.model_selection import train_test_split

X_train, X_test, y_train , y_test = train_test_split(X, y, test_size=0.3)


clf.fit(X_train, y_train)
print("Stacking model score: %.3f" % clf.score(X_test, y_test))

# Stacking model score: 0.967

【讨论】：