【发布时间】:2020-09-25 22:12:56
【问题描述】:
我正在尝试使用 Scikit-learn 集成 SVM,特别是优化超参数。我很随机地收到以下错误:
File "C:\Users\jakub\anaconda3\envs\SVM_ensembles\lib\site-packages\sklearn\svm\_base.py", line 250, in _dense_fit
self.probB_, self.fit_status_ = libsvm.fit(
File "sklearn\svm\_libsvm.pyx", line 191, in sklearn.svm._libsvm.fit
ValueError: Invalid input - all samples with positive weights have the same label.
据我了解,这来自文件https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/svm/src/libsvm/svm.cpp,并且与仅进入 SVM 的 1 个类的示例有关。我正在使用分层的 K 折交叉验证,并且数据集非常平衡(一类 45%,其他类 55%),所以无论如何都不应该发生这种情况。
我能做什么?
优化引发错误的代码:
def get_best_ensemble_params(X_train, y_train, X_test, y_test, n_tries=5):
search_spaces = {
"max_samples": Real(0.1, 1, "uniform"),
"max_features": Real(0.1, 1, "uniform"),
"kernel": Categorical(["linear", "poly", "rbf", "sigmoid"]),
"C": Real(1e-6, 1e+6, "log-uniform"),
"gamma": Real(1e-6, 1e+1, "log-uniform")
}
best_accuracy = 0
best_model = None
for i in range(n_tries):
done = False
while not done:
try:
optimizer = BayesSearchCV(SVMEnsemble(), search_spaces, cv=3, n_iter=10, n_jobs=-1, n_points=10,
verbose=1)
optimizer.fit(X_train, y_train) # <- ERROR HERE
accuracy = accuracy_score(y_test, optimizer)
if accuracy > best_accuracy:
best_accuracy = accuracy
best_model = optimizer
done = True
print(i, "job done")
except:
pass
return best_model.best_params_
if __name__ == "__main__":
dataset_name = "acute_inflammations"
loading_functions = {
"acute_inflammations": load_acute_inflammations,
"breast_cancer_coimbra": load_breast_cancer_coimbra,
"breast_cancer_wisconsin": load_breast_cancer_wisconsin
}
X, y = loading_functions[dataset_name]()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
params = get_best_ensemble_params(X_train, y_train, X_test, y_test)
params["n_jobs"] = -1
params["random_state"] = 0
model = SVMEnsemble(n_estimators=20, **params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
我的自定义 SVMEnsemble 只是 BaggingClassifier 硬编码 SVC:
import inspect
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from skopt import BayesSearchCV
svm_possible_args = {"C", "kernel", "degree", "gamma", "coef0", "shrinking", "probability", "tol", "cache_size",
"class_weight", "max_iter", "decision_function_shape", "break_ties"}
bagging_possible_args = {"n_estimators", "max_samples", "max_features", "bootstrap", "bootstrap_features",
"oob_score", "warm_start", "n_jobs"}
common_possible_args = {"random_state", "verbose"}
class SVMEnsemble(BaggingClassifier):
def __init__(self, voting_method="hard", n_jobs=-1,
n_estimators=10, max_samples=1.0, max_features=1.0,
C=1.0, kernel="linear", gamma="scale",
**kwargs):
if voting_method not in {"hard", "soft"}:
raise ValueError(f"voting_method {voting_method} is not recognized.")
self._voting_method = voting_method
self._C = C
self._gamma = gamma
self._kernel = kernel
passed_args = {
"n_jobs": n_jobs,
"n_estimators": n_estimators,
"max_samples": max_samples,
"max_features": max_features,
"C": C,
"gamma": gamma,
"cache_size": 1024,
}
kwargs.update(passed_args)
svm_args = {
"probability": True if voting_method == "soft" else False,
"kernel": kernel
}
bagging_args = dict()
for arg_name, arg_val in kwargs.items():
if arg_name in svm_possible_args:
svm_args[arg_name] = arg_val
elif arg_name in bagging_possible_args:
bagging_args[arg_name] = arg_val
elif arg_name in common_possible_args:
svm_args[arg_name] = arg_val
bagging_args[arg_name] = arg_val
else:
raise ValueError(f"argument {voting_method} is not recognized.")
self.svm_args = svm_args
self.bagging_args = bagging_args
base_estimator = SVC(**svm_args)
super().__init__(base_estimator=base_estimator, **bagging_args)
@property
def voting_method(self):
return self._voting_method
@voting_method.setter
def voting_method(self, new_voting_method):
if new_voting_method == "soft":
self._voting_method = new_voting_method
self.svm_args["probability"] = True
base_estimator = SVC(**self.svm_args)
super().__init__(base_estimator=base_estimator, **self.bagging_args)
elif self._voting_method == "soft":
self._voting_method = new_voting_method
self.svm_args["probability"] = False
base_estimator = SVC(**self.svm_args)
super().__init__(base_estimator=base_estimator, **self.bagging_args)
else:
self._voting_method = new_voting_method
@property
def C(self):
return self._C
@C.setter
def C(self, new_C):
self._C = new_C
self.svm_args["C"] = new_C
base_estimator = SVC(**self.svm_args)
super().__init__(base_estimator=base_estimator, **self.bagging_args)
@property
def gamma(self):
return self._gamma
@gamma.setter
def gamma(self, new_gamma):
self._gamma = new_gamma
self.svm_args["gamma"] = new_gamma
base_estimator = SVC(**self.svm_args)
super().__init__(base_estimator=base_estimator, **self.bagging_args)
@property
def kernel(self):
return self._kernel
@kernel.setter
def kernel(self, new_kernel):
self._kernel = new_kernel
self.svm_args["kernel"] = new_kernel
base_estimator = SVC(**self.svm_args)
super().__init__(base_estimator=base_estimator, **self.bagging_args)
def predict(self, X):
if self._voting_method == "hard":
return super().predict(X)
elif self._voting_method == "soft":
probabilities = np.zeros((X.shape[0], self.classes_.shape[0]))
for estimator in self.estimators_:
estimator_probabilities = estimator.predict_proba(X)
probabilities += estimator_probabilities
return self.classes_[probabilities.argmax(axis=1)]
else:
raise ValueError(f"voting_method {self._voting_method} is not recognized.")
【问题讨论】:
-
您应该发布更多代码和最小可验证示例,以便其他人可以对代码有所了解。
-
@hadik 我照你说的做了
-
您的完整数据集中有多少示例? (你提到的那个在问题中被分成 45/55)。
-
@AlexanderPivovarov 我已经在 3 个数据集上进行了尝试,每个数据集都用于二进制分类,每个类的百分比约为 45/55。一个有 120 个样本,另一个是 160 个,另一个是 600 个。每个样本都有这样的问题。我不明白为什么即使使用 Stratified K-fold CV 也会发生这种情况,因为它应该在任何地方都混合使用这两个类。
标签: python machine-learning scikit-learn svm