【问题标题】:IndexError while fitting pipeline with FeatureUnion使用 FeatureUnion 拟合管道时出现 IndexError
【发布时间】:2017-11-25 13:49:47
【问题描述】:

我一直收到一个

IndexError: only integers, slices (:), ellipsis (...), numpy.newaxis (None) and integer or boolean arrays are valid indices

在尝试使我的数据框适合以下管道时。训练和测试是具有相同列的两个数据框。有不同的列,但我只想通过 ItemSelector 关注其中的三个。

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import preprocessing
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import  OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline

    class ItemSelector(BaseEstimator, TransformerMixin):

        def __init__(self, column):
            self.column = column

        def fit(self, X, y=None):
            return self

        def transform(self, X):
            return X[self.column]


    def predictCases(train, test):
        target_names = sorted(list(set(train['TARGET'].values)))
        y_train  = np.array([target_names.index(x) for x in train['TARGET'].values])
        y_test   = np.array([target_names.index(x) for x in test['TARGET'].values])

        # train and predict
        classifier = Pipeline([
                    ('union', FeatureUnion([

                            ('text', Pipeline([
                                ('selector', ItemSelector(column='TEXT')),
                                ('tfidf_vec', TfidfVectorizer())
                            ])),

                            ('feature1', Pipeline([
                                ('selector', ItemSelector(column='CATEG_FEAT1')),
                                ('lbe', LabelEncoder())
                            ])),

                            ('feature2', Pipeline([
                                ('selector', ItemSelector(column='CATEG_FEAT2')),
                                ('lbe', LabelEncoder())
                            ]))
                    ])),
                    ('clf', OneVsRestClassifier(LinearSVC()))])
        classifier.fit(train.values, y_train)
        predicted = classifier.predict(test.values)
        return(metrics.precision_recall_fscore_support(y_test, predicted))

完全错误:

IndexError                                Traceback (most recent call last)
<ipython-input-19-95d9d0c337f4> in <module>()
----> 1 tt = predictCases(train_resampled, validate)

<ipython-input-17-efc951f4192e> in predictCases(train, test)
     24                 ])),
     25                 ('clf', OneVsRestClassifier(LinearSVC()))])
---> 26     classifier.fit(train.values, y_train)
     27     predicted = classifier.predict(test.values)
     28     return(metrics.precision_recall_fscore_support(y_test, predicted))

C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
    266             This estimator
    267         """
--> 268         Xt, fit_params = self._fit(X, y, **fit_params)
    269         if self._final_estimator is not None:
    270             self._final_estimator.fit(Xt, y, **fit_params)

C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
    232                 pass
    233             elif hasattr(transform, "fit_transform"):
--> 234                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
    235             else:
    236                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \

C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
    732             delayed(_fit_transform_one)(trans, name, weight, X, y,
    733                                         **fit_params)
--> 734             for name, trans, weight in self._iter())
    735 
    736         if not result:

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
    756             # was dispatched. In particular this covers the edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

    C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
        324         # Don't delay the application, to avoid keeping the input
        325         # arguments in memory
    --> 326         self.results = batch()
        327 
        328     def get(self):

    C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
        129 
        130     def __call__(self):
    --> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        132 
        133     def __len__(self):

    C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
        129 
        130     def __call__(self):
    --> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        132 
        133     def __len__(self):

    C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params)
        575                        **fit_params):
        576     if hasattr(transformer, 'fit_transform'):
    --> 577         res = transformer.fit_transform(X, y, **fit_params)
        578     else:
        579         res = transformer.fit(X, y, **fit_params).transform(X)

    C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
        299         """
        300         last_step = self._final_estimator
    --> 301         Xt, fit_params = self._fit(X, y, **fit_params)
        302         if hasattr(last_step, 'fit_transform'):
        303             return last_step.fit_transform(Xt, y, **fit_params)

    C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
        232                 pass
        233             elif hasattr(transform, "fit_transform"):
    --> 234                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
        235             else:
        236                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \

    C:\\Anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
        495         else:
        496             # fit method of arity 2 (supervised transformation)
    --> 497             return self.fit(X, y, **fit_params).transform(X)
        498 
        499 

    <ipython-input-2-fdc42fd9d831> in transform(self, X)
         10 
         11     def transform(self, X):
    ---> 12         return X[self.column]

    IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

编辑:

如果我在 fit 中使用 train 而不是 train.values,我会收到以下错误:

TypeError: fit_transform() takes 2 positional arguments but 3 were given

【问题讨论】:

    标签: python python-3.x scikit-learn pipeline


    【解决方案1】:

    您将 test.values(即具有原始 DataFrame 值的 numpy 数组)传递给 classifier.predict 和 classifier.fit,而您的转换器需要一个 DataFrame 对象。

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2021-10-12
      • 2019-11-08
      • 2017-12-20
      • 2018-12-18
      • 2019-06-15
      • 1970-01-01
      相关资源
      最近更新 更多