如何使用 sklearn 预处理管道中的标签？答案

【问题标题】：How do you preprocess labels in a pipeline with sklearn?如何使用 sklearn 预处理管道中的标签？
【发布时间】：2021-12-18 01:34:27
【问题描述】：

我有一个预处理脚本，它从钻石数据集中获取数据并预处理数据。我显然也需要它来预处理标签。

这是我的代码：

# Data Preprocessing

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from icecream import ic


def diamond_preprocess(data_dir):
    data = pd.read_csv(data_dir)
    cleaned_data = data.drop(['id', 'depth_percent'], axis=1)  # Features I don't want

    x = cleaned_data.drop(['price'], axis=1)  # Train data
    y = cleaned_data['price']  # Label data

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=99)

    numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = x_train.select_dtypes(include=['object']).columns.tolist()

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),  # Fill in missing data with median
        ('scaler', StandardScaler())  # Scale data
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Fill in missing data with 'missing'
        ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One hot encode categorical data
    ])

    preprocessor_pipeline = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Fit to the training data
    preprocessor_pipeline.fit(x_train)
    preprocessor_pipeline.fit(y_train)

    # Apply the pipeline to the training and test data
    x_train_pipe = preprocessor_pipeline.transform(x_train)
    x_test_pipe = preprocessor_pipeline.transform(x_test)
    y_train_pipe = preprocessor_pipeline.transform(y_train)
    y_test_pipe = preprocessor_pipeline.transform(y_test)

    x_train = pd.DataFrame(data=x_train_pipe)
    x_test = pd.DataFrame(data=x_test_pipe)
    y_train = pd.DataFrame(data=y_train_pipe)
    y_test = pd.DataFrame(data=y_test_pipe)

    return x_train, x_test, y_train, y_test

我不太确定我的代码是否正确，或者我对 sklearn 中管道和预处理的工作方式有很好的了解。显然，当我收到此错误时，口译员同意：

     File "C:\Users\17574\Anaconda3\envs\kraken-gpu\lib\site-packages\sklearn\compose\_column_transformer.py", line 470, in fit
    self.fit_transform(X, y=y)
  File "C:\Users\17574\Anaconda3\envs\kraken-gpu\lib\site-packages\sklearn\compose\_column_transformer.py", line 502, in fit_transform
    self._check_n_features(X, reset=True)
  File "C:\Users\17574\Anaconda3\envs\kraken-gpu\lib\site-packages\sklearn\base.py", line 352, in _check_n_features
    n_features = X.shape[1]
IndexError: tuple index out of range

如何像处理训练数据一样正确预处理标签？一个解释也很好！

【问题讨论】：

标签： python pandas dataframe machine-learning scikit-learn

【解决方案1】：

如果您想单独应用转换，您可以为目标列创建额外的管道，请参见下面的示例。

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# generate the data
data = pd.DataFrame({
    'y':  [1, 2, np.nan, 4, 5],
    'x1': [6, 7, 8, np.nan, np.nan],
    'x2': [9, 10, 11, np.nan, np.nan],
    'x3': ['a', 'b', 'c', np.nan, np.nan],
    'x4': [np.nan, np.nan, 'd', 'e', 'f']
})

# extract the features and target
x = data.drop(labels=['y'], axis=1)
y = data[['y']]  # note that this is a data frame, not a series

# split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=99)

# map the features to the corresponding types (numerical or categorical)
numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = x_train.select_dtypes(include=['object']).columns.tolist()

# define the features pipeline
numerical_features_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

features_pipeline = ColumnTransformer(transformers=[
    ('num_features', numerical_features_transformer, numerical_features),
    ('cat_features', categorical_features_transformer, categorical_features)
])

# define the target pipeline
target_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# fit the pipelines to the training data
features_pipeline.fit(x_train)
target_pipeline.fit(y_train)

# apply the pipelines to the training and test data
x_train_pipe = features_pipeline.transform(x_train)
x_test_pipe = features_pipeline.transform(x_test)

y_train_pipe = target_pipeline.transform(y_train)
y_test_pipe = target_pipeline.transform(y_test)

x_train = pd.DataFrame(data=x_train_pipe)
x_test = pd.DataFrame(data=x_test_pipe)

y_train = pd.DataFrame(data=y_train_pipe)
y_test = pd.DataFrame(data=y_test_pipe)

【讨论】：