【发布时间】:2021-07-11 05:50:18
【问题描述】:
我正在根据 uci ml repo 的汽车数据集制作 ML 项目。
这里有 6 个功能和 1 个标签....但问题是 6 个功能列是对象(字符串)形式...所以我如何使用 OneHotEncoder 和 ColumnTransformer 将其转换为浮点...附加蚂蚁答案中的示例代码...
下面的代码我用来转换..有人可以解释一下所有参数和它们的用途吗...
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
#print(ct)
X = np.array(ct.fit_transform(cars))
print(X)
使用此代码后我收到以下输出
[[1.0 0.0 1.0 ... 'small' 'low' 'unacc']
[1.0 0.0 1.0 ... 'small' 'med' 'unacc']
[1.0 0.0 1.0 ... 'small' 'high' 'unacc']
...
[1.0 0.0 1.0 ... 'big' 'low' 'unacc']
[1.0 0.0 1.0 ... 'big' 'med' 'good']
[1.0 0.0 1.0 ... 'big' 'high' 'vgood']]
y 也一样
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [6])], remainder='passthrough')
#print(ct)
y = np.array(ct.fit_transform(cars))
print(y)
而输出是
array([[0.0, 1.0, 1.0, ..., 'small', 'low', 'unacc'],
[0.0, 1.0, 1.0, ..., 'small', 'med', 'unacc'],
[0.0, 1.0, 1.0, ..., 'small', 'high', 'unacc'],
...,
[0.0, 1.0, 1.0, ..., 'big', 'low', 'unacc'],
[0.0, 1.0, 1.0, ..., 'big', 'med', 'good'],
[0.0, 1.0, 1.0, ..., 'big', 'high', 'vgood']], dtype=object)
但是当我尝试训练模型时
from sklearn.model_selection import train_test_split
X_train, X_test, y_train , y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
y_test
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
错误:
ValueError Traceback (most recent call last)
<ipython-input-127-19f7ef69ea28> in <module>
1 from sklearn.linear_model import LinearRegression
2 regressor = LinearRegression()
----> 3 regressor.fit(X_train, y_train)
c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\sklearn\linear_model\_base.py in fit(self, X, y, sample_weight)
517
518 X, y = self._validate_data(X, y, accept_sparse=accept_sparse,
--> 519 y_numeric=True, multi_output=True)
520
521 if sample_weight is not None:
c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
431 y = check_array(y, **check_y_params)
432 else:
--> 433 X, y = check_X_y(X, y, **check_params)
434 out = X, y
435
c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
876 ensure_min_samples=ensure_min_samples,
877 ensure_min_features=ensure_min_features,
--> 878 estimator=estimator)
879 if multi_output:
880 y = check_array(y, accept_sparse='csr', force_all_finite=True,
c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
671 array = array.astype(dtype, casting="unsafe", copy=False)
672 else:
--> 673 array = np.asarray(array, order=order, dtype=dtype)
674 except ComplexWarning as complex_warning:
675 raise ValueError("Complex data not supported\n"
c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order, like)
100 return _asarray_with_like(a, dtype=dtype, order=order, like=like)
101
--> 102 return array(a, dtype, copy=False, order=order)
103
104
ValueError: could not convert string to float: 'low'
【问题讨论】:
-
请阅读ml标签的描述。
标签: python machine-learning scikit-learn one-hot-encoding