这个功能是在 scikit-learn 上请求的,我已经为它添加了一个PR。此时代码正在等待审查。
这段代码在最近的Kaggle competition 上使用并取得了一些不错的效果。
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class GroupTimeSeriesSplit(_BaseKFold):
"""Time Series cross-validator variant with non-overlapping groups.
Provides train/test indices to split time series data samples
that are observed at fixed time intervals according to a
third-party provided group.
In each split, test indices must be higher than before, and thus shuffling
in cross validator is inappropriate.
This cross-validation object is a variation of :class:`KFold`.
In the kth split, it returns first k folds as train set and the
(k+1)th fold as test set.
The same group will not appear in two different folds (the number of
distinct groups has to be at least equal to the number of folds).
Note that unlike standard cross-validation methods, successive
training sets are supersets of those that come before them.
Read more in the :ref:`User Guide <cross_validation>`.
Parameters
----------
n_splits : int, default=5
Number of splits. Must be at least 2.
max_train_size : int, default=None
Maximum size for a single training set.
Examples
--------
>>> import numpy as np
>>> from sklearn.model_selection import GroupTimeSeriesSplit
>>> groups = np.array(['a', 'a', 'a', 'a', 'a', 'a',\
'b', 'b', 'b', 'b', 'b',\
'c', 'c', 'c', 'c',\
'd', 'd', 'd'])
>>> gtss = GroupTimeSeriesSplit(n_splits=3)
>>> for train_idx, test_idx in gtss.split(groups, groups=groups):
... print("TRAIN:", train_idx, "TEST:", test_idx)
... print("TRAIN GROUP:", groups[train_idx],\
"TEST GROUP:", groups[test_idx])
TRAIN: [0, 1, 2, 3, 4, 5] TEST: [6, 7, 8, 9, 10]
TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a']\
TEST GROUP: ['b' 'b' 'b' 'b' 'b']
TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] TEST: [11, 12, 13, 14]
TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b']\
TEST GROUP: ['c' 'c' 'c' 'c']
TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]\
TEST: [15, 16, 17]
TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b' 'c' 'c' 'c' 'c']\
TEST GROUP: ['d' 'd' 'd']
"""
@_deprecate_positional_args
def __init__(self,
n_splits=5,
*,
max_train_size=None
):
super().__init__(n_splits, shuffle=False, random_state=None)
self.max_train_size = max_train_size
def split(self, X, y=None, groups=None):
"""Generate indices to split data into training and test set.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where n_samples is the number of samples
and n_features is the number of features.
y : array-like of shape (n_samples,)
Always ignored, exists for compatibility.
groups : array-like of shape (n_samples,)
Group labels for the samples used while splitting the dataset into
train/test set.
Yields
------
train : ndarray
The training set indices for that split.
test : ndarray
The testing set indices for that split.
"""
if groups is None:
raise ValueError(
"The 'groups' parameter should not be None")
X, y, groups = indexable(X, y, groups)
n_samples = _num_samples(X)
n_splits = self.n_splits
n_folds = n_splits + 1
group_dict = {}
u, ind = np.unique(groups, return_index=True)
unique_groups = u[np.argsort(ind)]
n_samples = _num_samples(X)
n_groups = _num_samples(unique_groups)
for idx in np.arange(n_samples):
if (groups[idx] in group_dict):
group_dict[groups[idx]].append(idx)
else:
group_dict[groups[idx]] = [idx]
if n_folds > n_groups:
raise ValueError(
("Cannot have number of folds={0} greater than"
" the number of groups={1}").format(n_folds,
n_groups))
group_test_size = n_groups // n_folds
group_test_starts = range(n_groups - n_splits * group_test_size,
n_groups, group_test_size)
for group_test_start in group_test_starts:
train_array = []
test_array = []
for train_group_idx in unique_groups[:group_test_start]:
train_array_tmp = group_dict[train_group_idx]
train_array = np.sort(np.unique(
np.concatenate((train_array,
train_array_tmp)),
axis=None), axis=None)
train_end = train_array.size
if self.max_train_size and self.max_train_size < train_end:
train_array = train_array[train_end -
self.max_train_size:train_end]
for test_group_idx in unique_groups[group_test_start:
group_test_start +
group_test_size]:
test_array_tmp = group_dict[test_group_idx]
test_array = np.sort(np.unique(
np.concatenate((test_array,
test_array_tmp)),
axis=None), axis=None)
yield [int(i) for i in train_array], [int(i) for i in test_array]
以 GridSearchCV 为例。从 SO 帖子 here 修改的代码。
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import numpy as np
groups = np.array(['a', 'a', 'a', 'b', 'b', 'c'])
X = np.array([[4, 5, 6, 1, 0, 2], [3.1, 3.5, 1.0, 2.1, 8.3, 1.1]]).T
y = np.array([1, 6, 7, 1, 2, 3])
model = xgb.XGBRegressor()
param_search = {'max_depth' : [3, 5]}
tscv = GroupTimeSeriesSplit(n_splits=2)
gsearch = GridSearchCV(estimator=model, cv=tscv,
param_grid=param_search)
gsearch.fit(X, y , groups=groups)