admin 管理员组文章数量: 887021
2024年3月1日发(作者:mind 编程教程)
---------- seed : None, int or instance of RandomState If seed is None, return the RandomState singleton used by . If seed is an int, return a new RandomState instance seeded with seed. If seed is already a RandomState instance, return it. Otherwise raise ValueError. """ if seed is None or seed is : return ._rand if isinstance(seed, al): return State(seed) if isinstance(seed, State): return seed raise ValueError('%r cannot be used to seed a State' ' instance' % seed)class BaseShuffleSplit(): """Base class for ShuffleSplit and StratifiedShuffleSplit""" def __init__(self, n_splits=10, *, test_size=None, train_size=None, random_state=None): self.n_splits = n_splits _size = test_size _size = train_size _state = random_state self._default_test_size = 0.1 def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like of shape (n_samples,) The target variable for supervised learning problems. groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. Notes ----- Randomized CV splitters may return different results for each call of split. You can make the results identical by setting `random_state` to an integer. """ X, y, groups = indexable(X, y, groups) for train, test in self._iter_indices(X, y, groups): yield train, test def _iter_indices(self, X, y=None, groups=None): """Generate (train, test) indices""" def get_n_splits(self, X=None, y=None, groups=None): """Returns the number of splitting iterations in the cross-validator Parameters ---------- X : object Always ignored, exists for compatibility.
y : object Always ignored, exists for compatibility. groups : object Always ignored, exists for compatibility. Returns ------- n_splits : int Returns the number of splitting iterations in the cross-validator. """ return self.n_splits def __repr__(self): return _build_repr(self)class ShuffleSplit(BaseShuffleSplit): """Random permutation cross-validator Yields indices to split data into training and test sets. Note: contrary to other cross-validation strategies, random splits do not guarantee that all folds will be different, although this is still very likely for sizeable datasets. Read more in the :ref:`User Guide
TRAIN: [3 4 1] TEST: [5 2] TRAIN: [3 5 1] TEST: [2 4] """ def __init__(self, n_splits=10, *, test_size=None, train_size=None, random_state=None): super().__init__( n_splits=n_splits, test_size=test_size, train_size=train_size, random_state=random_state) self._default_test_size = 0.1 def _iter_indices(self, X, y=None, groups=None): n_samples = _num_samples(X) n_train, n_test = _validate_shuffle_split( n_samples, _size, _size, default_test_size=self._default_test_size) rng = check_random_state(_state) for i in range(self.n_splits): # random partition permutation = ation(n_samples) ind_test = permutation[:n_test] ind_train = permutation[n_test:(n_test + n_train)] yield ind_train, ind_testdef _num_samples(x): """Return number of samples in array-like x.""" message = 'Expected sequence or array-like, got %s' % type(x) if hasattr(x, 'fit') and callable(): # Don't get num_samples from an ensembles length! raise TypeError(message) if not hasattr(x, '__len__') and not hasattr(x, 'shape'): if hasattr(x, '__array__'): x = y(x) else: raise TypeError(message) if hasattr(x, 'shape') and is not None: if len() == 0: raise TypeError("Singleton array %r cannot be considered" " a valid collection." % x) # Check that shape is returning an integer or default to len # Dask dataframes may not return numeric shape[0] value if isinstance([0], al): return [0] try: return len(x) except TypeError as type_error: raise TypeError(message) from type_errordef check_consistent_length(*arrays): """Check that all arrays have consistent first dimensions. Checks whether all objects in arrays have the same shape or length. Parameters ---------- *arrays : list or tuple of input objects. Objects that will be checked for consistent length. """ lengths = [_num_samples(X) for X in arrays if X is not None] uniques = (lengths)
uniques = (lengths) if len(uniques) > 1: raise ValueError("Found input variables with inconsistent numbers of" " samples: %r" % [int(l) for l in lengths])def _make_indexable(iterable): """Ensure iterable supports indexing or convert to an indexable variant. Convert sparse matrices to csr and other non-indexable iterable to arrays. Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged. Parameters ---------- iterable : {list, dataframe, ndarray, sparse matrix} or None Object to be converted to an indexable iterable. """ if se(iterable): return () elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"): return iterable elif iterable is None: return iterable return (iterable)def indexable(*iterables): """Make arrays indexable for cross-validation. Checks consistent length, passes through None, and ensures that everything can be indexed by converting sparse matrices to csr and converting non-interable objects to arrays. Parameters ---------- *iterables : {lists, dataframes, ndarrays, sparse matrices} List of objects to ensure sliceability. """ result = [_make_indexable(X) for X in iterables] check_consistent_length(*result) return resultdef _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=None): """ Validation helper to check if the test/test sizes are meaningful wrt to the size of the data (n_samples) """ if test_size is None and train_size is None: test_size = default_test_size #
获得数据类型 test_size_type = y(test_size). train_size_type = y(train_size). if (test_size_type == 'i' and (test_size >= n_samples or test_size <= 0) or test_size_type == 'f' and (test_size <= 0 or test_size >= 1)): raise ValueError('test_size={0} should be either positive and smaller' ' than the number of samples {1} or a float in the ' '(0, 1) range'.format(test_size, n_samples)) if (train_size_type == 'i' and (train_size >= n_samples or train_size <= 0) or train_size_type == 'f' and (train_size <= 0 or train_size >= 1)): raise ValueError('train_size={0} should be either positive and smaller' ' than the number of samples {1} or a float in the ' '(0, 1) range'.format(train_size, n_samples)) if train_size is not None and train_size_type not in ('i', 'f'):
raise ValueError("Invalid value for train_size: {}".format(train_size)) if test_size is not None and test_size_type not in ('i', 'f'): raise ValueError("Invalid value for test_size: {}".format(test_size)) if (train_size_type == 'f' and test_size_type == 'f' and train_size + test_size > 1): raise ValueError( 'The sum of test_size and train_size = {}, should be in the (0, 1)' ' range. Reduce test_size and/or train_size.' .format(train_size + test_size)) if test_size_type == 'f': n_test = ceil(test_size * n_samples) elif test_size_type == 'i': n_test = float(test_size) if train_size_type == 'f': n_train = floor(train_size * n_samples) elif train_size_type == 'i': n_train = float(train_size) if train_size is None: n_train = n_samples - n_test elif test_size is None: n_test = n_samples - n_train if n_train + n_test > n_samples: raise ValueError('The sum of train_size and test_size = %d, ' 'should be smaller than the number of ' 'samples %d. Reduce test_size and/or ' 'train_size.' % (n_train + n_test, n_samples)) n_train, n_test = int(n_train), int(n_test) if n_train == 0: raise ValueError( 'With n_samples={}, test_size={} and train_size={}, the ' 'resulting train set will be empty. Adjust any of the ' 'aforementioned parameters.'.format(n_samples, test_size, train_size) ) return n_train, n_testdef _list_indexing(X, key, key_dtype): """Index a Python list.""" if ar(key) or isinstance(key, slice): # key is a slice or a scalar return X[key] if key_dtype == 'bool': # key is a boolean array-like return list(compress(X, key)) # key is a integer array-like of key return [X[idx] for idx in key]def _determine_key_type(key, accept_slice=True): """Determine the data type of key. Parameters ---------- key : scalar, slice or array-like The key from which we want to infer the data type. accept_slice : bool, default=True Whether or not to raise an error if the key is a slice.
Whether or not to raise an error if the key is a slice. Returns ------- dtype : {'int', 'str', 'bool', None} Returns the data type of key. """ err_msg = ("No valid specification of the columns. Only a scalar, list or " "slice of all integers or all strings, or boolean mask is " "allowed") dtype_to_str = {int: 'int', str: 'str', bool: 'bool', _: 'bool'} array_dtype_to_str = {'i': 'int', 'u': 'int', 'b': 'bool', 'O': 'str', 'U': 'str', 'S': 'str'} if key is None: return None if isinstance(key, tuple(dtype_to_())): try: return dtype_to_str[type(key)] except KeyError: raise ValueError(err_msg) if isinstance(key, slice): if not accept_slice: raise TypeError( 'Only array-like or scalar are supported. ' 'A Python slice was given.' ) if is None and is None: return None key_start_type = _determine_key_type() key_stop_type = _determine_key_type() if key_start_type is not None and key_stop_type is not None: if key_start_type != key_stop_type: raise ValueError(err_msg) if key_start_type is not None: return key_start_type return key_stop_type if isinstance(key, (list, tuple)): unique_key = set(key) key_type = {_determine_key_type(elt) for elt in unique_key} if not key_type: return None if len(key_type) != 1: raise ValueError(err_msg) return key_() if hasattr(key, 'dtype'): try: return array_dtype_to_str[] except KeyError: raise ValueError(err_msg) raise ValueError(err_msg)def _safe_indexing(X, indices, *, axis=0): """Return rows, items or columns of X using indices. .. warning:: This utility is documented, but **private**. This means that backward compatibility might be broken without any deprecation cycle. Parameters ---------- X : array-like, sparse-matrix, list, ame, Data from which to sample rows, items or columns. `list` are only supported when `axis=0`. indices : bool, int, str, slice, array-like
indices : bool, int, str, slice, array-like - If `axis=0`, boolean and integer array-like, integer slice, and scalar integer are supported. - If `axis=1`: - to select a single column, `indices` can be of `int` type for all `X` types and `str` only for dataframe. The selected subset will be 1D, unless `X` is a sparse matrix in which case it will be 2D. - to select multiples columns, `indices` can be one of the following: `list`, `array`, `slice`. The type used in these containers can be one of the following: `int`, 'bool' and `str`. However, `str` is only supported when `X` is a dataframe. The selected subset will be 2D. axis : int, default=0 The axis along which `X` will be subsampled. `axis=0` will select rows while `axis=1` will select columns. Returns ------- subset Subset of X on axis 0 or 1. Notes ----- CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are not supported. """ if indices is None: return X if axis not in (0, 1): raise ValueError( "'axis' should be either 0 (to index rows) or 1 (to index " " column). Got {} instead.".format(axis) ) indices_dtype = _determine_key_type(indices) if axis == 0 and indices_dtype == 'str': raise ValueError( "String indexing is not supported with 'axis=0'" ) if axis == 1 and != 2: raise ValueError( "'X' should be a 2D NumPy array, 2D sparse matrix or pandas " "dataframe when indexing the columns (i.e. 'axis=1'). " "Got {} instead with {} dimension(s).".format(type(X), ) ) if axis == 1 and indices_dtype == 'str' and not hasattr(X, 'loc'): raise ValueError( "Specifying the columns using strings is only supported for " "pandas DataFrames" ) return _list_indexing(X, indices, indices_dtype)def train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None): n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") #
已经判断都具有相同长度 arrays = indexable(*arrays)
arrays = indexable(*arrays) n_samples = _num_samples(arrays[0]) #
获得train和test合法数字 n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=0.25) if shuffle is False: if stratify is not None: raise ValueError( "Stratified train/test split is not implemented for " "shuffle=False") train = (n_train) test = (n_train, n_train + n_test) else: CVClass = ShuffleSplit cv = CVClass(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next((X=arrays[0], y=stratify)) return list(_iterable( (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays))X, y = (10).reshape((5, 2)), range(5)train_test_split(X, y)参考资料
版权声明:本文标题:阅读源码系列:sklearn中train_test_split使用及源码学习 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.freenas.com.cn/free/1709247620h541289.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论