阅读源码系列:sklearn中train_test_split使用及源码学习-FreeNAS中文网

admin 管理员组

文章数量: 887021

2024年3月1日发(作者：mind 编程教程)

---------- seed : None, int or instance of RandomState If seed is None, return the RandomState singleton used by . If seed is an int, return a new RandomState instance seeded with seed. If seed is already a RandomState instance, return it. Otherwise raise ValueError. """ if seed is None or seed is : return ._rand if isinstance(seed, al): return State(seed) if isinstance(seed, State): return seed raise ValueError('%r cannot be used to seed a State' ' instance' % seed)class BaseShuffleSplit(): """Base class for ShuffleSplit and StratifiedShuffleSplit""" def __init__(self, n_splits=10, *, test_size=None, train_size=None, random_state=None): self.n_splits = n_splits _size = test_size _size = train_size _state = random_state self._default_test_size = 0.1 def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like of shape (n_samples,) The target variable for supervised learning problems. groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. Notes ----- Randomized CV splitters may return different results for each call of split. You can make the results identical by setting `random_state` to an integer. """ X, y, groups = indexable(X, y, groups) for train, test in self._iter_indices(X, y, groups): yield train, test def _iter_indices(self, X, y=None, groups=None): """Generate (train, test) indices""" def get_n_splits(self, X=None, y=None, groups=None): """Returns the number of splitting iterations in the cross-validator Parameters ---------- X : object Always ignored, exists for compatibility.

y : object Always ignored, exists for compatibility. groups : object Always ignored, exists for compatibility. Returns ------- n_splits : int Returns the number of splitting iterations in the cross-validator. """ return self.n_splits def __repr__(self): return _build_repr(self)class ShuffleSplit(BaseShuffleSplit): """Random permutation cross-validator Yields indices to split data into training and test sets. Note: contrary to other cross-validation strategies, random splits do not guarantee that all folds will be different, although this is still very likely for sizeable datasets. Read more in the :ref:`User Guide `. Parameters ---------- n_splits : int, default=10 Number of re-shuffling & splitting iterations. test_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.1. train_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. random_state : int, RandomState instance or None, default=None Controls the randomness of the training and testing indices produced. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. Examples -------- # >>> import numpy as np # >>> from _selection import ShuffleSplit # >>> X = ([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]]) # >>> y = ([1, 2, 1, 2, 1, 2]) # >>> rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0) # >>> _n_splits(X) # 5 # >>> print(rs) # ShuffleSplit(n_splits=5, random_state=0, test_size=0.25, train_size=None) # >>> for train_index, test_index in (X): # ... print("TRAIN:", train_index, "TEST:", test_index) # TRAIN: [1 3 0 4] TEST: [5 2] # TRAIN: [4 0 2 5] TEST: [1 3] # TRAIN: [1 2 4 0] TEST: [3 5] # TRAIN: [3 4 1 0] TEST: [5 2] # TRAIN: [3 5 1 0] TEST: [2 4] # >>> rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25, # ... random_state=0) # >>> for train_index, test_index in (X): # ... print("TRAIN:", train_index, "TEST:", test_index) TRAIN: [1 3 0] TEST: [5 2] TRAIN: [4 0 2] TEST: [1 3] TRAIN: [1 2 4] TEST: [3 5] TRAIN: [3 4 1] TEST: [5 2]

TRAIN: [3 4 1] TEST: [5 2] TRAIN: [3 5 1] TEST: [2 4] """ def __init__(self, n_splits=10, *, test_size=None, train_size=None, random_state=None): super().__init__( n_splits=n_splits, test_size=test_size, train_size=train_size, random_state=random_state) self._default_test_size = 0.1 def _iter_indices(self, X, y=None, groups=None): n_samples = _num_samples(X) n_train, n_test = _validate_shuffle_split( n_samples, _size, _size, default_test_size=self._default_test_size) rng = check_random_state(_state) for i in range(self.n_splits): # random partition permutation = ation(n_samples) ind_test = permutation[:n_test] ind_train = permutation[n_test:(n_test + n_train)] yield ind_train, ind_testdef _num_samples(x): """Return number of samples in array-like x.""" message = 'Expected sequence or array-like, got %s' % type(x) if hasattr(x, 'fit') and callable(): # Don't get num_samples from an ensembles length! raise TypeError(message) if not hasattr(x, '__len__') and not hasattr(x, 'shape'): if hasattr(x, '__array__'): x = y(x) else: raise TypeError(message) if hasattr(x, 'shape') and is not None: if len() == 0: raise TypeError("Singleton array %r cannot be considered" " a valid collection." % x) # Check that shape is returning an integer or default to len # Dask dataframes may not return numeric shape[0] value if isinstance([0], al): return [0] try: return len(x) except TypeError as type_error: raise TypeError(message) from type_errordef check_consistent_length(*arrays): """Check that all arrays have consistent first dimensions. Checks whether all objects in arrays have the same shape or length. Parameters ---------- *arrays : list or tuple of input objects. Objects that will be checked for consistent length. """ lengths = [_num_samples(X) for X in arrays if X is not None] uniques = (lengths)

uniques = (lengths) if len(uniques) > 1: raise ValueError("Found input variables with inconsistent numbers of" " samples: %r" % [int(l) for l in lengths])def _make_indexable(iterable): """Ensure iterable supports indexing or convert to an indexable variant. Convert sparse matrices to csr and other non-indexable iterable to arrays. Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged. Parameters ---------- iterable : {list, dataframe, ndarray, sparse matrix} or None Object to be converted to an indexable iterable. """ if se(iterable): return () elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"): return iterable elif iterable is None: return iterable return (iterable)def indexable(*iterables): """Make arrays indexable for cross-validation. Checks consistent length, passes through None, and ensures that everything can be indexed by converting sparse matrices to csr and converting non-interable objects to arrays. Parameters ---------- *iterables : {lists, dataframes, ndarrays, sparse matrices} List of objects to ensure sliceability. """ result = [_make_indexable(X) for X in iterables] check_consistent_length(*result) return resultdef _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=None): """ Validation helper to check if the test/test sizes are meaningful wrt to the size of the data (n_samples) """ if test_size is None and train_size is None: test_size = default_test_size #

获得数据类型 test_size_type = y(test_size). train_size_type = y(train_size). if (test_size_type == 'i' and (test_size >= n_samples or test_size <= 0) or test_size_type == 'f' and (test_size <= 0 or test_size >= 1)): raise ValueError('test_size={0} should be either positive and smaller' ' than the number of samples {1} or a float in the ' '(0, 1) range'.format(test_size, n_samples)) if (train_size_type == 'i' and (train_size >= n_samples or train_size <= 0) or train_size_type == 'f' and (train_size <= 0 or train_size >= 1)): raise ValueError('train_size={0} should be either positive and smaller' ' than the number of samples {1} or a float in the ' '(0, 1) range'.format(train_size, n_samples)) if train_size is not None and train_size_type not in ('i', 'f'):

raise ValueError("Invalid value for train_size: {}".format(train_size)) if test_size is not None and test_size_type not in ('i', 'f'): raise ValueError("Invalid value for test_size: {}".format(test_size)) if (train_size_type == 'f' and test_size_type == 'f' and train_size + test_size > 1): raise ValueError( 'The sum of test_size and train_size = {}, should be in the (0, 1)' ' range. Reduce test_size and/or train_size.' .format(train_size + test_size)) if test_size_type == 'f': n_test = ceil(test_size * n_samples) elif test_size_type == 'i': n_test = float(test_size) if train_size_type == 'f': n_train = floor(train_size * n_samples) elif train_size_type == 'i': n_train = float(train_size) if train_size is None: n_train = n_samples - n_test elif test_size is None: n_test = n_samples - n_train if n_train + n_test > n_samples: raise ValueError('The sum of train_size and test_size = %d, ' 'should be smaller than the number of ' 'samples %d. Reduce test_size and/or ' 'train_size.' % (n_train + n_test, n_samples)) n_train, n_test = int(n_train), int(n_test) if n_train == 0: raise ValueError( 'With n_samples={}, test_size={} and train_size={}, the ' 'resulting train set will be empty. Adjust any of the ' 'aforementioned parameters.'.format(n_samples, test_size, train_size) ) return n_train, n_testdef _list_indexing(X, key, key_dtype): """Index a Python list.""" if ar(key) or isinstance(key, slice): # key is a slice or a scalar return X[key] if key_dtype == 'bool': # key is a boolean array-like return list(compress(X, key)) # key is a integer array-like of key return [X[idx] for idx in key]def _determine_key_type(key, accept_slice=True): """Determine the data type of key. Parameters ---------- key : scalar, slice or array-like The key from which we want to infer the data type. accept_slice : bool, default=True Whether or not to raise an error if the key is a slice.

Whether or not to raise an error if the key is a slice. Returns ------- dtype : {'int', 'str', 'bool', None} Returns the data type of key. """ err_msg = ("No valid specification of the columns. Only a scalar, list or " "slice of all integers or all strings, or boolean mask is " "allowed") dtype_to_str = {int: 'int', str: 'str', bool: 'bool', _: 'bool'} array_dtype_to_str = {'i': 'int', 'u': 'int', 'b': 'bool', 'O': 'str', 'U': 'str', 'S': 'str'} if key is None: return None if isinstance(key, tuple(dtype_to_())): try: return dtype_to_str[type(key)] except KeyError: raise ValueError(err_msg) if isinstance(key, slice): if not accept_slice: raise TypeError( 'Only array-like or scalar are supported. ' 'A Python slice was given.' ) if is None and is None: return None key_start_type = _determine_key_type() key_stop_type = _determine_key_type() if key_start_type is not None and key_stop_type is not None: if key_start_type != key_stop_type: raise ValueError(err_msg) if key_start_type is not None: return key_start_type return key_stop_type if isinstance(key, (list, tuple)): unique_key = set(key) key_type = {_determine_key_type(elt) for elt in unique_key} if not key_type: return None if len(key_type) != 1: raise ValueError(err_msg) return key_() if hasattr(key, 'dtype'): try: return array_dtype_to_str[] except KeyError: raise ValueError(err_msg) raise ValueError(err_msg)def _safe_indexing(X, indices, *, axis=0): """Return rows, items or columns of X using indices. .. warning:: This utility is documented, but **private**. This means that backward compatibility might be broken without any deprecation cycle. Parameters ---------- X : array-like, sparse-matrix, list, ame, Data from which to sample rows, items or columns. `list` are only supported when `axis=0`. indices : bool, int, str, slice, array-like

indices : bool, int, str, slice, array-like - If `axis=0`, boolean and integer array-like, integer slice, and scalar integer are supported. - If `axis=1`: - to select a single column, `indices` can be of `int` type for all `X` types and `str` only for dataframe. The selected subset will be 1D, unless `X` is a sparse matrix in which case it will be 2D. - to select multiples columns, `indices` can be one of the following: `list`, `array`, `slice`. The type used in these containers can be one of the following: `int`, 'bool' and `str`. However, `str` is only supported when `X` is a dataframe. The selected subset will be 2D. axis : int, default=0 The axis along which `X` will be subsampled. `axis=0` will select rows while `axis=1` will select columns. Returns ------- subset Subset of X on axis 0 or 1. Notes ----- CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are not supported. """ if indices is None: return X if axis not in (0, 1): raise ValueError( "'axis' should be either 0 (to index rows) or 1 (to index " " column). Got {} instead.".format(axis) ) indices_dtype = _determine_key_type(indices) if axis == 0 and indices_dtype == 'str': raise ValueError( "String indexing is not supported with 'axis=0'" ) if axis == 1 and != 2: raise ValueError( "'X' should be a 2D NumPy array, 2D sparse matrix or pandas " "dataframe when indexing the columns (i.e. 'axis=1'). " "Got {} instead with {} dimension(s).".format(type(X), ) ) if axis == 1 and indices_dtype == 'str' and not hasattr(X, 'loc'): raise ValueError( "Specifying the columns using strings is only supported for " "pandas DataFrames" ) return _list_indexing(X, indices, indices_dtype)def train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None): n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") #

已经判断都具有相同长度 arrays = indexable(*arrays)

arrays = indexable(*arrays) n_samples = _num_samples(arrays[0]) #

获得train和test合法数字 n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=0.25) if shuffle is False: if stratify is not None: raise ValueError( "Stratified train/test split is not implemented for " "shuffle=False") train = (n_train) test = (n_train, n_train + n_test) else: CVClass = ShuffleSplit cv = CVClass(test_size=n_test, train_size=n_train, random_state=random_state) train, test = next((X=arrays[0], y=stratify)) return list(_iterable( (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays))X, y = (10).reshape((5, 2)), range(5)train_test_split(X, y)参考资料

本文标签：编程获得合法长度判断

版权声明：本文标题：阅读源码系列:sklearn中train_test_split使用及源码学习内容由网友自发贡献，该文观点仅代表作者本人，转载请联系作者并注明出处：http://www.freenas.com.cn/free/1709247620h541289.html，本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容，一经查实，本站将立刻删除。

技术交流 – FreeNAS中文网

阅读源码系列:sklearn中train_test_split使用及源码学习

更多相关文章

S7-200的编程软件STEP-MicroWIN V4.0的安装与使用

数控加工技术学习中的常用软件介绍

IF函数对多个条件的判断编写方法

少儿编程的好处

学习编程对孩子的好处

小孩学编程好吗

少儿编程教育的现状与前景

少儿编程:提升孩子的逻辑思维

2345网址大全-注册码-注册地址-免费下载

编程信息学学习高中生的故事

C语言书籍推荐

常用的计算机程序设计语言

小学五年级编程游戏代码

Excel VBA编程 典型实例—拼图游戏

funcode飞机大战编程c语言

魔兽世界宏命令大全

2020年春【中石油】商务英语阅读第一次在线作业(标准)

Windows文件名长度限制

解除Windows系统中文件名和目录路径的最大长度限制

windows@环境变量配置和使用@命令行配置环境变量@环境变量路径长度限制解除

发表评论

推荐文章

中国金贤敏量子计算机量子加速技术,我国科学家首次实现快速到达量子加速算法...

统计列表加小计

2855 游乐园的迷宫

理清SpringBoot CURD处理逻辑、顺序

win7任务栏时钟显示秒_如何使Windows 10的任务栏时钟显示秒数

热门文章

智慧城市建设，选择紫光云的连云港有啥不一样？

Semantic Textual Similarity (STS)

数学图形网站推荐转载 http:xuxzmail.blog.163.comblogstatic2513191620097983241171

Python 打包 出现 NameError: name 'help' is not defined 问题记录

CBNData发布在线票务平台数据报告 淘票票份额升至第二

如何提高信用分，跨过借钱难的门槛

DM数据库安装（windows单机）

解决Windows7 Path环境变量过长的方法

windows7英文版，变为中文版

在其它品牌电脑，非小米电脑安装小米电脑管家，实现与小米设备跨端互联

最新文章

Raid技术

LSI_阵列卡操作手册

破解Centos7_root用户密码

Redhat重置Root用户密码方法

远程批量修改linux服务器密码的脚本

Windows7 系统安全设置权限技巧

（Windows系统）详细介绍Windows系统 含有英文版

最新Windows 11教育版下载：专为教育设计的系统！

Win7系统下搭建NFS服务器

零基础使用UltraISO制作并安装纯净Win10系统指南

Excel VBA编程典型实例—拼图游戏

Python 打包出现 NameError: name 'help' is not defined 问题记录

CBNData发布在线票务平台数据报告淘票票份额升至第二

（Windows系统）详细介绍Windows系统含有英文版