You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

387 lines
16 KiB

  1. """This file includes multilabel cross validators based on an implementation of
  2. the Iterative Stratification algorithm described in the following paper:
  3. Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of Multi-
  4. Label Data. In: Gunopulos D., Hofmann T., Malerba D., Vazirgiannis M. (eds)
  5. Machine Learning and Knowledge Discovery in Databases. ECML PKDD 2011. Lecture
  6. Notes in Computer Science, vol 6913. Springer, Berlin, Heidelberg.
  7. From scikit-learn 0.19.0, StratifiedKFold, RepeatedStratifiedKFold, and
  8. StratifiedShuffleSplit were copied and modified, retaining compatibility
  9. with scikit-learn.
  10. Attribution to authors of scikit-learn/model_selection/_split.py under BSD 3 clause:
  11. Alexandre Gramfort <alexandre.gramfort@inria.fr>,
  12. Gael Varoquaux <gael.varoquaux@normalesup.org>,
  13. Olivier Grisel <olivier.grisel@ensta.org>,
  14. Raghav RV <rvraghav93@gmail.com>
  15. """
  16. # Author: Trent J. Bradberry <trentjason@hotmail.com>
  17. # License: BSD 3 clause
  18. import numpy as np
  19. from sklearn.utils import check_random_state
  20. from sklearn.utils.validation import _num_samples, check_array
  21. from sklearn.utils.multiclass import type_of_target
  22. from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits, \
  23. BaseShuffleSplit, _validate_shuffle_split
  24. def IterativeStratification(labels, r, random_state):
  25. """This function implements the Iterative Stratification algorithm described
  26. in the following paper:
  27. Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of
  28. Multi-Label Data. In: Gunopulos D., Hofmann T., Malerba D., Vazirgiannis M.
  29. (eds) Machine Learning and Knowledge Discovery in Databases. ECML PKDD
  30. 2011. Lecture Notes in Computer Science, vol 6913. Springer, Berlin,
  31. Heidelberg.
  32. """
  33. n_samples = labels.shape[0]
  34. test_folds = np.zeros(n_samples, dtype=int)
  35. # Calculate the desired number of examples at each subset
  36. c_folds = r * n_samples
  37. # Calculate the desired number of examples of each label at each subset
  38. c_folds_labels = np.outer(r, labels.sum(axis=0))
  39. labels_not_processed_mask = np.ones(n_samples, dtype=bool)
  40. while np.any(labels_not_processed_mask):
  41. # Find the label with the fewest (but at least one) remaining examples,
  42. # breaking ties randomly
  43. num_labels = labels[labels_not_processed_mask].sum(axis=0)
  44. # Handle case where only all-zero labels are left by distributing
  45. # across all folds as evenly as possible (not in original algorithm but
  46. # mentioned in the text). (By handling this case separately, some
  47. # code redundancy is introduced; however, this approach allows for
  48. # decreased execution time when there are a relatively large number
  49. # of all-zero labels.)
  50. if num_labels.sum() == 0:
  51. sample_idxs = np.where(labels_not_processed_mask)[0]
  52. for sample_idx in sample_idxs:
  53. fold_idx = np.where(c_folds == c_folds.max())[0]
  54. if fold_idx.shape[0] > 1:
  55. fold_idx = fold_idx[random_state.choice(fold_idx.shape[0])]
  56. test_folds[sample_idx] = fold_idx
  57. c_folds[fold_idx] -= 1
  58. break
  59. label_idx = np.where(num_labels == num_labels[np.nonzero(num_labels)].min())[0]
  60. if label_idx.shape[0] > 1:
  61. label_idx = label_idx[random_state.choice(label_idx.shape[0])]
  62. sample_idxs = np.where(np.logical_and(labels[:, label_idx].flatten(), labels_not_processed_mask))[0]
  63. for sample_idx in sample_idxs:
  64. # Find the subset(s) with the largest number of desired examples
  65. # for this label, breaking ties by considering the largest number
  66. # of desired examples, breaking further ties randomly
  67. label_folds = c_folds_labels[:, label_idx]
  68. fold_idx = np.where(label_folds == label_folds.max())[0]
  69. if fold_idx.shape[0] > 1:
  70. temp_fold_idx = np.where(c_folds[fold_idx] ==
  71. c_folds[fold_idx].max())[0]
  72. fold_idx = fold_idx[temp_fold_idx]
  73. if temp_fold_idx.shape[0] > 1:
  74. fold_idx = fold_idx[random_state.choice(temp_fold_idx.shape[0])]
  75. test_folds[sample_idx] = fold_idx
  76. labels_not_processed_mask[sample_idx] = False
  77. # Update desired number of examples
  78. c_folds_labels[fold_idx, labels[sample_idx]] -= 1
  79. c_folds[fold_idx] -= 1
  80. return test_folds
  81. class MultilabelStratifiedKFold(_BaseKFold):
  82. """Multilabel stratified K-Folds cross-validator
  83. Provides train/test indices to split multilabel data into train/test sets.
  84. This cross-validation object is a variation of KFold that returns
  85. stratified folds for multilabel data. The folds are made by preserving
  86. the percentage of samples for each label.
  87. Parameters
  88. ----------
  89. n_splits : int, default=3
  90. Number of folds. Must be at least 2.
  91. shuffle : boolean, optional
  92. Whether to shuffle each stratification of the data before splitting
  93. into batches.
  94. random_state : int, RandomState instance or None, optional, default=None
  95. If int, random_state is the seed used by the random number generator;
  96. If RandomState instance, random_state is the random number generator;
  97. If None, the random number generator is the RandomState instance used
  98. by `np.random`. Unlike StratifiedKFold that only uses random_state
  99. when ``shuffle`` == True, this multilabel implementation
  100. always uses the random_state since the iterative stratification
  101. algorithm breaks ties randomly.
  102. Examples
  103. --------
  104. >>> from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
  105. >>> import numpy as np
  106. >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
  107. >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
  108. >>> mskf = MultilabelStratifiedKFold(n_splits=2, random_state=0)
  109. >>> mskf.get_n_splits(X, y)
  110. 2
  111. >>> print(mskf) # doctest: +NORMALIZE_WHITESPACE
  112. MultilabelStratifiedKFold(n_splits=2, random_state=0, shuffle=False)
  113. >>> for train_index, test_index in mskf.split(X, y):
  114. ... print("TRAIN:", train_index, "TEST:", test_index)
  115. ... X_train, X_test = X[train_index], X[test_index]
  116. ... y_train, y_test = y[train_index], y[test_index]
  117. TRAIN: [0 3 4 6] TEST: [1 2 5 7]
  118. TRAIN: [1 2 5 7] TEST: [0 3 4 6]
  119. Notes
  120. -----
  121. Train and test sizes may be slightly different in each fold.
  122. See also
  123. --------
  124. RepeatedMultilabelStratifiedKFold: Repeats Multilabel Stratified K-Fold
  125. n times.
  126. """
  127. def __init__(self, n_splits=3, shuffle=False, random_state=None):
  128. super(MultilabelStratifiedKFold, self).__init__(n_splits, shuffle, random_state)
  129. def _make_test_folds(self, X, y):
  130. y = np.asarray(y, dtype=bool)
  131. type_of_target_y = type_of_target(y)
  132. if type_of_target_y != 'multilabel-indicator':
  133. raise ValueError(
  134. 'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(type_of_target_y))
  135. num_samples = y.shape[0]
  136. rng = check_random_state(self.random_state)
  137. indices = np.arange(num_samples)
  138. if self.shuffle:
  139. rng.shuffle(indices)
  140. y = y[indices]
  141. r = np.asarray([1 / self.n_splits] * self.n_splits)
  142. test_folds = IterativeStratification(labels=y, r=r, random_state=rng)
  143. return test_folds[np.argsort(indices)]
  144. def _iter_test_masks(self, X=None, y=None, groups=None):
  145. test_folds = self._make_test_folds(X, y)
  146. for i in range(self.n_splits):
  147. yield test_folds == i
  148. def split(self, X, y, groups=None):
  149. """Generate indices to split data into training and test set.
  150. Parameters
  151. ----------
  152. X : array-like, shape (n_samples, n_features)
  153. Training data, where n_samples is the number of samples
  154. and n_features is the number of features.
  155. Note that providing ``y`` is sufficient to generate the splits and
  156. hence ``np.zeros(n_samples)`` may be used as a placeholder for
  157. ``X`` instead of actual training data.
  158. y : array-like, shape (n_samples, n_labels)
  159. The target variable for supervised learning problems.
  160. Multilabel stratification is done based on the y labels.
  161. groups : object
  162. Always ignored, exists for compatibility.
  163. Returns
  164. -------
  165. train : ndarray
  166. The training set indices for that split.
  167. test : ndarray
  168. The testing set indices for that split.
  169. Notes
  170. -----
  171. Randomized CV splitters may return different results for each call of
  172. split. You can make the results identical by setting ``random_state``
  173. to an integer.
  174. """
  175. y = check_array(y, ensure_2d=False, dtype=None)
  176. return super(MultilabelStratifiedKFold, self).split(X, y, groups)
  177. class RepeatedMultilabelStratifiedKFold(_RepeatedSplits):
  178. """Repeated Multilabel Stratified K-Fold cross validator.
  179. Repeats Mulilabel Stratified K-Fold n times with different randomization
  180. in each repetition.
  181. Parameters
  182. ----------
  183. n_splits : int, default=5
  184. Number of folds. Must be at least 2.
  185. n_repeats : int, default=10
  186. Number of times cross-validator needs to be repeated.
  187. random_state : None, int or RandomState, default=None
  188. Random state to be used to generate random state for each
  189. repetition as well as randomly breaking ties within the iterative
  190. stratification algorithm.
  191. Examples
  192. --------
  193. >>> from iterstrat.ml_stratifiers import RepeatedMultilabelStratifiedKFold
  194. >>> import numpy as np
  195. >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
  196. >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
  197. >>> rmskf = RepeatedMultilabelStratifiedKFold(n_splits=2, n_repeats=2,
  198. ... random_state=0)
  199. >>> for train_index, test_index in rmskf.split(X, y):
  200. ... print("TRAIN:", train_index, "TEST:", test_index)
  201. ... X_train, X_test = X[train_index], X[test_index]
  202. ... y_train, y_test = y[train_index], y[test_index]
  203. ...
  204. TRAIN: [0 3 4 6] TEST: [1 2 5 7]
  205. TRAIN: [1 2 5 7] TEST: [0 3 4 6]
  206. TRAIN: [0 1 4 5] TEST: [2 3 6 7]
  207. TRAIN: [2 3 6 7] TEST: [0 1 4 5]
  208. See also
  209. --------
  210. RepeatedStratifiedKFold: Repeats (Non-multilabel) Stratified K-Fold
  211. n times.
  212. """
  213. def __init__(self, n_splits=5, n_repeats=10, random_state=None):
  214. super(RepeatedMultilabelStratifiedKFold, self).__init__(
  215. MultilabelStratifiedKFold, n_repeats, random_state,
  216. n_splits=n_splits)
  217. class MultilabelStratifiedShuffleSplit(BaseShuffleSplit):
  218. """Multilabel Stratified ShuffleSplit cross-validator
  219. Provides train/test indices to split data into train/test sets.
  220. This cross-validation object is a merge of MultilabelStratifiedKFold and
  221. ShuffleSplit, which returns stratified randomized folds for multilabel
  222. data. The folds are made by preserving the percentage of each label.
  223. Note: like the ShuffleSplit strategy, multilabel stratified random splits
  224. do not guarantee that all folds will be different, although this is
  225. still very likely for sizeable datasets.
  226. Parameters
  227. ----------
  228. n_splits : int, default 10
  229. Number of re-shuffling & splitting iterations.
  230. test_size : float, int, None, optional
  231. If float, should be between 0.0 and 1.0 and represent the proportion
  232. of the dataset to include in the test split. If int, represents the
  233. absolute number of test samples. If None, the value is set to the
  234. complement of the train size. By default, the value is set to 0.1.
  235. The default will change in version 0.21. It will remain 0.1 only
  236. if ``train_size`` is unspecified, otherwise it will complement
  237. the specified ``train_size``.
  238. train_size : float, int, or None, default is None
  239. If float, should be between 0.0 and 1.0 and represent the
  240. proportion of the dataset to include in the train split. If
  241. int, represents the absolute number of train samples. If None,
  242. the value is automatically set to the complement of the test size.
  243. random_state : int, RandomState instance or None, optional (default=None)
  244. If int, random_state is the seed used by the random number generator;
  245. If RandomState instance, random_state is the random number generator;
  246. If None, the random number generator is the RandomState instance used
  247. by `np.random`. Unlike StratifiedShuffleSplit that only uses
  248. random_state when ``shuffle`` == True, this multilabel implementation
  249. always uses the random_state since the iterative stratification
  250. algorithm breaks ties randomly.
  251. Examples
  252. --------
  253. >>> from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
  254. >>> import numpy as np
  255. >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
  256. >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
  257. >>> msss = MultilabelStratifiedShuffleSplit(n_splits=3, test_size=0.5,
  258. ... random_state=0)
  259. >>> msss.get_n_splits(X, y)
  260. 3
  261. >>> print(mss) # doctest: +ELLIPSIS
  262. MultilabelStratifiedShuffleSplit(n_splits=3, random_state=0, test_size=0.5,
  263. train_size=None)
  264. >>> for train_index, test_index in msss.split(X, y):
  265. ... print("TRAIN:", train_index, "TEST:", test_index)
  266. ... X_train, X_test = X[train_index], X[test_index]
  267. ... y_train, y_test = y[train_index], y[test_index]
  268. TRAIN: [1 2 5 7] TEST: [0 3 4 6]
  269. TRAIN: [2 3 6 7] TEST: [0 1 4 5]
  270. TRAIN: [1 2 5 6] TEST: [0 3 4 7]
  271. Notes
  272. -----
  273. Train and test sizes may be slightly different from desired due to the
  274. preference of stratification over perfectly sized folds.
  275. """
  276. def __init__(self, n_splits=10, test_size="default", train_size=None,
  277. random_state=None):
  278. super(MultilabelStratifiedShuffleSplit, self).__init__(
  279. n_splits, test_size, train_size, random_state)
  280. def _iter_indices(self, X, y, groups=None):
  281. n_samples = _num_samples(X)
  282. y = check_array(y, ensure_2d=False, dtype=None)
  283. y = np.asarray(y, dtype=bool)
  284. type_of_target_y = type_of_target(y)
  285. if type_of_target_y != 'multilabel-indicator':
  286. raise ValueError(
  287. 'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(
  288. type_of_target_y))
  289. n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
  290. self.train_size)
  291. n_samples = y.shape[0]
  292. rng = check_random_state(self.random_state)
  293. y_orig = y.copy()
  294. r = np.array([n_train, n_test]) / (n_train + n_test)
  295. for _ in range(self.n_splits):
  296. indices = np.arange(n_samples)
  297. rng.shuffle(indices)
  298. y = y_orig[indices]
  299. test_folds = IterativeStratification(labels=y, r=r, random_state=rng)
  300. test_idx = test_folds[np.argsort(indices)] == 1
  301. test = np.where(test_idx)[0]
  302. train = np.where(~test_idx)[0]
  303. yield train, test
  304. def split(self, X, y, groups=None):
  305. """Generate indices to split data into training and test set.
  306. Parameters
  307. ----------
  308. X : array-like, shape (n_samples, n_features)
  309. Training data, where n_samples is the number of samples
  310. and n_features is the number of features.
  311. Note that providing ``y`` is sufficient to generate the splits and
  312. hence ``np.zeros(n_samples)`` may be used as a placeholder for
  313. ``X`` instead of actual training data.
  314. y : array-like, shape (n_samples, n_labels)
  315. The target variable for supervised learning problems.
  316. Multilabel stratification is done based on the y labels.
  317. groups : object
  318. Always ignored, exists for compatibility.
  319. Returns
  320. -------
  321. train : ndarray
  322. The training set indices for that split.
  323. test : ndarray
  324. The testing set indices for that split.
  325. Notes
  326. -----
  327. Randomized CV splitters may return different results for each call of
  328. split. You can make the results identical by setting ``random_state``
  329. to an integer.
  330. """
  331. y = check_array(y, ensure_2d=False, dtype=None)
  332. return super(MultilabelStratifiedShuffleSplit, self).split(X, y, groups)