|
- # ref: https://github.com/thomas-young-2013/open-box/blob/master/openbox/surrogate/base/rf_with_instances_sklearn.py
-
- import typing, logging
- import numpy as np
- from sklearn.ensemble import RandomForestRegressor
- import threading
- from joblib import Parallel, delayed
- from sklearn.utils.fixes import _joblib_parallel_args
- from sklearn.utils.validation import check_is_fitted
- from xbbo.configspace.space import DenseConfigurationSpace
- from xbbo.utils.util import get_types
- try:
- from sklearn.ensemble.base import _partition_estimators
- old_sk_version = True
- except:
- from sklearn.ensemble._base import _partition_estimators
- old_sk_version = False
-
- from xbbo.surrogate.base import BaseRF
-
- logger = logging.getLogger(__name__)
-
-
- def _collect_prediction(predict, X, out, lock):
- """
- This is a utility function for joblib's Parallel.
-
- It can't go locally in ForestClassifier or ForestRegressor, because joblib
- complains that it cannot pickle it when placed there.
- """
- prediction = predict(X, check_input=False)
- with lock:
- out.append(prediction)
-
- class skRandomForestWithInstances(BaseRF):
-
- """Random forest that takes instance features into account.
-
- implement based on sklearn.ensemble.RandomForestRegressor
-
- Attributes
- ----------
- n_points_per_tree : int
- rf : RandomForestRegressor
- Only available after training
- unlog_y: bool
- seed : int
- types : np.ndarray
- bounds : list
- rng : np.random.RandomState
- logger : logging.logger
- """
-
- def __init__(self, configspace: DenseConfigurationSpace,
- log_y: bool=False,
- num_trees: int=10,
- do_bootstrapping: bool=True,
- n_points_per_tree: int=-1,
- ratio_features: float=5. / 6.,
- min_samples_split: int=3,
- min_samples_leaf: int=3,
- max_depth: int=2**20,
- eps_purity: float=1e-8,
- max_num_nodes: int=2**20,
- rng: np.random.RandomState = np.random.RandomState(42),
- n_jobs: int=None,
- types=None, bounds=None,
- **kwargs):
- """
- Parameters
- ----------
- types : np.ndarray (D)
- Specifies the number of categorical values of an input dimension where
- the i-th entry corresponds to the i-th input dimension. Let's say we
- have 2 dimension where the first dimension consists of 3 different
- categorical choices and the second dimension is continuous than we
- have to pass np.array([2, 0]). Note that we count starting from 0.
- bounds : list
- Specifies the bounds for continuous features.
- log_y: bool
- y values (passed to this RF) are expected to be log(y) transformed;
- this will be considered during predicting
- num_trees : int
- The number of trees in the random forest.
- do_bootstrapping : bool
- Turns on / off bootstrapping in the random forest.
- n_points_per_tree : int
- Number of points per tree. If <= 0 X.shape[0] will be used
- in _train(X, y) instead
- ratio_features : float
- The ratio of features that are considered for splitting.
- min_samples_split : int
- The minimum number of data points to perform a split.
- min_samples_leaf : int
- The minimum number of data points in a leaf.
- max_depth : int
- The maximum depth of a single tree.
- eps_purity : float
- The minimum difference between two target values to be considered
- different
- max_num_nodes : int
- The maxmimum total number of nodes in a tree
- seed : int
- The seed that is passed to the random_forest_run library.
- n_jobs : int, default=None
- The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
- :meth:`decision_path` and :meth:`apply` are all parallelized over the
- trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
- context. ``-1`` means using all processors. See :term:`Glossary
- <n_jobs>` for more details.
- """
- if types is None or bounds is None:
- types, bounds = get_types(configspace)
- super().__init__(configspace, types, bounds, **kwargs)
-
-
- self.log_y = log_y
- if self.log_y:
- raise NotImplementedError
- self.rng = rng
-
- self.num_trees = num_trees
- self.do_bootstrapping = do_bootstrapping
- max_features = None if ratio_features > 1.0 else \
- int(max(1, types.shape[0] * ratio_features))
- self.max_features = max_features
- self.min_samples_split = min_samples_split
- self.min_samples_leaf = min_samples_leaf
- self.max_depth = max_depth
- self.epsilon_purity = eps_purity
- self.max_num_nodes = max_num_nodes
-
- self.n_points_per_tree = n_points_per_tree
- self.n_jobs = n_jobs
-
- self.rf = None # type: RandomForestRegressor
-
- def _train(self, X: np.ndarray, y: np.ndarray, **kwargs):
- """Trains the random forest on X and y.
-
- Parameters
- ----------
- X : np.ndarray [n_samples, n_features (config + instance features)]
- Input data points.
- Y : np.ndarray [n_samples, ]
- The corresponding target values.
-
- Returns
- -------
- self
- """
- X = self._impute_inactive(X)
- self.X = X
- self.y = y.flatten()
-
- if self.n_points_per_tree <= 0:
- self.num_data_points_per_tree = self.X.shape[0]
- else:
- self.num_data_points_per_tree = self.n_points_per_tree
- if old_sk_version:
- self.rf = RandomForestRegressor(
- n_estimators=self.num_trees,
- max_depth=self.max_depth,
- min_samples_split=self.min_samples_split,
- min_samples_leaf=self.min_samples_leaf,
- max_features=self.max_features,
- # max_samples=self.num_data_points_per_tree,
- max_leaf_nodes=self.max_num_nodes,
- min_impurity_decrease=self.epsilon_purity,
- bootstrap=self.do_bootstrapping,
- n_jobs=self.n_jobs,
- random_state=self.rng,
- )
- else:
- self.rf = RandomForestRegressor(
- n_estimators=self.num_trees,
- max_depth=self.max_depth,
- min_samples_split=self.min_samples_split,
- min_samples_leaf=self.min_samples_leaf,
- max_features=self.max_features,
- max_samples=self.num_data_points_per_tree,
- max_leaf_nodes=self.max_num_nodes,
- min_impurity_decrease=self.epsilon_purity,
- bootstrap=self.do_bootstrapping,
- n_jobs=self.n_jobs,
- random_state=self.rng,
- )
- self.rf.fit(self.X, self.y)
- return self
-
- def predict_mean_var(self, X: np.ndarray):
- if old_sk_version:
- check_is_fitted(self.rf, 'estimators_')
- else:
- check_is_fitted(self.rf)
- # Check data
- if X.ndim == 1:
- X = X.reshape((1, -1))
- X = self.rf._validate_X_predict(X)
-
- # Assign chunk of trees to jobs
- n_jobs, _, _ = _partition_estimators(self.rf.n_estimators, self.rf.n_jobs)
-
- # collect the output of every estimator
- all_y_preds = list()
-
- # Parallel loop
- lock = threading.Lock()
- Parallel(n_jobs=n_jobs, verbose=self.rf.verbose,
- **_joblib_parallel_args(require="sharedmem"))(
- delayed(_collect_prediction)(e.predict, X, all_y_preds, lock)
- for e in self.rf.estimators_)
- all_y_preds = np.asarray(all_y_preds, dtype=np.float64)
-
- m = np.mean(all_y_preds, axis=0)
- v = np.var(all_y_preds, axis=0)
- return m, v
-
- def _predict(self, X: np.ndarray, **kwargs) -> typing.Tuple[np.ndarray, np.ndarray]:
- """Predict means and variances for given X.
-
- Parameters
- ----------
- X : np.ndarray of shape = [n_samples,
- n_features (config + instance features)]
-
- Returns
- -------
- means : np.ndarray of shape = [n_samples, 1]
- Predictive mean
- vars : np.ndarray of shape = [n_samples, 1]
- Predictive variance
- """
- if len(X.shape) != 2:
- raise ValueError(
- 'Expected 2d array, got %dd array!' % len(X.shape))
- if X.shape[1] != self.types.shape[0]:
- raise ValueError('Rows in X should have %d entries but have %d!' % (self.types.shape[0], X.shape[1]))
- X = self._impute_inactive(X)
- if self.log_y:
- raise NotImplementedError
- else:
- means, vars_ = self.predict_mean_var(X)
-
- return means.reshape((-1, 1)), vars_.reshape((-1, 1))
-
- def predict_marginalized_over_instances(self, X: np.ndarray):
- """Predict mean and variance marginalized over all instances.
-
- Returns the predictive mean and variance marginalised over all
- instances for a set of configurations.
-
- Note
- ----
- This method overwrites the same method of ~smac.epm.base_epm.AbstractEPM;
- the following method is random forest specific
- and follows the SMAC2 implementation;
- it requires no distribution assumption
- to marginalize the uncertainty estimates
-
- Parameters
- ----------
- X : np.ndarray
- [n_samples, n_features (config)]
-
- Returns
- -------
- means : np.ndarray of shape = [n_samples, 1]
- Predictive mean
- vars : np.ndarray of shape = [n_samples, 1]
- Predictive variance
- """
-
- if self.log_y:
- raise NotImplementedError
- else:
- return super().predict_marginalized_over_instances(X)
|