noxer.pipelines module
A set of helper classes for better pipelining of data preprocessing for machine learning and beyond.
""" A set of helper classes for better pipelining of data preprocessing for machine learning and beyond. """ import numpy as np from sklearn.base import BaseEstimator from sklearn.pipeline import Pipeline from sklearn.ensemble import GradientBoostingRegressor from sklearn.linear_model import Lasso from sklearn.dummy import DummyRegressor from searchgrid import set_grid class IOTransform(BaseEstimator): """ A base class for training. Implements a set of useful methods and variables, such that preprocessing of the data can be done using scikit-learn like class instances. Parameters ---------- X_prep : BaseEstimator Class instance that will be fitted to the input X for the model. This transformer is applied to the input X before it is fed into the model. Y_prep : BaseEstimator Class instance that will be fitted to the output values Y for the model. This transformer is applied to the values of Y when it is used for training. Y_post : BaseEstimator Class instance that will be fitted to the output values Y for the model. This transformer is applied after the values are generated. model : BaseEstimator Instance of a class that is used for mapping from inputs to outputs. metric : callable with two arguments Scorer that is used to evaluate predictions of the model. If None, the score function of the model will be used. """ _estimator_type = "generator" def __init__(self, model, metric=None, augm=None, X_prep=None, Y_prep=None, Y_post=None): self.X_prep = X_prep self.Y_prep = Y_prep self.Y_post = Y_post if not isinstance(model, BaseEstimator): raise TypeError('Model should be an instance of BaseEstimator, got %s' % model) self.model = model self.metric = metric self.augm = augm def set_params(self, **params): """ Custom setting of parameters for generative models. All parameters that start with 'x_prep', 'y_prep', 'y_post' are delegated to respective preprocessors. """ elements = {'augm', 'X_prep', 'Y_prep', 'Y_post', 'model'} self_params = { k:v for k, v in params.items() if not any( k.startswith(p.lower()) for p in elements ) } BaseEstimator.set_params(self, **self_params) # set attributes of elements for e in elements: element = getattr(self, e) if isinstance(element, BaseEstimator): subprm = { k[len(e)+2:]: v for k, v in params.items() if k.startswith(e.lower()) } element.set_params(**subprm) return self def _fit_preprocessors(self, X, Y): """Fits all preprocessors to the data. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model. """ if self.augm is not None: X, Y = self.augm.fit_transform(X, Y) if self.X_prep is not None: X = self.X_prep.fit_transform(X, Y) if self.Y_post is not None: self.Y_post.fit(Y, X) if self.Y_prep is not None: Y = self.Y_prep.fit_transform(Y, X) return X, Y def _transform_inputs(self, X, Y=None): """Transforms inputs so that they can be used for estimations with generative model Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model. """ if self.X_prep is not None: # account for some transformers taking only single argument if 'Y' in self.X_prep.transform.__code__.co_varnames: X = self.X_prep.transform(X, Y) else: X = self.X_prep.transform(X) if Y is None: return X if self.Y_prep is not None: # account for some transformers taking only single argument if 'Y' in self.Y_prep.transform.__code__.co_varnames: Y = self.Y_prep.transform(Y, X) else: Y = self.Y_prep.transform(Y) return X, Y def _transform_generated_outputs(self, Y, X=None): """Apply output transformers to the generated values Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model. """ if self.Y_prep is not None: if 'Y' in self.Y_prep.inverse_transform.__code__.co_varnames: Y = self.Y_prep.inverse_transform(Y, X) else: Y = self.Y_prep.inverse_transform(Y) if self.Y_post is not None: if 'Y' in self.Y_post.transform.__code__.co_varnames: Y = self.Y_post.transform(Y, X) else: Y = self.Y_post.transform(Y) return Y def fit(self, X, Y, *args, **kwargs): """ Complete fitting pipeline with data preprocessing for generative models. Includes data augmentation. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model. """ X, Y = self._fit_preprocessors(X, Y) self.model.fit(X, Y, *args, **kwargs) return self def predict(self, X, *args, **kwargs): """ Full generation pipeline with all necessary steps such as data preprocessing. IMPORTANT: this function does not do augmentation of input values! Hence a particular form of X should be the one that self.augm returns. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. """ X = self._transform_inputs(X) Y = self.model.predict(X, *args, **kwargs) Y = self._transform_generated_outputs(Y, X) return Y def score_no_augmentation(self, X, Y, *args, **kwargs): """ Evaluates the quality of the model using comparison to real data. DOES NOT include the data augmentation. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model. Returns ------- score : float Score from 0.0 to 1.0 that indicates quality of estimations. """ if self.metric: Yp = self.predict(X, *args, **kwargs) score = self.metric(Y, Yp) else: score = self.model.score(X, Y) return score def score(self, X, Y, *args, **kwargs): """ Evaluates the quality of the model using comparison to real data. Includes data augmentation. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model. Returns ------- score : float Score from 0.0 to 1.0 that indicates quality of estimations. """ if self.augm is not None: X, Y = self.augm.transform(X, Y) return self.score_no_augmentation(X, Y, *args, **kwargs) def make_regressors(subset=None): available_regressors = { 'gbrt': set_grid(GradientBoostingRegressor(), n_estimators=[2 ** i for i in range(1, 11)], learning_rate=[0.1, 0.01, 0.001], ), 'lasso': set_grid(Lasso(), alpha=np.exp(np.linspace(-8, 8)), ) } if subset is None: subset = list(available_regressors.keys()) result = [available_regressors[k] for k in subset] pipe = Pipeline( [('finmodel', DummyRegressor())] ) pipe = set_grid(pipe, finmodel=result) return pipe def make_dummy_regressor(subset=None): available_regressors = { 'dummy': set_grid(DummyRegressor(), strategy=['mean', 'median'] ), } if subset is None: subset = list(available_regressors.keys()) result = [available_regressors[k] for k in subset] pipe = Pipeline( [('model', Lasso())] ) pipe = set_grid(pipe, model=result) return pipe
Functions
def make_dummy_regressor(
subset=None)
def make_dummy_regressor(subset=None): available_regressors = { 'dummy': set_grid(DummyRegressor(), strategy=['mean', 'median'] ), } if subset is None: subset = list(available_regressors.keys()) result = [available_regressors[k] for k in subset] pipe = Pipeline( [('model', Lasso())] ) pipe = set_grid(pipe, model=result) return pipe
def make_regressors(
subset=None)
def make_regressors(subset=None): available_regressors = { 'gbrt': set_grid(GradientBoostingRegressor(), n_estimators=[2 ** i for i in range(1, 11)], learning_rate=[0.1, 0.01, 0.001], ), 'lasso': set_grid(Lasso(), alpha=np.exp(np.linspace(-8, 8)), ) } if subset is None: subset = list(available_regressors.keys()) result = [available_regressors[k] for k in subset] pipe = Pipeline( [('finmodel', DummyRegressor())] ) pipe = set_grid(pipe, finmodel=result) return pipe
Classes
class IOTransform
A base class for training. Implements a set of useful methods and variables, such that preprocessing of the data can be done using scikit-learn like class instances.
Parameters
X_prep : BaseEstimator Class instance that will be fitted to the input X for the model. This transformer is applied to the input X before it is fed into the model.
Y_prep : BaseEstimator Class instance that will be fitted to the output values Y for the model. This transformer is applied to the values of Y when it is used for training.
Y_post : BaseEstimator Class instance that will be fitted to the output values Y for the model. This transformer is applied after the values are generated.
model : BaseEstimator Instance of a class that is used for mapping from inputs to outputs.
metric : callable with two arguments Scorer that is used to evaluate predictions of the model. If None, the score function of the model will be used.
class IOTransform(BaseEstimator): """ A base class for training. Implements a set of useful methods and variables, such that preprocessing of the data can be done using scikit-learn like class instances. Parameters ---------- X_prep : BaseEstimator Class instance that will be fitted to the input X for the model. This transformer is applied to the input X before it is fed into the model. Y_prep : BaseEstimator Class instance that will be fitted to the output values Y for the model. This transformer is applied to the values of Y when it is used for training. Y_post : BaseEstimator Class instance that will be fitted to the output values Y for the model. This transformer is applied after the values are generated. model : BaseEstimator Instance of a class that is used for mapping from inputs to outputs. metric : callable with two arguments Scorer that is used to evaluate predictions of the model. If None, the score function of the model will be used. """ _estimator_type = "generator" def __init__(self, model, metric=None, augm=None, X_prep=None, Y_prep=None, Y_post=None): self.X_prep = X_prep self.Y_prep = Y_prep self.Y_post = Y_post if not isinstance(model, BaseEstimator): raise TypeError('Model should be an instance of BaseEstimator, got %s' % model) self.model = model self.metric = metric self.augm = augm def set_params(self, **params): """ Custom setting of parameters for generative models. All parameters that start with 'x_prep', 'y_prep', 'y_post' are delegated to respective preprocessors. """ elements = {'augm', 'X_prep', 'Y_prep', 'Y_post', 'model'} self_params = { k:v for k, v in params.items() if not any( k.startswith(p.lower()) for p in elements ) } BaseEstimator.set_params(self, **self_params) # set attributes of elements for e in elements: element = getattr(self, e) if isinstance(element, BaseEstimator): subprm = { k[len(e)+2:]: v for k, v in params.items() if k.startswith(e.lower()) } element.set_params(**subprm) return self def _fit_preprocessors(self, X, Y): """Fits all preprocessors to the data. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model. """ if self.augm is not None: X, Y = self.augm.fit_transform(X, Y) if self.X_prep is not None: X = self.X_prep.fit_transform(X, Y) if self.Y_post is not None: self.Y_post.fit(Y, X) if self.Y_prep is not None: Y = self.Y_prep.fit_transform(Y, X) return X, Y def _transform_inputs(self, X, Y=None): """Transforms inputs so that they can be used for estimations with generative model Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model. """ if self.X_prep is not None: # account for some transformers taking only single argument if 'Y' in self.X_prep.transform.__code__.co_varnames: X = self.X_prep.transform(X, Y) else: X = self.X_prep.transform(X) if Y is None: return X if self.Y_prep is not None: # account for some transformers taking only single argument if 'Y' in self.Y_prep.transform.__code__.co_varnames: Y = self.Y_prep.transform(Y, X) else: Y = self.Y_prep.transform(Y) return X, Y def _transform_generated_outputs(self, Y, X=None): """Apply output transformers to the generated values Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model. """ if self.Y_prep is not None: if 'Y' in self.Y_prep.inverse_transform.__code__.co_varnames: Y = self.Y_prep.inverse_transform(Y, X) else: Y = self.Y_prep.inverse_transform(Y) if self.Y_post is not None: if 'Y' in self.Y_post.transform.__code__.co_varnames: Y = self.Y_post.transform(Y, X) else: Y = self.Y_post.transform(Y) return Y def fit(self, X, Y, *args, **kwargs): """ Complete fitting pipeline with data preprocessing for generative models. Includes data augmentation. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model. """ X, Y = self._fit_preprocessors(X, Y) self.model.fit(X, Y, *args, **kwargs) return self def predict(self, X, *args, **kwargs): """ Full generation pipeline with all necessary steps such as data preprocessing. IMPORTANT: this function does not do augmentation of input values! Hence a particular form of X should be the one that self.augm returns. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. """ X = self._transform_inputs(X) Y = self.model.predict(X, *args, **kwargs) Y = self._transform_generated_outputs(Y, X) return Y def score_no_augmentation(self, X, Y, *args, **kwargs): """ Evaluates the quality of the model using comparison to real data. DOES NOT include the data augmentation. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model. Returns ------- score : float Score from 0.0 to 1.0 that indicates quality of estimations. """ if self.metric: Yp = self.predict(X, *args, **kwargs) score = self.metric(Y, Yp) else: score = self.model.score(X, Y) return score def score(self, X, Y, *args, **kwargs): """ Evaluates the quality of the model using comparison to real data. Includes data augmentation. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model. Returns ------- score : float Score from 0.0 to 1.0 that indicates quality of estimations. """ if self.augm is not None: X, Y = self.augm.transform(X, Y) return self.score_no_augmentation(X, Y, *args, **kwargs)
Ancestors (in MRO)
- IOTransform
- sklearn.base.BaseEstimator
- builtins.object
Static methods
def __init__(
self, model, metric=None, augm=None, X_prep=None, Y_prep=None, Y_post=None)
Initialize self. See help(type(self)) for accurate signature.
def __init__(self, model, metric=None, augm=None, X_prep=None, Y_prep=None, Y_post=None): self.X_prep = X_prep self.Y_prep = Y_prep self.Y_post = Y_post if not isinstance(model, BaseEstimator): raise TypeError('Model should be an instance of BaseEstimator, got %s' % model) self.model = model self.metric = metric self.augm = augm
def fit(
self, X, Y, *args, **kwargs)
Complete fitting pipeline with data preprocessing for generative models.
Includes data augmentation.
Parameters
X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs.
Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model.
def fit(self, X, Y, *args, **kwargs): """ Complete fitting pipeline with data preprocessing for generative models. Includes data augmentation. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model. """ X, Y = self._fit_preprocessors(X, Y) self.model.fit(X, Y, *args, **kwargs) return self
def get_params(
self, deep=True)
Get parameters for this estimator.
Parameters
deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators.
Returns
params : mapping of string to any Parameter names mapped to their values.
def get_params(self, deep=True): """Get parameters for this estimator. Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ out = dict() for key in self._get_param_names(): # We need deprecation warnings to always be on in order to # catch deprecated param values. # This is set in utils/__init__.py but it gets overwritten # when running under python3 somehow. warnings.simplefilter("always", DeprecationWarning) try: with warnings.catch_warnings(record=True) as w: value = getattr(self, key, None) if len(w) and w[0].category == DeprecationWarning: # if the parameter is deprecated, don't show it continue finally: warnings.filters.pop(0) # XXX: should we rather test if instance of estimator? if deep and hasattr(value, 'get_params'): deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) out[key] = value return out
def predict(
self, X, *args, **kwargs)
Full generation pipeline with all necessary steps such as data preprocessing.
IMPORTANT: this function does not do augmentation of input values! Hence a particular form of X should be the one that self.augm returns.
Parameters
X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs.
def predict(self, X, *args, **kwargs): """ Full generation pipeline with all necessary steps such as data preprocessing. IMPORTANT: this function does not do augmentation of input values! Hence a particular form of X should be the one that self.augm returns. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. """ X = self._transform_inputs(X) Y = self.model.predict(X, *args, **kwargs) Y = self._transform_generated_outputs(Y, X) return Y
def score(
self, X, Y, *args, **kwargs)
Evaluates the quality of the model using comparison to real data.
Includes data augmentation.
Parameters
X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs.
Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model.
Returns
score : float Score from 0.0 to 1.0 that indicates quality of estimations.
def score(self, X, Y, *args, **kwargs): """ Evaluates the quality of the model using comparison to real data. Includes data augmentation. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model. Returns ------- score : float Score from 0.0 to 1.0 that indicates quality of estimations. """ if self.augm is not None: X, Y = self.augm.transform(X, Y) return self.score_no_augmentation(X, Y, *args, **kwargs)
def score_no_augmentation(
self, X, Y, *args, **kwargs)
Evaluates the quality of the model using comparison to real data.
DOES NOT include the data augmentation.
Parameters
X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs.
Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model.
Returns
score : float Score from 0.0 to 1.0 that indicates quality of estimations.
def score_no_augmentation(self, X, Y, *args, **kwargs): """ Evaluates the quality of the model using comparison to real data. DOES NOT include the data augmentation. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs. Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model. Returns ------- score : float Score from 0.0 to 1.0 that indicates quality of estimations. """ if self.metric: Yp = self.predict(X, *args, **kwargs) score = self.metric(Y, Yp) else: score = self.model.score(X, Y) return score
def set_params(
self, **params)
Custom setting of parameters for generative models. All parameters that start with 'x_prep', 'y_prep', 'y_post' are delegated to respective preprocessors.
def set_params(self, **params): """ Custom setting of parameters for generative models. All parameters that start with 'x_prep', 'y_prep', 'y_post' are delegated to respective preprocessors. """ elements = {'augm', 'X_prep', 'Y_prep', 'Y_post', 'model'} self_params = { k:v for k, v in params.items() if not any( k.startswith(p.lower()) for p in elements ) } BaseEstimator.set_params(self, **self_params) # set attributes of elements for e in elements: element = getattr(self, e) if isinstance(element, BaseEstimator): subprm = { k[len(e)+2:]: v for k, v in params.items() if k.startswith(e.lower()) } element.set_params(**subprm) return self
Instance variables
var X_prep
var Y_post
var Y_prep
var augm
var metric
var model