Top

noxer.pipelines module

A set of helper classes for better pipelining of data preprocessing for machine learning and beyond.

"""
A set of helper classes for better pipelining of data preprocessing
for machine learning and beyond.
"""
import numpy as np

from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn.dummy import DummyRegressor

from searchgrid import set_grid


class IOTransform(BaseEstimator):
    """
    A base class for training.
    Implements a set of useful methods and variables, such
    that preprocessing of the data can be done using scikit-learn
    like class instances.

    Parameters
    ----------
    X_prep : BaseEstimator
        Class instance that will be fitted to the input X
        for the model. This transformer is applied to the
        input X before it is fed into the model.

    Y_prep : BaseEstimator
        Class instance that will be fitted to the output values Y
        for the model. This transformer is applied to the values of
        Y when it is used for training.

    Y_post : BaseEstimator
        Class instance that will be fitted to the output values Y
        for the model. This transformer is applied after the values
        are generated.

    model : BaseEstimator
        Instance of a class that is used for mapping from inputs to
        outputs.

    metric : callable with two arguments
        Scorer that is used to evaluate predictions of the model. If
        None, the score function of the model will be used.

    """

    _estimator_type = "generator"

    def __init__(self, model, metric=None, augm=None, X_prep=None, Y_prep=None, Y_post=None):
        self.X_prep = X_prep
        self.Y_prep = Y_prep
        self.Y_post = Y_post

        if not isinstance(model, BaseEstimator):
            raise TypeError('Model should be an instance of BaseEstimator, got %s' % model)

        self.model = model
        self.metric = metric
        self.augm = augm

    def set_params(self, **params):
        """
        Custom setting of parameters for generative models.
        All parameters that start with 'x_prep', 'y_prep', 'y_post' are
        delegated to respective preprocessors.
        """

        elements = {'augm', 'X_prep', 'Y_prep', 'Y_post', 'model'}

        self_params = {
            k:v for k, v in params.items()
            if not any(
                k.startswith(p.lower()) for p in elements
            )
        }

        BaseEstimator.set_params(self, **self_params)

        # set attributes of elements
        for e in elements:
            element = getattr(self, e)

            if isinstance(element, BaseEstimator):
                subprm = {
                    k[len(e)+2:]: v for k, v in params.items()
                    if k.startswith(e.lower())
                }

                element.set_params(**subprm)

        return self

    def _fit_preprocessors(self, X, Y):
        """Fits all preprocessors to the data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, ...]
            The data used as inputs to generatie model's outputs.

        Y : {array-like, sparse matrix}, shape [n_samples, ...]
            The target values estimated by the model.
        """

        if self.augm is not None:
            X, Y = self.augm.fit_transform(X, Y)

        if self.X_prep is not None:
            X = self.X_prep.fit_transform(X, Y)

        if self.Y_post is not None:
            self.Y_post.fit(Y, X)

        if self.Y_prep is not None:
            Y = self.Y_prep.fit_transform(Y, X)

        return X, Y

    def _transform_inputs(self, X, Y=None):
        """Transforms inputs so that they can be used for estimations
        with generative model

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, ...]
            The data used as inputs to generatie model's outputs.

        Y : {array-like, sparse matrix}, shape [n_samples, ...]
            The target values estimated by the model.
        """
        if self.X_prep is not None:
            # account for some transformers taking only single argument
            if 'Y' in self.X_prep.transform.__code__.co_varnames:
                X = self.X_prep.transform(X, Y)
            else:
                X = self.X_prep.transform(X)

        if Y is None:
            return X

        if self.Y_prep is not None:
            # account for some transformers taking only single argument
            if 'Y' in self.Y_prep.transform.__code__.co_varnames:
                Y = self.Y_prep.transform(Y, X)
            else:
                Y = self.Y_prep.transform(Y)

        return X, Y

    def _transform_generated_outputs(self, Y, X=None):
        """Apply output transformers to the generated values

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, ...]
            The data used as inputs to generatie model's outputs.

        Y : {array-like, sparse matrix}, shape [n_samples, ...]
            The target values estimated by the model.
        """
        if self.Y_prep is not None:
            if 'Y' in self.Y_prep.inverse_transform.__code__.co_varnames:
                Y = self.Y_prep.inverse_transform(Y, X)
            else:
                Y = self.Y_prep.inverse_transform(Y)

        if self.Y_post is not None:
            if 'Y' in self.Y_post.transform.__code__.co_varnames:
                Y = self.Y_post.transform(Y, X)
            else:
                Y = self.Y_post.transform(Y)

        return Y

    def fit(self, X, Y, *args, **kwargs):
        """
        Complete fitting pipeline with data preprocessing for generative
        models.

        Includes data augmentation.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, ...]
            The data used as inputs to generatie model's outputs.

        Y : {array-like, sparse matrix}, shape [n_samples, ...]
            The target values estimated by the model.
        """
        X, Y = self._fit_preprocessors(X, Y)
        self.model.fit(X, Y, *args, **kwargs)
        return self

    def predict(self, X, *args, **kwargs):
        """
        Full generation pipeline with all necessary steps such as data
        preprocessing.

        IMPORTANT: this function does not do augmentation of input
        values! Hence a particular form of X should be the one
        that self.augm returns.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, ...]
            The data used as inputs to generatie model's outputs.
        """
        X = self._transform_inputs(X)
        Y = self.model.predict(X, *args, **kwargs)
        Y = self._transform_generated_outputs(Y, X)
        return Y

    def score_no_augmentation(self, X, Y, *args, **kwargs):
        """
        Evaluates the quality of the model using comparison
        to real data.

        DOES NOT include the data augmentation.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, ...]
            The data used as inputs to generatie model's outputs.

        Y : {array-like, sparse matrix}, shape [n_samples, ...]
            The target values estimated by the model.

        Returns
        -------
        score : float
            Score from 0.0 to 1.0 that indicates quality of estimations.
        """

        if self.metric:
            Yp = self.predict(X, *args, **kwargs)
            score = self.metric(Y, Yp)
        else:
            score = self.model.score(X, Y)

        return score

    def score(self, X, Y, *args, **kwargs):
        """
        Evaluates the quality of the model using comparison
        to real data.

        Includes data augmentation.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, ...]
            The data used as inputs to generatie model's outputs.

        Y : {array-like, sparse matrix}, shape [n_samples, ...]
            The target values estimated by the model.

        Returns
        -------
        score : float
            Score from 0.0 to 1.0 that indicates quality of estimations.
        """

        if self.augm is not None:
            X, Y = self.augm.transform(X, Y)

        return self.score_no_augmentation(X, Y, *args, **kwargs)


def make_regressors(subset=None):
    available_regressors = {
        'gbrt': set_grid(GradientBoostingRegressor(),
                         n_estimators=[2 ** i for i in range(1, 11)],
                         learning_rate=[0.1, 0.01, 0.001],
                         ),
        'lasso': set_grid(Lasso(),
                         alpha=np.exp(np.linspace(-8, 8)),
                         )
    }

    if subset is None:
        subset = list(available_regressors.keys())

    result = [available_regressors[k] for k in subset]

    pipe = Pipeline(
        [('finmodel', DummyRegressor())]
    )

    pipe = set_grid(pipe, finmodel=result)

    return pipe


def make_dummy_regressor(subset=None):
    available_regressors = {
        'dummy': set_grid(DummyRegressor(),
                         strategy=['mean', 'median']
                         ),
    }

    if subset is None:
        subset = list(available_regressors.keys())

    result = [available_regressors[k] for k in subset]

    pipe = Pipeline(
        [('model', Lasso())]
    )

    pipe = set_grid(pipe, model=result)

    return pipe

Functions

def make_dummy_regressor(

subset=None)

def make_dummy_regressor(subset=None):
    available_regressors = {
        'dummy': set_grid(DummyRegressor(),
                         strategy=['mean', 'median']
                         ),
    }

    if subset is None:
        subset = list(available_regressors.keys())

    result = [available_regressors[k] for k in subset]

    pipe = Pipeline(
        [('model', Lasso())]
    )

    pipe = set_grid(pipe, model=result)

    return pipe

def make_regressors(

subset=None)

def make_regressors(subset=None):
    available_regressors = {
        'gbrt': set_grid(GradientBoostingRegressor(),
                         n_estimators=[2 ** i for i in range(1, 11)],
                         learning_rate=[0.1, 0.01, 0.001],
                         ),
        'lasso': set_grid(Lasso(),
                         alpha=np.exp(np.linspace(-8, 8)),
                         )
    }

    if subset is None:
        subset = list(available_regressors.keys())

    result = [available_regressors[k] for k in subset]

    pipe = Pipeline(
        [('finmodel', DummyRegressor())]
    )

    pipe = set_grid(pipe, finmodel=result)

    return pipe

Classes

class IOTransform

A base class for training. Implements a set of useful methods and variables, such that preprocessing of the data can be done using scikit-learn like class instances.

Parameters

X_prep : BaseEstimator Class instance that will be fitted to the input X for the model. This transformer is applied to the input X before it is fed into the model.

Y_prep : BaseEstimator Class instance that will be fitted to the output values Y for the model. This transformer is applied to the values of Y when it is used for training.

Y_post : BaseEstimator Class instance that will be fitted to the output values Y for the model. This transformer is applied after the values are generated.

model : BaseEstimator Instance of a class that is used for mapping from inputs to outputs.

metric : callable with two arguments Scorer that is used to evaluate predictions of the model. If None, the score function of the model will be used.

class IOTransform(BaseEstimator):
    """
    A base class for training.
    Implements a set of useful methods and variables, such
    that preprocessing of the data can be done using scikit-learn
    like class instances.

    Parameters
    ----------
    X_prep : BaseEstimator
        Class instance that will be fitted to the input X
        for the model. This transformer is applied to the
        input X before it is fed into the model.

    Y_prep : BaseEstimator
        Class instance that will be fitted to the output values Y
        for the model. This transformer is applied to the values of
        Y when it is used for training.

    Y_post : BaseEstimator
        Class instance that will be fitted to the output values Y
        for the model. This transformer is applied after the values
        are generated.

    model : BaseEstimator
        Instance of a class that is used for mapping from inputs to
        outputs.

    metric : callable with two arguments
        Scorer that is used to evaluate predictions of the model. If
        None, the score function of the model will be used.

    """

    _estimator_type = "generator"

    def __init__(self, model, metric=None, augm=None, X_prep=None, Y_prep=None, Y_post=None):
        self.X_prep = X_prep
        self.Y_prep = Y_prep
        self.Y_post = Y_post

        if not isinstance(model, BaseEstimator):
            raise TypeError('Model should be an instance of BaseEstimator, got %s' % model)

        self.model = model
        self.metric = metric
        self.augm = augm

    def set_params(self, **params):
        """
        Custom setting of parameters for generative models.
        All parameters that start with 'x_prep', 'y_prep', 'y_post' are
        delegated to respective preprocessors.
        """

        elements = {'augm', 'X_prep', 'Y_prep', 'Y_post', 'model'}

        self_params = {
            k:v for k, v in params.items()
            if not any(
                k.startswith(p.lower()) for p in elements
            )
        }

        BaseEstimator.set_params(self, **self_params)

        # set attributes of elements
        for e in elements:
            element = getattr(self, e)

            if isinstance(element, BaseEstimator):
                subprm = {
                    k[len(e)+2:]: v for k, v in params.items()
                    if k.startswith(e.lower())
                }

                element.set_params(**subprm)

        return self

    def _fit_preprocessors(self, X, Y):
        """Fits all preprocessors to the data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, ...]
            The data used as inputs to generatie model's outputs.

        Y : {array-like, sparse matrix}, shape [n_samples, ...]
            The target values estimated by the model.
        """

        if self.augm is not None:
            X, Y = self.augm.fit_transform(X, Y)

        if self.X_prep is not None:
            X = self.X_prep.fit_transform(X, Y)

        if self.Y_post is not None:
            self.Y_post.fit(Y, X)

        if self.Y_prep is not None:
            Y = self.Y_prep.fit_transform(Y, X)

        return X, Y

    def _transform_inputs(self, X, Y=None):
        """Transforms inputs so that they can be used for estimations
        with generative model

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, ...]
            The data used as inputs to generatie model's outputs.

        Y : {array-like, sparse matrix}, shape [n_samples, ...]
            The target values estimated by the model.
        """
        if self.X_prep is not None:
            # account for some transformers taking only single argument
            if 'Y' in self.X_prep.transform.__code__.co_varnames:
                X = self.X_prep.transform(X, Y)
            else:
                X = self.X_prep.transform(X)

        if Y is None:
            return X

        if self.Y_prep is not None:
            # account for some transformers taking only single argument
            if 'Y' in self.Y_prep.transform.__code__.co_varnames:
                Y = self.Y_prep.transform(Y, X)
            else:
                Y = self.Y_prep.transform(Y)

        return X, Y

    def _transform_generated_outputs(self, Y, X=None):
        """Apply output transformers to the generated values

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, ...]
            The data used as inputs to generatie model's outputs.

        Y : {array-like, sparse matrix}, shape [n_samples, ...]
            The target values estimated by the model.
        """
        if self.Y_prep is not None:
            if 'Y' in self.Y_prep.inverse_transform.__code__.co_varnames:
                Y = self.Y_prep.inverse_transform(Y, X)
            else:
                Y = self.Y_prep.inverse_transform(Y)

        if self.Y_post is not None:
            if 'Y' in self.Y_post.transform.__code__.co_varnames:
                Y = self.Y_post.transform(Y, X)
            else:
                Y = self.Y_post.transform(Y)

        return Y

    def fit(self, X, Y, *args, **kwargs):
        """
        Complete fitting pipeline with data preprocessing for generative
        models.

        Includes data augmentation.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, ...]
            The data used as inputs to generatie model's outputs.

        Y : {array-like, sparse matrix}, shape [n_samples, ...]
            The target values estimated by the model.
        """
        X, Y = self._fit_preprocessors(X, Y)
        self.model.fit(X, Y, *args, **kwargs)
        return self

    def predict(self, X, *args, **kwargs):
        """
        Full generation pipeline with all necessary steps such as data
        preprocessing.

        IMPORTANT: this function does not do augmentation of input
        values! Hence a particular form of X should be the one
        that self.augm returns.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, ...]
            The data used as inputs to generatie model's outputs.
        """
        X = self._transform_inputs(X)
        Y = self.model.predict(X, *args, **kwargs)
        Y = self._transform_generated_outputs(Y, X)
        return Y

    def score_no_augmentation(self, X, Y, *args, **kwargs):
        """
        Evaluates the quality of the model using comparison
        to real data.

        DOES NOT include the data augmentation.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, ...]
            The data used as inputs to generatie model's outputs.

        Y : {array-like, sparse matrix}, shape [n_samples, ...]
            The target values estimated by the model.

        Returns
        -------
        score : float
            Score from 0.0 to 1.0 that indicates quality of estimations.
        """

        if self.metric:
            Yp = self.predict(X, *args, **kwargs)
            score = self.metric(Y, Yp)
        else:
            score = self.model.score(X, Y)

        return score

    def score(self, X, Y, *args, **kwargs):
        """
        Evaluates the quality of the model using comparison
        to real data.

        Includes data augmentation.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, ...]
            The data used as inputs to generatie model's outputs.

        Y : {array-like, sparse matrix}, shape [n_samples, ...]
            The target values estimated by the model.

        Returns
        -------
        score : float
            Score from 0.0 to 1.0 that indicates quality of estimations.
        """

        if self.augm is not None:
            X, Y = self.augm.transform(X, Y)

        return self.score_no_augmentation(X, Y, *args, **kwargs)

Ancestors (in MRO)

  • IOTransform
  • sklearn.base.BaseEstimator
  • builtins.object

Static methods

def __init__(

self, model, metric=None, augm=None, X_prep=None, Y_prep=None, Y_post=None)

Initialize self. See help(type(self)) for accurate signature.

def __init__(self, model, metric=None, augm=None, X_prep=None, Y_prep=None, Y_post=None):
    self.X_prep = X_prep
    self.Y_prep = Y_prep
    self.Y_post = Y_post
    if not isinstance(model, BaseEstimator):
        raise TypeError('Model should be an instance of BaseEstimator, got %s' % model)
    self.model = model
    self.metric = metric
    self.augm = augm

def fit(

self, X, Y, *args, **kwargs)

Complete fitting pipeline with data preprocessing for generative models.

Includes data augmentation.

Parameters

X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs.

Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model.

def fit(self, X, Y, *args, **kwargs):
    """
    Complete fitting pipeline with data preprocessing for generative
    models.
    Includes data augmentation.
    Parameters
    ----------
    X : {array-like, sparse matrix}, shape [n_samples, ...]
        The data used as inputs to generatie model's outputs.
    Y : {array-like, sparse matrix}, shape [n_samples, ...]
        The target values estimated by the model.
    """
    X, Y = self._fit_preprocessors(X, Y)
    self.model.fit(X, Y, *args, **kwargs)
    return self

def get_params(

self, deep=True)

Get parameters for this estimator.

Parameters

deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returns

params : mapping of string to any Parameter names mapped to their values.

def get_params(self, deep=True):
    """Get parameters for this estimator.
    Parameters
    ----------
    deep : boolean, optional
        If True, will return the parameters for this estimator and
        contained subobjects that are estimators.
    Returns
    -------
    params : mapping of string to any
        Parameter names mapped to their values.
    """
    out = dict()
    for key in self._get_param_names():
        # We need deprecation warnings to always be on in order to
        # catch deprecated param values.
        # This is set in utils/__init__.py but it gets overwritten
        # when running under python3 somehow.
        warnings.simplefilter("always", DeprecationWarning)
        try:
            with warnings.catch_warnings(record=True) as w:
                value = getattr(self, key, None)
            if len(w) and w[0].category == DeprecationWarning:
                # if the parameter is deprecated, don't show it
                continue
        finally:
            warnings.filters.pop(0)
        # XXX: should we rather test if instance of estimator?
        if deep and hasattr(value, 'get_params'):
            deep_items = value.get_params().items()
            out.update((key + '__' + k, val) for k, val in deep_items)
        out[key] = value
    return out

def predict(

self, X, *args, **kwargs)

Full generation pipeline with all necessary steps such as data preprocessing.

IMPORTANT: this function does not do augmentation of input values! Hence a particular form of X should be the one that self.augm returns.

Parameters

X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs.

def predict(self, X, *args, **kwargs):
    """
    Full generation pipeline with all necessary steps such as data
    preprocessing.
    IMPORTANT: this function does not do augmentation of input
    values! Hence a particular form of X should be the one
    that self.augm returns.
    Parameters
    ----------
    X : {array-like, sparse matrix}, shape [n_samples, ...]
        The data used as inputs to generatie model's outputs.
    """
    X = self._transform_inputs(X)
    Y = self.model.predict(X, *args, **kwargs)
    Y = self._transform_generated_outputs(Y, X)
    return Y

def score(

self, X, Y, *args, **kwargs)

Evaluates the quality of the model using comparison to real data.

Includes data augmentation.

Parameters

X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs.

Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model.

Returns

score : float Score from 0.0 to 1.0 that indicates quality of estimations.

def score(self, X, Y, *args, **kwargs):
    """
    Evaluates the quality of the model using comparison
    to real data.
    Includes data augmentation.
    Parameters
    ----------
    X : {array-like, sparse matrix}, shape [n_samples, ...]
        The data used as inputs to generatie model's outputs.
    Y : {array-like, sparse matrix}, shape [n_samples, ...]
        The target values estimated by the model.
    Returns
    -------
    score : float
        Score from 0.0 to 1.0 that indicates quality of estimations.
    """
    if self.augm is not None:
        X, Y = self.augm.transform(X, Y)
    return self.score_no_augmentation(X, Y, *args, **kwargs)

def score_no_augmentation(

self, X, Y, *args, **kwargs)

Evaluates the quality of the model using comparison to real data.

DOES NOT include the data augmentation.

Parameters

X : {array-like, sparse matrix}, shape [n_samples, ...] The data used as inputs to generatie model's outputs.

Y : {array-like, sparse matrix}, shape [n_samples, ...] The target values estimated by the model.

Returns

score : float Score from 0.0 to 1.0 that indicates quality of estimations.

def score_no_augmentation(self, X, Y, *args, **kwargs):
    """
    Evaluates the quality of the model using comparison
    to real data.
    DOES NOT include the data augmentation.
    Parameters
    ----------
    X : {array-like, sparse matrix}, shape [n_samples, ...]
        The data used as inputs to generatie model's outputs.
    Y : {array-like, sparse matrix}, shape [n_samples, ...]
        The target values estimated by the model.
    Returns
    -------
    score : float
        Score from 0.0 to 1.0 that indicates quality of estimations.
    """
    if self.metric:
        Yp = self.predict(X, *args, **kwargs)
        score = self.metric(Y, Yp)
    else:
        score = self.model.score(X, Y)
    return score

def set_params(

self, **params)

Custom setting of parameters for generative models. All parameters that start with 'x_prep', 'y_prep', 'y_post' are delegated to respective preprocessors.

def set_params(self, **params):
    """
    Custom setting of parameters for generative models.
    All parameters that start with 'x_prep', 'y_prep', 'y_post' are
    delegated to respective preprocessors.
    """
    elements = {'augm', 'X_prep', 'Y_prep', 'Y_post', 'model'}
    self_params = {
        k:v for k, v in params.items()
        if not any(
            k.startswith(p.lower()) for p in elements
        )
    }
    BaseEstimator.set_params(self, **self_params)
    # set attributes of elements
    for e in elements:
        element = getattr(self, e)
        if isinstance(element, BaseEstimator):
            subprm = {
                k[len(e)+2:]: v for k, v in params.items()
                if k.startswith(e.lower())
            }
            element.set_params(**subprm)
    return self

Instance variables

var X_prep

var Y_post

var Y_prep

var augm

var metric

var model