Top

noxer.preprocessing module

Feature preprocessing of data, such as expanding categorical features to numerical ones.

"""
Feature preprocessing of data, such as expanding
categorical features to numerical ones.
"""

from sklearn.base import ClassifierMixin, BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
import numpy as np

class ColumnSelector(BaseEstimator, TransformerMixin):
    """Selects a single column with index `key` from some matrix X"""
    def __init__(self, key, row_space=True, as_matrix=True):
        self.key = key
        self.row_space = row_space
        self.as_matrix = as_matrix

    def fit(self, X, y=None):
        return self  # do nothing during fitting procedure

    def transform(self, data_matrix):
        # return a matrix with single column
        if self.row_space:
            R = data_matrix[:, [self.key]] # eg numpy array
        else:
            R = data_matrix[[self.key]] # eg pandas dataframe

        R = np.array(R)

        if not self.as_matrix:
            R = R[:, 0]

        return R



class OneHotEncoder(BaseEstimator, TransformerMixin):
    """Wrapper around LabelBinarizer. Assumes that input X to fit and transform is a single
    column matrix of categorical values."""
    def fit(self, X, y=None):
        # create label encoder
        M = [x[0] for x in X]
        self.encoder = LabelBinarizer()
        self.encoder.fit(M)
        return self

    def transform(self, X, y=None):
        return self.encoder.transform(X[:,0])


class IntegerEncoder(BaseEstimator, TransformerMixin):
    """Wrapper around LabelBinarizer. Assumes that input X to fit and transform is a single
    column matrix of categorical values."""
    def fit(self, X, y=None):
        # create label encoder
        M = X[:, 0]
        self.encoder = LabelEncoder()
        self.encoder.fit(M)
        return self

    def transform(self, X, y=None):
        return self.encoder.transform(X[:,0])[:, np.newaxis]

Classes

class ColumnSelector

Selects a single column with index key from some matrix X

class ColumnSelector(BaseEstimator, TransformerMixin):
    """Selects a single column with index `key` from some matrix X"""
    def __init__(self, key, row_space=True, as_matrix=True):
        self.key = key
        self.row_space = row_space
        self.as_matrix = as_matrix

    def fit(self, X, y=None):
        return self  # do nothing during fitting procedure

    def transform(self, data_matrix):
        # return a matrix with single column
        if self.row_space:
            R = data_matrix[:, [self.key]] # eg numpy array
        else:
            R = data_matrix[[self.key]] # eg pandas dataframe

        R = np.array(R)

        if not self.as_matrix:
            R = R[:, 0]

        return R

Ancestors (in MRO)

  • ColumnSelector
  • sklearn.base.BaseEstimator
  • sklearn.base.TransformerMixin
  • builtins.object

Static methods

def __init__(

self, key, row_space=True, as_matrix=True)

Initialize self. See help(type(self)) for accurate signature.

def __init__(self, key, row_space=True, as_matrix=True):
    self.key = key
    self.row_space = row_space
    self.as_matrix = as_matrix

def fit(

self, X, y=None)

def fit(self, X, y=None):
    return self  # do nothing during fitting procedure

def fit_transform(

self, X, y=None, **fit_params)

Fit to data, then transform it.

Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X.

Parameters

X : numpy array of shape [n_samples, n_features] Training set.

y : numpy array of shape [n_samples] Target values.

Returns

X_new : numpy array of shape [n_samples, n_features_new] Transformed array.

def fit_transform(self, X, y=None, **fit_params):
    """Fit to data, then transform it.
    Fits transformer to X and y with optional parameters fit_params
    and returns a transformed version of X.
    Parameters
    ----------
    X : numpy array of shape [n_samples, n_features]
        Training set.
    y : numpy array of shape [n_samples]
        Target values.
    Returns
    -------
    X_new : numpy array of shape [n_samples, n_features_new]
        Transformed array.
    """
    # non-optimized default implementation; override when a better
    # method is possible for a given clustering algorithm
    if y is None:
        # fit method of arity 1 (unsupervised transformation)
        return self.fit(X, **fit_params).transform(X)
    else:
        # fit method of arity 2 (supervised transformation)
        return self.fit(X, y, **fit_params).transform(X)

def get_params(

self, deep=True)

Get parameters for this estimator.

Parameters

deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returns

params : mapping of string to any Parameter names mapped to their values.

def get_params(self, deep=True):
    """Get parameters for this estimator.
    Parameters
    ----------
    deep : boolean, optional
        If True, will return the parameters for this estimator and
        contained subobjects that are estimators.
    Returns
    -------
    params : mapping of string to any
        Parameter names mapped to their values.
    """
    out = dict()
    for key in self._get_param_names():
        # We need deprecation warnings to always be on in order to
        # catch deprecated param values.
        # This is set in utils/__init__.py but it gets overwritten
        # when running under python3 somehow.
        warnings.simplefilter("always", DeprecationWarning)
        try:
            with warnings.catch_warnings(record=True) as w:
                value = getattr(self, key, None)
            if len(w) and w[0].category == DeprecationWarning:
                # if the parameter is deprecated, don't show it
                continue
        finally:
            warnings.filters.pop(0)
        # XXX: should we rather test if instance of estimator?
        if deep and hasattr(value, 'get_params'):
            deep_items = value.get_params().items()
            out.update((key + '__' + k, val) for k, val in deep_items)
        out[key] = value
    return out

def set_params(

self, **params)

Set the parameters of this estimator.

The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form <component>__<parameter> so that it's possible to update each component of a nested object.

Returns

self

def set_params(self, **params):
    """Set the parameters of this estimator.
    The method works on simple estimators as well as on nested objects
    (such as pipelines). The latter have parameters of the form
    ``<component>__<parameter>`` so that it's possible to update each
    component of a nested object.
    Returns
    -------
    self
    """
    if not params:
        # Simple optimization to gain speed (inspect is slow)
        return self
    valid_params = self.get_params(deep=True)
    nested_params = defaultdict(dict)  # grouped by prefix
    for key, value in params.items():
        key, delim, sub_key = key.partition('__')
        if key not in valid_params:
            raise ValueError('Invalid parameter %s for estimator %s. '
                             'Check the list of available parameters '
                             'with `estimator.get_params().keys()`.' %
                             (key, self))
        if delim:
            nested_params[key][sub_key] = value
        else:
            setattr(self, key, value)
    for key, sub_params in nested_params.items():
        valid_params[key].set_params(**sub_params)
    return self

def transform(

self, data_matrix)

def transform(self, data_matrix):
    # return a matrix with single column
    if self.row_space:
        R = data_matrix[:, [self.key]] # eg numpy array
    else:
        R = data_matrix[[self.key]] # eg pandas dataframe
    R = np.array(R)
    if not self.as_matrix:
        R = R[:, 0]
    return R

Instance variables

var as_matrix

var key

var row_space

class IntegerEncoder

Wrapper around LabelBinarizer. Assumes that input X to fit and transform is a single column matrix of categorical values.

class IntegerEncoder(BaseEstimator, TransformerMixin):
    """Wrapper around LabelBinarizer. Assumes that input X to fit and transform is a single
    column matrix of categorical values."""
    def fit(self, X, y=None):
        # create label encoder
        M = X[:, 0]
        self.encoder = LabelEncoder()
        self.encoder.fit(M)
        return self

    def transform(self, X, y=None):
        return self.encoder.transform(X[:,0])[:, np.newaxis]

Ancestors (in MRO)

  • IntegerEncoder
  • sklearn.base.BaseEstimator
  • sklearn.base.TransformerMixin
  • builtins.object

Static methods

def fit(

self, X, y=None)

def fit(self, X, y=None):
    # create label encoder
    M = X[:, 0]
    self.encoder = LabelEncoder()
    self.encoder.fit(M)
    return self

def fit_transform(

self, X, y=None, **fit_params)

Fit to data, then transform it.

Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X.

Parameters

X : numpy array of shape [n_samples, n_features] Training set.

y : numpy array of shape [n_samples] Target values.

Returns

X_new : numpy array of shape [n_samples, n_features_new] Transformed array.

def fit_transform(self, X, y=None, **fit_params):
    """Fit to data, then transform it.
    Fits transformer to X and y with optional parameters fit_params
    and returns a transformed version of X.
    Parameters
    ----------
    X : numpy array of shape [n_samples, n_features]
        Training set.
    y : numpy array of shape [n_samples]
        Target values.
    Returns
    -------
    X_new : numpy array of shape [n_samples, n_features_new]
        Transformed array.
    """
    # non-optimized default implementation; override when a better
    # method is possible for a given clustering algorithm
    if y is None:
        # fit method of arity 1 (unsupervised transformation)
        return self.fit(X, **fit_params).transform(X)
    else:
        # fit method of arity 2 (supervised transformation)
        return self.fit(X, y, **fit_params).transform(X)

def get_params(

self, deep=True)

Get parameters for this estimator.

Parameters

deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returns

params : mapping of string to any Parameter names mapped to their values.

def get_params(self, deep=True):
    """Get parameters for this estimator.
    Parameters
    ----------
    deep : boolean, optional
        If True, will return the parameters for this estimator and
        contained subobjects that are estimators.
    Returns
    -------
    params : mapping of string to any
        Parameter names mapped to their values.
    """
    out = dict()
    for key in self._get_param_names():
        # We need deprecation warnings to always be on in order to
        # catch deprecated param values.
        # This is set in utils/__init__.py but it gets overwritten
        # when running under python3 somehow.
        warnings.simplefilter("always", DeprecationWarning)
        try:
            with warnings.catch_warnings(record=True) as w:
                value = getattr(self, key, None)
            if len(w) and w[0].category == DeprecationWarning:
                # if the parameter is deprecated, don't show it
                continue
        finally:
            warnings.filters.pop(0)
        # XXX: should we rather test if instance of estimator?
        if deep and hasattr(value, 'get_params'):
            deep_items = value.get_params().items()
            out.update((key + '__' + k, val) for k, val in deep_items)
        out[key] = value
    return out

def set_params(

self, **params)

Set the parameters of this estimator.

The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form <component>__<parameter> so that it's possible to update each component of a nested object.

Returns

self

def set_params(self, **params):
    """Set the parameters of this estimator.
    The method works on simple estimators as well as on nested objects
    (such as pipelines). The latter have parameters of the form
    ``<component>__<parameter>`` so that it's possible to update each
    component of a nested object.
    Returns
    -------
    self
    """
    if not params:
        # Simple optimization to gain speed (inspect is slow)
        return self
    valid_params = self.get_params(deep=True)
    nested_params = defaultdict(dict)  # grouped by prefix
    for key, value in params.items():
        key, delim, sub_key = key.partition('__')
        if key not in valid_params:
            raise ValueError('Invalid parameter %s for estimator %s. '
                             'Check the list of available parameters '
                             'with `estimator.get_params().keys()`.' %
                             (key, self))
        if delim:
            nested_params[key][sub_key] = value
        else:
            setattr(self, key, value)
    for key, sub_params in nested_params.items():
        valid_params[key].set_params(**sub_params)
    return self

def transform(

self, X, y=None)

def transform(self, X, y=None):
    return self.encoder.transform(X[:,0])[:, np.newaxis]

class OneHotEncoder

Wrapper around LabelBinarizer. Assumes that input X to fit and transform is a single column matrix of categorical values.

class OneHotEncoder(BaseEstimator, TransformerMixin):
    """Wrapper around LabelBinarizer. Assumes that input X to fit and transform is a single
    column matrix of categorical values."""
    def fit(self, X, y=None):
        # create label encoder
        M = [x[0] for x in X]
        self.encoder = LabelBinarizer()
        self.encoder.fit(M)
        return self

    def transform(self, X, y=None):
        return self.encoder.transform(X[:,0])

Ancestors (in MRO)

  • OneHotEncoder
  • sklearn.base.BaseEstimator
  • sklearn.base.TransformerMixin
  • builtins.object

Static methods

def fit(

self, X, y=None)

def fit(self, X, y=None):
    # create label encoder
    M = [x[0] for x in X]
    self.encoder = LabelBinarizer()
    self.encoder.fit(M)
    return self

def fit_transform(

self, X, y=None, **fit_params)

Fit to data, then transform it.

Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X.

Parameters

X : numpy array of shape [n_samples, n_features] Training set.

y : numpy array of shape [n_samples] Target values.

Returns

X_new : numpy array of shape [n_samples, n_features_new] Transformed array.

def fit_transform(self, X, y=None, **fit_params):
    """Fit to data, then transform it.
    Fits transformer to X and y with optional parameters fit_params
    and returns a transformed version of X.
    Parameters
    ----------
    X : numpy array of shape [n_samples, n_features]
        Training set.
    y : numpy array of shape [n_samples]
        Target values.
    Returns
    -------
    X_new : numpy array of shape [n_samples, n_features_new]
        Transformed array.
    """
    # non-optimized default implementation; override when a better
    # method is possible for a given clustering algorithm
    if y is None:
        # fit method of arity 1 (unsupervised transformation)
        return self.fit(X, **fit_params).transform(X)
    else:
        # fit method of arity 2 (supervised transformation)
        return self.fit(X, y, **fit_params).transform(X)

def get_params(

self, deep=True)

Get parameters for this estimator.

Parameters

deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators.

Returns

params : mapping of string to any Parameter names mapped to their values.

def get_params(self, deep=True):
    """Get parameters for this estimator.
    Parameters
    ----------
    deep : boolean, optional
        If True, will return the parameters for this estimator and
        contained subobjects that are estimators.
    Returns
    -------
    params : mapping of string to any
        Parameter names mapped to their values.
    """
    out = dict()
    for key in self._get_param_names():
        # We need deprecation warnings to always be on in order to
        # catch deprecated param values.
        # This is set in utils/__init__.py but it gets overwritten
        # when running under python3 somehow.
        warnings.simplefilter("always", DeprecationWarning)
        try:
            with warnings.catch_warnings(record=True) as w:
                value = getattr(self, key, None)
            if len(w) and w[0].category == DeprecationWarning:
                # if the parameter is deprecated, don't show it
                continue
        finally:
            warnings.filters.pop(0)
        # XXX: should we rather test if instance of estimator?
        if deep and hasattr(value, 'get_params'):
            deep_items = value.get_params().items()
            out.update((key + '__' + k, val) for k, val in deep_items)
        out[key] = value
    return out

def set_params(

self, **params)

Set the parameters of this estimator.

The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form <component>__<parameter> so that it's possible to update each component of a nested object.

Returns

self

def set_params(self, **params):
    """Set the parameters of this estimator.
    The method works on simple estimators as well as on nested objects
    (such as pipelines). The latter have parameters of the form
    ``<component>__<parameter>`` so that it's possible to update each
    component of a nested object.
    Returns
    -------
    self
    """
    if not params:
        # Simple optimization to gain speed (inspect is slow)
        return self
    valid_params = self.get_params(deep=True)
    nested_params = defaultdict(dict)  # grouped by prefix
    for key, value in params.items():
        key, delim, sub_key = key.partition('__')
        if key not in valid_params:
            raise ValueError('Invalid parameter %s for estimator %s. '
                             'Check the list of available parameters '
                             'with `estimator.get_params().keys()`.' %
                             (key, self))
        if delim:
            nested_params[key][sub_key] = value
        else:
            setattr(self, key, value)
    for key, sub_params in nested_params.items():
        valid_params[key].set_params(**sub_params)
    return self

def transform(

self, X, y=None)

def transform(self, X, y=None):
    return self.encoder.transform(X[:,0])