noxer.preprocessing module
Feature preprocessing of data, such as expanding categorical features to numerical ones.
""" Feature preprocessing of data, such as expanding categorical features to numerical ones. """ from sklearn.base import ClassifierMixin, BaseEstimator, TransformerMixin from sklearn.preprocessing import LabelBinarizer, LabelEncoder import numpy as np class ColumnSelector(BaseEstimator, TransformerMixin): """Selects a single column with index `key` from some matrix X""" def __init__(self, key, row_space=True, as_matrix=True): self.key = key self.row_space = row_space self.as_matrix = as_matrix def fit(self, X, y=None): return self # do nothing during fitting procedure def transform(self, data_matrix): # return a matrix with single column if self.row_space: R = data_matrix[:, [self.key]] # eg numpy array else: R = data_matrix[[self.key]] # eg pandas dataframe R = np.array(R) if not self.as_matrix: R = R[:, 0] return R class OneHotEncoder(BaseEstimator, TransformerMixin): """Wrapper around LabelBinarizer. Assumes that input X to fit and transform is a single column matrix of categorical values.""" def fit(self, X, y=None): # create label encoder M = [x[0] for x in X] self.encoder = LabelBinarizer() self.encoder.fit(M) return self def transform(self, X, y=None): return self.encoder.transform(X[:,0]) class IntegerEncoder(BaseEstimator, TransformerMixin): """Wrapper around LabelBinarizer. Assumes that input X to fit and transform is a single column matrix of categorical values.""" def fit(self, X, y=None): # create label encoder M = X[:, 0] self.encoder = LabelEncoder() self.encoder.fit(M) return self def transform(self, X, y=None): return self.encoder.transform(X[:,0])[:, np.newaxis]
Classes
class ColumnSelector
Selects a single column with index key
from some matrix X
class ColumnSelector(BaseEstimator, TransformerMixin): """Selects a single column with index `key` from some matrix X""" def __init__(self, key, row_space=True, as_matrix=True): self.key = key self.row_space = row_space self.as_matrix = as_matrix def fit(self, X, y=None): return self # do nothing during fitting procedure def transform(self, data_matrix): # return a matrix with single column if self.row_space: R = data_matrix[:, [self.key]] # eg numpy array else: R = data_matrix[[self.key]] # eg pandas dataframe R = np.array(R) if not self.as_matrix: R = R[:, 0] return R
Ancestors (in MRO)
- ColumnSelector
- sklearn.base.BaseEstimator
- sklearn.base.TransformerMixin
- builtins.object
Static methods
def __init__(
self, key, row_space=True, as_matrix=True)
Initialize self. See help(type(self)) for accurate signature.
def __init__(self, key, row_space=True, as_matrix=True): self.key = key self.row_space = row_space self.as_matrix = as_matrix
def fit(
self, X, y=None)
def fit(self, X, y=None): return self # do nothing during fitting procedure
def fit_transform(
self, X, y=None, **fit_params)
Fit to data, then transform it.
Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X.
Parameters
X : numpy array of shape [n_samples, n_features] Training set.
y : numpy array of shape [n_samples] Target values.
Returns
X_new : numpy array of shape [n_samples, n_features_new] Transformed array.
def fit_transform(self, X, y=None, **fit_params): """Fit to data, then transform it. Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X. Parameters ---------- X : numpy array of shape [n_samples, n_features] Training set. y : numpy array of shape [n_samples] Target values. Returns ------- X_new : numpy array of shape [n_samples, n_features_new] Transformed array. """ # non-optimized default implementation; override when a better # method is possible for a given clustering algorithm if y is None: # fit method of arity 1 (unsupervised transformation) return self.fit(X, **fit_params).transform(X) else: # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X)
def get_params(
self, deep=True)
Get parameters for this estimator.
Parameters
deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators.
Returns
params : mapping of string to any Parameter names mapped to their values.
def get_params(self, deep=True): """Get parameters for this estimator. Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ out = dict() for key in self._get_param_names(): # We need deprecation warnings to always be on in order to # catch deprecated param values. # This is set in utils/__init__.py but it gets overwritten # when running under python3 somehow. warnings.simplefilter("always", DeprecationWarning) try: with warnings.catch_warnings(record=True) as w: value = getattr(self, key, None) if len(w) and w[0].category == DeprecationWarning: # if the parameter is deprecated, don't show it continue finally: warnings.filters.pop(0) # XXX: should we rather test if instance of estimator? if deep and hasattr(value, 'get_params'): deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) out[key] = value return out
def set_params(
self, **params)
Set the parameters of this estimator.
The method works on simple estimators as well as on nested objects
(such as pipelines). The latter have parameters of the form
<component>__<parameter>
so that it's possible to update each
component of a nested object.
Returns
self
def set_params(self, **params): """Set the parameters of this estimator. The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form ``<component>__<parameter>`` so that it's possible to update each component of a nested object. Returns ------- self """ if not params: # Simple optimization to gain speed (inspect is slow) return self valid_params = self.get_params(deep=True) nested_params = defaultdict(dict) # grouped by prefix for key, value in params.items(): key, delim, sub_key = key.partition('__') if key not in valid_params: raise ValueError('Invalid parameter %s for estimator %s. ' 'Check the list of available parameters ' 'with `estimator.get_params().keys()`.' % (key, self)) if delim: nested_params[key][sub_key] = value else: setattr(self, key, value) for key, sub_params in nested_params.items(): valid_params[key].set_params(**sub_params) return self
def transform(
self, data_matrix)
def transform(self, data_matrix): # return a matrix with single column if self.row_space: R = data_matrix[:, [self.key]] # eg numpy array else: R = data_matrix[[self.key]] # eg pandas dataframe R = np.array(R) if not self.as_matrix: R = R[:, 0] return R
Instance variables
var as_matrix
var key
var row_space
class IntegerEncoder
Wrapper around LabelBinarizer. Assumes that input X to fit and transform is a single column matrix of categorical values.
class IntegerEncoder(BaseEstimator, TransformerMixin): """Wrapper around LabelBinarizer. Assumes that input X to fit and transform is a single column matrix of categorical values.""" def fit(self, X, y=None): # create label encoder M = X[:, 0] self.encoder = LabelEncoder() self.encoder.fit(M) return self def transform(self, X, y=None): return self.encoder.transform(X[:,0])[:, np.newaxis]
Ancestors (in MRO)
- IntegerEncoder
- sklearn.base.BaseEstimator
- sklearn.base.TransformerMixin
- builtins.object
Static methods
def fit(
self, X, y=None)
def fit(self, X, y=None): # create label encoder M = X[:, 0] self.encoder = LabelEncoder() self.encoder.fit(M) return self
def fit_transform(
self, X, y=None, **fit_params)
Fit to data, then transform it.
Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X.
Parameters
X : numpy array of shape [n_samples, n_features] Training set.
y : numpy array of shape [n_samples] Target values.
Returns
X_new : numpy array of shape [n_samples, n_features_new] Transformed array.
def fit_transform(self, X, y=None, **fit_params): """Fit to data, then transform it. Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X. Parameters ---------- X : numpy array of shape [n_samples, n_features] Training set. y : numpy array of shape [n_samples] Target values. Returns ------- X_new : numpy array of shape [n_samples, n_features_new] Transformed array. """ # non-optimized default implementation; override when a better # method is possible for a given clustering algorithm if y is None: # fit method of arity 1 (unsupervised transformation) return self.fit(X, **fit_params).transform(X) else: # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X)
def get_params(
self, deep=True)
Get parameters for this estimator.
Parameters
deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators.
Returns
params : mapping of string to any Parameter names mapped to their values.
def get_params(self, deep=True): """Get parameters for this estimator. Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ out = dict() for key in self._get_param_names(): # We need deprecation warnings to always be on in order to # catch deprecated param values. # This is set in utils/__init__.py but it gets overwritten # when running under python3 somehow. warnings.simplefilter("always", DeprecationWarning) try: with warnings.catch_warnings(record=True) as w: value = getattr(self, key, None) if len(w) and w[0].category == DeprecationWarning: # if the parameter is deprecated, don't show it continue finally: warnings.filters.pop(0) # XXX: should we rather test if instance of estimator? if deep and hasattr(value, 'get_params'): deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) out[key] = value return out
def set_params(
self, **params)
Set the parameters of this estimator.
The method works on simple estimators as well as on nested objects
(such as pipelines). The latter have parameters of the form
<component>__<parameter>
so that it's possible to update each
component of a nested object.
Returns
self
def set_params(self, **params): """Set the parameters of this estimator. The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form ``<component>__<parameter>`` so that it's possible to update each component of a nested object. Returns ------- self """ if not params: # Simple optimization to gain speed (inspect is slow) return self valid_params = self.get_params(deep=True) nested_params = defaultdict(dict) # grouped by prefix for key, value in params.items(): key, delim, sub_key = key.partition('__') if key not in valid_params: raise ValueError('Invalid parameter %s for estimator %s. ' 'Check the list of available parameters ' 'with `estimator.get_params().keys()`.' % (key, self)) if delim: nested_params[key][sub_key] = value else: setattr(self, key, value) for key, sub_params in nested_params.items(): valid_params[key].set_params(**sub_params) return self
def transform(
self, X, y=None)
def transform(self, X, y=None): return self.encoder.transform(X[:,0])[:, np.newaxis]
class OneHotEncoder
Wrapper around LabelBinarizer. Assumes that input X to fit and transform is a single column matrix of categorical values.
class OneHotEncoder(BaseEstimator, TransformerMixin): """Wrapper around LabelBinarizer. Assumes that input X to fit and transform is a single column matrix of categorical values.""" def fit(self, X, y=None): # create label encoder M = [x[0] for x in X] self.encoder = LabelBinarizer() self.encoder.fit(M) return self def transform(self, X, y=None): return self.encoder.transform(X[:,0])
Ancestors (in MRO)
- OneHotEncoder
- sklearn.base.BaseEstimator
- sklearn.base.TransformerMixin
- builtins.object
Static methods
def fit(
self, X, y=None)
def fit(self, X, y=None): # create label encoder M = [x[0] for x in X] self.encoder = LabelBinarizer() self.encoder.fit(M) return self
def fit_transform(
self, X, y=None, **fit_params)
Fit to data, then transform it.
Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X.
Parameters
X : numpy array of shape [n_samples, n_features] Training set.
y : numpy array of shape [n_samples] Target values.
Returns
X_new : numpy array of shape [n_samples, n_features_new] Transformed array.
def fit_transform(self, X, y=None, **fit_params): """Fit to data, then transform it. Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X. Parameters ---------- X : numpy array of shape [n_samples, n_features] Training set. y : numpy array of shape [n_samples] Target values. Returns ------- X_new : numpy array of shape [n_samples, n_features_new] Transformed array. """ # non-optimized default implementation; override when a better # method is possible for a given clustering algorithm if y is None: # fit method of arity 1 (unsupervised transformation) return self.fit(X, **fit_params).transform(X) else: # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X)
def get_params(
self, deep=True)
Get parameters for this estimator.
Parameters
deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators.
Returns
params : mapping of string to any Parameter names mapped to their values.
def get_params(self, deep=True): """Get parameters for this estimator. Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ out = dict() for key in self._get_param_names(): # We need deprecation warnings to always be on in order to # catch deprecated param values. # This is set in utils/__init__.py but it gets overwritten # when running under python3 somehow. warnings.simplefilter("always", DeprecationWarning) try: with warnings.catch_warnings(record=True) as w: value = getattr(self, key, None) if len(w) and w[0].category == DeprecationWarning: # if the parameter is deprecated, don't show it continue finally: warnings.filters.pop(0) # XXX: should we rather test if instance of estimator? if deep and hasattr(value, 'get_params'): deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) out[key] = value return out
def set_params(
self, **params)
Set the parameters of this estimator.
The method works on simple estimators as well as on nested objects
(such as pipelines). The latter have parameters of the form
<component>__<parameter>
so that it's possible to update each
component of a nested object.
Returns
self
def set_params(self, **params): """Set the parameters of this estimator. The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form ``<component>__<parameter>`` so that it's possible to update each component of a nested object. Returns ------- self """ if not params: # Simple optimization to gain speed (inspect is slow) return self valid_params = self.get_params(deep=True) nested_params = defaultdict(dict) # grouped by prefix for key, value in params.items(): key, delim, sub_key = key.partition('__') if key not in valid_params: raise ValueError('Invalid parameter %s for estimator %s. ' 'Check the list of available parameters ' 'with `estimator.get_params().keys()`.' % (key, self)) if delim: nested_params[key][sub_key] = value else: setattr(self, key, value) for key, sub_params in nested_params.items(): valid_params[key].set_params(**sub_params) return self
def transform(
self, X, y=None)
def transform(self, X, y=None): return self.encoder.transform(X[:,0])