noxer.sequences module
Sequence preprocessing functionality. Extends sklearn transformers to sequences.
""" Sequence preprocessing functionality. Extends sklearn transformers to sequences. """ import numpy as np from sklearn.base import ClassifierMixin, BaseEstimator, TransformerMixin, clone from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.metrics import accuracy_score from sklearn.svm import LinearSVC # A set of procedures for preprocessing of sequences def make_subsequences(x, y, step=1, max_len=2 ** 31): """ Creates views to all subsequences of the sequence x. For example if x = [1,2,3,4] y = [1,1,0,0] step = 1 the result is a tuple a, b, where: a = [[1], [1,2], [1,2,3], [1,2,3,4] ] b = [1,1,0,0] Note that only a view into x is created, but not a copy of elements of x. Parameters ---------- X : array [seq_length, n_features] y : numpy array of shape [n_samples] Target values. Can be string, float, int etc. step : int Step with which to subsample the sequence. max_len : int, default 2 ** 31 Step with which to subsample the sequence. Returns ------- a, b : a is all subsequences of x taken with some step, and b is labels assigned to these sequences. """ r = range(step-1, len(x), step) X = [] Y = [] for i in r: start = max(0, i - max_len) stop = i+1 X.append(x[start:stop]) Y.append(y[i]) return X, Y class PadSubsequence(BaseEstimator, TransformerMixin): """ Takes subsequences of fixed length from input list of sequences. If sequence is not long enough, it is left padded with zeros. Parameters ---------- length : float, length of the subsequence to take """ def __init__(self, length=10, step=1): self.length = length self.step = step def _check_input(self, X): if len(X.shape) < 2: raise ValueError("The input should be a sequence, found shape %s" % X.shape) def fit(self,X,y=None): # remeber the num. of features self.n_features = X[0].shape[-1] return self def transform(self, X, y=None): if not hasattr(self, 'step'): self.step = 1 # X might be a list R = [] for x in X: if len(x) >= self.length: R.append(x[-self.length::self.step]) else: z = np.zeros((self.length - len(x), x.shape[-1])) zx = np.row_stack((z,x)) zx = zx[::self.step] R.append(zx) R = np.array(R) return R class CalculateSpectrum(BaseEstimator, TransformerMixin): """Calculates spectrum of sequence. """ def __init__(self, copy=True, with_mean=True, with_std=True): self.with_mean = with_mean self.with_std = with_std def fit(self, X, y=None): return self def transform(self, X, y=None): """Perform fft on sequence along every feature Parameters ---------- X : array-like, shape [n_samples, seq_len, n_features] The data used to fft along the features axis. """ from scipy import fftpack X = abs(fftpack.fft(X, axis=1)) return X class FlattenShape(BaseEstimator, TransformerMixin): """ Flattens the shape of samples to a single vector. This is useful in cases when "classic" models like SVM are used. Parameters ---------- """ def fit(self, X, y=None): self.shape = X[0].shape return self def transform(self, X, y=None): V = np.array([np.ravel(x) for x in X]) return V def inverse_transform(self, X, y=None): V = np.array([np.reshape(x, self.shape) for x in X]) return V # Wrapper for the standard classes of sklearn to work with sequence labeling class SequenceTransformer(BaseEstimator, TransformerMixin): def __init__(self, transformer, mode='stack'): """ Applies transformer to every element in input sequence. transformer: TransformerMixin mode: How to preprocess sequences for transformer fitting. default: stack all sequences into one huge sequence so that then it looks like a normal 2d training set """ self.transformer = transformer self.mode = mode self.transformer_ = None def fit(self, X, y=None): """ Fit base transformer to the set of sequences. X: iterable of shape [n_samples, ...] y: iterable of shape [n_samples, ...] """ # stack all the elements into one huge dataset self.transformer_ = clone(self.transformer) if self.mode == 'stack': X_conc = np.row_stack(x for x in X) # might have bugs here in future :( if y is not None: y_conc = np.concatenate([[v] * len(x) for x, v in zip(X, y)]) else: X_conc = X y_conc = y if y is None: self.transformer_.fit(X_conc) else: self.transformer_.fit(X_conc, y_conc) return self def transform(self, X, y=None): if y is None: result = [self.transformer_.transform(xx) for xx in X] else: result = [self.transformer_.transform(xx, [yy] * len(xx)) for xx, yy in zip(X, y)] result = np.array(result) return result def set_params(self, **params): self.base_transformer.set_params(**params) return self class Seq1Dto2D(BaseEstimator, TransformerMixin): """ Useful for working with text sequences. Such sequence is just a list of characters. This converts a sequence of elements to a sequence of lists of size 1 of characters. So "abc" -> [['a'], ['b'], ['c']] Useful for applications where you do not want to convert text to features explicitly. """ def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X, y=None): return [np.array(list(xx))[:, np.newaxis] for xx in X] class Subsequensor(BaseEstimator, TransformerMixin): """ Creates views in all subsequences of a numpy sequence. Parameters ---------- step: int, step with which the subsequences are taken. max_subsequence: int or None, maximum subsequence size that is used in order to predict a certain output value. """ def __init__(self, step, max_subsequence=None): self.step = step self.max_subsequence = max_subsequence def fit(self, X, Y): """Fit the transformer according to the given training data. Parameters ---------- X : list of numpy arrays List of sequences, where every sequence is a 2d numpy array, where the first dimension corresponds to time, and last for features. Y : list of object List of labels assigned to corresponding sequences in X. Returns ------- self : object Returns self. """ return self def transform(self, X, Y=None): """Transform the input data. Parameters ---------- X : list of numpy arrays List of sequences, where every sequence is a 2d numpy array, where the first dimension corresponds to time, and last for features. Y : list of object List of labels assigned to corresponding sequences in X. Returns ------- X : list Returns list of views into the sequences. """ test_time = Y is None if test_time: Y = [[None]*len(x) for x in X] if self.max_subsequence is None: args = (self.step, ) else: args = (self.step, self.max_subsequence) XY = [make_subsequences(*((x, y, ) + args)) for x, y in zip(X, Y)] X = [z[0] for z in XY] if test_time: return X return X, [z[1] for z in XY] class SequenceEstimator(BaseEstimator): """ This generic estimator class can be used to label every element in a sequence using underlying subsequence estimator. One example would be labeling which parts of sensory data correspond to what kind of activity of the user. Consider the following example: X = [[1,2,3]] y = [[0,0,1]] fit() will train the estimator to classify properly the following data: X = [[1], [1,2], [1,2,3]] y = [[0, 0, 1]] predict() on X will return labels for every element in a sequence. Parameters ---------- estimator: BaseEstimator, model which is used to do estimations on subsequences. step: int, step with which the subsequences are taken for training of internal sestimator. """ def __init__(self, estimator, step=1, max_subsequence=None): self.estimator = estimator self.step = step self.max_subsequence = max_subsequence self.subsequencer = None # class instance that is responsible for getting views into the sequence def set_params(self, **params): step_name = self.__class__.__name__.lower() + "__step" if step_name in params: self.step = params[step_name] params = params.copy() del params[step_name] self.estimator.set_params(**params) return self def fit(self, X, y): X, y = Subsequensor(step=self.step, max_subsequence=self.max_subsequence).transform(X, y) X, y = sum(X, []), sum(y, []) # concat all data together self.estimator.fit(X, y) return self def predict(self, X): X = Subsequensor(step=1).transform(X) R = [self.estimator.predict(x) for x in X] return R def score(self, X, y): X, y = Subsequensor(step=self.step, max_subsequence=self.max_subsequence).transform(X, y) X, y = sum(X, []), sum(y, []) # concat all data together return self.estimator.score(X, y) # Classes that work with sequences directly # Readers def read_wav(filename, mono=False): """ Reads a wav file into a sequence of vectors, which represent the intensity of sound at some time. Every vector has a lenght of 1 if mono mode is used, else 2. Parameters ---------- filename : string, file to read mono: bool, whether to read audio as mono or stereo. Mono files are always read as mono. Returns ------- numpy array containing sequence of audio intensities. """ import scipy.io.wavfile as scw framerate, data = scw.read(filename) if len(data.shape) < 2: data = data[:,np.newaxis] if mono: data = np.mean(data, axis=1) data = data[:,np.newaxis] return data # Example pipelines def rnn_pipe(): pipe = make_pipeline( PadSubsequence(), RNNClassifier() ) grid = [ { "paddedsubsequence__length":[2,4], "rnnclassifier__n_neurons":[32] } ] return pipe, grid def svm_pipe(): pipe = make_pipeline( PadSubsequence(), FlattenShape(), StandardScaler(), LinearSVC(), ) grid = [ { "paddedsubsequence__length":[1,2,4,8,16], "linearsvc__C":10 ** np.linspace(-10, 10, 51) } ] return pipe, grid if __name__ == "__main__": pass
Functions
def make_subsequences(
x, y, step=1, max_len=2147483648)
Creates views to all subsequences of the sequence x. For example if x = [1,2,3,4] y = [1,1,0,0] step = 1 the result is a tuple a, b, where: a = [[1], [1,2], [1,2,3], [1,2,3,4] ] b = [1,1,0,0]
Note that only a view into x is created, but not a copy of elements of x.
Parameters
X : array [seq_length, n_features]
y : numpy array of shape [n_samples] Target values. Can be string, float, int etc.
step : int Step with which to subsample the sequence.
max_len : int, default 2 ** 31 Step with which to subsample the sequence.
Returns
a, b : a is all subsequences of x taken with some step, and b is labels assigned to these sequences.
def make_subsequences(x, y, step=1, max_len=2 ** 31): """ Creates views to all subsequences of the sequence x. For example if x = [1,2,3,4] y = [1,1,0,0] step = 1 the result is a tuple a, b, where: a = [[1], [1,2], [1,2,3], [1,2,3,4] ] b = [1,1,0,0] Note that only a view into x is created, but not a copy of elements of x. Parameters ---------- X : array [seq_length, n_features] y : numpy array of shape [n_samples] Target values. Can be string, float, int etc. step : int Step with which to subsample the sequence. max_len : int, default 2 ** 31 Step with which to subsample the sequence. Returns ------- a, b : a is all subsequences of x taken with some step, and b is labels assigned to these sequences. """ r = range(step-1, len(x), step) X = [] Y = [] for i in r: start = max(0, i - max_len) stop = i+1 X.append(x[start:stop]) Y.append(y[i]) return X, Y
def read_wav(
filename, mono=False)
Reads a wav file into a sequence of vectors, which represent the intensity of sound at some time. Every vector has a lenght of 1 if mono mode is used, else 2.
Parameters
filename : string, file to read
mono: bool, whether to read audio as mono or stereo. Mono files are always read as mono.
Returns
numpy array containing sequence of audio intensities.
def read_wav(filename, mono=False): """ Reads a wav file into a sequence of vectors, which represent the intensity of sound at some time. Every vector has a lenght of 1 if mono mode is used, else 2. Parameters ---------- filename : string, file to read mono: bool, whether to read audio as mono or stereo. Mono files are always read as mono. Returns ------- numpy array containing sequence of audio intensities. """ import scipy.io.wavfile as scw framerate, data = scw.read(filename) if len(data.shape) < 2: data = data[:,np.newaxis] if mono: data = np.mean(data, axis=1) data = data[:,np.newaxis] return data
def rnn_pipe(
)
def rnn_pipe(): pipe = make_pipeline( PadSubsequence(), RNNClassifier() ) grid = [ { "paddedsubsequence__length":[2,4], "rnnclassifier__n_neurons":[32] } ] return pipe, grid
def svm_pipe(
)
def svm_pipe(): pipe = make_pipeline( PadSubsequence(), FlattenShape(), StandardScaler(), LinearSVC(), ) grid = [ { "paddedsubsequence__length":[1,2,4,8,16], "linearsvc__C":10 ** np.linspace(-10, 10, 51) } ] return pipe, grid
Classes
class CalculateSpectrum
Calculates spectrum of sequence.
class CalculateSpectrum(BaseEstimator, TransformerMixin): """Calculates spectrum of sequence. """ def __init__(self, copy=True, with_mean=True, with_std=True): self.with_mean = with_mean self.with_std = with_std def fit(self, X, y=None): return self def transform(self, X, y=None): """Perform fft on sequence along every feature Parameters ---------- X : array-like, shape [n_samples, seq_len, n_features] The data used to fft along the features axis. """ from scipy import fftpack X = abs(fftpack.fft(X, axis=1)) return X
Ancestors (in MRO)
- CalculateSpectrum
- sklearn.base.BaseEstimator
- sklearn.base.TransformerMixin
- builtins.object
Static methods
def __init__(
self, copy=True, with_mean=True, with_std=True)
Initialize self. See help(type(self)) for accurate signature.
def __init__(self, copy=True, with_mean=True, with_std=True): self.with_mean = with_mean self.with_std = with_std
def fit(
self, X, y=None)
def fit(self, X, y=None): return self
def fit_transform(
self, X, y=None, **fit_params)
Fit to data, then transform it.
Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X.
Parameters
X : numpy array of shape [n_samples, n_features] Training set.
y : numpy array of shape [n_samples] Target values.
Returns
X_new : numpy array of shape [n_samples, n_features_new] Transformed array.
def fit_transform(self, X, y=None, **fit_params): """Fit to data, then transform it. Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X. Parameters ---------- X : numpy array of shape [n_samples, n_features] Training set. y : numpy array of shape [n_samples] Target values. Returns ------- X_new : numpy array of shape [n_samples, n_features_new] Transformed array. """ # non-optimized default implementation; override when a better # method is possible for a given clustering algorithm if y is None: # fit method of arity 1 (unsupervised transformation) return self.fit(X, **fit_params).transform(X) else: # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X)
def get_params(
self, deep=True)
Get parameters for this estimator.
Parameters
deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators.
Returns
params : mapping of string to any Parameter names mapped to their values.
def get_params(self, deep=True): """Get parameters for this estimator. Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ out = dict() for key in self._get_param_names(): # We need deprecation warnings to always be on in order to # catch deprecated param values. # This is set in utils/__init__.py but it gets overwritten # when running under python3 somehow. warnings.simplefilter("always", DeprecationWarning) try: with warnings.catch_warnings(record=True) as w: value = getattr(self, key, None) if len(w) and w[0].category == DeprecationWarning: # if the parameter is deprecated, don't show it continue finally: warnings.filters.pop(0) # XXX: should we rather test if instance of estimator? if deep and hasattr(value, 'get_params'): deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) out[key] = value return out
def set_params(
self, **params)
Set the parameters of this estimator.
The method works on simple estimators as well as on nested objects
(such as pipelines). The latter have parameters of the form
<component>__<parameter>
so that it's possible to update each
component of a nested object.
Returns
self
def set_params(self, **params): """Set the parameters of this estimator. The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form ``<component>__<parameter>`` so that it's possible to update each component of a nested object. Returns ------- self """ if not params: # Simple optimization to gain speed (inspect is slow) return self valid_params = self.get_params(deep=True) nested_params = defaultdict(dict) # grouped by prefix for key, value in params.items(): key, delim, sub_key = key.partition('__') if key not in valid_params: raise ValueError('Invalid parameter %s for estimator %s. ' 'Check the list of available parameters ' 'with `estimator.get_params().keys()`.' % (key, self)) if delim: nested_params[key][sub_key] = value else: setattr(self, key, value) for key, sub_params in nested_params.items(): valid_params[key].set_params(**sub_params) return self
def transform(
self, X, y=None)
Perform fft on sequence along every feature
Parameters
X : array-like, shape [n_samples, seq_len, n_features] The data used to fft along the features axis.
def transform(self, X, y=None): """Perform fft on sequence along every feature Parameters ---------- X : array-like, shape [n_samples, seq_len, n_features] The data used to fft along the features axis. """ from scipy import fftpack X = abs(fftpack.fft(X, axis=1)) return X
Instance variables
var with_mean
var with_std
class FlattenShape
Flattens the shape of samples to a single vector. This is useful in cases when "classic" models like SVM are used.
Parameters
class FlattenShape(BaseEstimator, TransformerMixin): """ Flattens the shape of samples to a single vector. This is useful in cases when "classic" models like SVM are used. Parameters ---------- """ def fit(self, X, y=None): self.shape = X[0].shape return self def transform(self, X, y=None): V = np.array([np.ravel(x) for x in X]) return V def inverse_transform(self, X, y=None): V = np.array([np.reshape(x, self.shape) for x in X]) return V
Ancestors (in MRO)
- FlattenShape
- sklearn.base.BaseEstimator
- sklearn.base.TransformerMixin
- builtins.object
Static methods
def fit(
self, X, y=None)
def fit(self, X, y=None): self.shape = X[0].shape return self
def fit_transform(
self, X, y=None, **fit_params)
Fit to data, then transform it.
Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X.
Parameters
X : numpy array of shape [n_samples, n_features] Training set.
y : numpy array of shape [n_samples] Target values.
Returns
X_new : numpy array of shape [n_samples, n_features_new] Transformed array.
def fit_transform(self, X, y=None, **fit_params): """Fit to data, then transform it. Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X. Parameters ---------- X : numpy array of shape [n_samples, n_features] Training set. y : numpy array of shape [n_samples] Target values. Returns ------- X_new : numpy array of shape [n_samples, n_features_new] Transformed array. """ # non-optimized default implementation; override when a better # method is possible for a given clustering algorithm if y is None: # fit method of arity 1 (unsupervised transformation) return self.fit(X, **fit_params).transform(X) else: # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X)
def get_params(
self, deep=True)
Get parameters for this estimator.
Parameters
deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators.
Returns
params : mapping of string to any Parameter names mapped to their values.
def get_params(self, deep=True): """Get parameters for this estimator. Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ out = dict() for key in self._get_param_names(): # We need deprecation warnings to always be on in order to # catch deprecated param values. # This is set in utils/__init__.py but it gets overwritten # when running under python3 somehow. warnings.simplefilter("always", DeprecationWarning) try: with warnings.catch_warnings(record=True) as w: value = getattr(self, key, None) if len(w) and w[0].category == DeprecationWarning: # if the parameter is deprecated, don't show it continue finally: warnings.filters.pop(0) # XXX: should we rather test if instance of estimator? if deep and hasattr(value, 'get_params'): deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) out[key] = value return out
def inverse_transform(
self, X, y=None)
def inverse_transform(self, X, y=None): V = np.array([np.reshape(x, self.shape) for x in X]) return V
def set_params(
self, **params)
Set the parameters of this estimator.
The method works on simple estimators as well as on nested objects
(such as pipelines). The latter have parameters of the form
<component>__<parameter>
so that it's possible to update each
component of a nested object.
Returns
self
def set_params(self, **params): """Set the parameters of this estimator. The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form ``<component>__<parameter>`` so that it's possible to update each component of a nested object. Returns ------- self """ if not params: # Simple optimization to gain speed (inspect is slow) return self valid_params = self.get_params(deep=True) nested_params = defaultdict(dict) # grouped by prefix for key, value in params.items(): key, delim, sub_key = key.partition('__') if key not in valid_params: raise ValueError('Invalid parameter %s for estimator %s. ' 'Check the list of available parameters ' 'with `estimator.get_params().keys()`.' % (key, self)) if delim: nested_params[key][sub_key] = value else: setattr(self, key, value) for key, sub_params in nested_params.items(): valid_params[key].set_params(**sub_params) return self
def transform(
self, X, y=None)
def transform(self, X, y=None): V = np.array([np.ravel(x) for x in X]) return V
class PadSubsequence
Takes subsequences of fixed length from input list of sequences. If sequence is not long enough, it is left padded with zeros.
Parameters
length : float, length of the subsequence to take
class PadSubsequence(BaseEstimator, TransformerMixin): """ Takes subsequences of fixed length from input list of sequences. If sequence is not long enough, it is left padded with zeros. Parameters ---------- length : float, length of the subsequence to take """ def __init__(self, length=10, step=1): self.length = length self.step = step def _check_input(self, X): if len(X.shape) < 2: raise ValueError("The input should be a sequence, found shape %s" % X.shape) def fit(self,X,y=None): # remeber the num. of features self.n_features = X[0].shape[-1] return self def transform(self, X, y=None): if not hasattr(self, 'step'): self.step = 1 # X might be a list R = [] for x in X: if len(x) >= self.length: R.append(x[-self.length::self.step]) else: z = np.zeros((self.length - len(x), x.shape[-1])) zx = np.row_stack((z,x)) zx = zx[::self.step] R.append(zx) R = np.array(R) return R
Ancestors (in MRO)
- PadSubsequence
- sklearn.base.BaseEstimator
- sklearn.base.TransformerMixin
- builtins.object
Static methods
def __init__(
self, length=10, step=1)
Initialize self. See help(type(self)) for accurate signature.
def __init__(self, length=10, step=1): self.length = length self.step = step
def fit(
self, X, y=None)
def fit(self,X,y=None): # remeber the num. of features self.n_features = X[0].shape[-1] return self
def fit_transform(
self, X, y=None, **fit_params)
Fit to data, then transform it.
Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X.
Parameters
X : numpy array of shape [n_samples, n_features] Training set.
y : numpy array of shape [n_samples] Target values.
Returns
X_new : numpy array of shape [n_samples, n_features_new] Transformed array.
def fit_transform(self, X, y=None, **fit_params): """Fit to data, then transform it. Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X. Parameters ---------- X : numpy array of shape [n_samples, n_features] Training set. y : numpy array of shape [n_samples] Target values. Returns ------- X_new : numpy array of shape [n_samples, n_features_new] Transformed array. """ # non-optimized default implementation; override when a better # method is possible for a given clustering algorithm if y is None: # fit method of arity 1 (unsupervised transformation) return self.fit(X, **fit_params).transform(X) else: # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X)
def get_params(
self, deep=True)
Get parameters for this estimator.
Parameters
deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators.
Returns
params : mapping of string to any Parameter names mapped to their values.
def get_params(self, deep=True): """Get parameters for this estimator. Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ out = dict() for key in self._get_param_names(): # We need deprecation warnings to always be on in order to # catch deprecated param values. # This is set in utils/__init__.py but it gets overwritten # when running under python3 somehow. warnings.simplefilter("always", DeprecationWarning) try: with warnings.catch_warnings(record=True) as w: value = getattr(self, key, None) if len(w) and w[0].category == DeprecationWarning: # if the parameter is deprecated, don't show it continue finally: warnings.filters.pop(0) # XXX: should we rather test if instance of estimator? if deep and hasattr(value, 'get_params'): deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) out[key] = value return out
def set_params(
self, **params)
Set the parameters of this estimator.
The method works on simple estimators as well as on nested objects
(such as pipelines). The latter have parameters of the form
<component>__<parameter>
so that it's possible to update each
component of a nested object.
Returns
self
def set_params(self, **params): """Set the parameters of this estimator. The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form ``<component>__<parameter>`` so that it's possible to update each component of a nested object. Returns ------- self """ if not params: # Simple optimization to gain speed (inspect is slow) return self valid_params = self.get_params(deep=True) nested_params = defaultdict(dict) # grouped by prefix for key, value in params.items(): key, delim, sub_key = key.partition('__') if key not in valid_params: raise ValueError('Invalid parameter %s for estimator %s. ' 'Check the list of available parameters ' 'with `estimator.get_params().keys()`.' % (key, self)) if delim: nested_params[key][sub_key] = value else: setattr(self, key, value) for key, sub_params in nested_params.items(): valid_params[key].set_params(**sub_params) return self
def transform(
self, X, y=None)
def transform(self, X, y=None): if not hasattr(self, 'step'): self.step = 1 # X might be a list R = [] for x in X: if len(x) >= self.length: R.append(x[-self.length::self.step]) else: z = np.zeros((self.length - len(x), x.shape[-1])) zx = np.row_stack((z,x)) zx = zx[::self.step] R.append(zx) R = np.array(R) return R
Instance variables
var length
var step
class Seq1Dto2D
Useful for working with text sequences. Such sequence is just a list of characters. This converts a sequence of elements to a sequence of lists of size 1 of characters. So "abc" -> [['a'], ['b'], ['c']] Useful for applications where you do not want to convert text to features explicitly.
class Seq1Dto2D(BaseEstimator, TransformerMixin): """ Useful for working with text sequences. Such sequence is just a list of characters. This converts a sequence of elements to a sequence of lists of size 1 of characters. So "abc" -> [['a'], ['b'], ['c']] Useful for applications where you do not want to convert text to features explicitly. """ def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X, y=None): return [np.array(list(xx))[:, np.newaxis] for xx in X]
Ancestors (in MRO)
- Seq1Dto2D
- sklearn.base.BaseEstimator
- sklearn.base.TransformerMixin
- builtins.object
Static methods
def __init__(
self)
Initialize self. See help(type(self)) for accurate signature.
def __init__(self): pass
def fit(
self, X, y=None)
def fit(self, X, y=None): return self
def fit_transform(
self, X, y=None, **fit_params)
Fit to data, then transform it.
Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X.
Parameters
X : numpy array of shape [n_samples, n_features] Training set.
y : numpy array of shape [n_samples] Target values.
Returns
X_new : numpy array of shape [n_samples, n_features_new] Transformed array.
def fit_transform(self, X, y=None, **fit_params): """Fit to data, then transform it. Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X. Parameters ---------- X : numpy array of shape [n_samples, n_features] Training set. y : numpy array of shape [n_samples] Target values. Returns ------- X_new : numpy array of shape [n_samples, n_features_new] Transformed array. """ # non-optimized default implementation; override when a better # method is possible for a given clustering algorithm if y is None: # fit method of arity 1 (unsupervised transformation) return self.fit(X, **fit_params).transform(X) else: # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X)
def get_params(
self, deep=True)
Get parameters for this estimator.
Parameters
deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators.
Returns
params : mapping of string to any Parameter names mapped to their values.
def get_params(self, deep=True): """Get parameters for this estimator. Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ out = dict() for key in self._get_param_names(): # We need deprecation warnings to always be on in order to # catch deprecated param values. # This is set in utils/__init__.py but it gets overwritten # when running under python3 somehow. warnings.simplefilter("always", DeprecationWarning) try: with warnings.catch_warnings(record=True) as w: value = getattr(self, key, None) if len(w) and w[0].category == DeprecationWarning: # if the parameter is deprecated, don't show it continue finally: warnings.filters.pop(0) # XXX: should we rather test if instance of estimator? if deep and hasattr(value, 'get_params'): deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) out[key] = value return out
def set_params(
self, **params)
Set the parameters of this estimator.
The method works on simple estimators as well as on nested objects
(such as pipelines). The latter have parameters of the form
<component>__<parameter>
so that it's possible to update each
component of a nested object.
Returns
self
def set_params(self, **params): """Set the parameters of this estimator. The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form ``<component>__<parameter>`` so that it's possible to update each component of a nested object. Returns ------- self """ if not params: # Simple optimization to gain speed (inspect is slow) return self valid_params = self.get_params(deep=True) nested_params = defaultdict(dict) # grouped by prefix for key, value in params.items(): key, delim, sub_key = key.partition('__') if key not in valid_params: raise ValueError('Invalid parameter %s for estimator %s. ' 'Check the list of available parameters ' 'with `estimator.get_params().keys()`.' % (key, self)) if delim: nested_params[key][sub_key] = value else: setattr(self, key, value) for key, sub_params in nested_params.items(): valid_params[key].set_params(**sub_params) return self
def transform(
self, X, y=None)
def transform(self, X, y=None): return [np.array(list(xx))[:, np.newaxis] for xx in X]
class SequenceEstimator
This generic estimator class can be used to label every element in a sequence using underlying subsequence estimator. One example would be labeling which parts of sensory data correspond to what kind of activity of the user.
Consider the following example:
X = [[1,2,3]] y = [[0,0,1]]
fit() will train the estimator to classify properly the following data:
X = [[1], [1,2], [1,2,3]] y = [[0, 0, 1]]
predict() on X will return labels for every element in a sequence.
Parameters
estimator: BaseEstimator, model which is used to do estimations on subsequences.
step: int, step with which the subsequences are taken for training of internal sestimator.
class SequenceEstimator(BaseEstimator): """ This generic estimator class can be used to label every element in a sequence using underlying subsequence estimator. One example would be labeling which parts of sensory data correspond to what kind of activity of the user. Consider the following example: X = [[1,2,3]] y = [[0,0,1]] fit() will train the estimator to classify properly the following data: X = [[1], [1,2], [1,2,3]] y = [[0, 0, 1]] predict() on X will return labels for every element in a sequence. Parameters ---------- estimator: BaseEstimator, model which is used to do estimations on subsequences. step: int, step with which the subsequences are taken for training of internal sestimator. """ def __init__(self, estimator, step=1, max_subsequence=None): self.estimator = estimator self.step = step self.max_subsequence = max_subsequence self.subsequencer = None # class instance that is responsible for getting views into the sequence def set_params(self, **params): step_name = self.__class__.__name__.lower() + "__step" if step_name in params: self.step = params[step_name] params = params.copy() del params[step_name] self.estimator.set_params(**params) return self def fit(self, X, y): X, y = Subsequensor(step=self.step, max_subsequence=self.max_subsequence).transform(X, y) X, y = sum(X, []), sum(y, []) # concat all data together self.estimator.fit(X, y) return self def predict(self, X): X = Subsequensor(step=1).transform(X) R = [self.estimator.predict(x) for x in X] return R def score(self, X, y): X, y = Subsequensor(step=self.step, max_subsequence=self.max_subsequence).transform(X, y) X, y = sum(X, []), sum(y, []) # concat all data together return self.estimator.score(X, y)
Ancestors (in MRO)
- SequenceEstimator
- sklearn.base.BaseEstimator
- builtins.object
Static methods
def __init__(
self, estimator, step=1, max_subsequence=None)
Initialize self. See help(type(self)) for accurate signature.
def __init__(self, estimator, step=1, max_subsequence=None): self.estimator = estimator self.step = step self.max_subsequence = max_subsequence self.subsequencer = None # class instance that is responsible for getting views into the sequence
def fit(
self, X, y)
def fit(self, X, y): X, y = Subsequensor(step=self.step, max_subsequence=self.max_subsequence).transform(X, y) X, y = sum(X, []), sum(y, []) # concat all data together self.estimator.fit(X, y) return self
def get_params(
self, deep=True)
Get parameters for this estimator.
Parameters
deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators.
Returns
params : mapping of string to any Parameter names mapped to their values.
def get_params(self, deep=True): """Get parameters for this estimator. Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ out = dict() for key in self._get_param_names(): # We need deprecation warnings to always be on in order to # catch deprecated param values. # This is set in utils/__init__.py but it gets overwritten # when running under python3 somehow. warnings.simplefilter("always", DeprecationWarning) try: with warnings.catch_warnings(record=True) as w: value = getattr(self, key, None) if len(w) and w[0].category == DeprecationWarning: # if the parameter is deprecated, don't show it continue finally: warnings.filters.pop(0) # XXX: should we rather test if instance of estimator? if deep and hasattr(value, 'get_params'): deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) out[key] = value return out
def predict(
self, X)
def predict(self, X): X = Subsequensor(step=1).transform(X) R = [self.estimator.predict(x) for x in X] return R
def score(
self, X, y)
def score(self, X, y): X, y = Subsequensor(step=self.step, max_subsequence=self.max_subsequence).transform(X, y) X, y = sum(X, []), sum(y, []) # concat all data together return self.estimator.score(X, y)
def set_params(
self, **params)
Set the parameters of this estimator.
The method works on simple estimators as well as on nested objects
(such as pipelines). The latter have parameters of the form
<component>__<parameter>
so that it's possible to update each
component of a nested object.
Returns
self
def set_params(self, **params): step_name = self.__class__.__name__.lower() + "__step" if step_name in params: self.step = params[step_name] params = params.copy() del params[step_name] self.estimator.set_params(**params) return self
Instance variables
var estimator
var max_subsequence
var step
var subsequencer
class SequenceTransformer
Base class for all estimators in scikit-learn
Notes
All estimators should specify all the parameters that can be set
at the class level in their __init__
as explicit keyword
arguments (no *args
or **kwargs
).
class SequenceTransformer(BaseEstimator, TransformerMixin): def __init__(self, transformer, mode='stack'): """ Applies transformer to every element in input sequence. transformer: TransformerMixin mode: How to preprocess sequences for transformer fitting. default: stack all sequences into one huge sequence so that then it looks like a normal 2d training set """ self.transformer = transformer self.mode = mode self.transformer_ = None def fit(self, X, y=None): """ Fit base transformer to the set of sequences. X: iterable of shape [n_samples, ...] y: iterable of shape [n_samples, ...] """ # stack all the elements into one huge dataset self.transformer_ = clone(self.transformer) if self.mode == 'stack': X_conc = np.row_stack(x for x in X) # might have bugs here in future :( if y is not None: y_conc = np.concatenate([[v] * len(x) for x, v in zip(X, y)]) else: X_conc = X y_conc = y if y is None: self.transformer_.fit(X_conc) else: self.transformer_.fit(X_conc, y_conc) return self def transform(self, X, y=None): if y is None: result = [self.transformer_.transform(xx) for xx in X] else: result = [self.transformer_.transform(xx, [yy] * len(xx)) for xx, yy in zip(X, y)] result = np.array(result) return result def set_params(self, **params): self.base_transformer.set_params(**params) return self
Ancestors (in MRO)
- SequenceTransformer
- sklearn.base.BaseEstimator
- sklearn.base.TransformerMixin
- builtins.object
Static methods
def __init__(
self, transformer, mode='stack')
Applies transformer to every element in input sequence. transformer: TransformerMixin mode: How to preprocess sequences for transformer fitting. default: stack all sequences into one huge sequence so that then it looks like a normal 2d training set
def __init__(self, transformer, mode='stack'): """ Applies transformer to every element in input sequence. transformer: TransformerMixin mode: How to preprocess sequences for transformer fitting. default: stack all sequences into one huge sequence so that then it looks like a normal 2d training set """ self.transformer = transformer self.mode = mode self.transformer_ = None
def fit(
self, X, y=None)
Fit base transformer to the set of sequences.
X: iterable of shape [n_samples, ...] y: iterable of shape [n_samples, ...]
def fit(self, X, y=None): """ Fit base transformer to the set of sequences. X: iterable of shape [n_samples, ...] y: iterable of shape [n_samples, ...] """ # stack all the elements into one huge dataset self.transformer_ = clone(self.transformer) if self.mode == 'stack': X_conc = np.row_stack(x for x in X) # might have bugs here in future :( if y is not None: y_conc = np.concatenate([[v] * len(x) for x, v in zip(X, y)]) else: X_conc = X y_conc = y if y is None: self.transformer_.fit(X_conc) else: self.transformer_.fit(X_conc, y_conc) return self
def fit_transform(
self, X, y=None, **fit_params)
Fit to data, then transform it.
Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X.
Parameters
X : numpy array of shape [n_samples, n_features] Training set.
y : numpy array of shape [n_samples] Target values.
Returns
X_new : numpy array of shape [n_samples, n_features_new] Transformed array.
def fit_transform(self, X, y=None, **fit_params): """Fit to data, then transform it. Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X. Parameters ---------- X : numpy array of shape [n_samples, n_features] Training set. y : numpy array of shape [n_samples] Target values. Returns ------- X_new : numpy array of shape [n_samples, n_features_new] Transformed array. """ # non-optimized default implementation; override when a better # method is possible for a given clustering algorithm if y is None: # fit method of arity 1 (unsupervised transformation) return self.fit(X, **fit_params).transform(X) else: # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X)
def get_params(
self, deep=True)
Get parameters for this estimator.
Parameters
deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators.
Returns
params : mapping of string to any Parameter names mapped to their values.
def get_params(self, deep=True): """Get parameters for this estimator. Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ out = dict() for key in self._get_param_names(): # We need deprecation warnings to always be on in order to # catch deprecated param values. # This is set in utils/__init__.py but it gets overwritten # when running under python3 somehow. warnings.simplefilter("always", DeprecationWarning) try: with warnings.catch_warnings(record=True) as w: value = getattr(self, key, None) if len(w) and w[0].category == DeprecationWarning: # if the parameter is deprecated, don't show it continue finally: warnings.filters.pop(0) # XXX: should we rather test if instance of estimator? if deep and hasattr(value, 'get_params'): deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) out[key] = value return out
def set_params(
self, **params)
Set the parameters of this estimator.
The method works on simple estimators as well as on nested objects
(such as pipelines). The latter have parameters of the form
<component>__<parameter>
so that it's possible to update each
component of a nested object.
Returns
self
def set_params(self, **params): self.base_transformer.set_params(**params) return self
def transform(
self, X, y=None)
def transform(self, X, y=None): if y is None: result = [self.transformer_.transform(xx) for xx in X] else: result = [self.transformer_.transform(xx, [yy] * len(xx)) for xx, yy in zip(X, y)] result = np.array(result) return result
Instance variables
var mode
var transformer
var transformer_
class Subsequensor
Creates views in all subsequences of a numpy sequence.
Parameters
step: int, step with which the subsequences are taken.
max_subsequence: int or None, maximum subsequence size that is used in order to predict a certain output value.
class Subsequensor(BaseEstimator, TransformerMixin): """ Creates views in all subsequences of a numpy sequence. Parameters ---------- step: int, step with which the subsequences are taken. max_subsequence: int or None, maximum subsequence size that is used in order to predict a certain output value. """ def __init__(self, step, max_subsequence=None): self.step = step self.max_subsequence = max_subsequence def fit(self, X, Y): """Fit the transformer according to the given training data. Parameters ---------- X : list of numpy arrays List of sequences, where every sequence is a 2d numpy array, where the first dimension corresponds to time, and last for features. Y : list of object List of labels assigned to corresponding sequences in X. Returns ------- self : object Returns self. """ return self def transform(self, X, Y=None): """Transform the input data. Parameters ---------- X : list of numpy arrays List of sequences, where every sequence is a 2d numpy array, where the first dimension corresponds to time, and last for features. Y : list of object List of labels assigned to corresponding sequences in X. Returns ------- X : list Returns list of views into the sequences. """ test_time = Y is None if test_time: Y = [[None]*len(x) for x in X] if self.max_subsequence is None: args = (self.step, ) else: args = (self.step, self.max_subsequence) XY = [make_subsequences(*((x, y, ) + args)) for x, y in zip(X, Y)] X = [z[0] for z in XY] if test_time: return X return X, [z[1] for z in XY]
Ancestors (in MRO)
- Subsequensor
- sklearn.base.BaseEstimator
- sklearn.base.TransformerMixin
- builtins.object
Static methods
def __init__(
self, step, max_subsequence=None)
Initialize self. See help(type(self)) for accurate signature.
def __init__(self, step, max_subsequence=None): self.step = step self.max_subsequence = max_subsequence
def fit(
self, X, Y)
Fit the transformer according to the given training data.
Parameters
X : list of numpy arrays List of sequences, where every sequence is a 2d numpy array, where the first dimension corresponds to time, and last for features.
Y : list of object List of labels assigned to corresponding sequences in X. Returns
self : object Returns self.
def fit(self, X, Y): """Fit the transformer according to the given training data. Parameters ---------- X : list of numpy arrays List of sequences, where every sequence is a 2d numpy array, where the first dimension corresponds to time, and last for features. Y : list of object List of labels assigned to corresponding sequences in X. Returns ------- self : object Returns self. """ return self
def fit_transform(
self, X, y=None, **fit_params)
Fit to data, then transform it.
Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X.
Parameters
X : numpy array of shape [n_samples, n_features] Training set.
y : numpy array of shape [n_samples] Target values.
Returns
X_new : numpy array of shape [n_samples, n_features_new] Transformed array.
def fit_transform(self, X, y=None, **fit_params): """Fit to data, then transform it. Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X. Parameters ---------- X : numpy array of shape [n_samples, n_features] Training set. y : numpy array of shape [n_samples] Target values. Returns ------- X_new : numpy array of shape [n_samples, n_features_new] Transformed array. """ # non-optimized default implementation; override when a better # method is possible for a given clustering algorithm if y is None: # fit method of arity 1 (unsupervised transformation) return self.fit(X, **fit_params).transform(X) else: # fit method of arity 2 (supervised transformation) return self.fit(X, y, **fit_params).transform(X)
def get_params(
self, deep=True)
Get parameters for this estimator.
Parameters
deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators.
Returns
params : mapping of string to any Parameter names mapped to their values.
def get_params(self, deep=True): """Get parameters for this estimator. Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ out = dict() for key in self._get_param_names(): # We need deprecation warnings to always be on in order to # catch deprecated param values. # This is set in utils/__init__.py but it gets overwritten # when running under python3 somehow. warnings.simplefilter("always", DeprecationWarning) try: with warnings.catch_warnings(record=True) as w: value = getattr(self, key, None) if len(w) and w[0].category == DeprecationWarning: # if the parameter is deprecated, don't show it continue finally: warnings.filters.pop(0) # XXX: should we rather test if instance of estimator? if deep and hasattr(value, 'get_params'): deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) out[key] = value return out
def set_params(
self, **params)
Set the parameters of this estimator.
The method works on simple estimators as well as on nested objects
(such as pipelines). The latter have parameters of the form
<component>__<parameter>
so that it's possible to update each
component of a nested object.
Returns
self
def set_params(self, **params): """Set the parameters of this estimator. The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form ``<component>__<parameter>`` so that it's possible to update each component of a nested object. Returns ------- self """ if not params: # Simple optimization to gain speed (inspect is slow) return self valid_params = self.get_params(deep=True) nested_params = defaultdict(dict) # grouped by prefix for key, value in params.items(): key, delim, sub_key = key.partition('__') if key not in valid_params: raise ValueError('Invalid parameter %s for estimator %s. ' 'Check the list of available parameters ' 'with `estimator.get_params().keys()`.' % (key, self)) if delim: nested_params[key][sub_key] = value else: setattr(self, key, value) for key, sub_params in nested_params.items(): valid_params[key].set_params(**sub_params) return self
def transform(
self, X, Y=None)
Transform the input data.
Parameters
X : list of numpy arrays List of sequences, where every sequence is a 2d numpy array, where the first dimension corresponds to time, and last for features.
Y : list of object List of labels assigned to corresponding sequences in X.
Returns
X : list Returns list of views into the sequences.
def transform(self, X, Y=None): """Transform the input data. Parameters ---------- X : list of numpy arrays List of sequences, where every sequence is a 2d numpy array, where the first dimension corresponds to time, and last for features. Y : list of object List of labels assigned to corresponding sequences in X. Returns ------- X : list Returns list of views into the sequences. """ test_time = Y is None if test_time: Y = [[None]*len(x) for x in X] if self.max_subsequence is None: args = (self.step, ) else: args = (self.step, self.max_subsequence) XY = [make_subsequences(*((x, y, ) + args)) for x, y in zip(X, Y)] X = [z[0] for z in XY] if test_time: return X return X, [z[1] for z in XY]
Instance variables
var max_subsequence
var step