import math
import numpy as np
import traitlets
import warnings
import vaex.dataframe
from vaex.ml import generate
from vaex.ml.state import HasState
from vaex.serialize import register
from vaex.utils import _ensure_strings_from_expressions
sklearn = vaex.utils.optional_import("sklearn", modules=[
"sklearn.decomposition",
"sklearn.random_projection"
])
help_features = 'List of features to transform.'
help_prefix = 'Prefix for the names of the transformed features.'
@register
class StateTransfer(HasState):
state = traitlets.Dict()
def transform(self, df):
copy = df.copy()
self.state = dict(self.state, active_range=[copy._index_start, copy._index_end])
copy.state_set(self.state)
return copy
class Transformer(HasState):
''' Parent class for all of the transformers.
'''
features = traitlets.List(traitlets.Unicode(), help=help_features).tag(ui='SelectMultiple')
def fit_transform(self, df):
'''Fit and apply the transformer to the supplied DataFrame.
:param df: A vaex DataFrame.
:returns copy: A shallow copy of the DataFrame that includes the transformations.
'''
self.fit(df=df)
return self.transform(df=df)
[docs]@register
@generate.register
class PCA(Transformer):
'''Transform a set of features using a Principal Component Analysis.
Example:
>>> import vaex
>>> import vaex.ml
>>> df = vaex.from_arrays(x=[2,5,7,2,15], y=[-2,3,0,0,10])
>>> df
# x y
0 2 -2
1 5 3
2 7 0
3 2 0
4 15 10
>>> pca = vaex.ml.PCA(n_components=2, features=['x', 'y'])
>>> pca.fit_transform(df)
# x y PCA_0 PCA_1
0 2 -2 5.92532 0.413011
1 5 3 0.380494 -1.39112
2 7 0 0.840049 2.18502
3 2 0 4.61287 -1.09612
4 15 10 -11.7587 -0.110794
'''
# title = traitlets.Unicode(default_value='PCA', read_only=True).tag(ui='HTML')
n_components = traitlets.Int(default_value=None, allow_none=True, help='Number of components to retain. If None, all the components will be retained.').tag(ui='IntText')
prefix = traitlets.Unicode(default_value="PCA_", help=help_prefix)
whiten = traitlets.Bool(default_value=False, allow_none=False, help='If True perform whitening, i.e. remove the relative variance schale of the transformed components.')
# progress = traitlets.Any(default_value=False, help='If True, display a progressbar of the PCA fitting process.').tag(ui='Checkbox')
eigen_vectors_ = traitlets.List(traitlets.List(traitlets.CFloat()), help='The eigen vectors corresponding to each feature').tag(output=True)
eigen_values_ = traitlets.List(traitlets.CFloat(), help='The eigen values that correspond to each feature.').tag(output=True)
means_ = traitlets.List(traitlets.CFloat(), help='The mean of each feature').tag(output=True)
explained_variance_ = traitlets.List(traitlets.CFloat(), help='Variance explained by each of the components. Same as the eigen values.').tag(output=True)
explained_variance_ratio_ = traitlets.List(traitlets.CFloat(), help='Percentage of variance explained by each of the selected components.').tag(output=True)
[docs] def fit(self, df, progress=None):
'''Fit the PCA model to the DataFrame.
:param df: A vaex DataFrame.
:param progress: If True or 'widget', display a progressbar of the fitting process.
'''
self.n_components = self.n_components or len(self.features)
assert self.n_components >= 2, 'At least two features are required.'
assert self.n_components <= len(self.features), 'Can not have more components than features.'
C = df.cov(self.features, progress=progress)
eigen_values, eigen_vectors = np.linalg.eigh(C)
indices = np.argsort(eigen_values)[::-1]
self.means_ = df.mean(self.features, progress=progress).tolist()
self.eigen_vectors_ = eigen_vectors[:, indices].tolist()
self.eigen_values_ = eigen_values[indices].tolist()
self.explained_variance_ = self.eigen_values_
self.explained_variance_ratio_ = (eigen_values[indices] / np.sum(eigen_values)).tolist()
@register
@generate.register
class PCAIncremental(PCA):
'''Transform a set of features using the "sklearn.decomposition.IncrementalPCA" algorithm.
Note that you need to have scikit-learn installed to fit this Transformer, but not
for transformations using an already fitted Transformer.
Example:
>>> import vaex
>>> import vaex.ml
>>> df = vaex.from_arrays(x=[2,5,7,2,15], y=[-2,3,0,0,10])
>>> df
# x y
0 2 -2
1 5 3
2 7 0
3 2 0
4 15 10
>>> pca = vaex.ml.PCAIncremental(n_components=2, features=['x', 'y'], batch_size=3)
>>> pca.fit_transform(df)
# x y PCA_0 PCA_1
0 2 -2 -5.92532 -0.413011
1 5 3 -0.380494 1.39112
2 7 0 -0.840049 -2.18502
3 2 0 -4.61287 1.09612
4 15 10 11.7587 0.110794
'''
snake_name = 'pca_incremental'
batch_size = traitlets.Int(default_value=1000, help='Number of samples to be send to the transformer in each batch.')
noise_variance_ = traitlets.CFloat(default_value=0, help='The estimated noise covariance following the Probabilistic PCA model from Tipping and Bishop 1999.').tag(output=True)
n_samples_seen_ = traitlets.CInt(default_value=0, help='The number of samples processed by the transformer.').tag(output=True)
def fit(self, df, progress=None):
'''Fit the PCAIncremental model to the DataFrame.
:param df: A vaex DataFrame.
:param progress: If True or 'widget', display a progressbar of the fitting process.
'''
self.n_components = self.n_components or len(self.features)
n_samples = len(df)
progressbar = vaex.utils.progressbars(progress, title="fit(PCA)")
pca = sklearn.decomposition.IncrementalPCA(n_components=self.n_components,
batch_size=self.batch_size,
whiten=self.whiten)
for i1, i2, chunk in df.evaluate_iterator(self.features, chunk_size=self.batch_size, array_type='numpy'):
progressbar(i1 / n_samples)
chunk = np.array(chunk).T.astype(np.float64)
pca.partial_fit(X=chunk, check_input=False)
progressbar(1.0)
self.singular_values_ = pca.singular_values_.tolist()
self.eigen_vectors_ = pca.components_.T.tolist()
self.eigen_values_ = pca.explained_variance_.tolist()
self.explained_variance_ = pca.explained_variance_.tolist()
self.explained_variance_ratio_ = pca.explained_variance_ratio_.tolist()
self.means_ = pca.mean_.tolist()
self.noise_variance_ = pca.noise_variance_
self.n_samples_seen_ = pca.n_samples_seen_
@register
@generate.register
class RandomProjections(Transformer):
'''Reduce dimensionality through a random matrix projection.
The random projections method is based on the Johnson-Lindenstrauss lemma.
For mode details see https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma
Note that you need scikit-learn to fit this Transformer but not for transformations using an already fitter Transformer.
Example:
>>> import vaex
>>> import vaex.ml
>>> df = vaex.from_arrays(x=[2,5,7,2,15], y=[-2,3,0,0,10], z=[2, -10, 2, 3, 0])
>>> df
# x y z
0 2 -2 2
1 5 3 -10
2 7 0 2
3 2 0 3
4 15 10 0
>>> rand_proj = vaex.ml.RandomProjections(features=['x', 'y', 'z'], n_components=2)
>>> rand_proj.fit_transform(df)
# x y z random_projection_0 random_projection_1
0 2 -2 2 1.73363 -0.0700273
1 5 3 -10 -17.8742 -14.0226
2 7 0 2 -3.32911 -8.50181
3 2 0 3 2.04843 -1.27538
4 15 10 0 -17.0289 -28.6562
'''
snake_name = 'random_projections'
n_components = traitlets.CInt(default_value=None, allow_none=True, help='Number of components to retain. If None (default) the number will be set via the Johnson-Lindenstrauss formula. See https://scikit-learn.org/stable/modules/generated/sklearn.random_projection.johnson_lindenstrauss_min_dim.html for more details.')
eps = traitlets.Float(default_value=0.1, allow_none=True, help='Parameter to control the quality of the embedding according to the Johnson-Lindenstrauss lemma when `n_components` is set to None. The value must be positive.')
matrix_type = traitlets.Enum(values=['gaussian', 'sparse'], default_value='gaussian', help='The type of random matrix to create. The values can be "gaussian" and "sparse".')
density = traitlets.Float(default_value=None, allow_none=True, help='Ratio in the range (0, 1] of non-zero component in the random projection matrix. Only valid if `matrix_type` is "sparse". If density is None, the value is set to the minimum density as recommended by Ping Li et al.: 1 / sqrt(n_features).')
prefix = traitlets.Unicode(default_value="random_projection_", help=help_prefix)
random_state = traitlets.Int(default_value=None, allow_none=True, help='Controls the pseudo random number generator used to generate the projection matrix at fit time. Used to get reproducible results.')
random_matrix_ = traitlets.List(traitlets.List(traitlets.CFloat()), help='The random matrix.').tag(output=True)
@traitlets.validate('eps')
def _valid_eps(self, proposal):
if (proposal['value'] > 0) & (proposal['value'] < 1):
return proposal['value']
else:
raise traitlets.TraitError('`eps` must be between 0 and 1.')
@traitlets.validate('density')
def _valid_density(self, proposal):
if proposal['value'] is None or (proposal['value'] > 0) & (proposal['value'] <= 1):
return proposal['value']
else:
raise traitlets.TraitError('`density` must be 0 < density <= 1.')
def fit(self, df):
'''Fit the RandomProjections to the DataFrame.
:param df: A vaex DataFrame.
'''
n_samples = len(df)
n_features = len(self.features)
if self.n_components is None:
self.n_components = sklearn.random_projection.johnson_lindenstrauss_min_dim(n_samples=n_samples,
eps=self.eps)
if self.matrix_type == 'gaussian':
self.random_matrix_ = sklearn.random_projection._gaussian_random_matrix(n_components=self.n_components,
n_features=n_features,
random_state=self.random_state).tolist()
else:
density = self.density or 'auto'
self.random_matrix_ = sklearn.random_projection._sparse_random_matrix(n_components=self.n_components,
n_features=n_features,
density=density,
random_state=self.random_state).toarray().tolist()
def transform(self, df):
'''Apply the RandomProjection transformation to the DataFrame.
:param df: A vaex DataFrame
:return copy: A shallow copy of the DataFrame that includes the RandomProjection components.
:rtype: DataFrame
'''
copy = df.copy()
random_matrix = np.array(self.random_matrix_)
name_prefix_offset = 0
while self.prefix + str(name_prefix_offset) in copy.get_column_names(virtual=True, strings=True):
name_prefix_offset += 1
for component in range(self.n_components):
vector = random_matrix[component]
feature_expressions = [copy[feat] for feat in self.features]
expr = copy.func.dot_product(feature_expressions, vector)
name = self.prefix + str(component + name_prefix_offset)
copy[name] = expr
return copy
[docs]@register
@generate.register
class LabelEncoder(Transformer):
'''Encode categorical columns with integer values between 0 and num_classes-1.
Example:
>>> import vaex
>>> df = vaex.from_arrays(color=['red', 'green', 'green', 'blue', 'red'])
>>> df
# color
0 red
1 green
2 green
3 blue
4 red
>>> encoder = vaex.ml.LabelEncoder(features=['color'])
>>> encoder.fit_transform(df)
# color label_encoded_color
0 red 2
1 green 1
2 green 1
3 blue 0
4 red 2
'''
# title = traitlets.Unicode(default_value='Label Encoder', read_only=True).tag(ui='HTML')
prefix = traitlets.Unicode(default_value='label_encoded_', help=help_prefix).tag(ui='Text')
labels_ = traitlets.Dict(default_value={}, allow_none=True, help='The encoded labels of each feature.').tag(output=True)
allow_unseen = traitlets.Bool(default_value=False, allow_none=False, help='If True, unseen values will be \
encoded with -1, otherwise an error is raised').tag(ui='Checkbox')
[docs] def fit(self, df):
'''Fit LabelEncoder to the DataFrame.
:param df: A vaex DataFrame.
'''
for feature in self.features:
labels = vaex.array_types.tolist(df[feature].unique())
self.labels_[feature] = dict(zip(labels, np.arange(len(labels))))
[docs]@register
@generate.register
class OneHotEncoder(Transformer):
'''Encode categorical columns according ot the One-Hot scheme.
Example:
>>> import vaex
>>> df = vaex.from_arrays(color=['red', 'green', 'green', 'blue', 'red'])
>>> df
# color®
0 red
1 green
2 green
3 blue
4 red
>>> encoder = vaex.ml.OneHotEncoder(features=['color'])
>>> encoder.fit_transform(df)
# color color_blue color_green color_red
0 red 0 0 1
1 green 0 1 0
2 green 0 1 0
3 blue 1 0 0
4 red 0 0 1
'''
# title = Unicode(default_value='One-Hot Encoder', read_only=True).tag(ui='HTML')
prefix = traitlets.Unicode(default_value='', help=help_prefix).tag(ui='Text')
one = traitlets.Any(1, help='Value to encode when a category is present.')
zero = traitlets.Any(0, help='Value to encode when category is absent.')
uniques_ = traitlets.List(traitlets.List(), help='The unique elements found in each feature.').tag(output=True)
[docs] def fit(self, df):
'''Fit OneHotEncoder to the DataFrame.
:param df: A vaex DataFrame.
'''
uniques = []
for i in self.features:
expression = _ensure_strings_from_expressions(i)
unique_values = vaex.array_types.tolist(df.unique(expression))
if None in unique_values:
unique_values.remove(None)
unique_values.sort()
unique_values.insert(0, None) # This is done in place
else:
unique_values.sort()
uniques.append(unique_values)
self.uniques_ = uniques
# detect ability to downcast to uint8
[docs]@register
@generate.register
class MultiHotEncoder(Transformer):
'''Encode categorical columns according to a binary multi-hot scheme.
With Multi-Hot Encoder (sometimes called Binary Encoder), the categorical variables are first
ordinal encoded, and those encodings are converted to a binary number. Each digit of that binary number
is a separate column, containing either a "0" or a "1". This is can be considered as an improvement
over the One-Hot encoder as it guards against generating too many new columns when the cardinality of the
categorical column is high, while effecively removing the ordinality that an Ordinal Encoder would introduce.
Example:
>>> import vaex
>>> import vaex.ml
>>> df = vaex.from_arrays(color=['red', 'green', 'green', 'blue', 'red'])
>>> df
# color
0 red
1 green
2 green
3 blue
4 red
>>> encoder = vaex.ml.MultiHotEncoder(features=['color'])
>>> encoder.fit_transform(df)
# color color_0 color_1 color_2
0 red 0 1 1
1 green 0 1 0
2 green 0 1 0
3 blue 0 0 1
4 red 0 1 1
'''
prefix = traitlets.Unicode(default_value='', help=help_prefix).tag(ui='Text')
labels_ = traitlets.Dict(default_value={}, allow_none=True, help='The ordinal-encoded labels of each feature.').tag(output=True)
[docs] def fit(self, df):
'''Fit MultiHotEncoder to the DataFrame.
:param df: A vaex DataFrame.
'''
for feature in self.features:
# Get unique labels
labels = vaex.array_types.tolist(df[feature].unique())
n_labels = len(labels)
if None in labels:
labels.remove(None)
labels.sort()
labels.insert(0, None) # This is done in place
else:
labels.sort()
labels_dict = dict(zip(labels, np.arange(1, n_labels+1)))
self.labels_[feature] = labels_dict
def _get_n_dims(self, n_labels):
'''Get the number of dimensions for the multi-hot vector, based on the number of unique labels.'''
return math.floor(math.log2(n_labels)) + 1 + np.mod(n_labels, 2)
[docs]@register
@generate.register
class FrequencyEncoder(Transformer):
'''Encode categorical columns by the frequency of their respective samples.
Example:
>>> import vaex
>>> df = vaex.from_arrays(color=['red', 'green', 'green', 'blue', 'red', 'green'])
>>> df
# color
0 red
1 green
2 green
3 blue
4 red
>>> encoder = vaex.ml.FrequencyEncoder(features=['color'])
>>> encoder.fit_transform(df)
# color frequency_encoded_color
0 red 0.333333
1 green 0.5
2 green 0.5
3 blue 0.166667
4 red 0.333333
5 green 0.5
'''
prefix = traitlets.Unicode(default_value='frequency_encoded_', help=help_prefix).tag(ui='Text')
unseen = traitlets.Enum(values=['zero', 'nan'], default_value='nan', help='Strategy to deal with unseen values.')
mappings_ = traitlets.Dict()
[docs] def fit(self, df):
'''Fit FrequencyEncoder to the DataFrame.
:param df: A vaex DataFrame.
'''
# number of samples
nsamples = len(df)
# Encoding
for feature in self.features:
self.mappings_[feature] = dict(df[feature].value_counts() / nsamples)
[docs]@register
@generate.register
class StandardScaler(Transformer):
'''Standardize features by removing thir mean and scaling them to unit variance.
Example:
>>> import vaex
>>> df = vaex.from_arrays(x=[2,5,7,2,15], y=[-2,3,0,0,10])
>>> df
# x y
0 2 -2
1 5 3
2 7 0
3 2 0
4 15 10
>>> scaler = vaex.ml.StandardScaler(features=['x', 'y'])
>>> scaler.fit_transform(df)
# x y standard_scaled_x standard_scaled_y
0 2 -2 -0.876523 -0.996616
1 5 3 -0.250435 0.189832
2 7 0 0.166957 -0.522037
3 2 0 -0.876523 -0.522037
4 15 10 1.83652 1.85086
'''
# title = Unicode(default_value='Standard Scaler', read_only=True).tag(ui='HTML')
prefix = traitlets.Unicode(default_value="standard_scaled_", help=help_prefix).tag(ui='Text')
with_mean = traitlets.CBool(default_value=True, help='If True, remove the mean from each feature.').tag(ui='Checkbox')
with_std = traitlets.CBool(default_value=True, help='If True, scale each feature to unit variance.').tag(ui='Checkbox')
mean_ = traitlets.List(traitlets.CFloat(), help='The mean of each feature').tag(output=True)
std_ = traitlets.List(traitlets.CFloat(), help='The standard deviation of each feature.').tag(output=True)
[docs] def fit(self, df):
'''
Fit StandardScaler to the DataFrame.
:param df: A vaex DataFrame.
'''
mean = df.mean(self.features, delay=True)
std = df.std(self.features, delay=True)
@vaex.delayed
def assign(mean, std):
self.mean_ = mean.tolist()
self.std_ = std.tolist()
assign(mean, std)
df.execute()
[docs]@register
@generate.register
class MinMaxScaler(Transformer):
'''Will scale a set of features to a given range.
Example:
>>> import vaex
>>> df = vaex.from_arrays(x=[2,5,7,2,15], y=[-2,3,0,0,10])
>>> df
# x y
0 2 -2
1 5 3
2 7 0
3 2 0
4 15 10
>>> scaler = vaex.ml.MinMaxScaler(features=['x', 'y'])
>>> scaler.fit_transform(df)
# x y minmax_scaled_x minmax_scaled_y
0 2 -2 0 0
1 5 3 0.230769 0.416667
2 7 0 0.384615 0.166667
3 2 0 0 0.166667
4 15 10 1 1
'''
snake_name = 'minmax_scaler'
# title = Unicode(default_value='MinMax Scaler', read_only=True).tag(ui='HTML')
feature_range = traitlets.Tuple(default_value=(0, 1), help='The range the features are scaled to.').tag().tag(ui='FloatRangeSlider')
prefix = traitlets.Unicode(default_value="minmax_scaled_", help=help_prefix).tag(ui='Text')
fmax_ = traitlets.List(traitlets.CFloat(), help='The minimum value of a feature.').tag(output=True)
fmin_ = traitlets.List(traitlets.CFloat(), help='The maximum value of a feature.').tag(output=True)
[docs] def fit(self, df):
'''
Fit MinMaxScaler to the DataFrame.
:param df: A vaex DataFrame.
'''
minmax = []
for feat in self.features:
minmax.append(df.minmax(feat, delay=True))
@vaex.delayed
def assign(minmax):
self.fmin_ = [elem[0] for elem in minmax]
self.fmax_ = [elem[1] for elem in minmax]
assign(minmax)
df.execute()
[docs]@register
@generate.register
class MaxAbsScaler(Transformer):
''' Scale features by their maximum absolute value.
Example:
>>> import vaex
>>> df = vaex.from_arrays(x=[2,5,7,2,15], y=[-2,3,0,0,10])
>>> df
# x y
0 2 -2
1 5 3
2 7 0
3 2 0
4 15 10
>>> scaler = vaex.ml.MaxAbsScaler(features=['x', 'y'])
>>> scaler.fit_transform(df)
# x y absmax_scaled_x absmax_scaled_y
0 2 -2 0.133333 -0.2
1 5 3 0.333333 0.3
2 7 0 0.466667 0
3 2 0 0.133333 0
4 15 10 1 1
'''
prefix = traitlets.Unicode(default_value="absmax_scaled_", help=help_prefix).tag(ui='Text')
absmax_ = traitlets.List(traitlets.CFloat(), help='Tha maximum absolute value of a feature.').tag(output=True)
[docs] def fit(self, df):
'''
Fit MinMaxScaler to the DataFrame.
:param df: A vaex DataFrame.
'''
absmax = df.max(['abs(%s)' % k for k in self.features]).tolist()
# Check if the absmax_ value is 0, in which case replace with 1
self.absmax_ = [value if value != 0 else 1 for value in absmax]
[docs]@register
@generate.register
class RobustScaler(Transformer):
''' The RobustScaler removes the median and scales the data according to a
given percentile range. By default, the scaling is done between the 25th and
the 75th percentile. Centering and scaling happens independently for each
feature (column).
Example:
>>> import vaex
>>> df = vaex.from_arrays(x=[2,5,7,2,15], y=[-2,3,0,0,10])
>>> df
# x y
0 2 -2
1 5 3
2 7 0
3 2 0
4 15 10
>>> scaler = vaex.ml.MaxAbsScaler(features=['x', 'y'])
>>> scaler.fit_transform(df)
# x y robust_scaled_x robust_scaled_y
0 2 -2 -0.333686 -0.266302
1 5 3 -0.000596934 0.399453
2 7 0 0.221462 0
3 2 0 -0.333686 0
4 15 10 1.1097 1.33151
'''
with_centering = traitlets.CBool(default_value=True, help='If True, remove the median.').tag(ui='Checkbox')
with_scaling = traitlets.CBool(default_value=True, help='If True, scale each feature between the specified percentile range.').tag(ui='Checkbox')
percentile_range = traitlets.Tuple(default_value=(25, 75), help='The percentile range to which to scale each feature to.').tag().tag(ui='FloatRangeSlider')
prefix = traitlets.Unicode(default_value="robust_scaled_", help=help_prefix).tag(ui='Text')
center_ = traitlets.List(traitlets.CFloat(), default_value=None, help='The median of each feature.').tag(output=True)
scale_ = traitlets.List(traitlets.CFloat(), default_value=None, help='The percentile range for each feature.').tag(output=True)
[docs] def fit(self, df):
'''
Fit RobustScaler to the DataFrame.
:param df: A vaex DataFrame.
'''
# check the quantile range
q_min, q_max = self.percentile_range
if not 0 <= q_min <= q_max <= 100:
raise ValueError('Invalid percentile range: %s' % (str(self.percentile_range)))
if self.with_centering:
self.center_ = df.percentile_approx(expression=self.features, percentage=50).tolist()
if self.with_scaling:
self.scale_ = (df.percentile_approx(expression=self.features, percentage=q_max) - df.percentile_approx(expression=self.features, percentage=q_min)).tolist()
[docs]@register
@generate.register
class BayesianTargetEncoder(Transformer):
'''Encode categorical variables with a Bayesian Target Encoder.
The categories are encoded by the mean of their target value,
which is adjusted by the global mean value of the target variable
using a Bayesian schema. For a larger `weight` value, the target
encodings are smoothed toward the global mean, while for a
`weight` of 0, the encodings are just the mean target value per
class.
Reference: https://www.wikiwand.com/en/Bayes_estimator#/Practical_example_of_Bayes_estimators
Example:
>>> import vaex
>>> import vaex.ml
>>> df = vaex.from_arrays(x=['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'],
... y=[1, 1, 1, 0, 0, 0, 0, 1])
>>> target_encoder = vaex.ml.BayesianTargetEncoder(features=['x'], weight=4)
>>> target_encoder.fit_transform(df, 'y')
# x y mean_encoded_x
0 a 1 0.625
1 a 1 0.625
2 a 1 0.625
3 a 0 0.625
4 b 0 0.375
5 b 0 0.375
6 b 0 0.375
7 b 1 0.375
'''
target = traitlets.Unicode(help='The name of the column containing the target variable.')
weight = traitlets.CFloat(default_value=100, allow_none=False, help='Weight to be applied to the mean encodings (smoothing parameter).')
prefix = traitlets.Unicode(default_value='mean_encoded_', help=help_prefix)
unseen = traitlets.Enum(values=['zero', 'nan'], default_value='nan', help='Strategy to deal with unseen values.')
mappings_ = traitlets.Dict()
[docs] def fit(self, df):
'''Fit a BayesianTargetEncoder to the DataFrame.
:param df: A vaex DataFrame
'''
# The global target mean - used for the smoothing
global_target_mean = df[self.target].mean().item()
# TODO: we don't have delayed groupby yet, which could speed up the case with many features (1 pass over the data)
for feature in self.features:
agg = df.groupby(feature, agg={'count': vaex.agg.count(), 'mean': vaex.agg.mean(self.target)})
agg['encoding'] = (agg['count'] * agg['mean'] + self.weight * global_target_mean) / (agg['count'] + self.weight)
self.mappings_[feature] = {value[feature]: value['encoding'] for index, value in agg.iterrows()}
[docs]@register
@generate.register
class WeightOfEvidenceEncoder(Transformer):
'''Encode categorical variables with a Weight of Evidence Encoder.
Weight of Evidence measures how well a particular feature supports
the given hypothesis (i.e. the target variable). With this
encoder, each category in a categorical feature is encoded by its
"strength" i.e. Weight of Evidence value. The target feature can be
a boolean or numerical column, where True/1 is seen as 'Good', and
False/0 is seen as 'Bad'
Reference: https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html
Example:
>>> import vaex
>>> import vaex.ml
>>> df = vaex.from_arrays(x=['a', 'a', 'b', 'b', 'b', 'c', 'c'],
... y=[1, 1, 0, 0, 1, 1, 0])
>>> woe_encoder = vaex.ml.WeightOfEvidenceEncoder(target='y', features=['x'])
>>> woe_encoder.fit_transform(df)
# x y mean_encoded_x
0 a 1 13.8155
1 a 1 13.8155
2 b 0 -0.693147
3 b 0 -0.693147
4 b 1 -0.693147
5 c 1 0
6 c 0 0
'''
target = traitlets.Unicode(help='The name of the column containing the target variable.')
prefix = traitlets.Unicode(default_value='woe_encoded_', help=help_prefix)
unseen = traitlets.Enum(values=['zero', 'nan'], default_value='nan', help='Strategy to deal with unseen values.')
epsilon = traitlets.Float(0.000001, help="Small value taken as minimum fot the negatives, to avoid a division by zero")
mappings_ = traitlets.Dict()
[docs] def fit(self, df):
'''Fit a WeightOfEvidenceEncoder to the DataFrame.
:param df: A vaex DataFrame
'''
values = df[self.target].unique(dropna=True)
if not (
(len(values) == 2 and (0 in values and 1 in values)) or \
(len(values) == 1 and (0 in values or 1 in values)) or
len(values) == 0 # all missing values
):
raise ValueError("Target contains values different from True/1 and False/0: %r" % values)
for feature in self.features:
# Instead of counting the goods and bad, we divide by the count
# which reduces to the mean
agg = df.groupby(feature, agg={'positive': vaex.agg.mean(self.target)}, sort=True)
agg['positive'] = agg.func.where(agg['positive'] == 0, self.epsilon, agg['positive'])
agg['negative'] = 1 - agg.positive
agg['negative'] = agg.func.where(agg['negative'] == 0, self.epsilon, agg['negative'])
agg['woe'] = np.log(agg.positive/agg.negative)
self.mappings_[feature] = {value[feature]: value['woe'] for index, value in agg.iterrows()}
[docs]@register
@generate.register
class KBinsDiscretizer(Transformer):
'''Bin continous features into discrete bins.
A stretegy to encode continuous features into discrete bins. The transformed
columns contain the bin label each sample falls into. In a way this
transformer Label/Ordinal encodes continous features.
Example:
>>> import vaex
>>> import vaex.ml
>>> df = vaex.from_arrays(x=[0, 2.5, 5, 7.5, 10, 12.5, 15])
>>> bin_trans = vaex.ml.KBinsDiscretizer(features=['x'], n_bins=3, strategy='uniform')
>>> bin_trans.fit_transform(df)
# x binned_x
0 0 0
1 2.5 0
2 5 1
3 7.5 1
4 10 2
5 12.5 2
6 15 2
'''
snake_name = 'kbins_discretizer'
n_bins = traitlets.Int(allow_none=False, default_value=5, help='Number of bins. Must be greater than 1.')
strategy = traitlets.Enum(values=['uniform', 'quantile', 'kmeans'], default_value='uniform', help='Strategy used to define the widths of the bins. Can be either "uniform", "quantile" or "kmeans".')
prefix = traitlets.Unicode(default_value='binned_', help=help_prefix)
epsilon = traitlets.Float(default_value=1e-8, allow_none=False, help='Tiny value added to the bin edges ensuring samples close to the bin edges are binned correcly.')
n_bins_ = traitlets.Dict(help='Number of bins per feature.').tag(output=True)
bin_edges_ = traitlets.Dict(help='The bin edges for each binned feature').tag(output=True)
[docs] def fit(self, df):
'''
Fit KBinsDiscretizer to the DataFrame.
:param df: A vaex DataFrame.
'''
# We need at least two bins to do the transformations
assert self.n_bins > 1, ' Kwarg `n_bins` must be greated than 1.'
# Find the extent of the features
minmax = []
minmax_promise = []
for feat in self.features:
minmax_promise.append(df.minmax(feat, delay=True))
@vaex.delayed
def assign(minmax_promise):
for elem in minmax_promise:
minmax.append(elem)
assign(minmax_promise)
df.execute()
# warning: everyting is cast to float, which is unavoidable due to the addition of self.epsilon
minmax = np.array(minmax)
minmax[:, 1] = minmax[:, 1] + self.epsilon
# # Determine the bin edges and number of bins depending on the strategy per feature
if self.strategy == 'uniform':
bin_edges = {feat: np.linspace(minmax[i, 0], minmax[i, 1], self.n_bins+1) for i, feat in enumerate(self.features)}
elif self.strategy == 'quantile':
percentiles = np.linspace(0, 100, self.n_bins + 1)
bin_edges = df.percentile_approx(self.features, percentage=percentiles)
bin_edges = {feat: edges for feat, edges in zip(self.features, bin_edges)}
else:
from .cluster import KMeans
bin_edges = {}
for i, feat in enumerate(self.features):
# Deterministic initialization with uniform spacing
uniform_edges = np.linspace(minmax[i, 0], minmax[i, 1], self.n_bins+1)
centers_init = ((uniform_edges[1:] + uniform_edges[:-1]) * 0.5).tolist()
centers_init = [[elem] for elem in centers_init]
# KMeans strategy
km = KMeans(n_clusters=self.n_bins, init=centers_init, n_init=1, features=[feat])
km.fit(df)
# Get and sort the centres of the kmeans clusters
centers = np.sort(np.array(km.cluster_centers).flatten())
# Put the bin edges half way between each center (ignoring the outermost edges)
be = (centers[1:] + centers[:-1]) * 0.5
# The outermost edges are defined by the min/max of each feature
# Quickly build a numpy array by concat individual values (min/max) and arrays (be)
bin_edges[feat] = np.r_[minmax[i, 0], be, minmax[i, 1]]
# Remove bins whose width are too small (i.e., <= 1e-8)
n_bins = {} # number of bins per features that are actually used
for feat in self.features:
mask = np.diff(bin_edges[feat], append=np.inf) > 1e-8
be = bin_edges[feat][mask]
if len(be) - 1 != self.n_bins:
warnings.warn(f'Bins whose width are too small (i.e., <= 1e-8) in {feat} are removed.'
f'Consider decreasing the number of bins.')
bin_edges[feat] = be
n_bins[feat] = len(be) - 1
self.bin_edges_ = bin_edges
self.n_bins_ = n_bins