Source code for vaex.ml.xgboost


import base64
import tempfile
import traitlets

import xgboost
import numpy as np

import vaex
from . import state
from . import generate
import vaex.serialize


[docs]@vaex.serialize.register @generate.register class XGBoostModel(state.HasState): '''The XGBoost algorithm. XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solves many data science problems in a fast and accurate way. (https://github.com/dmlc/xgboost) Example: >>> import vaex >>> import vaex.ml.xgboost >>> df = vaex.datasets.iris() >>> features = ['sepal_width', 'petal_length', 'sepal_length', 'petal_width'] >>> df_train, df_test = df.ml.train_test_split() >>> params = { 'max_depth': 5, 'learning_rate': 0.1, 'objective': 'multi:softmax', 'num_class': 3, 'subsample': 0.80, 'colsample_bytree': 0.80, 'silent': 1} >>> booster = vaex.ml.xgboost.XGBoostModel(features=features, target='class_', num_boost_round=100, params=params) >>> booster.fit(df_train) >>> df_train = booster.transform(df_train) >>> df_train.head(3) # sepal_length sepal_width petal_length petal_width class_ xgboost_prediction 0 5.4 3 4.5 1.5 1 1 1 4.8 3.4 1.6 0.2 0 0 2 6.9 3.1 4.9 1.5 1 1 >>> df_test = booster.transform(df_test) >>> df_test.head(3) # sepal_length sepal_width petal_length petal_width class_ xgboost_prediction 0 5.9 3 4.2 1.5 1 1 1 6.1 3 4.6 1.4 1 1 2 6.6 2.9 4.6 1.3 1 1 ''' snake_name = 'xgboost_model' features = traitlets.List(traitlets.Unicode(), help='List of features to use when fitting the XGBoostModel.') target = traitlets.Unicode(allow_none=False, help='The name of the target column.') num_boost_round = traitlets.CInt(help='Number of boosting iterations.') params = traitlets.Dict(help='A dictionary of parameters to be passed on to the XGBoost model.') prediction_name = traitlets.Unicode(default_value='xgboost_prediction', help='The name of the virtual column housing the predictions.') def __call__(self, *args): data2d = np.stack([np.asarray(arg, np.float64) for arg in args], axis=1) dmatrix = xgboost.DMatrix(data2d) return self.booster.predict(dmatrix)
[docs] def transform(self, df): '''Transform a DataFrame such that it contains the predictions of the XGBoostModel in form of a virtual column. :param df: A vaex DataFrame. It should have the same columns as the DataFrame used to train the model. :return copy: A shallow copy of the DataFrame that includes the XGBoostModel prediction as a virtual column. :rtype: DataFrame ''' copy = df.copy() lazy_function = copy.add_function('xgboost_prediction_function', self, unique=True) expression = lazy_function(*self.features) copy.add_virtual_column(self.prediction_name, expression, unique=False) return copy
[docs] def fit(self, df, evals=(), early_stopping_rounds=None, evals_result=None, verbose_eval=False, **kwargs): '''Fit the XGBoost model given a DataFrame. This method accepts all key word arguments for the xgboost.train method. :param df: A vaex DataFrame containing the features and target on which to train the model. :param evals: A list of pairs (DataFrame, string). List of items to be evaluated during training, this allows user to watch performance on the validation set. :param int early_stopping_rounds: Activates early stopping. Validation error needs to decrease at least every *early_stopping_rounds* round(s) to continue training. Requires at least one item in *evals*. If there's more than one, will use the last. Returns the model from the last iteration (not the best one). :param dict evals_result: A dictionary storing the evaluation results of all the items in *evals*. :param bool verbose_eval: Requires at least one item in *evals*. If *verbose_eval* is True then the evaluation metric on the validation set is printed at each boosting stage. ''' data = df[self.features].values target_data = df[self.target].to_numpy() dtrain = xgboost.DMatrix(data, target_data) if evals is not None: evals = [list(elem) for elem in evals] for item in evals: data = item[0][self.features].values target_data = item[0][self.target].to_numpy() item[0] = xgboost.DMatrix(data, target_data) else: evals = () # This does the actual training / fitting of the xgboost model self.booster = xgboost.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round, evals=evals, early_stopping_rounds=early_stopping_rounds, evals_result=evals_result, verbose_eval=verbose_eval, **kwargs)
[docs] def predict(self, df, **kwargs): '''Provided a vaex DataFrame, get an in-memory numpy array with the predictions from the XGBoost model. This method accepts the key word arguments of the predict method from XGBoost. :returns: A in-memory numpy array containing the XGBoostModel predictions. :rtype: numpy.array ''' data = df[self.features].values dmatrix = xgboost.DMatrix(data) return self.booster.predict(dmatrix, **kwargs)
def state_get(self): filename = tempfile.mktemp() self.booster.save_model(filename) with open(filename, 'rb') as f: data = f.read() return dict(tree_state=base64.encodebytes(data).decode('ascii'), substate=super(XGBoostModel, self).state_get()) def state_set(self, state, trusted=True): super(XGBoostModel, self).state_set(state['substate']) data = base64.decodebytes(state['tree_state'].encode('ascii')) filename = tempfile.mktemp() with open(filename, 'wb') as f: f.write(data) self.booster = xgboost.Booster(model_file=filename)