import numpy as np
import matplotlib.pyplot as plt
04 Gradient Boosting over Decision Trees (GBDT)
Decision Tree
pros
- easy to explain (interpretable and simple)
- categorical variables
- fast
cons
- high variance
- not additive
- in accurate
= plt.rcParams['axes.prop_cycle'].by_key()['color'] COLORS
def poly(xs: np.ndarray):
return xs - 2.0 * xs ** 2 + 1.2 * xs ** 3
= 200 n_points
= np.random.RandomState(42)
rs = rs.normal(scale=0.01, size=n_points) noise
= np.linspace(0, 1, n_points)
xs = poly(xs) + noise ys
=r'$x - 2 x^2 + 1.2 x^3$')
plt.scatter(xs, ys, labelTrue)
plt.grid(
plt.legend()'x')
plt.xlabel('y')
plt.ylabel( plt.show()
Ensembling
The idea is that a combination of weak estimaters performs better than a single one. The resulting estimator is called ensemble (or meta estimator). In other words, we are strive to build a linear combination
\[ a_{meta}(x) = \alpha_0 + \sum_{k=1} \alpha_k a^{(k)}_{weak}(x) \]
with some weights \(\alpha_k\).
In our example we assume \(\alpha_0 = 0\) and all \(\alpha_k\) equal to 1. In general there are multiple ways to find \(\alpha\) (e.g Nesterov momentum or Langevine diffusion for boosting). Also, we assume weak algorthms \(a^{(k)}_{weak}\) to be a decision trees.
We have already seen one of the approaches to build a combination of weak estimators (e.g. RandomForestClassifier
). That time we iteratively built a new decision tree over subset of original dataset with bootstrap (bootstrap aggregation, bagging) or a subset of features. Then we make predictions with each tree out of collection and take an average prediction.
There is other approach to build an ensemble: instead of fitting to original data we could fit a model to the prediction error (residual) of an estimator built on previous step. Let’s try to build an ensemble for regression problem with boosting approach.
Boosting Step by Step
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
Single Tree
We use here extremely sily model.
= DecisionTreeRegressor(max_depth=1, random_state=42)
reg1 None], ys) reg1.fit(xs[:,
= reg.predict(xs[:, None])
ys_pred1 = ys - ys_pred1
ys_err1 = mean_squared_error(ys, ys_pred1)
score print(f'mse is {score:e}')
def plot_tree_perf(axs, xs, ys, ys_pred, ys_err):
= axs[0]
ax =COLORS[0], s=5, label='y')
ax.scatter(xs, ys, c=COLORS[1], label=r'$y_{pred}$')
ax.plot(xs, ys_pred, cTrue)
ax.grid(
ax.legend()'x')
ax.set_xlabel('y')
ax.set_ylabel(
= axs[1]
ax =COLORS[0], s=5, label='y')
ax.scatter(xs, ys_err, cTrue)
ax.grid(
ax.legend()'x')
ax.set_xlabel('y') ax.set_ylabel(
= plt.subplots(1, 2, figsize=(15, 5), layout='constrained')
fig, axs
plot_tree_perf(axs, xs, ys, ys_pred1, ys_err1) plt.show()
Pair of Tree
= DecisionTreeRegressor(max_depth=1, random_state=42)
reg2 None], ys_err1) reg2.fit(xs[:,
= reg2.predict(xs[:, None])
ys_pred2 = ys - (ys_pred1 + ys_pred2)
ys_err2 = mean_squared_error(err, ys_pred2)
score print(f'mse is {score:e}')
= plt.subplots(1, 2, figsize=(15, 5), layout='constrained')
fig, axs
plot_tree_perf(axs, xs, ys_err1, ys_pred2, ys_err2) plt.show()
The first regression model predicts target variable \(y\)
\[ \hat{y}_1 \sim y. \]
But the second on predicts residuals
\[ \hat{y}_2 \sim y - \hat{y}_1. \]
So the ultimate predictions is
\[ y \sim \hat{y}_1 + \hat{y}_2. \]
= ys_pred1 + ys_pred2 ys_pred
=COLORS[0], s=5, label='y')
plt.scatter(xs, ys, c=COLORS[1], label=r'$y_{pred}$')
plt.plot(xs, ys_pred, cTrue)
plt.grid(
plt.legend()'x')
plt.xlabel('y')
plt.ylabel( plt.show()
Multiple Estimators
We can repeat this procedure further.
Random Forest vs Gradient Boosting
from tqdm.notebook import tqdm
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
= [1, 2, 3, 5, 10, 100, 200]
n_estimators_list = len(n_estimators_list) nrows
Random Forest
= plt.subplots(nrows=nrows, ncols=2, figsize=(15, 5 * nrows), layout='constrained')
fig, axs
for i, n_estimators in enumerate(tqdm(n_estimators_list)):
= RandomForestRegressor(n_estimators=n_estimators, max_depth=1)
rf None], ys)
rf.fit(xs[:,
= rf.predict(xs[:, None])
ys_pred = ys - ys_pred
ys_err
plot_tree_perf(axs[i], xs, ys, ys_pred, ys_err)
plt.show()
Gradient Boosting
= plt.subplots(nrows=nrows, ncols=2, figsize=(15, 5 * nrows), layout='constrained')
fig, axs
for i, n_estimators in enumerate(tqdm(n_estimators_list)):
= GradientBoostingRegressor(n_estimators=n_estimators, max_depth=1, learning_rate=1)
gb None], ys)
gb.fit(xs[:,
= gb.predict(xs[:, None])
ys_pred = ys - ys_pred
ys_err
plot_tree_perf(axs[i], xs, ys, ys_pred, ys_err)
plt.show()
Complexoty and Bias-Variance Tradeoff
Now, let’s see how different ensembel methods behave as number of estimator is increased in ensemble.
from functools import partial
from sklearn.model_selection import train_test_split
def fit_parametric(clf, xs, ys, n_estimators_list: list[int]) -> np.ndarray:
= train_test_split(xs, ys, test_size=1/3, random_state=42)
xs_train, xs_test, ys_train, ys_test
= np.empty((len(n_estimators_list), 2))
scores for i, n_estimators in enumerate(tqdm(n_estimators_list)):
= clf.set_params(n_estimators=n_estimators)
rf None], ys_train)
rf.fit(xs_train[:,
= rf.predict(xs_train[:, None])
ys_pred 0] = mean_squared_error(ys_train, ys_pred)
scores[i,
= rf.predict(xs_test[:, None])
ys_pred 1] = mean_squared_error(ys_test, ys_pred)
scores[i,
return scores
= [*range(1, 201)] # Redefine. n_estimators_list
= partial(fit_parametric, xs=xs, ys=ys, n_estimators_list=n_estimators_list) fit
Fit and apply RF/GB with difference size of ensemble to the same data split.
= RandomForestRegressor(max_depth=1, random_state=42)
rf = fit(rf) rf_scores
= GradientBoostingRegressor(max_depth=1, learning_rate=1, random_state=42)
gb = fit(gb) gb_scores
Let’s take a look at how size of ensemble influences the performance of a model.
def plot_complexity(ax, n_estimators_list, scores, title=None):
for i, label in enumerate(['train', 'test']):
=label)
ax.plot(n_estimators_list, scores[:, i], label'#estimators (ensemble size)')
ax.set_xlabel('MSE')
ax.set_ylabel(if title is not None:
'Random Forest')
ax.set_title(True)
ax.grid( ax.legend()
= plt.subplots(nrows=1, ncols=2, figsize=(15, 5), layout='constrained', sharex=True, sharey=True)
fig, axs 0], n_estimators_list, rf_scores, 'Random Forest')
plot_complexity(axs[1], n_estimators_list, gb_scores, 'Gradient Boosting')
plot_complexity(axs[ plt.show()
Production-Level Libraries
Gradient boosting is one of the most used machine learning algorithm in practice for more then two decades. There are plenty of known world-wide companies which are used gradient boosting in their products. They have a billions requests and users and terrabyte-scale datasets. They need efficient and scalable algorithms to train a model and to apply in inference time.
There are three major libraries which suggests its own flavor of grading boosting over decision trees.
- XGBoost (stands for eXtreme Gradient Boosting).
- CatBoost (aka Categorical Boosting) by Yandex.
- LightGBM (LGBM for brevity) by MicroSoft.
!pip install catboost lightgbm xgboost
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
= load_iris()
data = train_test_split(data['data'], data['target'], test_size=0.2) X_train, X_test, y_train, y_test
from xgboost import XGBClassifier
%%time
= XGBClassifier(n_estimators=5, max_depth=2, learning_rate=1, objective='binary:logistic')
bst
bst.fit(X_train, y_train)= bst.predict(X_test)
y_pred = bst.score(X_test, y_test)
score print(f'xgboost accuracy is {score * 100:.2f}')
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
= pd.DataFrame(X_train, columns=data.feature_names)
df 'dummy'] = pd.Categorical(np.random.randint(0, 3, size=X_train.shape[0]))
df[ df.head()
= Pool(data=df.to_dict('split')['data'], label=y_train, cat_features=[4])
pool pool.get_cat_feature_indices()
%%time
= CatBoostClassifier(iterations=5, depth=2, learning_rate=1, loss_function='MultiClass', verbose=True)
cb =True)
cb.fit(X_train, y_train, plot= cb.predict(X_test) preds
from lightgbm import LGBMClassifier, Dataset
= Dataset(data=df, label=y_train, categorical_feature=['dummy'])
ds ds.categorical_feature
%%time
= LGBMClassifier(n_estimators=5, learning_rate=1, max_depth=2)
lgb
lgb.fit(X_train, y_train)= lgb.predict(X_test) y_pred