import numpy as np
import matplotlib.pyplot as plt04 Gradient Boosting over Decision Trees (GBDT)
Decision Tree
pros
- easy to explain (interpretable and simple)
- categorical variables
- fast
cons
- high variance
- not additive
- in accurate
COLORS = plt.rcParams['axes.prop_cycle'].by_key()['color']def poly(xs: np.ndarray):
return xs - 2.0 * xs ** 2 + 1.2 * xs ** 3n_points = 200rs = np.random.RandomState(42)
noise = rs.normal(scale=0.01, size=n_points)xs = np.linspace(0, 1, n_points)
ys = poly(xs) + noiseplt.scatter(xs, ys, label=r'$x - 2 x^2 + 1.2 x^3$')
plt.grid(True)
plt.legend()
plt.xlabel('x')
plt.ylabel('y')
plt.show()Ensembling
The idea is that a combination of weak estimaters performs better than a single one. The resulting estimator is called ensemble (or meta estimator). In other words, we are strive to build a linear combination
\[ a_{meta}(x) = \alpha_0 + \sum_{k=1} \alpha_k a^{(k)}_{weak}(x) \]
with some weights \(\alpha_k\).
In our example we assume \(\alpha_0 = 0\) and all \(\alpha_k\) equal to 1. In general there are multiple ways to find \(\alpha\) (e.g Nesterov momentum or Langevine diffusion for boosting). Also, we assume weak algorthms \(a^{(k)}_{weak}\) to be a decision trees.
We have already seen one of the approaches to build a combination of weak estimators (e.g. RandomForestClassifier). That time we iteratively built a new decision tree over subset of original dataset with bootstrap (bootstrap aggregation, bagging) or a subset of features. Then we make predictions with each tree out of collection and take an average prediction.
There is other approach to build an ensemble: instead of fitting to original data we could fit a model to the prediction error (residual) of an estimator built on previous step. Let’s try to build an ensemble for regression problem with boosting approach.
Boosting Step by Step
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_errorSingle Tree
We use here extremely sily model.
reg1 = DecisionTreeRegressor(max_depth=1, random_state=42)
reg1.fit(xs[:, None], ys)ys_pred1 = reg.predict(xs[:, None])
ys_err1 = ys - ys_pred1
score = mean_squared_error(ys, ys_pred1)
print(f'mse is {score:e}')def plot_tree_perf(axs, xs, ys, ys_pred, ys_err):
ax = axs[0]
ax.scatter(xs, ys, c=COLORS[0], s=5, label='y')
ax.plot(xs, ys_pred, c=COLORS[1], label=r'$y_{pred}$')
ax.grid(True)
ax.legend()
ax.set_xlabel('x')
ax.set_ylabel('y')
ax = axs[1]
ax.scatter(xs, ys_err, c=COLORS[0], s=5, label='y')
ax.grid(True)
ax.legend()
ax.set_xlabel('x')
ax.set_ylabel('y')fig, axs = plt.subplots(1, 2, figsize=(15, 5), layout='constrained')
plot_tree_perf(axs, xs, ys, ys_pred1, ys_err1)
plt.show()Pair of Tree
reg2 = DecisionTreeRegressor(max_depth=1, random_state=42)
reg2.fit(xs[:, None], ys_err1)ys_pred2 = reg2.predict(xs[:, None])
ys_err2 = ys - (ys_pred1 + ys_pred2)
score = mean_squared_error(err, ys_pred2)
print(f'mse is {score:e}')fig, axs = plt.subplots(1, 2, figsize=(15, 5), layout='constrained')
plot_tree_perf(axs, xs, ys_err1, ys_pred2, ys_err2)
plt.show()The first regression model predicts target variable \(y\)
\[ \hat{y}_1 \sim y. \]
But the second on predicts residuals
\[ \hat{y}_2 \sim y - \hat{y}_1. \]
So the ultimate predictions is
\[ y \sim \hat{y}_1 + \hat{y}_2. \]
ys_pred = ys_pred1 + ys_pred2plt.scatter(xs, ys, c=COLORS[0], s=5, label='y')
plt.plot(xs, ys_pred, c=COLORS[1], label=r'$y_{pred}$')
plt.grid(True)
plt.legend()
plt.xlabel('x')
plt.ylabel('y')
plt.show()Multiple Estimators
We can repeat this procedure further.
Random Forest vs Gradient Boosting
from tqdm.notebook import tqdmfrom sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressorn_estimators_list = [1, 2, 3, 5, 10, 100, 200]
nrows = len(n_estimators_list)Random Forest
fig, axs = plt.subplots(nrows=nrows, ncols=2, figsize=(15, 5 * nrows), layout='constrained')
for i, n_estimators in enumerate(tqdm(n_estimators_list)):
rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=1)
rf.fit(xs[:, None], ys)
ys_pred = rf.predict(xs[:, None])
ys_err = ys - ys_pred
plot_tree_perf(axs[i], xs, ys, ys_pred, ys_err)
plt.show()Gradient Boosting
fig, axs = plt.subplots(nrows=nrows, ncols=2, figsize=(15, 5 * nrows), layout='constrained')
for i, n_estimators in enumerate(tqdm(n_estimators_list)):
gb = GradientBoostingRegressor(n_estimators=n_estimators, max_depth=1, learning_rate=1)
gb.fit(xs[:, None], ys)
ys_pred = gb.predict(xs[:, None])
ys_err = ys - ys_pred
plot_tree_perf(axs[i], xs, ys, ys_pred, ys_err)
plt.show()Complexoty and Bias-Variance Tradeoff
Now, let’s see how different ensembel methods behave as number of estimator is increased in ensemble.
from functools import partial
from sklearn.model_selection import train_test_splitdef fit_parametric(clf, xs, ys, n_estimators_list: list[int]) -> np.ndarray:
xs_train, xs_test, ys_train, ys_test = train_test_split(xs, ys, test_size=1/3, random_state=42)
scores = np.empty((len(n_estimators_list), 2))
for i, n_estimators in enumerate(tqdm(n_estimators_list)):
rf = clf.set_params(n_estimators=n_estimators)
rf.fit(xs_train[:, None], ys_train)
ys_pred = rf.predict(xs_train[:, None])
scores[i, 0] = mean_squared_error(ys_train, ys_pred)
ys_pred = rf.predict(xs_test[:, None])
scores[i, 1] = mean_squared_error(ys_test, ys_pred)
return scoresn_estimators_list = [*range(1, 201)] # Redefine.fit = partial(fit_parametric, xs=xs, ys=ys, n_estimators_list=n_estimators_list)Fit and apply RF/GB with difference size of ensemble to the same data split.
rf = RandomForestRegressor(max_depth=1, random_state=42)
rf_scores = fit(rf)gb = GradientBoostingRegressor(max_depth=1, learning_rate=1, random_state=42)
gb_scores = fit(gb)Let’s take a look at how size of ensemble influences the performance of a model.
def plot_complexity(ax, n_estimators_list, scores, title=None):
for i, label in enumerate(['train', 'test']):
ax.plot(n_estimators_list, scores[:, i], label=label)
ax.set_xlabel('#estimators (ensemble size)')
ax.set_ylabel('MSE')
if title is not None:
ax.set_title('Random Forest')
ax.grid(True)
ax.legend()fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15, 5), layout='constrained', sharex=True, sharey=True)
plot_complexity(axs[0], n_estimators_list, rf_scores, 'Random Forest')
plot_complexity(axs[1], n_estimators_list, gb_scores, 'Gradient Boosting')
plt.show()Production-Level Libraries
Gradient boosting is one of the most used machine learning algorithm in practice for more then two decades. There are plenty of known world-wide companies which are used gradient boosting in their products. They have a billions requests and users and terrabyte-scale datasets. They need efficient and scalable algorithms to train a model and to apply in inference time.
There are three major libraries which suggests its own flavor of grading boosting over decision trees.
- XGBoost (stands for eXtreme Gradient Boosting).
- CatBoost (aka Categorical Boosting) by Yandex.
- LightGBM (LGBM for brevity) by MicroSoft.
!pip install catboost lightgbm xgboostfrom sklearn.datasets import load_iris
from sklearn.model_selection import train_test_splitdata = load_iris()
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], test_size=0.2)from xgboost import XGBClassifier%%time
bst = XGBClassifier(n_estimators=5, max_depth=2, learning_rate=1, objective='binary:logistic')
bst.fit(X_train, y_train)
y_pred = bst.predict(X_test)
score = bst.score(X_test, y_test)
print(f'xgboost accuracy is {score * 100:.2f}')import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pooldf = pd.DataFrame(X_train, columns=data.feature_names)
df['dummy'] = pd.Categorical(np.random.randint(0, 3, size=X_train.shape[0]))
df.head()pool = Pool(data=df.to_dict('split')['data'], label=y_train, cat_features=[4])
pool.get_cat_feature_indices()%%time
cb = CatBoostClassifier(iterations=5, depth=2, learning_rate=1, loss_function='MultiClass', verbose=True)
cb.fit(X_train, y_train, plot=True)
preds = cb.predict(X_test)from lightgbm import LGBMClassifier, Datasetds = Dataset(data=df, label=y_train, categorical_feature=['dummy'])
ds.categorical_feature%%time
lgb = LGBMClassifier(n_estimators=5, learning_rate=1, max_depth=2)
lgb.fit(X_train, y_train)
y_pred = lgb.predict(X_test)