import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from tqdm.notebook import tqdm
4 Ensembling
Digits Dataset
= load_digits(n_class=10, return_X_y=True)
X, y X.shape
assert X.shape[:1] == y.shape
= np.random.randint(0, y.size)
ix
8, 8))
plt.imshow(X[ix].reshape(f'digit {y[ix]}')
plt.title( plt.show()
= train_test_split(X, y, test_size=1/3, random_state=42) X_train, X_test, y_train, y_test
Random Features
We need many stupid
classifiers which make errors in different parts
of the feature space.
= 5 n_trees
Make an array to store probability predictions for different runs.
= np.empty((n_trees, ) + y_test.shape + (10, ))
y_probas y_probas.shape
Train n_trees
decision tree classifiers and save class probabilities to y_probas
.
for i, state in enumerate(range(n_trees)):
= DecisionTreeClassifier(max_features=4, max_depth=2, random_state=state)
model
model.fit(X_train, y_train)
= model.predict(X_test)
y_pred = model.predict_proba(X_test)
y_probas[i]
= accuracy_score(y_test, y_pred)
accuracy = np.nonzero(model.feature_importances_)
feat_index, print(f'[{i:02d}] Test accuracy i {accuracy:.3f}')
print(f'[{i:02d}] Features used for splitting are {feat_index}')
0].reshape(8, 8)) plt.imshow(X_test[
Shape of y_probas
is n_trees
x test_size
x n_classes
.
y_probas.shape
-1, 0].argmax() y_probas[
Averaging
Let’s average prediction of n_trees
decision trees on a same test set.
= y_probas.mean(axis=0)
y_proba_mean y_proba_mean.shape
3] y_probas_mean[:
Make predictions (over the last n_classes
axis).
= np.argmax(y_proba_mean, axis=1)
y_pred_mean print(y_pred_mean[:3])
Finally, evaluate such model (ensemble of trees) and compreare it with a performance of a single tree.
= accuracy_score(y_test, y_pred_mean)
score_mean print(f'Score of averaged across ensemble is {score_mean * 100 :.2f}')
= accuracy_score(y_test, y_probas[0].argmax(axis=1))
score print(f'Score of a single tree is {score * 100 :.2f}')
2. Random Observations (Samples)
We can introduce randomization in building a tree with sampling objects from train set (bootstrap).
def bootstrap_indices(random_state, n_samples):
"""Return random indices with repetition (bootstrap)."""
return np.random \
\
.RandomState(random_state) =0, high=n_samples, size=n_samples) .randint(low
1, 10) bootstrap_indices(
X_train.shape
= [2, 19, 25, 44] # fix a set of features. features
= np.empty((n_trees, ) + y_test.shape + (10, ))
y_probas y_probas.shape
for i, state in enumerate(range(n_trees)):
= bootstrap_indices(state, X_train.shape[0])
ix = X_train[ix, :][:, features]
X_train_ = y_train[ix]
y_train_ = X_test[:, features]
X_test_
= DecisionTreeClassifier(max_features=4, max_depth=2, random_state=2)
model
model.fit(X_train_, y_train_)
= model.predict(X_test_)
y_pred = model.predict_proba(X_test_)
y_probas[i]
= accuracy_score(y_test, y_pred)
accuracy = np.nonzero(model.feature_importances_)
feat_index, print(f'[{i:02d}] Test accuracy i {accuracy:.3f}')
print(f'[{i:02d}] Features used for splitting are {feat_index}')
Average probablities over n_trees
axis again and make predictions.
= y_probas.mean(axis=0)
y_proba_mean y_proba_mean.shape
= np.argmax(y_proba_mean, axis=1)
y_pred_mean print(y_pred_mean[:3])
= accuracy_score(y_test, y_pred_mean)
score_mean print(f'Score of averaged across ensemble is {score_mean * 100 :.2f}')
Combined Approach: Features + Objeservations
= []
models = list(range(5))
rs
for state in rs:
= bootstrap_indices(state, X_train.shape[0])
ind = X_train[ind], y_train[ind]
X_train_, y_train_
= DecisionTreeClassifier(max_features=6, max_depth=2, random_state=state)
model
model.fit(X_train_, y_train_) # Save trained model models.append(model)
= []
predict_proba_models for model in models:
= model.predict_proba(X_test)
y_pred_proba
predict_proba_models.append(y_pred_proba)= np.argmax(y_pred_proba, axis=1)
y_pred print('Accuracy: {:.3f}'.format(accuracy_score(y_test, y_pred)))
= np.array(predict_proba_models) predict_proba_models
= predict_proba_models.mean(axis=0)
mean_predict_proba = np.argmax(mean_predict_proba, axis=1)
mean_predict print('Random Forest Accuracy:', accuracy_score(y_test, mean_predict))
Random Forest
from sklearn.ensemble import RandomForestClassifier
= RandomForestClassifier(n_estimators=5, max_features=6, max_depth=2, random_state=1)
model
model.fit(X_train, y_train)
= model.predict(X_test)
y_pred print(accuracy_score(y_test, y_pred))
We can reuse random states to build a tree manually.
= []
rs for m in model.estimators_:
rs.append(m.random_state)print(rs)
def rf_train_test_accuracy(param_name, param_grid, **params):
"""Returns train and test perfomance of a RandomForest for
different values (param_grid) of a hyperparameter (param_name).
"""
= [], []
train_score, test_score = RandomForestClassifier(n_estimators=5, max_features=8, max_depth=6, random_state=1, n_jobs=-1)
clf if params:
**params)
clf.set_params(
for param_value in tqdm(param_grid):
**{param_name: param_value})
clf.set_params(
clf.fit(X_train, y_train)
train_score.append(accuracy_score(y_train, clf.predict(X_train)))
test_score.append(accuracy_score(y_test, clf.predict(X_test)))return train_score, test_score
Random Forest: Number of Trees
= rf_train_test_accuracy('n_estimators', range(1, 50, 2)) train_accuracy, test_accuracy
=(10,5))
plt.figure(figsizelist(range(1,50,2)), 1-np.array(train_accuracy), label='Train error')
plt.plot(list(range(1,50,2)), 1-np.array(test_accuracy), label='Test error')
plt.plot('Number of trees in the forest')
plt.xlabel('Classification error (1 - accuracy).')
plt.ylabel(True)
plt.grid(
plt.legend() plt.show()
Random Forest. Tree Depth
= rf_train_test_accuracy('max_depth', range(1, 30)) train_accuracy, test_accuracy
=(10,5))
plt.figure(figsizelist(range(1,30)), 1-np.array(train_accuracy), label='Train error')
plt.plot(list(range(1,30)), 1-np.array(test_accuracy), label='Test error')
plt.plot('Tree depth')
plt.xlabel('Classification error (1 - accuracy).')
plt.ylabel(
plt.legend()True)
plt.grid( plt.show()
Random Forest: Number of Max Features
= rf_train_test_accuracy('max_features', range(1, 64)) train_accuracy, test_accuracy
=(10,5))
plt.figure(figsizelist(range(1, 64)), 1-np.array(train_accuracy), label='Train error')
plt.plot(list(range(1, 64)), 1-np.array(test_accuracy), label='Test error')
plt.plot('Max features to consider for split')
plt.xlabel('Classification error (1 - accuracy).')
plt.ylabel(
plt.legend()True)
plt.grid( plt.show()
= {
params 'n_estimators': 20,
'max_depth': 10
}= rf_train_test_accuracy('max_features', range(1, 64), **params) train_accuracy, test_accuracy
=(10,5))
plt.figure(figsizelist(range(1, 64)), 1-np.array(train_accuracy), label='Train error')
plt.plot(list(range(1, 64)), 1-np.array(test_accuracy), label='Test error')
plt.plot('Max features to consider for split')
plt.xlabel('Classification error (1 - accuracy).')
plt.ylabel(
plt.legend()True)
plt.grid( plt.show()
Typically something between log2(k) and sqrt(k) will work as a max_features.