4 Ensembling

ensemble_idea
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from tqdm.notebook import tqdm

Digits Dataset

X, y = load_digits(n_class=10, return_X_y=True)
X.shape
assert X.shape[:1] == y.shape
ix = np.random.randint(0, y.size)

plt.imshow(X[ix].reshape(8, 8))
plt.title(f'digit {y[ix]}')
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

Random Features

We need many stupid classifiers which make errors in different parts of the feature space.

n_trees = 5

Make an array to store probability predictions for different runs.

y_probas = np.empty((n_trees, ) + y_test.shape + (10, ))
y_probas.shape

Train n_trees decision tree classifiers and save class probabilities to y_probas.

for i, state in enumerate(range(n_trees)):
    model = DecisionTreeClassifier(max_features=4, max_depth=2, random_state=state)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_probas[i] = model.predict_proba(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    feat_index, = np.nonzero(model.feature_importances_)
    print(f'[{i:02d}] Test accuracy i {accuracy:.3f}')
    print(f'[{i:02d}] Features used for splitting are {feat_index}')
plt.imshow(X_test[0].reshape(8, 8))

Shape of y_probas is n_trees x test_size x n_classes.

y_probas.shape
y_probas[-1, 0].argmax()

Averaging

Let’s average prediction of n_trees decision trees on a same test set.

y_proba_mean = y_probas.mean(axis=0)
y_proba_mean.shape
y_probas_mean[:3]

Make predictions (over the last n_classes axis).

y_pred_mean = np.argmax(y_proba_mean, axis=1)
print(y_pred_mean[:3])

Finally, evaluate such model (ensemble of trees) and compreare it with a performance of a single tree.

score_mean = accuracy_score(y_test, y_pred_mean)
print(f'Score of averaged across ensemble is {score_mean * 100 :.2f}')
score = accuracy_score(y_test, y_probas[0].argmax(axis=1))
print(f'Score of a single tree is {score * 100 :.2f}')

2. Random Observations (Samples)

We can introduce randomization in building a tree with sampling objects from train set (bootstrap).

def bootstrap_indices(random_state, n_samples):
    """Return random indices with repetition (bootstrap)."""
    return np.random \
        .RandomState(random_state) \
        .randint(low=0, high=n_samples, size=n_samples)
bootstrap_indices(1, 10)
X_train.shape
features = [2, 19, 25, 44]  # fix a set of features.
y_probas = np.empty((n_trees, ) + y_test.shape + (10, ))
y_probas.shape
for i, state in enumerate(range(n_trees)):
    ix = bootstrap_indices(state, X_train.shape[0])
    X_train_ = X_train[ix, :][:, features]
    y_train_ = y_train[ix]
    X_test_ = X_test[:, features]
    
    model = DecisionTreeClassifier(max_features=4, max_depth=2, random_state=2)
    model.fit(X_train_, y_train_)
        
    y_pred = model.predict(X_test_)
    y_probas[i] = model.predict_proba(X_test_)

    accuracy = accuracy_score(y_test, y_pred)
    feat_index, = np.nonzero(model.feature_importances_)
    print(f'[{i:02d}] Test accuracy i {accuracy:.3f}')
    print(f'[{i:02d}] Features used for splitting are {feat_index}')

Average probablities over n_trees axis again and make predictions.

y_proba_mean = y_probas.mean(axis=0)
y_proba_mean.shape
y_pred_mean = np.argmax(y_proba_mean, axis=1)
print(y_pred_mean[:3])
score_mean = accuracy_score(y_test, y_pred_mean)
print(f'Score of averaged across ensemble is {score_mean * 100 :.2f}')

Combined Approach: Features + Objeservations

models = []
rs = list(range(5))

for state in rs:
    ind = bootstrap_indices(state, X_train.shape[0])
    X_train_, y_train_ = X_train[ind], y_train[ind]
    
    model = DecisionTreeClassifier(max_features=6, max_depth=2, random_state=state)

    model.fit(X_train_, y_train_)    
    models.append(model)  # Save trained model
predict_proba_models = []
for model in models:
    y_pred_proba = model.predict_proba(X_test)
    predict_proba_models.append(y_pred_proba)
    y_pred = np.argmax(y_pred_proba, axis=1)
    print('Accuracy: {:.3f}'.format(accuracy_score(y_test, y_pred)))
predict_proba_models = np.array(predict_proba_models)
mean_predict_proba = predict_proba_models.mean(axis=0)
mean_predict = np.argmax(mean_predict_proba, axis=1)
print('Random Forest Accuracy:', accuracy_score(y_test, mean_predict))

Random Forest

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=5, max_features=6, max_depth=2, random_state=1) 
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

We can reuse random states to build a tree manually.

rs = []
for m in model.estimators_:
    rs.append(m.random_state)
print(rs)
def rf_train_test_accuracy(param_name, param_grid, **params):
    """Returns train and test perfomance of a RandomForest for
    different values (param_grid) of a hyperparameter (param_name).
    """
    
    train_score, test_score = [], []
    clf = RandomForestClassifier(n_estimators=5, max_features=8, max_depth=6, random_state=1, n_jobs=-1)
    if params:
        clf.set_params(**params)
    
    for param_value in tqdm(param_grid):
        clf.set_params(**{param_name: param_value})
        clf.fit(X_train, y_train)

        train_score.append(accuracy_score(y_train, clf.predict(X_train)))
        test_score.append(accuracy_score(y_test, clf.predict(X_test)))
    return train_score, test_score

Random Forest: Number of Trees

train_accuracy, test_accuracy = rf_train_test_accuracy('n_estimators', range(1, 50, 2))
plt.figure(figsize=(10,5))
plt.plot(list(range(1,50,2)), 1-np.array(train_accuracy), label='Train error')
plt.plot(list(range(1,50,2)), 1-np.array(test_accuracy), label='Test error')
plt.xlabel('Number of trees in the forest')
plt.ylabel('Classification error (1 - accuracy).')
plt.grid(True)
plt.legend()
plt.show()

Random Forest. Tree Depth

train_accuracy, test_accuracy = rf_train_test_accuracy('max_depth', range(1, 30))
plt.figure(figsize=(10,5))
plt.plot(list(range(1,30)), 1-np.array(train_accuracy), label='Train error')
plt.plot(list(range(1,30)), 1-np.array(test_accuracy), label='Test error')
plt.xlabel('Tree depth')
plt.ylabel('Classification error (1 - accuracy).')
plt.legend()
plt.grid(True)
plt.show()

Random Forest: Number of Max Features

train_accuracy, test_accuracy = rf_train_test_accuracy('max_features', range(1, 64))
plt.figure(figsize=(10,5))
plt.plot(list(range(1, 64)), 1-np.array(train_accuracy), label='Train error')
plt.plot(list(range(1, 64)), 1-np.array(test_accuracy), label='Test error')
plt.xlabel('Max features to consider for split')
plt.ylabel('Classification error (1 - accuracy).')
plt.legend()
plt.grid(True)
plt.show()
params = {
    'n_estimators': 20,
    'max_depth': 10
}
train_accuracy, test_accuracy = rf_train_test_accuracy('max_features', range(1, 64), **params)
plt.figure(figsize=(10,5))
plt.plot(list(range(1, 64)), 1-np.array(train_accuracy), label='Train error')
plt.plot(list(range(1, 64)), 1-np.array(test_accuracy), label='Test error')
plt.xlabel('Max features to consider for split')
plt.ylabel('Classification error (1 - accuracy).')
plt.legend()
plt.grid(True)
plt.show()

Typically something between log2(k) and sqrt(k) will work as a max_features.