import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error
03 Linear Models: Linear Regression
Regressive models…
def mse(a, b, prec=2):
return np.round(mean_squared_error(a, b), prec)
Generate data
Generating 1d data.
= 5
a = 10
b = 300
n_points = 0.5
x_min = 4 x_max
= np.random.RandomState(33)
rs = rs.normal(0, 5, (n_points, 1))
noise
= np.linspace(x_min, x_max, n_points)[:, None]
X = a + b * X + noise y
from sklearn.model_selection import train_test_split
= train_test_split(X, y, train_size=0.33, random_state=33) X_train, X_test, y_train, y_test
=(9, 7))
plt.figure(figsize='b', alpha=0.3, label='test points')
plt.scatter(X_test, y_test, c='r', alpha=0.3, label='train points')
plt.scatter(X_train, y_train, c0.5, 4], [10, 45], c='black', lw=1.5, label=f'$y = {a}+{b}x$');
plt.plot([True)
plt.grid(
plt.legend()f'MSE of an original model on test data = {mse(a+b*X_test, y_test)}')
plt.title( plt.show()
Linear Regression Model
\[ y = w_0 + w_1X_1 + w_2 X_2 + \ldots + w_k X_k \]
In case of \(k= 1\), linear model becomes trivial
\[ y = w_0 + w_1 x, \quad x = X_1. \]
from sklearn.linear_model import LinearRegression
Train a model.
= LinearRegression()
reg reg.fit(X_train, y_train)
Make predictions.
= reg.predict(X_train)
y_pred_train = reg.predict(X_test) y_pred_test
Compute an error.
print('Train MSE:', mse(y_pred_train, y_train),
'Test MSE:', mse(y_pred_test, y_test))
Plot predicted model.
= np.linspace(0.5, 4, 200)[:, None]
X_linspace = reg.predict(X_linspace)
y_pred_linspace
=(12, 8))
plt.figure(figsize
plt.plot(X_linspace, y_pred_linspace)=.5, c='r', label='Test points');
plt.scatter(X_test, y_test, alpha=.3, c='b', label='Train points');
plt.scatter(X_train, y_train, alpha
plt.legend()True)
plt.grid( plt.show()
reg.coef_, reg.intercept_
What if we add more (redundant) polynomial features, could we do better on train?
\[ y = w_0 + w_1X_1 + w_2 X_2 + \ldots + w_k X_k \]
Let \(k\) to be equal 15, then \[ X_1 = x, X_2 = x^2, \ldots, X_{15}=x^{15}, \quad k = 15. \]
\[ y = w_0 + w_1x + w_2 x^2 + \ldots + w_{15} x^{15}. \]
from sklearn.preprocessing import PolynomialFeatures
= PolynomialFeatures(degree=15, include_bias=False)
poly = poly.fit_transform(X_train)
X_train_poly = poly.fit_transform(X_test) X_test_poly
= LinearRegression(fit_intercept=True)
reg
reg.fit(X_train_poly, y_train)= reg.predict(X_train_poly)
y_pred_train = reg.predict(X_test_poly)
y_pred_test
print('Train MSE:', mse(y_pred_train, y_train),
'Test MSE:', mse(y_pred_test, y_test))
1.1 Prediction curve
= poly.fit_transform(np.linspace(0.5,4,200).reshape(-1,1))
X_linspace_poly = reg.predict(X_linspace_poly)
y_pred_linspace
=(12, 8))
plt.figure(figsize0.5,4,200), y_pred_linspace, label='Linear regression prediction line')
plt.plot(np.linspace(=.3,c='b', label = 'Test points');
plt.scatter(X_test, y_test, alpha
=.3, c='r', label='Train points');
plt.scatter(X_train, y_train, alpha
plt.legend()True)
plt.grid( plt.show()
reg.coef_
1.2 Error curve
We know that originally data was produce from a linear law, but we are building a model as a polynom of 15th degree:
\[y = w_0 + w_1x + w_2 x^2 + \ldots + w_{15} x^{15} \]
What if we make some of the coefficients very small (ideally coefficients corresponded to all degrees higher than 1):
- Lasso
- Ridge
- Elastic Net (combination of Lasso and Ridge)
These models share the same idea: if we want to make some coefficient small, let’s add their norm to the optimized function, so instead of regular MSE:
\[\sum(y - (w_0 + w_1 x))^2 \rightarrow \min_{\text{w.r. }w_i} \]
let’s optimize the following:
\[\left[\sum_{i=1}^{n}(y - (w_0 + w_1 x))^2 + \frac{1}{\alpha}\sum_{j=1}^{k} ||w_i|| \right]\rightarrow \min_{\text{w.r. }w_i} \]
in case of Ridge regression the norm is l2: \[||w_i|| = ||w_i||_{l_2} = w_i^2,\] and in case of Lasso regression the norm is l1: \[||w_i|| = ||w_i||_{l_1} = |w_i|\]
and \(\alpha\) is a regularization coefficient. Smaller values of \(\alpha\) correspond to harder regularization.
Typically, you want to change \(\alpha\) on the log-scale, e.g. 0.003, 0.001, 0.03…
from sklearn.linear_model import Ridge, Lasso
= []
test_error = []
train_error = []
weights = np.logspace(-6,1,20)
logspace
for alpha in logspace:
= Ridge(fit_intercept=True, alpha=alpha)
reg
reg.fit(X_train_poly, y_train)= reg.predict(X_train_poly)
y_pred_train = reg.predict(X_test_poly)
y_pred_test
train_error.append(mse(y_pred_train, y_train))
test_error.append(mse(y_pred_test, y_test))sum(reg.coef_**2)) weights.append(np.
\[y = w_0 + w_1x + w_2 x^2 + \ldots + w_{15} x^{15} \]
= plt.subplots(1, 2, figsize=(16, 5))
fig, ax
0].plot(logspace, train_error, label='Train')
ax[0].plot(logspace, test_error, label='Test')
ax[0].set_xscale('log')
ax[0].legend();
ax[0].arrow(1e-1, 28, -(1e-1-1e-5), 0, head_width=0.5, head_length=.000004, alpha=.5);
ax[0].annotate('Model "complexity" increases', [2e-5, 27], size=12, alpha=0.5);
ax[0].set_xlabel('Regularization coefficient')
ax[0].set_ylabel('MSE')
ax[
1].plot(logspace, weights, color='r', label='$l_2$ norm of the weights: $\sum_{j=1}^{k} w_i^2$')
ax[1].set_xscale('log')
ax[1].legend();
ax[1].set_ylabel('Weights norm')
ax[1].set_xlabel('Regularization coefficient'); ax[
np.argmin(test_error), logspace[np.argmin(test_error)]
= PolynomialFeatures(degree=15, include_bias=False)
poly = poly.fit_transform(X_train)
X_train_poly = poly.fit_transform(np.linspace(0.5,4,200).reshape(-1,1)) X_linspace_poly
= Ridge(alpha = logspace[15], fit_intercept=True)
reg
reg.fit(X_train_poly, y_train)= reg.predict(X_linspace_poly)
y_pred_linspace
=(12, 8))
plt.figure(figsize0.5,4,200), y_pred_linspace, label='Ridge prediction line')
plt.plot(np.linspace(=.3, c='b', label='Test points');
plt.scatter(X_test, y_test, alpha=.3, c='r', label='Train points');
plt.scatter(X_train, y_train, alpha
plt.legend()True)
plt.grid('X')
plt.xlabel('y')
plt.ylabel( plt.show()
No a Silver Bullet
We do not get a perfect line, but sometimes it helps, compare to Linear Regression without regularization.
= plt.subplots(1, 2, figsize=(16, 6))
fig, ax
= Ridge(alpha = logspace[15], fit_intercept=True)
reg
reg.fit(X_train_poly, y_train)= reg.predict(X_linspace_poly)
y_pred_linspace = np.round(np.sum(reg.coef_**2), 2)
w_norm_ridge
0].plot(np.linspace(0.5,4,200), y_pred_linspace,c='r',label='Ridge regression line')
ax[0].scatter(X_test, y_test, alpha=.3, c='b', label='Test points');
ax[0].set_title(f'Weights $l_2$ norm is {w_norm_ridge}')
ax[0].scatter(X_train, y_train, alpha=.3, c='r', label='Train points');
ax[0].legend()
ax[
= LinearRegression(fit_intercept=False)
reg
reg.fit(X_train_poly, y_train)= reg.predict(X_linspace_poly)
y_pred_linspace = np.round(np.sum(reg.coef_**2), 2)
w_norm_lr
1].set_title(f'Weights $l_2$ norm is {w_norm_lr}')
ax[1].plot(np.linspace(0.5,4,200), y_pred_linspace,c='r', label='Linear regression line')
ax[1].scatter(X_test, y_test, alpha=.3,c='b', label='Test points');
ax[1].scatter(X_train, y_train, alpha=.3, c='r', label='Train points');
ax[1].legend()
ax[
plt.show()