Contents
Data source: https://archive.ics.uci.edu/ml/datasets/BlogFeedback
Task Summary
Train.csv
and test on blogData_test-2012.03.31.01_00.csv
.import pandas
import numpy as np
import matplotlib.pyplot as plt
from scipy.linalg import svd
from scipy.stats import describe
from sklearn.linear_model import Lasso
# read data and compute correlation
df_train = pandas.read_csv('blogData_train.csv', header=None)
df_test = pandas.read_csv('blogData_test-2012.03.31.01_00.csv', header=None)
trainCorrMat = df_train.corr().to_numpy()
x_train_raw = df_train.to_numpy()[:, :-1]
x_train_attr_mean = np.mean(x_train_raw, axis=0)
x_train_attr_std = np.std(x_train_raw, axis=0)
x_train_attr_std[np.abs(x_train_attr_std) < 1.0e-4] = 1.0
# standardize the data
def get_xy(df):
npArr = df.to_numpy()
x = npArr[:, :-1].copy()
x -= x_train_attr_mean
x /= x_train_attr_std
y = npArr[:, -1]
return x, y
x_train, y_train = get_xy(df_train)
x_test, y_test = get_xy(df_test)
df_train.describe()
plt.pcolor(trainCorrMat, cmap='bwr')
plt.title('Pairwise correlation coefficients between columns')
plt.colorbar()
plt.show()
lastRowCorr = trainCorrMat[-1, :]
plt.title('Correlation coefficients between columns and last column (target variable)')
plt.plot(np.arange(lastRowCorr.size), lastRowCorr, '.')
plt.show()
The linear model is given by $Y = XW + \epsilon$. We want to minimize the error term $\lVert \epsilon \rVert^2=\lVert Y-XW\rVert^2$. Matrix calculus gives $W = (X^TX)^{-1}X^TY$. The SVD allows us to write matrix $X$ as $X=U\Sigma V^T$, where both $U$ and $V$ are unit matrices. This allows us to write
$$ \begin{align*} W &= (X^TX)^{-1}X^TY\\ &= (V\Sigma U^TU\Sigma V^T)V\Sigma U^TY\\ &= V\Sigma^{-1}U^TY. \end{align*} $$Inverting matrix $\Sigma$ is simple, for it is a diagonal matrix.
svd_u, svd_diags, svd_vh = svd(x_train, full_matrices=False, compute_uv=True)
# eliminate small singular values and invert the diagonal element
def invert_diags(diags, eps=1.0e-3):
newDiags = diags.copy()
newDiags[np.abs(newDiags) < eps] = 0.0
newDiags = 1.0 / newDiags # division by zero is expected
newDiags[np.invert(np.isfinite(newDiags))] = 0.0
return newDiags
svd_v = svd_vh.T
svd_inv_diags = invert_diags(svd_diags)
# compute the W for least square regression
w_lsr = svd_v @ np.diag(svd_inv_diags) @ svd_u.T @ y_train
# compute the abs error term and observe the statistics
eps_lsr = np.abs(y_train - x_train @ w_lsr)
print(describe(eps_lsr))
In ridge regression, we add a regularization term to the objective function, namely $\lVert Y-XW\rVert^2 + \lambda \lVert W \rVert^2$. Again, using matrix calculus gives $W=(X^TX + \lambda I)^{-1}X^TY$. Using SVD, we can write
$$ \begin{align*} W&=(X^TX + \lambda I)^{-1}X^TY\\ &= (V\Sigma^2V^T + \lambda VV^T)^{-1}V\Sigma U^TY\\ &= [V(\Sigma^2 + \lambda I)V^T]^{-1}V\Sigma U^TY\\ &= V(\Sigma^2 +\lambda I)^{-1}\Sigma U^TY. \end{align*} $$def get_w_ridge(l):
sqrDiags = np.square(svd_diags)
ones = np.ones(sqrDiags.shape, sqrDiags.dtype)
invDiags = invert_diags(sqrDiags + l * ones)
return svd_v @ np.diag(invDiags) @ np.diag(svd_diags) @ svd_u.T @ y_train
def get_w_ridge_sklearn(l):
from sklearn.linear_model import Ridge
skRidge = Ridge(l, solver='svd', fit_intercept=False)
skRidge.fit(x_train, y_train)
return skRidge.coef_
w_ridge_test = get_w_ridge(10.0)
eps_ridge_test = np.abs(y_train - x_train @ w_ridge_test)
print(np.median(eps_ridge_test))
print(describe(eps_ridge_test))
def get_w_lasso(l):
skLasso = Lasso(alpha=l, fit_intercept=False)
skLasso.fit(x_train, y_train)
return skLasso.coef_
w_lasso_test = get_w_lasso(3.0)
eps_lasso_test = np.abs(y_train - x_train @ w_lasso_test)
print(describe(eps_lasso_test))
def compute_rmse(y_true, y_pred):
dist = np.square(y_pred - y_true)
#return np.sqrt(np.median(dist))
return np.sqrt(np.sum(dist)/dist.size)
def compute_test_rmse(y_pred):
return compute_rmse(y_test, y_pred)
# test RMSE of methods
print(compute_rmse(y_train, x_train @ w_lasso_test))
print(compute_test_rmse(x_test @ w_lsr))
print(compute_test_rmse(x_test @ w_ridge_test))
print(compute_test_rmse(x_test @ w_lasso_test))
lambdaVals = [1.0e-3, 1.0, 10.0, 50.0, 100.0, 200.0, 500.0, 1000.0, 2000.0]
lsr_rmse = compute_test_rmse(x_test @ w_lsr)
ridge_rmses=[]
lasso_rmses=[]
for val in lambdaVals:
w_ridge = get_w_ridge(val)
w_lasso = get_w_lasso(val)
ridge_rmse = compute_test_rmse(x_test @ w_ridge)
lasso_rmse = compute_test_rmse(x_test @ w_lasso)
ridge_rmses.append(ridge_rmse)
lasso_rmses.append(lasso_rmse)
fig, ax = plt.subplots(1, 2)
ax[0].set_title('Test RMSE of ridge regression')
ax[0].plot(np.arange(len(ridge_rmses)), ridge_rmses, label='Ridge')
ax[1].set_title('Test RMSE of LASSO')
ax[1].plot(np.arange(len(lasso_rmses)), lasso_rmses, label='LASSO')
for i in range(2):
ax[i].set_xticks(np.arange(len(lambdaVals)))
ax[i].set_xticklabels(lambdaVals, rotation=90)
ax[i].set_xlabel('$\\lambda$')
ax[i].axhline(lsr_rmse, color='g', label='LSR')
ax[i].legend()
The RMSE is large due to extreme values. If we compute the mean of absolute error, the value is smaller.
def compute_mae(y_true, y_pred):
dist = np.abs(y_pred - y_true)
#return np.sqrt(np.median(dist))
return np.sum(dist)/dist.size
def compute_test_mae(y_pred):
return compute_mae(y_test, y_pred)
lsr_mae = compute_test_mae(x_test @ w_lsr)
ridge_maes=[]
lasso_maes=[]
for val in lambdaVals:
w_ridge = get_w_ridge(val)
w_lasso = get_w_lasso(val)
ridge_mae = compute_test_mae(x_test @ w_ridge)
lasso_mae = compute_test_mae(x_test @ w_lasso)
ridge_maes.append(ridge_mae)
lasso_maes.append(lasso_mae)
fig, ax = plt.subplots(1, 2)
ax[0].set_title('Test MAE of ridge regression')
ax[0].plot(np.arange(len(ridge_maes)), ridge_maes, label='Ridge')
ax[1].set_title('Test MAE of LASSO')
ax[1].plot(np.arange(len(lasso_maes)), lasso_maes, label='LASSO')
for i in range(2):
ax[i].set_xticks(np.arange(len(lambdaVals)))
ax[i].set_xticklabels(lambdaVals, rotation=90)
ax[i].set_xlabel('$\\lambda$')
ax[i].axhline(lsr_mae, color='g', label='LSR')
ax[i].legend()
# use the same regularization parameter
w_ridge = get_w_ridge(100.0)
w_lasso = get_w_lasso(10.0)
print('number of zeros in ridge regression:', np.count_nonzero(np.abs(w_ridge) < 1.0e-3))
print('number of zeros in LASSO:', np.count_nonzero(np.abs(w_lasso) < 1.0e-3))
print('non-zero indices of LASSO: ', np.where(np.abs(w_lasso) >= 1.0e-3)[0])