Midterm Exam
Import libraries and the dataset
from sklearn.datasets import fetch_california_housing
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
data = fetch_california_housing()
X=data.data
X_names = data.feature_names
y=data.target
X_df = pd.DataFrame(data = X, columns=X_names)
Create DoKFold function
def DoKFold(model, X, y, k, standardize=False, random_state=146):
import numpy as np
from sklearn.model_selection import KFold
kf = KFold(n_splits=k, shuffle=True, random_state=random_state)
if standardize:
from sklearn.preprocessing import StandardScaler as SS
ss = SS()
train_scores = []
test_scores = []
train_mse = []
test_mse = []
for idxTrain, idxTest in kf.split(X):
Xtrain = X[idxTrain, :]
Xtest = X[idxTest, :]
ytrain = y[idxTrain]
ytest = y[idxTest]
if standardize:
Xtrain = ss.fit_transform(Xtrain)
Xtest = ss.transform(Xtest)
model.fit(Xtrain, ytrain)
train_scores.append(model.score(Xtrain, ytrain))
test_scores.append(model.score(Xtest, ytest))
ytrain_pred = model.predict(Xtrain)
ytest_pred = model.predict(Xtest)
train_mse.append(np.mean((ytrain-ytrain_pred)**2))
test_mse.append(np.mean((ytest-ytest_pred)**2))
return train_scores, test_scores, train_mse, test_mse
Question 15:
Most strongly correlated feature is the MedInc (median income).
df=X_df.copy()
df['MedHouseVal']=y
df.corr()
Question 16:
If the features are standardized, the correlations from the previous question do not change.
ss = StandardScaler()
X_scaled = ss.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=X_names)
df_scaled['MedHouseVal']=y
df_scaled.corr()
Question 17:
R^2 is 0.47.
np.round(np.corrcoef(X_df['MedInc'],y)[0][1]**2, 2)
Question 18:
Mean R^2 value on the test folds for the linear regression is 0.60198.
k=20
lin_reg = LinearRegression()
train_scores, test_scores, train_mse, test_mse = DoKFold(lin_reg,X,y,k,standardize=True)
print(np.mean(train_scores), np.mean(test_scores))
print(np.mean(train_mse), np.mean(test_mse))
Question 19:
Mean R^2 value on the test folds for the ridge regression is 0.60201.
rid_a_range = np.linspace(20, 30, 101)
k = 20
rid_tr=[]
rid_te=[]
rid_tr_mse=[]
rid_te_mse=[]
for a in rid_a_range:
rid_reg = Ridge(alpha=a)
train_scores, test_scores, train_mse, test_mse= DoKFold(rid_reg,X,y,k,standardize=True)
rid_tr.append(np.mean(train_scores))
rid_te.append(np.mean(test_scores))
rid_tr_mse.append(np.mean(train_mse))
rid_te_mse.append(np.mean(test_mse))
idx = np.argmax(rid_te)
print(rid_a_range[idx], rid_tr[idx], rid_te[idx], rid_tr_mse[idx], rid_te_mse[idx])
plt.plot(rid_a_range, rid_te,'or')
plt.xlabel('$\\alpha$')
plt.ylabel('Avg $R^2$')
plt.show()
Question 20:
Mean R^2 value on the test folds for the Lasso regression is 0.60213.
las_a_range = np.linspace(0.001, 0.003, 101)
k = 20
las_tr=[]
las_te=[]
las_tr_mse=[]
las_te_mse=[]
for a in las_a_range:
las_reg = Lasso(alpha=a)
train_scores, test_scores, train_mse, test_mse= DoKFold(las_reg,X,y,k,standardize=True)
las_tr.append(np.mean(train_scores))
las_te.append(np.mean(test_scores))
las_tr_mse.append(np.mean(train_mse))
las_te_mse.append(np.mean(test_mse))
idx = np.argmax(las_te)
print(las_a_range[idx], las_tr[idx], las_te[idx], las_tr_mse[idx], las_te_mse[idx])
plt.plot(las_a_range, las_te,'or')
plt.xlabel('$\\alpha$')
plt.ylabel('Avg $R^2$')
plt.show()
Question 21:
Lasso model estimates the smallest coefficient for the variable that is least correlated.
lin = LinearRegression()
rid = Ridge(alpha=25.8)
las = Lasso(alpha=0.00186)
lin.fit(X_scaled,y)
rid.fit(X_scaled,y)
las.fit(X_scaled,y)
print(lin.coef_[5],rid.coef_[5],las.coef_[5])
Question 22:
Lasso model estimates the smallest coefficient for the variable that is most correlated.
print(lin.coef_[0],rid.coef_[0],las.coef_[0])
Question 23:
The optimal alpha value (=26.1) is different from that in Q19 (=25.8).
idx = np.argmin(rid_te_mse)
print(rid_a_range[idx], rid_tr[idx], rid_te[idx], rid_tr_mse[idx], rid_te_mse[idx])
plt.plot(rid_a_range, rid_te_mse,'or')
plt.xlabel('$\\alpha$')
plt.ylabel('Avg MSE')
plt.show()
Question 24:
The optimal alpha value is 0.0186.
idx = np.argmin(las_te_mse)
print(las_a_range[idx], las_tr[idx], las_te[idx], las_tr_mse[idx], las_te_mse[idx])
plt.plot(las_a_range, las_te_mse,'or')
plt.xlabel('$\\alpha$')
plt.ylabel('Avg MSE')
plt.show()