Midterm Exam
Import libraries and the dataset
from sklearn.datasets import fetch_california_housing
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
data = fetch_california_housing()
X=data.data
X_names = data.feature_names
y=data.target  
X_df = pd.DataFrame(data = X, columns=X_names)
Create DoKFold function
def DoKFold(model, X, y, k, standardize=False, random_state=146):
    import numpy as np
    from sklearn.model_selection import KFold
    
    kf = KFold(n_splits=k, shuffle=True, random_state=random_state)
    if standardize:
        from sklearn.preprocessing import StandardScaler as SS
        ss = SS()
    train_scores = [] 
    test_scores = []  
    
    train_mse = []   
    test_mse = []   
    for idxTrain, idxTest in kf.split(X):
        Xtrain = X[idxTrain, :]
        Xtest = X[idxTest, :]
        ytrain = y[idxTrain]
        ytest = y[idxTest]
        if standardize:
            Xtrain = ss.fit_transform(Xtrain) 
            Xtest = ss.transform(Xtest) 
        model.fit(Xtrain, ytrain)
        train_scores.append(model.score(Xtrain, ytrain))
        test_scores.append(model.score(Xtest, ytest))
        
        ytrain_pred = model.predict(Xtrain)
        ytest_pred = model.predict(Xtest)
        
        train_mse.append(np.mean((ytrain-ytrain_pred)**2))
        test_mse.append(np.mean((ytest-ytest_pred)**2)) 
        
    return train_scores, test_scores, train_mse, test_mse
Question 15:
Most strongly correlated feature is the MedInc (median income).
df=X_df.copy()
df['MedHouseVal']=y
df.corr() 
Question 16:
If the features are standardized, the correlations from the previous question do not change.
ss = StandardScaler()
X_scaled = ss.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=X_names)
df_scaled['MedHouseVal']=y
df_scaled.corr()
Question 17:
R^2 is 0.47.
np.round(np.corrcoef(X_df['MedInc'],y)[0][1]**2, 2)
Question 18:
Mean R^2 value on the test folds for the linear regression is 0.60198.
k=20
lin_reg = LinearRegression()
train_scores, test_scores, train_mse, test_mse = DoKFold(lin_reg,X,y,k,standardize=True)
print(np.mean(train_scores), np.mean(test_scores))
print(np.mean(train_mse), np.mean(test_mse))
Question 19:
Mean R^2 value on the test folds for the ridge regression is 0.60201.
rid_a_range = np.linspace(20, 30, 101)
k = 20
rid_tr=[]
rid_te=[]
rid_tr_mse=[]
rid_te_mse=[]
for a in rid_a_range:
    rid_reg = Ridge(alpha=a) 
    train_scores, test_scores, train_mse, test_mse= DoKFold(rid_reg,X,y,k,standardize=True)
    rid_tr.append(np.mean(train_scores))
    rid_te.append(np.mean(test_scores))
    rid_tr_mse.append(np.mean(train_mse))
    rid_te_mse.append(np.mean(test_mse))
    
idx = np.argmax(rid_te)
print(rid_a_range[idx], rid_tr[idx], rid_te[idx], rid_tr_mse[idx], rid_te_mse[idx])
plt.plot(rid_a_range, rid_te,'or')
plt.xlabel('$\\alpha$')
plt.ylabel('Avg $R^2$')
plt.show()
Question 20:
Mean R^2 value on the test folds for the Lasso regression is 0.60213.
las_a_range = np.linspace(0.001, 0.003, 101)
k = 20
las_tr=[]
las_te=[]
las_tr_mse=[]
las_te_mse=[]
for a in las_a_range:
    las_reg = Lasso(alpha=a) 
    train_scores, test_scores, train_mse, test_mse= DoKFold(las_reg,X,y,k,standardize=True)
    las_tr.append(np.mean(train_scores))
    las_te.append(np.mean(test_scores))
    las_tr_mse.append(np.mean(train_mse))
    las_te_mse.append(np.mean(test_mse))
idx = np.argmax(las_te)
print(las_a_range[idx], las_tr[idx], las_te[idx], las_tr_mse[idx], las_te_mse[idx])
plt.plot(las_a_range, las_te,'or')
plt.xlabel('$\\alpha$')
plt.ylabel('Avg $R^2$')
plt.show()
Question 21:
Lasso model estimates the smallest coefficient for the variable that is least correlated.
lin = LinearRegression()  
rid = Ridge(alpha=25.8)
las = Lasso(alpha=0.00186)
lin.fit(X_scaled,y)
rid.fit(X_scaled,y)
las.fit(X_scaled,y)
print(lin.coef_[5],rid.coef_[5],las.coef_[5])
Question 22:
Lasso model estimates the smallest coefficient for the variable that is most correlated.
print(lin.coef_[0],rid.coef_[0],las.coef_[0])
Question 23:
The optimal alpha value (=26.1) is different from that in Q19 (=25.8).
idx = np.argmin(rid_te_mse)
print(rid_a_range[idx], rid_tr[idx], rid_te[idx], rid_tr_mse[idx], rid_te_mse[idx])
plt.plot(rid_a_range, rid_te_mse,'or')
plt.xlabel('$\\alpha$')
plt.ylabel('Avg MSE')
plt.show()
Question 24:
The optimal alpha value is 0.0186.
idx = np.argmin(las_te_mse)
print(las_a_range[idx], las_tr[idx], las_te[idx], las_tr_mse[idx], las_te_mse[idx])
plt.plot(las_a_range, las_te_mse,'or')
plt.xlabel('$\\alpha$')
plt.ylabel('Avg MSE')
plt.show()