10/13/24, 7:07 PM 2024301006 Tutorial 7.
ipynb - Colab
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
data = pd.read_csv('linear_regression-100.csv')
data = data.sort_values('x')
X=pd.DataFrame(data['x'])
y=pd.DataFrame(data['y'])
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
y = scaler.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
plt.scatter(X, y, color='blue', label='Data')
plt.title('Data from CSv')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
▾ LinearRegression i ?
LinearRegression()
y_pred_train_linear = linear_model.predict(X_train)
y_pred_test_linear = linear_model.predict(X_test)
plt.scatter(X, y, color='blue', label='Data')
plt.plot(X, linear_model.predict(X), color='red', label='Linear fit ')
plt.title('Linear Regression:')
plt.xlabel('X')
plt.ylabel('y')
https://colab.research.google.com/drive/1HIYJoWVI4prgwPTk2LO9k01MPMpOm4QR#scrollTo=nnyFDPi--q6j&printMode=true 1/7
10/13/24, 7:07 PM 2024301006 Tutorial 7.ipynb - Colab
plt.legend()
plt.show()
mse_train = mean_squared_error(y_train, y_pred_train_linear)
mse_linear = mean_squared_error(y_test, y_pred_test_linear)
bias_train = np.mean(y_pred_train_linear - y_train)
bias_test = np.mean(y_pred_test_linear - y_test)
variance_train = np.var(y_pred_train_linear)
variance_test = np.var(y_pred_test_linear)
print(f'Mean Squared Error Train (Linear Regression): {mse_train}')
print("Bias (Train):", bias_train)
print("Variance (Train):", variance_train)
print(f'Mean Squared Error Test (Linear Regression): {mse_linear}')
print("Bias (Test):", bias_test)
print("Variance (Test):", variance_test)
Mean Squared Error Train (Linear Regression): 5.141007291223486
Bias (Train): 8.152209066533292e-16
Variance (Train): 4.397649185731088
Mean Squared Error Test (Linear Regression): 6.818406388680551
Bias (Test): -0.023000954542939617
Variance (Test): 3.765210102931741
poly_features = PolynomialFeatures(degree=5)
X_poly_train = poly_features.fit_transform(X_train)
X_poly_test = poly_features.transform(X_test)
poly_model = LinearRegression()
poly_model.fit(X_poly_train, y_train)
y_pred_train_poly = poly_model.predict(X_poly_train)
y_pred_test_poly = poly_model.predict(X_poly_test)
mse_train_poly = mean_squared_error(y_train, y_pred_train_poly)
mse_test_poly = mean_squared_error(y_test, y_pred_test_poly)
print("MSE (Train):", mse_train_poly)
print("MSE (Test):", mse_test_poly)
bias_train = (y_train - y_pred_train_poly).mean()
bias_test = (y_test - y_pred_test_poly).mean()
# bias_test = np.mean(y_test) - np.mean(y_pred_test_poly)
print("Bias (Train):", bias_train)
print("Bias (Test):", bias_test)
https://colab.research.google.com/drive/1HIYJoWVI4prgwPTk2LO9k01MPMpOm4QR#scrollTo=nnyFDPi--q6j&printMode=true 2/7
10/13/24, 7:07 PM 2024301006 Tutorial 7.ipynb - Colab
variance_train = np.var(y_pred_train_poly)
variance_test = np.var(y_pred_test_poly)
print("Variance (Train):", variance_train)
print("Variance (Test):", variance_test)
plt.scatter(X, y, color='blue', label='Data')
plt.plot(X, poly_model.predict(poly_features.transform(X)), color='red', linewidth=1, label='Polynomial fit (degree=5)')
plt.title('Polynomial Regression: Degree 5')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
MSE (Train): 1.23110433047544e-27
MSE (Test): 9.957037959354165e-28
Bias (Train): -3.408731630294426e-16
Bias (Test): -5.0637734746080316e-15
Variance (Train): 9.53865647695457
Variance (Test): 12.939110061091156
Interpretation:
The train and test MSE values are quite close, which suggests that the model generalizes well and is not severely overfitting. However, both
MSE values are quite low, indicating the model is capturing the data patterns effectively.
Training Bias is extremely small, nearly zero, meaning the model fits the training data very closely without underfitting.
Test Bias is slightly larger, but still relatively low. This suggests that the model is performing well in terms of the overall fit to the test data, with
only a small deviation from perfect predictions.
Both Train Variance and Test Variance are high, indicating that the model’s predictions are sensitive to small fluctuations in the input data.
Despite the low test MSE and relatively low test bias, the high variance suggests that this degree 10 polynomial model is at risk of overfitting.
The model is very sensitive to variations in the data, and while it performs well on both the training and test sets, its ability to generalize to new,
unseen data could be compromised by this high variance.
data = pd.read_csv('linear_regression-1000.csv')
data = data.sort_values('x')
X = data['x'].values.reshape(-1, 1)
y = data['y'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
plt.scatter(X, y, color='blue', label='Data')
plt.title('Data from CSv')
plt.xlabel('X')
https://colab.research.google.com/drive/1HIYJoWVI4prgwPTk2LO9k01MPMpOm4QR#scrollTo=nnyFDPi--q6j&printMode=true 3/7
10/13/24, 7:07 PM 2024301006 Tutorial 7.ipynb - Colab
plt.ylabel('y')
plt.legend()
plt.show()
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
▾ LinearRegression i ?
LinearRegression()
y_pred_train_linear = linear_model.predict(X_train)
y_pred_test_linear = linear_model.predict(X_test)
plt.scatter(X, y, color='blue', label='Data')
plt.plot(X, linear_model.predict(X), color='red', label='Linear fit ')
plt.title('Linear Regression:')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
mse_train = mean_squared_error(y_train, y_pred_train_linear)
mse_linear = mean_squared_error(y_test, y_pred_test_linear)
https://colab.research.google.com/drive/1HIYJoWVI4prgwPTk2LO9k01MPMpOm4QR#scrollTo=nnyFDPi--q6j&printMode=true 4/7
10/13/24, 7:07 PM 2024301006 Tutorial 7.ipynb - Colab
bias_train = np.mean(y_pred_train_linear - y_train)
bias_test = np.mean(y_pred_test_linear - y_test)
variance_train = np.var(y_pred_train_linear)
variance_test = np.var(y_pred_test_linear)
print(f'Mean Squared Error Train (Linear Regression): {mse_train}')
print("Bias (Train):", bias_train)
print("Variance (Train):", variance_train)
print(f'Mean Squared Error Test (Linear Regression): {mse_linear}')
print("Bias (Test):", bias_test)
print("Variance (Test):", variance_test)
Mean Squared Error Train (Linear Regression): 1.1489708443865773e+29
Bias (Train): 0.02
Variance (Train): 2.2381775497652546e+29
Mean Squared Error Test (Linear Regression): 1.2635468668932845e+29
Bias (Test): -15368810361049.688
Variance (Test): 2.3309459865999088e+29
https://colab.research.google.com/drive/1HIYJoWVI4prgwPTk2LO9k01MPMpOm4QR#scrollTo=nnyFDPi--q6j&printMode=true 5/7
10/13/24, 7:07 PM 2024301006 Tutorial 7.ipynb - Colab
degrees = range(1, 11,2)
train_errors, test_errors = [], []
train_bias, test_bias = [], []
train_variance, test_variance = [], []
for degree in degrees:
poly = PolynomialFeatures(degree)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)
model = LinearRegression()
model.fit(X_poly_train, y_train)
y_train_pred = model.predict(X_poly_train)
y_test_pred = model.predict(X_poly_test)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_errors.append(train_mse)
test_errors.append(test_mse)
train_bias_sq = np.mean((y_train_pred - y_train) ** 2)
test_bias_sq = np.mean((y_test_pred - y_test) ** 2)
train_bias.append(train_bias_sq)
test_bias.append(test_bias_sq)
train_var = np.var(y_train_pred)
test_var = np.var(y_test_pred)
train_variance.append(train_var)
test_variance.append(test_var)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(degrees, train_errors, label='Train MSE', marker='o')
plt.plot(degrees, test_errors, label='Test MSE', marker='o')
plt.xlabel('Polynomial Degree')
plt.ylabel('Mean Squared Error')
plt.title('MSE vs. Polynomial Degree')
plt.legend()
plt.grid(True)
plt.subplot(1, 2, 2)
plt.plot(degrees, train_bias, label='Train Bias^2', marker='o')
plt.plot(degrees, test_bias, label='Test Bias^2', marker='o')
plt.plot(degrees, train_variance, label='Train Variance', marker='o')
plt.plot(degrees, test_variance, label='Test Variance', marker='o')
plt.xlabel('Polynomial Degree')
plt.ylabel('Bias^2 / Variance')
plt.title('Bias^2 and Variance vs. Polynomial Degree')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
https://colab.research.google.com/drive/1HIYJoWVI4prgwPTk2LO9k01MPMpOm4QR#scrollTo=nnyFDPi--q6j&printMode=true 6/7
10/13/24, 7:07 PM 2024301006 Tutorial 7.ipynb - Colab
Above MSE vs Polynomial Drgree Graph SUggests that the MSE is least at a polynomial of degree 5, thus using a polynomial regression model
with degree 5 would be good option.
poly_features = PolynomialFeatures(degree=5)
X_poly_train = poly_features.fit_transform(X_train)
X_poly_test = poly_features.transform(X_test)
poly_model = LinearRegression()
poly_model.fit(X_poly_train, y_train)
y_pred_train_poly = poly_model.predict(X_poly_train)
y_pred_test_poly = poly_model.predict(X_poly_test)
mse_train_poly = mean_squared_error(y_train, y_pred_train_poly)
mse_test_poly = mean_squared_error(y_test, y_pred_test_poly)
print("MSE (Train):", mse_train_poly)
print("MSE (Test):", mse_test_poly)
bias_train = (y_train - y_pred_train_poly).mean()
bias_test = (y_test - y_pred_test_poly).mean()
print("Bias (Train):", bias_train)
print("Bias (Test):", bias_test)
variance_train = np.var(y_pred_train_poly)
# Variance for the test set
variance_test = np.var(y_pred_test_poly)
print("Variance (Train):", variance_train)
print("Variance (Test):", variance_test)
plt.scatter(X, y, color='blue', label='Data')
plt.plot(X, poly_model.predict(poly_features.transform(X)), color='red', linewidth=4, label='Polynomial fit (degree=5)')
plt.title('Polynomial Regression: Degree 5')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
MSE (Train): 1024506.1521971822
MSE (Test): 991863.2572999481
Bias (Train): 0.011173361582545373
Bias (Test): 27.818454353462034
Variance (Train): 3.387148394151828e+29
Variance (Test): 3.9410143898299956e+29
https://colab.research.google.com/drive/1HIYJoWVI4prgwPTk2LO9k01MPMpOm4QR#scrollTo=nnyFDPi--q6j&printMode=true 7/7