import pandas as pd
import numpy as np
import [Link] as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
dataset = pd.read_csv("/content/[Link]")
[Link](20)
squareMeters numberOfRooms hasYard hasPool floors cityCode \
0 75523 3 0 1 63 9373
1 80771 39 1 1 98 39381
2 55712 58 0 1 19 34457
3 32316 47 0 0 6 27939
4 70429 19 1 1 90 38045
5 39223 36 0 1 17 39489
6 58682 10 1 1 99 6450
7 86929 100 1 0 11 98155
8 51522 3 0 0 61 9047
9 39686 42 0 0 15 71019
10 23563 21 0 1 90 91058
11 96470 74 1 0 21 92029
12 19127 31 1 0 5 7475
13 13087 44 1 0 77 40475
14 79770 3 0 1 69 54812
15 75985 60 1 0 67 6517
16 64169 88 0 1 6 61711
17 99371 31 1 1 16 96297
18 25966 37 1 1 17 22818
19 41792 43 1 1 10 80768
cityPartRange numPrevOwners made isNewBuilt hasStormProtector
\
0 3 8 2005 0 1
1 8 6 2015 1 0
2 6 8 2021 0 0
3 10 4 2012 0 1
4 3 7 1990 1 0
5 8 6 2012 0 1
6 10 9 1995 1 1
7 3 4 2003 1 0
8 8 3 2012 1 1
9 5 8 2021 1 1
10 6 8 1993 1 0
11 4 2 2011 1 1
12 2 9 2008 0 0
13 8 4 2004 1 0
14 10 5 2018 0 1
15 6 9 2009 1 1
16 3 9 2011 1 1
17 7 8 2013 1 1
18 3 1 2016 0 0
19 9 5 2017 1 1
basement attic garage hasStorageRoom hasGuestRoom price
0 4313 9005 956 0 7 7559081.5
1 3653 2436 128 1 2 8085989.5
2 2937 8852 135 1 9 5574642.1
3 659 7141 359 0 3 3232561.2
4 8435 2429 292 1 4 7055052.0
5 2009 4552 757 0 1 3926647.2
6 5930 9453 848 0 5 5876376.5
7 6326 4748 654 0 10 8696869.3
8 632 5792 807 1 5 5154055.2
9 5198 5342 591 1 3 3970892.1
10 703 852 684 1 10 2366397.3
11 5414 1172 716 1 9 9652258.1
12 5387 4430 374 0 4 1914688.8
13 1745 724 582 0 0 1320803.4
14 8871 7117 240 0 7 7986665.8
15 4878 281 384 1 5 7607322.9
16 3054 129 726 0 9 6420823.1
17 3258 6296 354 1 8 9944705.3
18 8257 2557 162 0 6 2604486.6
19 2950 9573 572 1 5 4187667.7
[Link]
(10000, 17)
[Link]()
<class '[Link]'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 squareMeters 10000 non-null int64
1 numberOfRooms 10000 non-null int64
2 hasYard 10000 non-null int64
3 hasPool 10000 non-null int64
4 floors 10000 non-null int64
5 cityCode 10000 non-null int64
6 cityPartRange 10000 non-null int64
7 numPrevOwners 10000 non-null int64
8 made 10000 non-null int64
9 isNewBuilt 10000 non-null int64
10 hasStormProtector 10000 non-null int64
11 basement 10000 non-null int64
12 attic 10000 non-null int64
13 garage 10000 non-null int64
14 hasStorageRoom 10000 non-null int64
15 hasGuestRoom 10000 non-null int64
16 price 10000 non-null float64
dtypes: float64(1), int64(16)
memory usage: 1.3 MB
corr = [Link]()
[Link](figsize=(10, 8))
[Link](corr, annot=True, cmap='coolwarm', fmt=".2f",
linewidths=0.5)
[Link]("Correlation Heatmap")
[Link]()
correlation_with_price = [Link]()['price'].abs()
threshold = 0.01
highly_correlated_columns =
correlation_with_price[correlation_with_price >
threshold].[Link]()
print("Columns highly correlated with 'Price':")
print(highly_correlated_columns)
Columns highly correlated with 'Price':
['squareMeters', 'numPrevOwners', 'isNewBuilt', 'garage', 'price']
df = dataset[highly_correlated_columns]
[Link]()
squareMeters numPrevOwners isNewBuilt garage price
0 75523 8 0 956 7559081.5
1 80771 6 1 128 8085989.5
2 55712 8 0 135 5574642.1
3 32316 4 0 359 3232561.2
4 70429 7 1 292 7055052.0
Gradient Desent, Learning rate, Cost Function
X = df['squareMeters'].values
y = df['price'].values
learning_rate = 0.01
iterations = 10
# Initialize coefficients (slope and intercept)
b0 = 0 # Intercept
b1 = 0 # Slope
# Lists to store the history of coefficients and cost
b0_history = []
b1_history = []
cost_history = []
# Gradient Descent
for iteration in range(iterations):
# Calculate predictions
y_pred = b0 + b1 * X
# Calculate the cost (mean squared error)
cost = [Link]((y_pred - y) ** 2)
# Calculate gradients
gradient_b0 = [Link](y_pred - y)
gradient_b1 = [Link]((y_pred - y) * X)
# Update coefficients using gradients and learning rate
b0 -= learning_rate * gradient_b0
b1 -= learning_rate * gradient_b1
# Append coefficients and cost to history lists for visualization
b0_history.append(b0)
b1_history.append(b1)
cost_history.append(cost)
# Plot the cost history
[Link](figsize=(10, 4))
[Link](1, 2, 1)
[Link](cost_history)
[Link]('Iterations')
[Link]('Cost')
[Link]('Cost History')
plt.tight_layout()
[Link]()
# Print the final coefficients and cost
print("Final Intercept (b0):", b0)
print("Final Slope (b1):", b1)
print("Final Cost:", cost_history[-1])
Final Intercept (b0): -2.4127252943307258e+72
Final Slope (b1): -1.6037599090064153e+77
Final Cost: 7.759017749802741e+148
X = df['squareMeters'].values
y = df['price'].values
learning_rate = 0.1
iterations = 10
# Initialize coefficients (slope and intercept)
b0 = 0 # Intercept
b1 = 0 # Slope
# Lists to store the history of coefficients and cost
b0_history = []
b1_history = []
cost_history = []
# Gradient Descent
for iteration in range(iterations):
# Calculate predictions
y_pred = b0 + b1 * X
# Calculate the cost (mean squared error)
cost = [Link]((y_pred - y) ** 2)
# Calculate gradients
gradient_b0 = [Link](y_pred - y)
gradient_b1 = [Link]((y_pred - y) * X)
# Update coefficients using gradients and learning rate
b0 -= learning_rate * gradient_b0
b1 -= learning_rate * gradient_b1
# Append coefficients and cost to history lists for visualization
b0_history.append(b0)
b1_history.append(b1)
cost_history.append(cost)
# Plot the cost history
[Link](figsize=(10, 4))
[Link](1, 2, 1)
[Link](cost_history)
[Link]('Iterations')
[Link]('Cost')
[Link]('Cost History')
plt.tight_layout()
[Link]()
# Print the final coefficients and cost
print("Final Intercept (b0):", b0)
print("Final Slope (b1):", b1)
print("Final Cost:", cost_history[-1])
Final Intercept (b0): -2.4127259493867817e+82
Final Slope (b1): -1.603760344427987e+87
Final Cost: 7.759021541641718e+166
from sklearn.model_selection import train_test_split
X = df['squareMeters'].values
y = df['price'].values
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.25, random_state=42)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)
# Fit the model to the data
[Link](X_train, y_train)
# Make predictions
y_pred = [Link](X_test)
from [Link] import r2_score, mean_squared_error
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("R-squared:", r2)
print("Mean Squared Error:", mse)
# Plot each independent variable against the dependent variable
for i in range(1):
[Link](figsize=(6, 4))
[Link](X_train[:, i], y_train, label='Data')
[Link]([Link][i])
[Link]('Price')
[Link](f'Scatter Plot of {[Link][i]} vs. Price')
# Plot the regression line
sorted_indices = [Link](X_test[:, i])
[Link](X_test[:, i][sorted_indices], y_pred[sorted_indices],
color='red', label='Linear Regression')
[Link]()
[Link]()
R-squared: 0.999998793097589
Mean Squared Error: 10440151.787275104
Gradient Desent, Cost Function, Learning Rate for Multi Regression
X = df[['squareMeters', 'numPrevOwners', 'isNewBuilt', 'garage']]
y = df['price']
def normalize(feature):
"""Standardize the feature using Z-score normalization."""
return (feature - [Link](feature)) / [Link](feature)
# Hyperparameters
alpha = 0.01
num_iterations = 10
# Initialization
m = len(X['squareMeters'])
X0 = [Link](m)
X1 = normalize([Link](X['isNewBuilt']))
X2 = normalize([Link](X['numPrevOwners']))
X3 = normalize([Link](X['squareMeters']))
X4 = normalize([Link](X['garage']))
y = normalize([Link](y))
X = [Link]([X0, X1, X2, X3, X4]).T
theta = [Link](5)
# Gradient Descent
for _ in range(num_iterations):
y_pred = [Link](X, theta)
gradient = (1/m) * [Link](X.T, (y_pred - y))
theta -= alpha * gradient
print("Parameters:", theta)
Parameters: [-8.27782287e-19 -9.70733553e-04 1.51884192e-03
9.56150111e-02
-1.57511062e-03]
X = df[['squareMeters', 'numPrevOwners', 'isNewBuilt', 'garage']]
y = df['price']
def normalize(feature):
"""Standardize the feature using Z-score normalization."""
return (feature - [Link](feature)) / [Link](feature)
# Hyperparameters
alpha = 0.10
num_iterations = 10
# Initialization
m = len(X['squareMeters'])
X0 = [Link](m)
X1 = normalize([Link](X['isNewBuilt']))
X2 = normalize([Link](X['numPrevOwners']))
X3 = normalize([Link](X['squareMeters']))
X4 = normalize([Link](X['garage']))
y = normalize([Link](y))
X = [Link]([X0, X1, X2, X3, X4]).T
theta = [Link](5)
# Gradient Descent
for _ in range(num_iterations):
y_pred = [Link](X, theta)
gradient = (1/m) * [Link](X.T, (y_pred - y))
theta -= alpha * gradient
print("Parameters:", theta)
Parameters: [ 2.11741735e-17 -4.05175641e-03 6.47160620e-03
6.51187869e-01
-6.73095587e-03]
X = df[['squareMeters', 'numPrevOwners', 'isNewBuilt', 'garage']]
y = df['price']
def normalize(feature):
"""Standardize the feature using Z-score normalization."""
return (feature - [Link](feature)) / [Link](feature)
# Hyperparameters
alpha = 0.01
num_iterations = 20
# Initialization
m = len(X['squareMeters'])
X0 = [Link](m)
X1 = normalize([Link](X['isNewBuilt']))
X2 = normalize([Link](X['numPrevOwners']))
X3 = normalize([Link](X['squareMeters']))
X4 = normalize([Link](X['garage']))
y = normalize([Link](y))
X = [Link]([X0, X1, X2,X3,X4]).T
theta = [Link](5)
# Cost history to store MSE values for each iteration
cost_history = []
# Gradient Descent
for _ in range(num_iterations):
y_pred = [Link](X, theta)
cost = (1/m) * sum((y_pred - y)**2)
cost_history.append(cost)
gradient = (1/m) * [Link](X.T, (y_pred - y))
theta -= alpha * gradient
[Link](cost_history)
[Link]('Cost Function Over Iterations (Multiple Regression)')
[Link]('Iterations')
[Link]('Cost (MSE)')
[Link]()
X = df[['squareMeters', 'numPrevOwners', 'isNewBuilt', 'garage']]
y = df['price']
def normalize(feature):
"""Standardize the feature using Z-score normalization."""
return (feature - [Link](feature)) / [Link](feature)
# Hyperparameters
alpha = 0.10
num_iterations = 20
# Initialization
m = len(X['squareMeters'])
X0 = [Link](m)
X1 = normalize([Link](X['isNewBuilt']))
X2 = normalize([Link](X['numPrevOwners']))
X3 = normalize([Link](X['squareMeters']))
X4 = normalize([Link](X['garage']))
y = normalize([Link](y))
X = [Link]([X0, X1, X2,X3,X4]).T
theta = [Link](5)
# Cost history to store MSE values for each iteration
cost_history = []
# Gradient Descent
for _ in range(num_iterations):
y_pred = [Link](X, theta)
cost = (1/m) * sum((y_pred - y)**2)
cost_history.append(cost)
gradient = (1/m) * [Link](X.T, (y_pred - y))
theta -= alpha * gradient
[Link](cost_history)
[Link]('Cost Function Over Iterations (Multiple Regression)')
[Link]('Iterations')
[Link]('Cost (MSE)')
[Link]()
from sklearn.model_selection import train_test_split
X = df[['squareMeters', 'numPrevOwners', 'isNewBuilt',
'garage']].values
y = df['price'].values
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.25, random_state=42)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
# Fit the model to the data
[Link](X_train, y_train)
# Make predictions
y_pred = [Link](X_test)
from [Link] import r2_score, mean_squared_error
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("R-squared:", r2)
print("Mean Squared Error:", mse)
# Plot each independent variable against the dependent variable
for i in range(1):
[Link](figsize=(6, 4))
[Link](X_train[:, i], y_train, label='Data')
[Link]([Link][i])
[Link]('Price')
[Link](f'Scatter Plot of {[Link][i]} vs. Price')
# Plot the regression line
sorted_indices = [Link](X_test[:, i])
[Link](X_test[:, i][sorted_indices], y_pred[sorted_indices],
color='red', label='Linear Regression')
[Link]()
[Link]()
R-squared: 0.9999987942704442
Mean Squared Error: 10430006.155390566