assignment-2
November 17, 2024
[7]: import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
[9]: data = pd.read_csv('Housing.csv')
[11]: print(data.head())
price area bedrooms bathrooms stories mainroad guestroom basement \
0 13300000 7420 4 2 3 yes no no
1 12250000 8960 4 4 4 yes no no
2 12250000 9960 3 2 2 yes no yes
3 12215000 7500 4 2 2 yes no yes
4 11410000 7420 4 1 2 yes yes yes
hotwaterheating airconditioning parking prefarea furnishingstatus
0 no yes 2 yes furnished
1 no yes 3 no furnished
2 no no 2 yes semi-furnished
3 no yes 3 yes furnished
4 no yes 2 no furnished
[13]: print(data.tail())
price area bedrooms bathrooms stories mainroad guestroom basement \
540 1820000 3000 2 1 1 yes no yes
541 1767150 2400 3 1 1 no no no
542 1750000 3620 2 1 1 yes no no
543 1750000 2910 3 1 1 no no no
544 1750000 3850 3 1 2 yes no no
hotwaterheating airconditioning parking prefarea furnishingstatus
540 no no 2 no unfurnished
541 no no 0 no semi-furnished
542 no no 0 no unfurnished
543 no no 0 no furnished
544 no no 0 no unfurnished
1
[17]: print("Data shape:", data.shape)
Data shape: (545, 13)
[19]: print(data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 price 545 non-null int64
1 area 545 non-null int64
2 bedrooms 545 non-null int64
3 bathrooms 545 non-null int64
4 stories 545 non-null int64
5 mainroad 545 non-null object
6 guestroom 545 non-null object
7 basement 545 non-null object
8 hotwaterheating 545 non-null object
9 airconditioning 545 non-null object
10 parking 545 non-null int64
11 prefarea 545 non-null object
12 furnishingstatus 545 non-null object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB
None
[21]: ss = data.describe()
ss
[21]: price area bedrooms bathrooms stories \
count 5.450000e+02 545.000000 545.000000 545.000000 545.000000
mean 4.766729e+06 5150.541284 2.965138 1.286239 1.805505
std 1.870440e+06 2170.141023 0.738064 0.502470 0.867492
min 1.750000e+06 1650.000000 1.000000 1.000000 1.000000
25% 3.430000e+06 3600.000000 2.000000 1.000000 1.000000
50% 4.340000e+06 4600.000000 3.000000 1.000000 2.000000
75% 5.740000e+06 6360.000000 3.000000 2.000000 2.000000
max 1.330000e+07 16200.000000 6.000000 4.000000 4.000000
parking
count 545.000000
mean 0.693578
std 0.861586
min 0.000000
25% 0.000000
2
50% 0.000000
75% 1.000000
max 3.000000
[25]: data
[25]: price area bedrooms bathrooms stories mainroad guestroom basement \
0 13300000 7420 4 2 3 yes no no
1 12250000 8960 4 4 4 yes no no
2 12250000 9960 3 2 2 yes no yes
3 12215000 7500 4 2 2 yes no yes
4 11410000 7420 4 1 2 yes yes yes
.. … … … … … … … …
540 1820000 3000 2 1 1 yes no yes
541 1767150 2400 3 1 1 no no no
542 1750000 3620 2 1 1 yes no no
543 1750000 2910 3 1 1 no no no
544 1750000 3850 3 1 2 yes no no
hotwaterheating airconditioning parking prefarea furnishingstatus
0 no yes 2 yes furnished
1 no yes 3 no furnished
2 no no 2 yes semi-furnished
3 no yes 3 yes furnished
4 no yes 2 no furnished
.. … … … … …
540 no no 2 no unfurnished
541 no no 0 no semi-furnished
542 no no 0 no unfurnished
543 no no 0 no furnished
544 no no 0 no unfurnished
[545 rows x 13 columns]
[29]: data.isnull()
[29]: price area bedrooms bathrooms stories mainroad guestroom \
0 False False False False False False False
1 False False False False False False False
2 False False False False False False False
3 False False False False False False False
4 False False False False False False False
.. … … … … … … …
540 False False False False False False False
541 False False False False False False False
542 False False False False False False False
543 False False False False False False False
3
544 False False False False False False False
basement hotwaterheating airconditioning parking prefarea \
0 False False False False False
1 False False False False False
2 False False False False False
3 False False False False False
4 False False False False False
.. … … … … …
540 False False False False False
541 False False False False False
542 False False False False False
543 False False False False False
544 False False False False False
furnishingstatus
0 False
1 False
2 False
3 False
4 False
.. …
540 False
541 False
542 False
543 False
544 False
[545 rows x 13 columns]
[31]: data.isnull().sum()
[31]: price 0
area 0
bedrooms 0
bathrooms 0
stories 0
mainroad 0
guestroom 0
basement 0
hotwaterheating 0
airconditioning 0
parking 0
prefarea 0
furnishingstatus 0
dtype: int64
4
[33]: data.isna().mean()*100
[33]: price 0.0
area 0.0
bedrooms 0.0
bathrooms 0.0
stories 0.0
mainroad 0.0
guestroom 0.0
basement 0.0
hotwaterheating 0.0
airconditioning 0.0
parking 0.0
prefarea 0.0
furnishingstatus 0.0
dtype: float64
[35]: data.head()
[35]: price area bedrooms bathrooms stories mainroad guestroom basement \
0 13300000 7420 4 2 3 yes no no
1 12250000 8960 4 4 4 yes no no
2 12250000 9960 3 2 2 yes no yes
3 12215000 7500 4 2 2 yes no yes
4 11410000 7420 4 1 2 yes yes yes
hotwaterheating airconditioning parking prefarea furnishingstatus
0 no yes 2 yes furnished
1 no yes 3 no furnished
2 no no 2 yes semi-furnished
3 no yes 3 yes furnished
4 no yes 2 no furnished
[39]: data.fillna('Disha')
[39]: price area bedrooms bathrooms stories mainroad guestroom basement \
0 13300000 7420 4 2 3 yes no no
1 12250000 8960 4 4 4 yes no no
2 12250000 9960 3 2 2 yes no yes
3 12215000 7500 4 2 2 yes no yes
4 11410000 7420 4 1 2 yes yes yes
.. … … … … … … … …
540 1820000 3000 2 1 1 yes no yes
541 1767150 2400 3 1 1 no no no
542 1750000 3620 2 1 1 yes no no
543 1750000 2910 3 1 1 no no no
544 1750000 3850 3 1 2 yes no no
5
hotwaterheating airconditioning parking prefarea furnishingstatus
0 no yes 2 yes furnished
1 no yes 3 no furnished
2 no no 2 yes semi-furnished
3 no yes 3 yes furnished
4 no yes 2 no furnished
.. … … … … …
540 no no 2 no unfurnished
541 no no 0 no semi-furnished
542 no no 0 no unfurnished
543 no no 0 no furnished
544 no no 0 no unfurnished
[545 rows x 13 columns]
[41]: data.duplicated().sum()
[41]: 0
[43]: data
[43]: price area bedrooms bathrooms stories mainroad guestroom basement \
0 13300000 7420 4 2 3 yes no no
1 12250000 8960 4 4 4 yes no no
2 12250000 9960 3 2 2 yes no yes
3 12215000 7500 4 2 2 yes no yes
4 11410000 7420 4 1 2 yes yes yes
.. … … … … … … … …
540 1820000 3000 2 1 1 yes no yes
541 1767150 2400 3 1 1 no no no
542 1750000 3620 2 1 1 yes no no
543 1750000 2910 3 1 1 no no no
544 1750000 3850 3 1 2 yes no no
hotwaterheating airconditioning parking prefarea furnishingstatus
0 no yes 2 yes furnished
1 no yes 3 no furnished
2 no no 2 yes semi-furnished
3 no yes 3 yes furnished
4 no yes 2 no furnished
.. … … … … …
540 no no 2 no unfurnished
541 no no 0 no semi-furnished
542 no no 0 no unfurnished
543 no no 0 no furnished
544 no no 0 no unfurnished
6
[545 rows x 13 columns]
[45]: data['area']
[45]: 0 7420
1 8960
2 9960
3 7500
4 7420
…
540 3000
541 2400
542 3620
543 2910
544 3850
Name: area, Length: 545, dtype: int64
[59]: sns.histplot(data['area'],kde=True,color='b',fill=False)
plt.show()
7
[63]: sns.kdeplot(data)
plt.show()
[75]: sns.kdeplot(data['area'], fill=True, color='g')
plt.show()
8
[69]: import warnings
warnings.filterwarnings('ignore')
[81]: sns.distplot(data['area'], bins=30,kde=True, color='m',hist=True,label='Area')
plt.legend()
plt.show()
9
[87]: sns.boxplot(data['area'])
[87]: <Axes: ylabel='area'>
10
[91]: sns.countplot(x='area', data=data)
plt.xticks(rotation=45) # Rotate labels if needed
plt.show()
11
[117]: sns.scatterplot(x='area', y='price', data=data)
plt.show()
12
[ ]: from scipy.stats import zscore
z_scores = np.abs(zscore(data.select_dtypes(include=[np.number])))
outliers = data[(z_scores > 3).any(axis=1)]
print(outliers)
[ ]: Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1
outliers = data[(data['price'] < Q1 - 1.5 * IQR) | (data['price'] > Q3 + 1.5 *␣
↪IQR)]
print(outliers)
[ ]:
[ ]:
[ ]:
[ ]:
13
[ ]:
[ ]:
14