Datascience 2 PDF
Datascience 2 PDF
PROGRAM:
import pandas as pd
import statsmodels.api as sm
data=pd.read_csv(“pima_diabetes.csv")
#create correlation matrix
data.corr()
PROGRAM:
# importing libraries
import statsmodels.api as sm
import pandas as pd
PROGRAM:
# importing modules and packages
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import statsmodels.api as ssm
# importing data
df = pd.read_csv('pima_diabetes.csv')
a) NORMAL CURVES
PROGRAM:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
df = pd.read_csv("adult.csv")
sns.set(font_scale=1.5)
sns.catplot(x="relationship", y="age", data=df,
kind="point",hue='income',capsize=0.4,ci=None,aspect=2)
# Show plot
plt.xticks(rotation=90) plt.show()
sns.set(font_scale=1)
sns.relplot(x="educational-num", y="hours-per-week",
data=df, kind="line",row='income' , ci=None,
hue="relationship",style="relationship",markers=True,
dashes=False,aspect=2)
# Show plot
plt.show()
OUTPUT:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 48842 non-null int64
1 workclass 48842 non-null object
2 fnlwgt 48842 non-null int64
3 education 48842 non-null object
4 educational-num 48842 non-null int64
5 marital-status 48842 non-null object
6 occupation 48842 non-null object
7 relationship 48842 non-null object
8 race 48842 non-null object
9 gender 48842 non-null object
10 capital-gain 48842 non-null int64
11 capital-loss 48842 non-null int64
12 hours-per-week 48842 non-null int64
13 native-country 48842 non-null object 14 income 48842 non-null
object dtypes: int64(6), object(9)
memory usage: 5.6+ MB
b) DENSITY AND CONTOUR PLOTS
PROGRAM:
import numpy as np
import pandas as pd
df = pd.read_csv("adult.csv")
sns.set_style("white")
#Map a third variable “income” with a hue semantic to show conditional distributions
sns.kdeplot(data=df, x="age", y="educational-num", hue="income")
PROGRAM:
pandas as pd import
seaborn as sns
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
df = pd.read_csv("adult.csv")
sns.set_style("white")
cormat = df.corr()
sns.heatmap(cormat, annot=True);
d) HISTOGRAMS
PROGRAM:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
df = pd.read_csv("adult.csv")
<AxesSubplot:xlabel='hours-per-week', ylabel='Count'>
sns.histplot(data=df[:100], x="hours-per-week", kde=True, color="red")
<AxesSubplot:xlabel='hours-per-week', ylabel='Count'>
sns.distplot(df["hours-per-week"], color="green")
<AxesSubplot:xlabel='hours-per-week', ylabel='Density'>
PROGRAM:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
df = pd.read_csv("adult.csv")
PROGRAM:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
cities = pd.read_csv('california_cities.csv')
# scatter city data, with color reflecting population and size reflecting area
m.scatter(lon, lat, latlon=True, c=np.log10(population), s=area, cmap='Reds', alpha=0.5)