Exercise for k means tutorial
1. Use iris flower dataset from sklearn library and try to form clusters of flowers using
petal width and length features. Drop other two features for simplicity.
2. Figure out if any preprocessing such as scaling would help here
3. Draw elbow plot and from that figure out optimal value of k
In [2]:
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
from sklearn.datasets import load_iris
%matplotlib inline
In [7]:
iris = load_iris()
In [8]:
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df.head()
Out[8]:
s p
s p
e e
e e
p t
p t
a a
a a
l l
l l
l l
w w
e e
i i
n n
d d
g g
t t
t t
h h
h h
( (
( (
c c
c c
m m
m m
) )
) )
5 3 1 0
0 . . . .
1 5 4 2
1 4 3 1 0
. . . .
s p
s p
e e
e e
p t
p t
a a
a a
l l
l l
l l
w w
e e
i i
n n
d d
g g
t t
t t
h h
h h
( (
( (
c c
c c
m m
m m
) )
) )
9 0 4 2
4 3 1 0
2 . . . .
7 2 3 2
4 3 1 0
3 . . . .
6 1 5 2
5 3 1 0
4 . . . .
0 6 4 2
In [9]:
df['flower'] = iris.target
df.head()
Out[9]:
s p
s p
e e
e e
p t
p t
a a
a a
l l
l l
f
l l
w w l
e e
i i o
n n
d d w
g g
t t e
t t
h h r
h h
( (
( (
c c
c c
m m
m m
) )
) )
5 3 1 0
0 . . . . 0
1 5 4 2
4 3 1 0
1 . . . . 0
9 0 4 2
4 3 1 0
2 . . . . 0
7 2 3 2
4 3 1 0
3 . . . . 0
6 1 5 2
5 3 1 0
4 . . . . 0
0 6 4 2
In [23]:
df.drop(['sepal length (cm)', 'sepal width (cm)',
'flower'],axis='columns',inplace=True)
In [24]:
df.head(3)
Out[24]:
petal length petal width
(cm) (cm)
0 1.4 0.2
1 1.4 0.2
2 1.3 0.2
In [28]:
km = KMeans(n_clusters=3)
yp = km.fit_predict(df)
yp
Out[28]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
In [30]:
df['cluster'] = yp
df.head(2)
Out[30]:
pet pe
al tal
len wi clu
gt dt ste
h h r
(c (c
m) m)
0.
0 1.4 0
2
1 1.4 0. 0
pet pe
al tal
len wi clu
gt dt ste
h h r
(c (c
m) m)
In [31]:
df.cluster.unique()
Out[31]:
array([0, 2, 1], dtype=int64)
In [33]:
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
In [34]:
plt.scatter(df1['petal length (cm)'],df1['petal width (cm)'],color='blue')
plt.scatter(df2['petal length (cm)'],df2['petal width (cm)'],color='green')
plt.scatter(df3['petal length (cm)'],df3['petal width
(cm)'],color='yellow')
Out[34]:
<matplotlib.collections.PathCollection at 0x124cdf8c908>
Elbow Plot
In [35]:
sse = []
k_rng = range(1,10)
for k in k_rng:
km = KMeans(n_clusters=k)
km.fit(df)
sse.append(km.inertia_)
In [36]:
plt.xlabel('K')
plt.ylabel('Sum of squared error')
plt.plot(k_rng,sse)
Out[36]:
[<matplotlib.lines.Line2D at 0x124ce45fc88>]