Employee Data Analysis Report
Employee Data Analysis Report
In [5]:
# QA1 - Display full column to view all attributes
dataset.head()
Research &
1 49 Travel_Frequently 5130 2 20520
Development
Research &
2 37 Travel_Rarely 2090 3 6270
Development
Research &
3 33 Travel_Frequently 2909 3 8727
Development
Research &
4 27 Travel_Rarely 3468 2 10404
Development
5 rows × 24 columns
In [6]:
# QA2a - Dimension and Attributes in the dataset
print('The Dimension this dataset is:',dataset.shape)
print("\n")
print('Which are the following:')
print(list(dataset.columns.values))
In [7]:
# QA2b - Dataset Info to check
print('Dataset info as per below:')
print(dataset.info())
In [8]:
# QA3 - Average value for ‘Age’, ‘Monthly Income’ and ‘Years At Company’
age_avg = np.average(dataset['Age'])
minc_avg = np.average(dataset['MonthlyIncome'])
yrsatco_avg = np.average(dataset['YearsAtCompany'])
In [10]:
# QA4 - Find the minimum and maximum ‘Monthly Income’
minc_min = min(dataset['MonthlyIncome'])
minc_max = max(dataset['MonthlyIncome'])
In [11]:
# QA5 - Histogram of Monthly Income vs Staff No
x = dataset['MonthlyIncome']
plt.hist(x)
plt.hist(x, color = 'pink', ec = 'black')
plt.title('Histogram of Monthly Income')
plt.xlabel('Monthly Income')
plt.ylabel('Staff No')
plt.grid(axis = 'y')
plt.show()
sns.set_style('darkgrid')
sns.distplot(x)
C:\Users\JAMIL KENYA\anaconda3\lib\site-packages\seaborn\distributions.py:2557:
FutureWarning: `distplot` is a deprecated function and will be removed in a futu
re version. Please adapt your code to use either `displot` (a figure-level funct
ion with similar flexibility) or `histplot` (an axes-level function for histogra
ms).
warnings.warn(msg, FutureWarning)
Out[11]: <AxesSubplot:xlabel='MonthlyIncome', ylabel='Density'>
In [12]:
# QA6 - Scatter Plot between ‘Years At Company’ and ‘Monthly Income’
x = dataset['YearsAtCompany']
y = dataset['MonthlyIncome']
localhost:8888/nbconvert/html/ADS-Final Exam/ADS_Exam 21C3- Mohd Jamil Talib.ipynb?download=false 3/22
9/19/21, 11:22 AM ADS_Exam 21C3- Mohd Jamil Talib
plt.scatter(x,y)
plt.scatter(x, y, color = 'hotpink')
plt.title('Years at Company vs. Monthly Income')
plt.xlabel('Years at Company')
plt.ylabel('Monthly Income')
plt.show()
In [13]:
# QA7 - Correlation between ‘Years At Company’ and ‘Monthly Income’
corr =np.corrcoef(x,y)
In [15]:
# QA8a - Range of monthly income at Company A
print( 'The range monthly income is between ', minc_min, 'and', minc_max)
In [16]:
# QA8b - Most and least monthly income values at Company A
import statistics
idx = np.where(y.value_counts() == 1)
print(dataset['MonthlyIncome'].value_counts())
print("\n")
2342 4
6142 3
2610 3
2559 3
6347 3
..
4103 1
2705 1
6796 1
19717 1
10239 1
Name: MonthlyIncome, Length: 1349, dtype: int64
7644, 3646, 4789, 9936, 11849, 13464, 19859, 16032, 11935, 9884, 3506, 3737, 227
5, 4707, 10648, 10368, 3730, 5070, 3211, 16015, 13966, 18061, 7847, 3375, 1706,
4374, 3780, 14026, 3785, 7880, 7879, 2760, 5828, 16064, 3755, 4617, 4490, 7861,
7898, 5811, 5810, 3761, 5772, 1675, 3722, 1081, 4163, 3688, 11878, 2517, 7779, 3
681, 3280, 3673, 19237, 3669, 4327, 5715, 5714, 7314, 3660, 1611, 3690, 3692, 57
68, 2700, 5765, 5762, 6932, 11904, 6334, 2105, 8789, 18041, 9924, 15992, 4420, 2
972, 5770, 5745, 5744, 5743, 1009, 13237, 11245, 1200, 6430, 2379, 6474, 4425, 4
424, 4256, 2373, 17665, 2370, 6465, 2368, 2366, 6384, 10552, 5399, 18740, 4403,
1261, 4434, 2387, 4449, 3697, 2413, 2438, 2408, 4422, 6499, 4450, 2400, 2389, 44
47, 2398, 10883, 4444, 2394, 3691, 16799, 4401, 4400, 2351, 2313, 2321, 10512, 2
319, 2318, 4364, 8740, 6410, 5775, 16422, 2311, 10502, 5878, 2308, 2307, 2306, 2
305, 2322, 2325, 2348, 10527, 2345, 6439, 2341, 2600, 18722, 4538, 5617, 4382, 2
326, 4381, 2691, 14852, 8474, 2329, 2328, 2022, 16756, 19141, 2426, 8639, 4554,
4553, 12742, 2501, 16835, 5209, 2496, 10686, 4558, 10685, 4539, 6586, 4537, 1682
3, 6582, 2800, 4556, 4559, 6578, 2532, 8686, 7484, 2539, 1702, 16872, 2534, 1072
5, 4577, 2514, 6623, 14814, 2523, 4568, 2519, 10880, 8321, 4759, 10673, 6524, 18
824, 3295, 10851, 3500, 6540, 4448, 6538, 8008, 4487, 6549, 12936, 4485, 8578, 2
432, 8161, 4478, 4477, 6545, 3743, 2479, 14756, 2478, 8621, 2476, 4523, 4522, 24
72, 6567, 6782, 2455, 8847, 2461, 18844, 8463, 10650, 4505, 16792, 10496, 8446,
19436, 4345, 4193, 2144, 2143, 2141, 3907, 4187, 6232, 6230, 4908, 10322, 6220,
4171, 4465, 2121, 10312, 2119, 6214, 4194, 9854, 6244, 6261, 2176, 4240, 2174, 4
221, 2168, 4286, 2166, 1790, 4198, 2966, 10932, 2157, 2154, 2153, 4200, 4320, 21
15, 4162, 6209, 2062, 16413, 6172, 2075, 10266, 4115, 2064, 19586, 2061, 2430, 1
0252, 4107, 2058, 4105, 10248, 6151, 6146, 5380, 8224, 5468, 2093, 4157, 2107, 6
201, 4152, 17046, 2099, 2097, 16437, 9756, 2090, 2089, 2088, 2086, 6180, 2083, 1
0274, 6274, 4227, 2180, 16595, 2270, 8412, 2267, 4312, 6392, 16598, 10453, 2679,
2272, 4317, 4001, 4302, 6349, 8396, 4298, 4774, 4319, 11957, 4294, 4335, 2296, 4
342, 5731, 16627, 10482, 6385, 2288, 2285, 2274, 6380, 10475, 18665, 2279, 6323,
4325, 4323, 8392, 2244, 4230, 2201, 6652, 4258, 4257, 10400, 2207, 2206, 8346, 3
201, 2213, 6294, 4244, 2194, 8564, 6288, 4771, 4233, 4260, 4262, 10435, 10422, 2
238, 8381, 8380, 2235, 7005, 2232, 2231, 4581, 2216, 6322, 4272, 2210, 18606, 22
20, 16555, 2218, 2543, 2659, 2546, 10739, 19246, 2506, 2859, 19038, 2074, 2856,
2728, 8998, 6949, 4900, 2851, 4648, 19232, 16704, 8376, 17181, 2844, 2863, 6962,
2867, 3580, 6447, 2899, 2897, 13348, 18172, 2889, 19272, 17007, 1420, 10377, 493
0, 13120, 13116, 2875, 3057, 2871, 2571, 11031, 2838, 2045, 12490, 19190, 19189,
2804, 4851, 2437, 2684, 8943, 2809, 6893, 6306, 2795, 2794, 2793, 2791, 2332, 99
07, 2810, 2837, 8966, 2836, 2835, 6929, 4878, 2297, 2827, 17159, 4869, 4859, 281
9, 19202, 6120, 2133, 2066, 8095, 19197, 4950, 2818, 2904, 2706, 17174, 5063, 92
50, 7104, 5055, 4087, 13245, 7094, 3022, 13402, 2994, 2991, 9241, 16328, 7082, 7
406, 5067, 7119, 2979, 3041, 4591, 10596, 1091, 7143, 5094, 5093, 7140, 5087, 56
89, 5295, 19419, 3034, 3033, 5079, 13269, 5042, 2983, 1569, 5377, 8837, 13964, 4
978, 2929, 19045, 2926, 9069, 2422, 4969, 2933, 3537, 15972, 4963, 15202, 14732,
4960, 12965, 2932, 2935, 2977, 2960, 2976, 2974, 5021, 13212, 8834, 13206, 5010,
4285, 19328, 5006, 13194, 4999, 4998, 19701, 19331, 2080, 2789, 4507, 17123, 466
8, 6725, 10820, 19431, 2625, 18711, 16959, 2622, 2619, 2632, 6712, 4663, 2613, 2
862, 4534, 10798, 6842, 4678, 4682, 19068, 4684, 2662, 2661, 2660, 6755, 2655, 2
654, 10845, 19033, 8237, 10854, 4695, 2645, 2644, 1904, 2642, 6735, 1359, 6696,
19049, 6653, 10761, 12808, 4615, 2566, 18947, 1878, 4084, 10748, 6667, 6651, 255
3, 2552, 4599, 6646, 16885, 6644, 2570, 2572, 4647, 3986, 6694, 11103, 10008, 37
02, 2593, 2592, 11159, 2585, 4621, 13973, 8726, 2580, 2579, 8722, 6673, 2576, 10
855, 2145, 4834, 4649, 2759, 6854, 4805, 2756, 4377, 10938, 10934, 4787, 4809, 6
834, 6833, 5747, 19406, 17068, 6825, 10920, 19144, 4810, 6577, 2774, 2785, 1097
6, 2783, 6877, 5071, 2778, 19161, 4821, 17099, 2440, 17169, 2768, 6632, 2766, 68
61, 9094, 4775, 2725, 4717, 6553, 4739, 10965, 18430, 4736, 4735, 2686, 6781, 26
83, 2695, 4728, 8823, 2678, 10169, 19973, 4721, 2670, 2694, 2696, 2723, 17048, 8
865, 6502, 6931, 4765, 2716, 8858, 2713, 10903, 19081, 19094, 8853, 2707, 4103,
2705, 6796, 19717, 10239] with frequency of 1
In [17]:
# QA8c - Observation on the distribution of monthly income values
In [18]:
# QA8d - is there linear relationship between monthly incompe and employees
In [20]:
# Qb2a - Checking dataset
datasetb.head()
Research &
1 49 Travel_Frequently 5130 2 20520
Development
Research &
2 37 Travel_Rarely 2090 3 6270
Development
Research &
3 33 Travel_Frequently 2909 3 8727
Development
Research &
4 27 Travel_Rarely 3468 2 10404
Development
5 rows × 24 columns
In [21]:
# QB3a - Check input & output columns
datasetb.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 24 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 1470 non-null int64
1 BusinessTravel 1470 non-null object
2 MonthlyIncome 1470 non-null int64
3 JobSatisfaction 1470 non-null int64
4 Bonus 1470 non-null int64
5 Department 1470 non-null object
6 DistanceFromHome 1470 non-null int64
7 Education 1470 non-null int64
8 EducationField 1470 non-null object
9 EmployeeCount 1470 non-null int64
10 EmployeeNumber 1470 non-null int64
11 EnvironmentSatisfaction 1470 non-null int64
12 Gender 1470 non-null object
13 JobLevel 1470 non-null int64
14 JobRole 1470 non-null object
15 MaritalStatus 1470 non-null object
16 PerformanceRating 1470 non-null int64
17 StockOptionLevel 1470 non-null int64
18 TrainingTimesLastYear 1470 non-null int64
19 WorkLifeBalance 1470 non-null int64
20 YearsAtCompany 1470 non-null int64
21 YearsSinceLastPromotion 1470 non-null int64
22 OverTime 1470 non-null object
23 Attrition 1470 non-null object
dtypes: int64(16), object(8)
memory usage: 275.8+ KB
In [23]:
# QB3b - Allocate & assign attributes to x,y
x=datasetb.iloc[:,[0,1,2,3]].values # column ‘Age’, ‘BusinessTravel’, ‘MonthlyIn
y=datasetb.iloc[:,23].values #column ‘Attrition’
In [24]:
# QB4 - Encoding categorical data for ‘BusinessTravel’
labelencoder_x=LabelEncoder()
x[:,1] = labelencoder_x.fit_transform(x[:,1])
print(x[:5,:])
print('\n')
print(y[0:5])
[[41 2 5993 4]
[49 1 5130 2]
[37 2 2090 3]
[33 1 2909 3]
[27 2 3468 2]]
In [25]:
# QB5 - Splitting the data into training set and test set
In [26]:
# QB6 - Feature scaling
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)
In [27]:
# QB7a - Fitting the Naïve Bayes Classifier to the training set
from sklearn.naive_bayes import GaussianNB
classifier=GaussianNB()
classifier.fit(x_train,y_train)
Out[27]: GaussianNB()
In [28]:
# QB7b - Predict the test result for test set
y_pred = classifier.predict(x_test)
print(y_test)
print('\n')
print(y_pred)
['No' 'No' 'Yes' 'No' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'No'
'Yes' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No'
'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'No'
'Yes' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'Yes'
'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'No' 'Yes' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'No'
'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'No'
'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'Yes'
'No' 'No']
['No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No']
In [31]:
# QB8a - Create making confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(y_test,y_pred)
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
print('Confusion matrix:')
print(cm)
Confusion matrix:
[[241 4]
[ 48 1]]
In [32]:
# QB8b - Calculating the accuracy
accu = (tn+tp)/(tn+fp+fn+tp)
print('The accuracy is', round(accu,4), 'equal to', round(accu*100,2),'%')
In [33]:
# QB8c - Calculating the precision
prec = tp/(tp+fp)
print('The precision is', round(prec,4), 'equal to', round(prec*100,2),'%')
In [34]:
# QB8d - Calculating the recall
recall = tp/(tp+fn)
print('The recall is', round(recall,2), 'equal to', round(recall*100,2),'%')
In [35]:
# QB8e - Calculation the error rate
err = 1 - accu
print('The Error rate is', round(err,4), 'equal to', round(err*100,2),'%')
In [36]:
# Qb8f - Calculation the specificity
specf = tn/(tn+tp)
print('The Specificity is', round(specf,4), 'equal to', round(specf*100,2),'%')
SECTION C
In [87]:
# Step 1 - Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [88]:
# Step 3 - Checking data
datasetc.shape
Out[88]: (300, 3)
In [89]:
# Step 4 - Checking data
datasetc.head()
Out[89]:
localhost:8888/nbconvert/html/ADS-Final Exam/ADS_Exam 21C3- Mohd Jamil Talib.ipynb?download=false 11/22
9/19/21, 11:22 AM ADS_Exam 21C3- Mohd Jamil Talib
Unnamed: 0 A B
0 0 0.329241 0.841783
1 1 1.697407 -0.236075
2 2 -0.831460 0.584743
3 3 1.825271 -0.297894
4 4 1.236577 0.121528
In [90]:
# Step 5 - Checking data
datasetc.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 300 non-null int64
1 A 300 non-null float64
2 B 300 non-null float64
dtypes: float64(2), int64(1)
memory usage: 7.2 KB
In [91]:
# QC1a - Standard Scaler
ss = StandardScaler()
X = datasetc.iloc[:, [1,2]].values
XX = ss.fit_transform(X)
XX
[ 1.00260241, -1.63401717],
[-1.77578325, 0.02797682],
[ 0.14046294, -1.43902102],
[-0.77072388, -0.06999854],
[-1.60386009, 0.19275344],
[ 0.06792247, -1.13251967],
[ 0.15808104, 0.99907498],
[ 0.5918192 , 0.43882649],
[ 1.79951945, -0.05662317],
[-1.74358217, -0.34597621],
[ 0.09472405, -1.29480856],
[-0.47510987, 1.07487746],
[-0.38136245, 1.6657932 ],
[ 0.06522098, -1.43342123],
[-0.03857433, -1.16641556],
[-0.16597062, 1.189908 ],
[ 1.20360375, -1.36750747],
[ 1.0732983 , -0.97594391],
[-0.35115433, 1.54616869],
[ 0.10305029, 1.04602726],
[-1.38351072, 0.76541859],
[ 0.17613919, -1.29609571],
[ 0.31017327, 0.63643759],
[ 1.7472377 , -0.40678156],
[-1.74163436, 0.20972692],
[ 0.57790187, -1.19736609],
[-1.48900544, 0.57690855],
[ 0.26169321, 0.54631588],
[-0.12248317, -1.08681148],
[-0.03213051, 1.2911978 ],
[ 1.67184556, -0.08121523],
[-0.53553067, 1.65924761],
[ 0.55769055, 0.42256247],
[ 1.5615795 , -0.7852702 ],
[ 0.20215809, 0.72963099],
[ 1.61257642, 0.00941248],
[-0.27668152, -0.49653308],
[ 0.37231261, -0.24188354],
[ 1.60851364, -0.54033105],
[ 1.75501302, -0.04436057],
[-1.33577938, 0.73374176],
[ 1.56165535, -1.11191577],
[ 0.60191189, 0.12201247],
[ 0.47411543, 0.33365726],
[ 0.14759729, -1.42036925],
[-0.29500422, -0.03249118],
[ 0.25455501, -1.66088578],
[-1.262038 , 1.19011598],
[-1.12971929, 1.29556783],
[-1.79286216, -0.09227981],
[ 1.42405653, -0.53242634],
[-0.66811575, 1.4945073 ],
[ 1.82849484, 0.27971421],
[ 1.09850151, -1.25822568],
[-1.28405716, 0.81875931],
[-0.9267942 , 1.33770908],
[-1.41225387, 0.58593108],
[ 0.43475579, -0.66017839],
[-0.03291934, -1.36325655],
[-0.19266819, 1.4092657 ],
[ 1.53279677, 0.57282086],
[-0.50949316, -0.06034573],
[-0.60193361, 0.43020802],
[-0.26576888, 1.42841156],
[ 0.73192467, -1.41081643],
[ 1.82840489, 0.15224212],
[ 0.2809764 , 0.96935721],
localhost:8888/nbconvert/html/ADS-Final Exam/ADS_Exam 21C3- Mohd Jamil Talib.ipynb?download=false 13/22
9/19/21, 11:22 AM ADS_Exam 21C3- Mohd Jamil Talib
[-1.72088488, -0.28542723],
[ 0.52330762, -1.21003467],
[ 1.83468027, -0.44822378],
[ 1.314881 , -0.56802268],
[-1.66209104, 0.35636128],
[-0.63424472, 1.21808607],
[ 0.93084606, -1.40424906],
[ 0.23410303, 0.68678045],
[ 0.86567058, -1.5477206 ],
[ 0.77505506, -1.26787726],
[-1.79618431, -0.42671216],
[-0.92977333, 1.14810566],
[ 0.68783268, -0.5075786 ],
[ 0.25684998, -1.33574746],
[-0.36334458, 1.73227863],
[ 1.53608765, -0.83686301],
[ 0.10411161, -1.04890313],
[-0.21126217, 1.35994922],
[-0.4043569 , 1.37117838],
[ 0.24507698, 1.10040827],
[ 0.30398076, -1.30201952],
[ 0.3362653 , 1.33522165],
[-0.61944683, 0.03193701],
[-0.46098453, 0.56321488],
[-0.10946508, -1.17105631],
[ 1.24801402, -1.07040373],
[-0.69955382, -0.02614578],
[-1.74463412, 0.23139312],
[-0.39443825, 1.76467142],
[-0.51812906, -0.8948249 ],
[-1.42711059, 0.87132496],
[ 1.7544594 , 0.56525096],
[-0.15820378, -0.70435453],
[-0.42045782, -0.9372088 ],
[-1.33695195, 1.71943321],
[ 0.03012863, 1.56324384],
[ 0.25671046, -1.17945547],
[ 1.5642675 , -0.98953903],
[ 0.51751434, -0.28969062],
[ 0.72259686, -1.66502852],
[-0.54391448, 0.58996774],
[ 0.89785103, -1.01254764],
[-0.63503605, 1.24586389],
[ 0.91968569, -1.3777468 ],
[ 0.90726208, -1.28503989],
[-0.33971656, -0.79927735],
[-0.95394401, 1.26597389],
[ 1.68563013, -0.41531267],
[ 0.31451382, -1.30629283],
[-0.4700258 , 1.64936299],
[ 0.61369022, -0.49964186],
[ 0.38833491, -1.27413715],
[ 0.00201814, -1.14197994],
[ 1.61162204, -0.07929346],
[-0.79914855, 1.60711339],
[ 1.61648931, -0.14879744],
[-0.28273724, 1.12413693],
[-0.14216885, -1.02490089],
[ 1.24433128, -0.81887872],
[-0.38620936, 1.66726562],
[-1.22532631, 1.29233212],
[ 0.44531696, 0.46471596],
[-0.74504741, 1.59531413],
[-0.4569497 , 1.48406937],
[-1.29883261, 1.34533705],
[ 0.38245435, -1.16390782],
[ 0.34510878, 0.57997883],
localhost:8888/nbconvert/html/ADS-Final Exam/ADS_Exam 21C3- Mohd Jamil Talib.ipynb?download=false 14/22
9/19/21, 11:22 AM ADS_Exam 21C3- Mohd Jamil Talib
[ 0.08237803, 1.11416533],
[ 0.93760658, -1.22306735],
[ 1.16651995, -1.06020357],
[-0.84055223, 1.64059509],
[-0.86961945, 1.16660696],
[ 1.17913898, -0.76476596],
[ 1.55316962, 0.1298981 ],
[-1.45579441, 0.4593996 ],
[-0.35395338, -0.33627981],
[ 0.42196061, -0.46172207],
[-0.44529019, -0.98342201],
[ 1.35782059, -0.79955299],
[ 0.63863183, 0.22850505],
[-1.28238685, 1.21635217],
[-1.45720122, 1.08700534],
[-0.45561127, -0.28518451],
[ 0.52602976, -1.31808183],
[ 1.56803518, 0.07970716],
[-1.16774043, 1.55925867],
[ 0.17101923, 1.300702 ],
[-0.25581562, -0.78403655],
[ 0.00612814, 1.38362092],
[ 0.60712586, 0.01758983],
[-0.5626133 , 0.00457586],
[ 0.00201533, -1.06732082],
[-0.9412958 , 1.30222859],
[-1.03070738, 1.56873984],
[-0.25861567, -0.98460152],
[ 0.21110295, 0.76245271],
[ 0.08503157, -1.18491829],
[-0.39077135, 1.7226643 ],
[-1.53184221, 0.56028519],
[-1.547237 , 0.1143973 ],
[-1.46499486, -0.05173428],
[ 0.5895073 , -1.42893958],
[-1.32141928, 0.74663487],
[-1.16480142, 1.29153694],
[-0.54517959, -0.21567658],
[ 1.23918814, -1.0864646 ],
[-0.77869731, 1.20625553],
[-1.11259579, 1.3044358 ],
[ 1.05840809, -1.56475069],
[-1.10483008, 1.33514873],
[ 0.65193531, -0.21247671],
[-0.32724407, -0.66726066],
[ 0.53404503, -1.58694134],
[ 1.67941162, -0.47296823],
[-1.32334543, 0.56316141],
[ 0.76802274, -1.41344864],
[ 0.26653178, 0.2273098 ],
[-0.73206107, 1.32899158],
[-0.51302132, -0.3465829 ],
[-1.44131322, 0.80018881],
[ 1.23136359, -1.17536451],
[-1.44204561, -0.09416426],
[-0.22407157, 1.39887222],
[-1.49383416, 0.18147278],
[-0.71341691, 0.04382273],
[-1.70243597, -0.21444389],
[ 1.2807792 , -0.89919532],
[-0.41919151, -0.84744412],
[ 0.09943788, 1.55301576],
[-0.28837467, -0.31552567],
[-0.48085421, -0.5568526 ],
[-1.43402414, 0.74490522],
[-1.64868992, -0.5928372 ],
[ 0.03470026, -1.37554518],
localhost:8888/nbconvert/html/ADS-Final Exam/ADS_Exam 21C3- Mohd Jamil Talib.ipynb?download=false 15/22
9/19/21, 11:22 AM ADS_Exam 21C3- Mohd Jamil Talib
[ 0.38585993, -1.65129052],
[ 0.88270035, -1.55267596],
[ 1.29206656, -0.84810823],
[ 0.39813273, 0.51398583],
[-0.41144853, -0.71487106],
[ 0.50072718, -0.12808598],
[ 0.53134497, -0.25330279],
[-0.11081476, -0.87735421],
[-0.24610423, -1.12076236],
[-1.26121627, 1.3762284 ],
[-1.12731422, 1.45503108],
[-0.92217835, 1.71755964],
[ 0.73755508, -1.36141886],
[-0.84267556, 1.75934982],
[ 1.93099153, 0.24788243],
[-1.14245354, 1.24994623],
[-1.748282 , 0.66703768],
[ 1.77943731, -0.01260514],
[-0.20421006, -0.44388164],
[ 0.71797573, 0.16976417],
[ 0.8632482 , -1.53144381],
[-1.53390223, -0.32001251],
[ 1.41173027, -0.78423113],
[-1.81998608, 0.12223338],
[-0.65002923, -0.66285385],
[-1.78097343, -0.12456502],
[ 1.72427759, 0.03221282],
[-1.38943293, 0.64273524],
[ 1.21390253, -0.61812225],
[-0.59140947, 0.04257789],
[ 0.23958252, 0.9535646 ],
[ 1.75729223, -0.23324216],
[-1.74707067, -0.34675177],
[-1.82779921, -0.51068881],
[ 1.68710042, 0.3774761 ],
[-0.58875026, 0.61684293],
[ 1.02725812, -1.5287461 ],
[-0.29832803, 1.36790989],
[ 0.37078913, 0.39839244],
[ 0.61747277, 0.11911065],
[-0.55328597, 0.22107544],
[ 0.09809325, -1.14356788],
[ 0.42445282, 0.36119642],
[-0.38007269, -0.94040173],
[ 0.66716984, 0.3058358 ],
[ 1.76264298, -0.37453882],
[ 0.74480854, -1.23005082],
[-1.59579142, 0.29846316],
[-0.00495696, 1.2208933 ],
[-0.70193379, 1.34987655],
[ 1.06898311, -1.48055058],
[-0.0728789 , 1.27338496],
[-1.04677039, 0.70345592],
[ 0.27355694, 1.05744484],
[-0.28099392, 1.21395035],
[ 0.71617936, -1.69968003],
[ 0.28361203, 0.65107434],
[-0.53228584, 0.23711499],
[-0.41341316, -0.34557826],
[ 0.29026951, 1.06853583],
[-0.620568 , 0.35880452],
[-1.58680505, 0.08473025],
[-0.01818413, -1.41262296],
[-1.39317862, 0.86814676],
[-0.07761094, 1.20302664],
[ 1.32926457, -1.4136665 ],
[ 0.92473153, -1.43847443],
localhost:8888/nbconvert/html/ADS-Final Exam/ADS_Exam 21C3- Mohd Jamil Talib.ipynb?download=false 16/22
9/19/21, 11:22 AM ADS_Exam 21C3- Mohd Jamil Talib
[ 1.35215624, -0.68329688],
[ 0.29314766, -1.53905118],
[ 0.43370526, -0.25442592],
[-0.37514515, 1.38146562],
[ 1.56354889, -0.48179178]])
In [92]:
# QC1b - Wccs assignement using elbow method
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 55)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of Cluster')
plt.ylabel('WCSS')
plt.show()
In [93]:
# QC1c - Kmeans and prediction
Out[93]: array([1, 3, 2, 3, 3, 3, 2, 2, 0, 0, 2, 3, 0, 2, 2, 2, 3, 0, 1, 3, 3, 0,
1, 1, 1, 0, 1, 0, 2, 0, 2, 2, 0, 1, 3, 3, 2, 0, 1, 1, 0, 0, 1, 0,
3, 1, 1, 2, 0, 1, 3, 2, 0, 2, 1, 0, 1, 3, 1, 3, 3, 1, 3, 0, 3, 3,
3, 2, 3, 3, 3, 0, 2, 0, 1, 1, 2, 3, 1, 3, 0, 2, 1, 2, 0, 0, 1, 3,
2, 2, 1, 0, 3, 1, 2, 0, 3, 3, 2, 1, 0, 1, 0, 0, 2, 1, 3, 0, 1, 3,
0, 1, 1, 1, 0, 1, 2, 1, 0, 3, 2, 2, 1, 0, 2, 3, 0, 0, 1, 1, 0, 3,
3, 0, 1, 0, 1, 0, 0, 0, 1, 3, 0, 1, 3, 0, 0, 3, 1, 3, 1, 0, 3, 1,
1, 3, 1, 1, 1, 0, 1, 1, 0, 3, 1, 1, 3, 3, 2, 2, 0, 0, 3, 3, 1, 2,
2, 0, 3, 1, 1, 0, 1, 3, 2, 0, 1, 1, 0, 1, 0, 1, 2, 2, 2, 0, 2, 1,
2, 3, 1, 1, 0, 1, 3, 0, 0, 3, 2, 0, 3, 1, 2, 2, 3, 2, 1, 2, 2, 2,
3, 0, 1, 2, 2, 2, 2, 0, 0, 0, 3, 1, 0, 3, 3, 0, 0, 1, 1, 1, 0, 1,
3, 1, 2, 3, 0, 3, 0, 2, 3, 2, 2, 2, 3, 2, 3, 2, 1, 3, 2, 2, 3, 1,
0, 1, 3, 3, 2, 0, 3, 0, 3, 3, 0, 2, 1, 1, 0, 1, 2, 1, 1, 0, 1, 2,
2, 1, 2, 2, 0, 2, 1, 0, 0, 3, 0, 3, 1, 3])
In [94]:
# QC1d - Scatter plot cluster visualizing
In [95]:
# QC1e - Scatter plot score evaluation
-------------------------------------------
Silhouette score : 0.4329118241119466
Silhouette score : 43.29118241119466 %
In [ ]:
In [98]:
# QC2a - Import Libraries
import warnings
warnings.filterwarnings("ignore")
In [99]:
##QC2b - DBScan and knee locator
In [100…
# QC2c DBSCAN fine tunning with knee locator
In [101…
## QC2d - find optimun knee locator
print(distances[knee.knee])
0.18723454491446345
In [102…
## DBSCAN Clustering with knee locator --->>> optimum eps
db = DBSCAN(eps=distances[knee.knee], min_samples=10).fit(X)
labels = db.labels_
In [108…
## QC3 - DBScan vs KMean
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
y_pread = kmeans.fit_predict(X)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\anaconda3\lib\site-packages\matplotlib\axes\_axes.py in _parse_scatter_color_a
rgs(c, edgecolors, kwargs, xsize, get_next_color_func)
4290 try: # Is 'c' acceptable as PathCollection facecolors?
-> 4291 colors = mcolors.to_rgba_array(c)
4292 except (TypeError, ValueError) as err:
~\anaconda3\lib\site-packages\matplotlib\colors.py in <listcomp>(.0)
340 else:
--> 341 return np.array([to_rgba(cc, alpha) for cc in c])
342
~\anaconda3\lib\site-packages\matplotlib\colors.py in _to_rgba_no_colorcycle(c,
localhost:8888/nbconvert/html/ADS-Final Exam/ADS_Exam 21C3- Mohd Jamil Talib.ipynb?download=false 20/22
9/19/21, 11:22 AM ADS_Exam 21C3- Mohd Jamil Talib
alpha)
259 return c, c, c, alpha if alpha is not None else 1.
--> 260 raise ValueError(f"Invalid RGBA argument: {orig_c!r}")
261 # tuple color.
The above exception was the direct cause of the following exception:
~\anaconda3\lib\site-packages\matplotlib\cbook\deprecation.py in wrapper(*inner_
args, **inner_kwargs)
409 else deprecation_addendum,
410 **kwargs)
--> 411 return func(*inner_args, **inner_kwargs)
412
413 return wrapper
~\anaconda3\lib\site-packages\matplotlib\axes\_axes.py in scatter(self, x, y, s,
c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, plotnon
finite, **kwargs)
4449
4450 c, colors, edgecolors = \
-> 4451 self._parse_scatter_color_args(
4452 c, edgecolors, kwargs, x.size,
4453 get_next_color_func=self._get_patches_for_fill.get_next_
color)
~\anaconda3\lib\site-packages\matplotlib\axes\_axes.py in _parse_scatter_color_a
rgs(c, edgecolors, kwargs, xsize, get_next_color_func)
4298 # Both the mapping *and* the RGBA conversion failed:
pretty
4299 # severe failure => one may appreciate a verbose fee
dback.
-> 4300 raise ValueError(
4301 f"'c' argument must be a color, a sequence of co
lors, "
4302 f"or a sequence of numbers, not {c}") from err
'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No']
In [ ]:
In [ ]: