Read the Data
import pandas as pd
df = pd.read_csv('/content/Employee.csv')
df.head()
{"summary":"{\n \"name\": \"df\",\n \"rows\": 4653,\n \"fields\":
[\n {\n \"column\": \"Education\",\n \"properties\": {\n
\"dtype\": \"category\",\n \"num_unique_values\": 3,\n
\"samples\": [\n \"Bachelors\",\n \"Masters\",\n
\"PHD\"\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"JoiningYear\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 1,\n \"min\": 2012,\n
\"max\": 2018,\n \"num_unique_values\": 7,\n
\"samples\": [\n 2017,\n 2013,\n 2012\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"City\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
3,\n \"samples\": [\n \"Bangalore\",\n
\"Pune\",\n \"New Delhi\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"PaymentTier\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
0,\n \"min\": 1,\n \"max\": 3,\n
\"num_unique_values\": 3,\n \"samples\": [\n 3,\n
1,\n 2\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 4,\n \"min\": 22,\n \"max\": 41,\n
\"num_unique_values\": 20,\n \"samples\": [\n 34,\n
35,\n 26\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Gender\",\n \"properties\": {\n \"dtype\":
\"category\",\n \"num_unique_values\": 2,\n \"samples\":
[\n \"Female\",\n \"Male\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"EverBenched\",\n
\"properties\": {\n \"dtype\": \"category\",\n
\"num_unique_values\": 2,\n \"samples\": [\n \"Yes\",\
n \"No\"\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"ExperienceInCurrentDomain\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n
\"max\": 7,\n \"num_unique_values\": 8,\n \"samples\":
[\n 3,\n 4\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n },\n {\n
\"column\": \"LeaveOrNot\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n
\"max\": 1,\n \"num_unique_values\": 2,\n \"samples\":
[\n 1,\n 0\n ],\n \"semantic_type\":
\"\",\n \"description\": \"\"\n }\n }\n ]\
n}","type":"dataframe","variable_name":"df"}
df.describe()
{"summary":"{\n \"name\": \"df\",\n \"rows\": 8,\n \"fields\": [\n
{\n \"column\": \"JoiningYear\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1251.4547220922507,\n
\"min\": 1.8633768286863546,\n \"max\": 4653.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
2015.0629701267999,\n 2015.0,\n 4653.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"PaymentTier\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
1644.2629844314426,\n \"min\": 0.5614354643364909,\n
\"max\": 4653.0,\n \"num_unique_values\": 5,\n
\"samples\": [\n 2.6982591876208897,\n 3.0,\n
0.5614354643364909\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 1635.8622898051515,\n \"min\": 4.826087009126065,\n
\"max\": 4653.0,\n \"num_unique_values\": 8,\n
\"samples\": [\n 29.393294648613796,\n 28.0,\n
4653.0\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"ExperienceInCurrentDomain\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 1644.051605746371,\n
\"min\": 0.0,\n \"max\": 4653.0,\n
\"num_unique_values\": 8,\n \"samples\": [\n
2.905652267354395,\n 3.0,\n 4653.0\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"LeaveOrNot\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
1644.9416023787396,\n \"min\": 0.0,\n \"max\": 4653.0,\n
\"num_unique_values\": 5,\n \"samples\": [\n
0.3438641736514077,\n 1.0,\n 0.47504747514881046\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n }\n ]\n}","type":"dataframe"}
df.shape
(4653, 9)
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 2764 entries, 0 to 4651
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Education 2764 non-null object
1 JoiningYear 2764 non-null int64
2 City 2764 non-null object
3 PaymentTier 2764 non-null int64
4 Age 2764 non-null int64
5 Gender 2764 non-null object
6 EverBenched 2764 non-null object
7 ExperienceInCurrentDomain 2764 non-null int64
8 LeaveOrNot 2764 non-null int64
dtypes: int64(5), object(4)
memory usage: 215.9+ KB
df.isnull().sum()
Education 0
JoiningYear 0
City 0
PaymentTier 0
Age 0
Gender 0
EverBenched 0
ExperienceInCurrentDomain 0
LeaveOrNot 0
dtype: int64
df.duplicated().sum()
import matplotlib.pyplot as plt
for col in df.select_dtypes(include=['object']).columns:
plt.bar(df[col].value_counts().index, df[col].value_counts().values)
plt.xlabel(col)
plt.ylabel('Value Counts')
plt.xticks(rotation = 30)
plt.show()
import seaborn as sns
for col in df.select_dtypes(include=['int64', 'float64']).columns:
sns.boxplot(x=df[col])
plt.show()
#Handling Outliers
# Use box plots to identify outliers in numerical features
numerical_features = ['JoiningYear', 'PaymentTier', 'Age',
'ExperienceInCurrentDomain']
for feature in numerical_features:
plt.figure(figsize=(10, 4))
sns.boxplot(x=df[feature])
plt.title(f'Boxplot of {feature}')
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Visualizing the distribution of the target variable 'LeaveOrNot'
sns.countplot(x='LeaveOrNot', data=df)
plt.title('Distribution of LeaveOrNot')
plt.show()
# Data cleaning and preprocessing
df.isnull().sum()
Education 0
JoiningYear 0
City 0
PaymentTier 0
Age 0
Gender 0
EverBenched 0
ExperienceInCurrentDomain 0
LeaveOrNot 0
dtype: int64
Encode Categorical Variables
from sklearn.preprocessing import LabelEncoder
obj_cols = df.select_dtypes(object).columns
for col in obj_cols:
encoder = LabelEncoder()
df[col] = encoder.fit_transform(df[col])
df.dtypes
Education int64
JoiningYear int64
City int64
PaymentTier int64
Age int64
Gender int64
EverBenched int64
ExperienceInCurrentDomain int64
LeaveOrNot int64
dtype: object
Split the Data
x = df.drop(columns=['LeaveOrNot']) # Features
y = df['LeaveOrNot'] # Target variable
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, train_size=0.8)
Model Selection
def eval_model(model, xtrain, ytrain, xtest, ytest):
model.fit(xtrain, ytrain)
train_score = model.score(xtrain, ytrain)
test_score = model.score(xtest, ytest)
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
eval_model(model, xtrain, ytrain, xtest, ytest)
Train Score: 0.7063406770553465
Test Score: 0.7175080558539205
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/
_logistic.py:460: ConvergenceWarning: lbfgs failed to converge
(status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as
shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
eval_model(model, xtrain, ytrain, xtest, ytest)
Train Score: 0.9306824288017195
Test Score: 0.8055853920515574
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
eval_model(model, xtrain, ytrain, xtest, ytest)
Train Score: 0.8382590005373455
Test Score: 0.7937701396348013
from sklearn.svm import SVC
model = SVC()
eval_model(model, xtrain, ytrain, xtest, ytest)
Train Score: 0.6534121440085975
Test Score: 0.6670247046186896
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
eval_model(model, xtrain, ytrain, xtest, ytest)
Train Score: 0.6775926921010209
Test Score: 0.6595059076262084
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
eval_model(model, xtrain, ytrain, xtest, ytest)
Train Score: 0.9306824288017195
Test Score: 0.8281417830290011
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
eval_model(model, xtrain, ytrain, xtest, ytest)
Train Score: 0.8559914024717894
Test Score: 0.8453276047261009
Model Training
model = RandomForestClassifier()
model.fit(xtrain, ytrain)
RandomForestClassifier()
Model Evaluation
trainpred = model.predict(xtrain)
testpred = model.predict(xtest)
from sklearn.metrics import classification_report
print(classification_report(ytrain, trainpred))
precision recall f1-score support
0 0.92 0.98 0.95 2432
1 0.95 0.84 0.89 1290
accuracy 0.93 3722
macro avg 0.94 0.91 0.92 3722
weighted avg 0.93 0.93 0.93 3722
print(classification_report(ytest, testpred))
precision recall f1-score support
0 0.86 0.90 0.88 621
1 0.79 0.70 0.74 310
accuracy 0.84 931
macro avg 0.82 0.80 0.81 931
weighted avg 0.83 0.84 0.83 931