Before Feature Selection
Before Feature Selection
Overview Data
In [1]: import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
data = pd.read_csv('C://Users/ditama/Downloads/Unicauca-dataset-April-June-2019-Network-flows
data.head()
5 rows × 50 columns
In [3]: data.shape
In [4]: data.columns
In [5]: data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2704839 entries, 0 to 2704838
Data columns (total 50 columns):
# Column Dtype
--- ------ -----
0 flow_key object
1 src_ip_numeric int64
2 src_ip object
3 src_port int64
4 dst_ip object
5 dst_port int64
6 proto int64
7 pktTotalCount int64
8 octetTotalCount int64
9 min_ps int64
10 max_ps int64
11 avg_ps float64
12 std_dev_ps float64
13 flowStart float64
14 flowEnd float64
15 flowDuration float64
16 min_piat float64
17 max_piat float64
18 avg_piat float64
19 std_dev_piat float64
20 f_pktTotalCount int64
21 f_octetTotalCount int64
22 f_min_ps int64
23 f_max_ps int64
24 f_avg_ps float64
25 f_std_dev_ps float64
26 f_flowStart float64
27 f_flowEnd float64
28 f_flowDuration float64
29 f_min_piat float64
30 f_max_piat float64
31 f_avg_piat float64
32 f_std_dev_piat float64
33 b_pktTotalCount int64
34 b_octetTotalCount int64
35 b_min_ps int64
36 b_max_ps int64
37 b_avg_ps float64
38 b_std_dev_ps float64
39 b_flowStart float64
40 b_flowEnd float64
41 b_flowDuration float64
42 b_min_piat float64
43 b_max_piat float64
44 b_avg_piat float64
45 b_std_dev_piat float64
46 flowEndReason int64
47 category object
48 application_protocol object
49 web_service object
dtypes: float64(27), int64(17), object(6)
memory usage: 1.0+ GB
Others : 14.04%
Out[10]: ['min_ps',
'b_max_ps',
'f_std_dev_piat',
'f_avg_piat',
'b_flowDuration',
'flowDuration',
'b_pktTotalCount',
'f_flowEnd',
'b_flowEnd',
'f_flowStart',
'b_avg_piat',
'f_flowDuration',
'b_max_piat',
'octetTotalCount',
'b_min_piat',
'avg_ps',
'src_ip_numeric',
'b_std_dev_piat',
'flowStart',
'f_octetTotalCount',
'pktTotalCount',
'b_avg_ps',
'src_port',
'b_min_ps',
'f_avg_ps',
'b_std_dev_ps',
'max_piat',
'min_piat',
'std_dev_ps',
'flowEnd',
'std_dev_piat',
'f_max_piat',
'f_min_ps',
'f_max_ps',
'avg_piat',
'f_std_dev_ps',
'flowEndReason',
'dst_port',
'f_pktTotalCount',
'proto',
'max_ps',
'b_flowStart',
'b_octetTotalCount',
'f_min_piat']
In [11]: data[num_cols].describe()
8 rows × 44 columns
Out[12]: []
Out[15]: ['min_ps',
'b_max_ps',
'f_std_dev_piat',
'f_avg_piat',
'b_flowDuration',
'flowDuration',
'b_pktTotalCount',
'f_flowEnd',
'b_flowEnd',
'f_flowStart',
'b_avg_piat',
'f_flowDuration',
'b_max_piat',
'octetTotalCount',
'b_min_piat',
'avg_ps',
'src_ip_numeric',
'b_std_dev_piat',
'flowStart',
'f_octetTotalCount',
'pktTotalCount',
'b_avg_ps',
'src_port',
'b_min_ps',
'f_avg_ps',
'b_std_dev_ps',
'max_piat',
'min_piat',
'std_dev_ps',
'flowEnd',
'std_dev_piat',
'f_max_piat',
'f_min_ps',
'f_max_ps',
'avg_piat',
'f_std_dev_ps',
'dst_port',
'f_pktTotalCount',
'max_ps',
'b_flowStart',
'b_octetTotalCount',
'f_min_piat']
Correlation Matrix
In [17]: corr = data[num_cols].corr()
Prepocessing
In [19]: #check null
data.isnull().sum()
Out[19]: flow_key 0
src_ip_numeric 0
src_ip 0
src_port 0
dst_ip 0
dst_port 0
proto 0
pktTotalCount 0
octetTotalCount 0
min_ps 0
max_ps 0
avg_ps 0
std_dev_ps 0
flowStart 0
flowEnd 0
flowDuration 0
min_piat 0
max_piat 0
avg_piat 0
std_dev_piat 0
f_pktTotalCount 0
f_octetTotalCount 0
f_min_ps 0
f_max_ps 0
f_avg_ps 0
f_std_dev_ps 0
f_flowStart 0
f_flowEnd 0
f_flowDuration 0
f_min_piat 0
f_max_piat 0
f_avg_piat 0
f_std_dev_piat 0
b_pktTotalCount 0
b_octetTotalCount 0
b_min_ps 0
b_max_ps 0
b_avg_ps 0
b_std_dev_ps 0
b_flowStart 0
b_flowEnd 0
b_flowDuration 0
b_min_piat 0
b_max_piat 0
b_avg_piat 0
b_std_dev_piat 0
flowEndReason 0
category 0
application_protocol 0
web_service 0
dtype: int64
Feature Selection
In [23]: ipdata.drop(['flow_key','src_ip_numeric','src_ip','dst_ip','category','application_protocol'
Out[24]: []
Final Feature
In [25]: df = ipdata.copy()
In [26]: df.head()
Out[26]: src_port dst_port proto pktTotalCount octetTotalCount min_ps max_ps avg_ps std_dev_ps
4 0 0 1 1 56 56 56 56.000000 0.000000
5 rows × 43 columns
In [27]: df.shape
In [29]: #splitting
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state
In [30]: X_train.shape
In [31]: X_test.shape
In [32]: #normalisasi
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
In [33]: X_train_scaled
In [34]: X_test_scaled
label_encoder = LabelEncoder()
Y_train_encode = label_encoder.fit_transform(y_train)
In [37]: Y_train_encode
In [38]: Y_test_encode
In [39]: label_encoder.classes_
In [40]: label_encoder2.classes_
Out[41]: 0.716117993454667
tree_train_accuracy = clf_gini.score(X_train_scaled,Y_train_encode)
tree_accuracy = clf_gini.score(X_test_scaled,Y_test_encode)
cnt = 1
# split() method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X_train_scaled, Y_train_encode):
print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
cnt+=1
# Note that:
# cross_val_score() parameter 'cv' will by default use StratifiedKFold spliting startergy if
# So you can bypass above step and just specify cv= 5 in cross_val_score() function
C:\Users\ditama\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:684:
UserWarning: The least populated class in y has only 1 members, which is less than
n_splits=5.
warnings.warn(
Fold:1, Train set: 1081931, Test set:270483
Fold:2, Train set: 1081931, Test set:270483
Fold:3, Train set: 1081931, Test set:270483
Fold:4, Train set: 1081931, Test set:270483
Fold:5, Train set: 1081932, Test set:270482
C:\Users\ditama\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:684:
UserWarning: The least populated class in y has only 1 members, which is less than
n_splits=5.
warnings.warn(
Scores for each fold are: [0.82997822 0.83020375 0.829627 0.83093947 0.82995172]
Average score: 0.83
cnt = 1
# split() method generate indices to split data into training and test set.
for train_index, test_index in kf2.split(X_test_scaled, Y_test_encode):
print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
cnt+=1
# Note that:
# cross_val_score() parameter 'cv' will by default use StratifiedKFold spliting startergy if
# So you can bypass above step and just specify cv= 5 in cross_val_score() function
C:\Users\ditama\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:684:
UserWarning: The least populated class in y has only 1 members, which is less than
n_splits=5.
warnings.warn(
C:\Users\ditama\anaconda3\lib\site-packages\sklearn\model_selection\_split.py:684:
UserWarning: The least populated class in y has only 1 members, which is less than
n_splits=5.
warnings.warn(
Scores for each fold are: [0.82922402 0.82960852 0.82986361 0.83021484 0.82977488]
Average score: 0.83
Naive Bayes
In [48]: # train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB
# instantiate the model
gnb = GaussianNB()
# fit the model
gnb.fit(X_train_scaled, Y_train_encode)
Out[48]: ▾ GaussianNB
GaussianNB()
KNN Model
In [50]: #Model Classification KNN using n_neighbors = 3
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train_scaled, Y_train_encode)
# store the predicted response values
Out[50]: ▾ KNeighborsClassifier
KNeighborsClassifier(n_neighbors=3)
Out[53]: ▾ MLPClassifier
MLPClassifier(hidden_layer_sizes=(3, 2))
In [54]: mlp.fit(X_train_scaled,Y_train_encode)
Out[54]: ▾ MLPClassifier
MLPClassifier(hidden_layer_sizes=(3, 2))
Random Forest
In [56]: from sklearn.ensemble import RandomForestClassifier
#Menggunakan ensamble algorithm Random Forest Classifier dengan libSklearn
modelRF = RandomForestClassifier(n_estimators=1)
In [57]: modelRF.fit(X_train_scaled,Y_train_encode)
Out[57]: ▾ RandomForestClassifier
RandomForestClassifier(n_estimators=1)
Evaluation With DT
C:\Users\ditama\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:133
4: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to
0.0 in labels with no predicted samples. Use `zero_division` parameter to control t
his behavior.
_warn_prf(average, modifier, msg_start, len(result))
C:\Users\ditama\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:133
4: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to
0.0 in labels with no predicted samples. Use `zero_division` parameter to control t
his behavior.
_warn_prf(average, modifier, msg_start, len(result))
C:\Users\ditama\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:133
4: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to
0.0 in labels with no predicted samples. Use `zero_division` parameter to control t
his behavior.
_warn_prf(average, modifier, msg_start, len(result))
In [60]: y = label_encoder.inverse_transform([28,17,102,116])
y