Files
PacketBreeze/assets/Deepfence_ML_flowmeter.py
2023-06-19 20:50:45 +05:30

263 lines
6.3 KiB
Python

# Import libraries
from __future__ import division
import pandas as pd
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
train_test_split,
GridSearchCV,
)
from sklearn.metrics import (
accuracy_score,
confusion_matrix,
roc_auc_score,
recall_score,
f1_score,
)
# # Data
# ### File path
folder = "../pkg/flowOutput/"
fname_benign = "benign_2017-05-02_kali-normal22_flow_stats.csv"
fname_malicious = "webgoat_flow_stats.csv"
# ### Malicious: Webgoat
# Malicious flows
pd_malicious = pd.read_csv(folder + fname_malicious)
pd_malicious.drop(pd_malicious.tail(1).index, inplace=True)
pd_malicious["Type"] = "Malicious"
# ### Benign
# Benign flows
pd_benign = pd.read_csv(folder + fname_benign)
# pd_benign.drop(pd_webgoat.tail(1).index, inplace=True)
pd_benign["Type"] = "Benign"
print(pd_benign.shape)
# ### Combined dataframe - Benign + malicious
## Combine malicous and benign dataframes.
pd_comb = pd.concat([pd_malicious, pd_benign])
## Random shuffle of rows
pd_comb = pd_comb.sample(frac=1)
pd_comb.shape
# ### Added throughput columns.
## Add throughput columns.
colsPerTime = [
"flowLength",
"fwdFlowLength",
"bwdFlowLength",
"packetSizeTotal",
"fwdPacketSizeTotal",
"bwdPacketSizeTotal",
]
for feature in colsPerTime:
pd_comb[feature + "PerTime"] = pd_comb[feature] / pd_comb["flowDuration"]
print(feature + "PerTime")
# ## Features
# Feature columns.
feature_cols = [
"flowDuration",
"flowLength",
"fwdFlowLength",
"bwdFlowLength",
"packetSizeTotal",
"packetSizeMean",
"packetSizeStd",
"packetSizeMin",
"packetSizeMax",
"fwdPacketSizeTotal",
"bwdPacketSizeTotal",
"fwdPacketSizeMean",
"bwdPacketSizeMean",
"fwdPacketSizeStd",
"bwdPacketSizeStd",
"fwdPacketSizeMin",
"bwdPacketSizeMin",
"fwdPacketSizeMax",
"bwdPacketSizeMax",
"IATMean",
"IATStd",
"IATMin",
"IATMax",
"fwdIATTotal",
"bwdIATTotal",
"fwdIATMean",
"bwdIATMean",
"fwdIATStd",
"bwdIATStd",
"fwdIATMin",
"bwdIATMin",
"fwdIATMax",
"bwdIATMax",
"flowLengthPerTime",
"fwdFlowLengthPerTime",
"bwdFlowLengthPerTime",
"packetSizeTotalPerTime",
"fwdPacketSizeTotalPerTime",
"bwdPacketSizeTotalPerTime",
"Type",
]
# ### Dataframe with chosen features
## Select feature columns in datasets.
pd_comb_features = pd_comb[feature_cols]
# # Machine learning - feature importance
# ## Clean dataset
# Remove spurious entries from dataset.
def clean_dataset(df):
df.dropna(inplace=True)
df_X = df.iloc[:,:-1]
df_Y = df.iloc[:,-1]
indices_to_keep = ~df_X.isin([np.nan, np.inf, -np.inf]).any(axis=1)
df_X_cleaned = df_X[indices_to_keep].astype(np.float64)
df_Y_cleaned = df_Y[indices_to_keep].values
return df_X_cleaned, df_Y_cleaned
# ### Get feature and class arrays (X and y.)
# Get feature and class arrays (X and y.)
pd_comb_features_cp = pd_comb_features.copy(deep=True)
X, y = clean_dataset(pd_comb_features_cp)
# ## Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# ## Scale data
scaler = StandardScaler() # MinMaxScaler
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)
# ## Weighted Logistic Regression
# ### Hyperparameter grid search.
# Class weights.
w = [
{0: 0.10, 1: 99.90},
{0: 0.25, 1: 99.75},
{0: 0.50, 1: 99.50},
{0: 0.75, 1: 99.25},
{0: 1.00, 1: 99.00},
{
0: 100 * np.sum(y == "Malicious") / (np.sum(y == "Benign") + np.sum(y == "Malicious")),
1: 100 * np.sum(y == "Benign") / (np.sum(y == "Benign") + np.sum(y == "Malicious")),
},
]
# Inverse of regularization strength.
crange = np.arange(0.1, 1.0, 0.2)
# Hyperparameter grid.
hyperparam_grid = {
"class_weight": w,
"penalty": ["l1", "l2"],
"C": crange,
"fit_intercept": [True, False],
}
# ### Prepare samples.
# String to int for classes
y_train2 = np.copy(y_train)
y_train2[np.where(y_train == "Benign")[0]] = 0
y_train2[np.where(y_train == "Malicious")[0]] = 1
# ### Model fitting.
# logistic model classifier.
lg = LogisticRegression(random_state=13)
# define evaluation procedure
grid = GridSearchCV(lg, hyperparam_grid, scoring="roc_auc", cv=10, n_jobs=-1, refit=True)
grid.fit(X_train_scale, y_train2.astype("int32"))
print(f"Best score: {grid.best_score_} with param: {grid.best_params_}")
# ### Test perfomance.
y_pred_wt = grid.predict(X_test_scale)
y_test2 = np.copy(y_test)
y_test2[np.where(y_test == "Benign")[0]] = 0
y_test2[np.where(y_test == "Malicious")[0]] = 1
# performance
conf_mat = confusion_matrix(y_test2.astype("int32"), y_pred_wt)
print(f"Accuracy Score: {accuracy_score(y_test2.astype('int32'),y_pred_wt)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test2.astype('int32'), y_pred_wt)}")
print(f"Area Under Curve: {roc_auc_score(y_test2.astype('int32'), y_pred_wt)}")
print(
f"Recall score (Pct of true malicious detected): {100*recall_score(y_test2.astype('int32'), y_pred_wt)}"
)
print(f"Data reduction: { np.round( 100.0 * conf_mat.T[1].sum() / conf_mat.sum() , 2 )} percent")
print(
f"Pct malicious in data sent to console: { np.round( 100.0 * conf_mat.T[1][1] / conf_mat.T[1].sum() , 2 )} percent"
)
print("F1 score: ", f1_score(y_test2.astype("int32"), y_pred_wt, average="weighted"))
# ### Best fit parameters.
# define model
best_fit_model = LogisticRegression(
class_weight=grid.best_params_["class_weight"],
penalty=grid.best_params_["penalty"],
C=grid.best_params_["C"],
fit_intercept=grid.best_params_["fit_intercept"],
random_state=13,
max_iter=5,
)
# fit it
best_fit_model.fit(X_train_scale, y_train2.astype("int32"))
# ### Save parameters.
np.savetxt("../pkg/ml/parameters/mean.txt", scaler.mean_, delimiter=",")
np.savetxt("../pkg/ml/parameters/std.txt", scaler.scale_, delimiter=",")
np.savetxt("../pkg/ml/parameters/weights.txt", best_fit_model.coef_[0], delimiter=",")
np.savetxt("../pkg/ml/parameters/intercept.txt", best_fit_model.intercept_, delimiter=",")
# ### Feature importance scores
important_features = pd_comb_features_cp.iloc[:, :-1].columns.values[
np.argsort(-1 * np.abs(best_fit_model.coef_[0]))
]
print(important_features)