263 lines
6.3 KiB
Python
263 lines
6.3 KiB
Python
# Import libraries
|
|
from __future__ import division
|
|
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt # plotting
|
|
import numpy as np # linear algebra
|
|
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
from sklearn.model_selection import (
|
|
train_test_split,
|
|
GridSearchCV,
|
|
)
|
|
|
|
from sklearn.metrics import (
|
|
accuracy_score,
|
|
confusion_matrix,
|
|
roc_auc_score,
|
|
recall_score,
|
|
f1_score,
|
|
)
|
|
|
|
# # Data
|
|
|
|
# ### File path
|
|
folder = "../pkg/flowOutput/"
|
|
fname_benign = "benign_2017-05-02_kali-normal22_flow_stats.csv"
|
|
fname_malicious = "webgoat_flow_stats.csv"
|
|
|
|
# ### Malicious: Webgoat
|
|
# Malicious flows
|
|
pd_malicious = pd.read_csv(folder + fname_malicious)
|
|
pd_malicious.drop(pd_malicious.tail(1).index, inplace=True)
|
|
pd_malicious["Type"] = "Malicious"
|
|
|
|
|
|
# ### Benign
|
|
# Benign flows
|
|
pd_benign = pd.read_csv(folder + fname_benign)
|
|
# pd_benign.drop(pd_webgoat.tail(1).index, inplace=True)
|
|
pd_benign["Type"] = "Benign"
|
|
|
|
print(pd_benign.shape)
|
|
|
|
|
|
# ### Combined dataframe - Benign + malicious
|
|
## Combine malicous and benign dataframes.
|
|
pd_comb = pd.concat([pd_malicious, pd_benign])
|
|
|
|
## Random shuffle of rows
|
|
pd_comb = pd_comb.sample(frac=1)
|
|
|
|
pd_comb.shape
|
|
|
|
# ### Added throughput columns.
|
|
## Add throughput columns.
|
|
colsPerTime = [
|
|
"flowLength",
|
|
"fwdFlowLength",
|
|
"bwdFlowLength",
|
|
"packetSizeTotal",
|
|
"fwdPacketSizeTotal",
|
|
"bwdPacketSizeTotal",
|
|
]
|
|
|
|
for feature in colsPerTime:
|
|
pd_comb[feature + "PerTime"] = pd_comb[feature] / pd_comb["flowDuration"]
|
|
print(feature + "PerTime")
|
|
|
|
# ## Features
|
|
# Feature columns.
|
|
feature_cols = [
|
|
"flowDuration",
|
|
"flowLength",
|
|
"fwdFlowLength",
|
|
"bwdFlowLength",
|
|
"packetSizeTotal",
|
|
"packetSizeMean",
|
|
"packetSizeStd",
|
|
"packetSizeMin",
|
|
"packetSizeMax",
|
|
"fwdPacketSizeTotal",
|
|
"bwdPacketSizeTotal",
|
|
"fwdPacketSizeMean",
|
|
"bwdPacketSizeMean",
|
|
"fwdPacketSizeStd",
|
|
"bwdPacketSizeStd",
|
|
"fwdPacketSizeMin",
|
|
"bwdPacketSizeMin",
|
|
"fwdPacketSizeMax",
|
|
"bwdPacketSizeMax",
|
|
"IATMean",
|
|
"IATStd",
|
|
"IATMin",
|
|
"IATMax",
|
|
"fwdIATTotal",
|
|
"bwdIATTotal",
|
|
"fwdIATMean",
|
|
"bwdIATMean",
|
|
"fwdIATStd",
|
|
"bwdIATStd",
|
|
"fwdIATMin",
|
|
"bwdIATMin",
|
|
"fwdIATMax",
|
|
"bwdIATMax",
|
|
"flowLengthPerTime",
|
|
"fwdFlowLengthPerTime",
|
|
"bwdFlowLengthPerTime",
|
|
"packetSizeTotalPerTime",
|
|
"fwdPacketSizeTotalPerTime",
|
|
"bwdPacketSizeTotalPerTime",
|
|
"Type",
|
|
]
|
|
|
|
|
|
# ### Dataframe with chosen features
|
|
## Select feature columns in datasets.
|
|
pd_comb_features = pd_comb[feature_cols]
|
|
|
|
|
|
# # Machine learning - feature importance
|
|
|
|
|
|
|
|
# ## Clean dataset
|
|
# Remove spurious entries from dataset.
|
|
def clean_dataset(df):
|
|
df.dropna(inplace=True)
|
|
|
|
df_X = df.iloc[:,:-1]
|
|
df_Y = df.iloc[:,-1]
|
|
|
|
|
|
indices_to_keep = ~df_X.isin([np.nan, np.inf, -np.inf]).any(axis=1)
|
|
df_X_cleaned = df_X[indices_to_keep].astype(np.float64)
|
|
df_Y_cleaned = df_Y[indices_to_keep].values
|
|
|
|
return df_X_cleaned, df_Y_cleaned
|
|
|
|
# ### Get feature and class arrays (X and y.)
|
|
# Get feature and class arrays (X and y.)
|
|
pd_comb_features_cp = pd_comb_features.copy(deep=True)
|
|
|
|
X, y = clean_dataset(pd_comb_features_cp)
|
|
|
|
|
|
# ## Train test split
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
|
|
|
|
|
|
# ## Scale data
|
|
scaler = StandardScaler() # MinMaxScaler
|
|
X_train_scale = scaler.fit_transform(X_train)
|
|
X_test_scale = scaler.transform(X_test)
|
|
|
|
|
|
# ## Weighted Logistic Regression
|
|
|
|
|
|
# ### Hyperparameter grid search.
|
|
# Class weights.
|
|
w = [
|
|
{0: 0.10, 1: 99.90},
|
|
{0: 0.25, 1: 99.75},
|
|
{0: 0.50, 1: 99.50},
|
|
{0: 0.75, 1: 99.25},
|
|
{0: 1.00, 1: 99.00},
|
|
{
|
|
0: 100 * np.sum(y == "Malicious") / (np.sum(y == "Benign") + np.sum(y == "Malicious")),
|
|
1: 100 * np.sum(y == "Benign") / (np.sum(y == "Benign") + np.sum(y == "Malicious")),
|
|
},
|
|
]
|
|
|
|
# Inverse of regularization strength.
|
|
crange = np.arange(0.1, 1.0, 0.2)
|
|
|
|
|
|
# Hyperparameter grid.
|
|
hyperparam_grid = {
|
|
"class_weight": w,
|
|
"penalty": ["l1", "l2"],
|
|
"C": crange,
|
|
"fit_intercept": [True, False],
|
|
}
|
|
|
|
|
|
# ### Prepare samples.
|
|
# String to int for classes
|
|
y_train2 = np.copy(y_train)
|
|
y_train2[np.where(y_train == "Benign")[0]] = 0
|
|
y_train2[np.where(y_train == "Malicious")[0]] = 1
|
|
|
|
|
|
# ### Model fitting.
|
|
# logistic model classifier.
|
|
lg = LogisticRegression(random_state=13)
|
|
|
|
# define evaluation procedure
|
|
grid = GridSearchCV(lg, hyperparam_grid, scoring="roc_auc", cv=10, n_jobs=-1, refit=True)
|
|
grid.fit(X_train_scale, y_train2.astype("int32"))
|
|
|
|
print(f"Best score: {grid.best_score_} with param: {grid.best_params_}")
|
|
|
|
|
|
# ### Test perfomance.
|
|
y_pred_wt = grid.predict(X_test_scale)
|
|
|
|
y_test2 = np.copy(y_test)
|
|
y_test2[np.where(y_test == "Benign")[0]] = 0
|
|
y_test2[np.where(y_test == "Malicious")[0]] = 1
|
|
|
|
|
|
# performance
|
|
conf_mat = confusion_matrix(y_test2.astype("int32"), y_pred_wt)
|
|
|
|
print(f"Accuracy Score: {accuracy_score(y_test2.astype('int32'),y_pred_wt)}")
|
|
print(f"Confusion Matrix: \n{confusion_matrix(y_test2.astype('int32'), y_pred_wt)}")
|
|
print(f"Area Under Curve: {roc_auc_score(y_test2.astype('int32'), y_pred_wt)}")
|
|
print(
|
|
f"Recall score (Pct of true malicious detected): {100*recall_score(y_test2.astype('int32'), y_pred_wt)}"
|
|
)
|
|
print(f"Data reduction: { np.round( 100.0 * conf_mat.T[1].sum() / conf_mat.sum() , 2 )} percent")
|
|
|
|
print(
|
|
f"Pct malicious in data sent to console: { np.round( 100.0 * conf_mat.T[1][1] / conf_mat.T[1].sum() , 2 )} percent"
|
|
)
|
|
|
|
print("F1 score: ", f1_score(y_test2.astype("int32"), y_pred_wt, average="weighted"))
|
|
|
|
|
|
# ### Best fit parameters.
|
|
# define model
|
|
best_fit_model = LogisticRegression(
|
|
class_weight=grid.best_params_["class_weight"],
|
|
penalty=grid.best_params_["penalty"],
|
|
C=grid.best_params_["C"],
|
|
fit_intercept=grid.best_params_["fit_intercept"],
|
|
random_state=13,
|
|
max_iter=5,
|
|
)
|
|
|
|
# fit it
|
|
best_fit_model.fit(X_train_scale, y_train2.astype("int32"))
|
|
|
|
|
|
# ### Save parameters.
|
|
np.savetxt("../pkg/ml/parameters/mean.txt", scaler.mean_, delimiter=",")
|
|
np.savetxt("../pkg/ml/parameters/std.txt", scaler.scale_, delimiter=",")
|
|
np.savetxt("../pkg/ml/parameters/weights.txt", best_fit_model.coef_[0], delimiter=",")
|
|
np.savetxt("../pkg/ml/parameters/intercept.txt", best_fit_model.intercept_, delimiter=",")
|
|
|
|
|
|
# ### Feature importance scores
|
|
important_features = pd_comb_features_cp.iloc[:, :-1].columns.values[
|
|
np.argsort(-1 * np.abs(best_fit_model.coef_[0]))
|
|
]
|
|
|
|
|
|
print(important_features)
|
|
|