library(ggplot2)
library(caret)
library(reshape2)
library(FactoMineR)
library(factoextra)
library(gmodels)
library(e1071)
library(dummies)
library(kernlab)
library(ggpubr)
library(PCAmixdata)
azure_ml <- FALSE
'%ni%' <- Negate('%in%')
epsilon <- 0.00001
read_data <- function(csv, azure_ml){
if (azure_ml == FALSE){
dir = "C:\\Anaconda\\Data\\"
filename = paste(dir,csv,sep='')
return (read.csv(filename, stringsAsFactors = TRUE))
}
library ("AzureML")
ws <- workspace()
return (download.datasets(ws, csv, stringsAsFactors = TRUE))
}
return_X_y <- function(df){
X <- df[, names(df) %ni% c('class_normal')]
y <- as.data.frame(df$class_normal)
names(y) <- 'class_normal'
result <- list(X=X,y=y)
return(result)
}
return_df_categorical <- function(value, cat){
columns_1 <- c('waarde', 'frequentie', 'categorie')
columns_2 <- c('categorie', 'waarde', 'frequentie')
df_result <- as.data.frame(table(value))
df_result$categorie <- cat
names(df_result) <- columns_1
df_result <- df_result[columns_2]
return (df_result)
}
return_frequency_categoricals <- function(X){
cat_variables <- c('protocol_type', 'service', 'flag', 'class')
df_protocol_type <- return_df_categorical(df_nid$protocol_type, cat_variables[1])
df_service <- return_df_categorical(df_nid$service, cat_variables[2])
df_flag <- return_df_categorical(df_nid$flag, cat_variables[3])
df_categoricals <- rbind(df_protocol_type,df_service,df_flag)
return (df_categoricals)
}
return_summary <- function(X){
matrix_summary <- sapply(X, summary)
matrix_summary_transpose <- t(matrix_summary)
X_summary <- as.data.frame(matrix_summary_transpose)[c("Min.", '1st Qu.', 'Median', 'Mean', '3rd Qu.', 'Max.')]
columns <- c("minimum", "1st Qu.", "mediaan", "gemiddelde", "3rd Qu.", "maximum")
names(X_summary) <- columns
return (X_summary)
}
return_indices_correlated_variables <- function(X, cof){
correlation_matrix <- round(cor(X), 2)
highly_correlated <- findCorrelation(correlation_matrix, cutoff=cof)
melted_cormat <- melt(correlation_matrix)
melted_cormat <- melted_cormat[melted_cormat$value >= cof, ]
melted_cormat <- melted_cormat[melted_cormat$value < 1.0, ]
print(ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() + theme_gray(base_size=20) + theme(axis.text.x=element_text(angle=90, vjust = 0.5, hjust = 1)))
return (highly_correlated)
}
plot_pca <- function(df, pc1, pc2){
df$class_normal <- as.factor(ifelse(df$class_normal == 0, 'Abnormaal', 'Normaal'))
ggpubr::ggscatter(df, x = pc1, y = pc2,
legend.title = "Nerwerkverkeer", legend.position = "top", color='class_normal',
ggtheme = theme_gray(base_size = 20), palette = "jco"
)
}
return_log_variable <- function(x){
x <- ifelse(x == 0, epsilon, x)
x <- log(x)
x <- x + abs(min(x))
return(x)
}
return_log <- function(df, epsilon){
D <- return_X_y(df)
X <- D$X
variables <- c('duration', 'src_bytes', 'dst_bytes', 'num_root', 'num_compromised', 'count',
'hot', 'num_file_creations', 'srv_count', 'dst_host_count', 'dst_host_srv_count')
df_test <- X[, names(X) %in% variables]
df_test <- as.data.frame(lapply(df_test, return_log_variable))
X <- X[, names(X) %ni% variables]
return(cbind(X, df_test, D$y))
}
return_bytes <- function(df){
df$bytes <- sqrt((df$src_bytes)^2 + (df$dst_bytes)^2)
df$src_bytes <- NULL
df$dst_bytes <- NULL
return (df)
}
return_validation_dataset <- function(){
df_val <- read_data("NID_network_intrusion_detection_validatie_dataset.csv", azure_ml)
df_val$diff_level <- NULL
df_val$is_host_login <- NULL
df_val$num_outbound_cmds <- NULL
df_val$class_normal = ifelse(df_val$class == 'normal', 1, 0)
df_val$class<- NULL
df_val <- dummy.data.frame(df_val, names = c('protocol_type', 'service', 'flag'), sep = '.')
datapoints <- !duplicated(df_val)
df_val <- df_val[as.vector(datapoints),]
return (df_val)
}
return_validation_dataset_transformed <- function(){
df_val <- return_validation_dataset()
df_val_transformations <- return_log(df_val,epsilon)
df_val_transformations <- return_bytes(df_val_transformations)
return (df_val_transformations)
}
return_classification_report <- function(model, df, type){
df_probabilities <- as.data.frame(predict(model, newdata = df, family = binomial, type = type))
names(df_probabilities) <- c("f_predicted")
df$f_predicted <- df_probabilities$f_predicted
df$f_predicted <- ifelse(df$f_predicted > 0.5, 1, 0)
df_act = as.data.frame(factor(df$class_normal, levels=c(0, 1)))
names(df_act) <- c('f')
df_pred = as.data.frame(factor(df$f_predicted, levels=c(0, 1)))
names(df_pred) <- c('f_predicted')
xtab <- table(df_pred$f_predicted, df_act$f)
return(confusionMatrix(xtab))
}
return_predictions <- function(model, df, type){
df_probabilities <- as.data.frame(predict(model, newdata = df, family = binomial, type = type))
names(df_probabilities) <- c("f_predicted")
df$f_predicted <- df_probabilities$f_predicted
df$f_predicted <- ifelse(df$f_predicted > 0.5, 1, 0)
df_pred = as.data.frame(factor(df$f_predicted, levels=c(0, 1)))
names(df_pred) <- c('class_normal')
return (cbind(df, df_pred))
}
return_plot_pca <- function(df){
D <- return_X_y(df)
pca <- prcomp(D$X, retx = TRUE, center = TRUE, scale = TRUE )
df_pca <- as.data.frame(pca$x)[, 1:2]
y <- as.data.frame(df$class_normal)
names(y) <- c('class_normal')
df_pca <- cbind(df_pca, y)
plot_pca(df_pca, 'PC1', 'PC2')
}
return_abnormal_predicted_traffic <- function()
{
model <- readRDS("C:\\Anaconda\\LOI Eindproduct\\LOI 772Z4 Opdracht 17 Eindproduct LR Model Transformations.RDS")
df <- return_predictions(model, return_validation_dataset_transformed(), "response")
df_val <- read_data("LOI 772Z4 Opdracht 17 Eindproduct Validatiedataset.csv", azure_ml)
df_val$diff_level <- NULL
df_val$is_host_login <- NULL
df_val$num_outbound_cmds <- NULL
df_val$label = ifelse(df_val$class == 'normal', 1, 0)
df_val$class<- NULL
datapoints <- !duplicated(df_val)
df_val <- df_val[as.vector(datapoints),]
df_pred <- as.data.frame(df$class_normal)
names(df_pred) <- c('class_normal')
df_val <- cbind(df_val, df_pred)
df_val <- df_val[df_val$class_normal == 0, ]
return (df_val)
}
return_distributions <- function(){
df_val <- return_abnormal_predicted_traffic()
table(df_val$service)
result <- list(protocol_types=table(df_val$protocol_type),services=table(df_val$service))
return(result)
}
return_plot_bytes <- function(){
df_val <- return_abnormal_predicted_traffic()
ggpubr::ggscatter(df_val, x = 'src_bytes', y = 'dst_bytes', ggtheme = theme_gray(base_size = 20), palette = "jco")
}