Skip to content

Commit

Permalink
Deprecated ml methods and rearrange code.
Browse files Browse the repository at this point in the history
  • Loading branch information
tamnvhust1 committed Mar 12, 2020
1 parent 2fce88b commit e6b6cab
Show file tree
Hide file tree
Showing 15 changed files with 329 additions and 602 deletions.
8 changes: 5 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# VScode
.vscode/
result/
data/
logs/
.env/

# Model files
*.h5
*.pkl

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
2 changes: 0 additions & 2 deletions src/preprocess.py → malnet/create_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# preprocess.py

"""
- Author: tamnv
- Description: This script will extract raw data from EMBER
Expand Down
File renamed without changes.
File renamed without changes.
73 changes: 73 additions & 0 deletions malnet/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""Evaluate pretrained model independently. Note that we must have pretrained
model before.
Usage: python eval.py \
--model_path MODEL_PATH \
--scaler_path SCALER_PATH \
--data_dir DATA_DIR \
"""
import argparse
from sys import argv
import pickle as pkl

import keras
import tensorflow as tf
import numpy as np
from keras.models import load_model
from sklearn.metrics import roc_curve, auc

import utils
import ember

# Fix tensorflow bug on rtx card
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
keras.backend.set_session(tf.Session(config=config))

def parse_arguments(argv):
"""Parse command line arguments."""
parser = argparse.ArgumentParser(prog='MalNet')
parser.add_argument('--data_dir', dest='data_dir', type=str, default='data',
help='Path to data directory contains test dataset.')
parser.add_argument('--model_path', dest='model_path', type=str,
help='Path to model directory.')
parser.add_argument('--scaler_path', dest='scaler_path', type=str,
help='Path to the scaler object file.')
parser.add_argument('--scale', dest='scale', type=float, default=1.,
help='Scale of training/test dataset.')
return parser.parse_args(argv)


# Parse arguments
args = parse_arguments(argv[1:])
model_path = args.model_path
scaler_path = args.scaler_path
num_classes = 2

print('Loading data...')
data_dir = args.data_dir
_, _, X_test, y_test = ember.read_vectorized_features(data_dir, scale=args.scale)
X_test = np.array(X_test)

# Only keep supervised data
# Note that unsupervised data has label -1
X_test = X_test[y_test != -1]
y_test = y_test[y_test != -1]

print('Loading model from {}'.format(model_path))
scaler_path = args.scaler_path
with open(scaler_path, 'rb') as f:
scaler = pkl.load(f)
model_path = args.model_path
model = load_model(model_path)
model.summary()

X_test = scaler.transform(X_test)
X_test = np.expand_dims(X_test, axis=-1)
y_test = keras.utils.to_categorical(y_test, num_classes=num_classes)

# ROC curve
y_pred = model.predict(X_test)
fpr, tpr, thresholds = roc_curve(np.argmax(y_test, axis=1), y_pred[:, 1], pos_label=1)
acc = np.mean(np.equal(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))
utils.visualize_roc(fpr, tpr, thresholds, acc)
Binary file added malnet/saved_models/roc_auc.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added malnet/saved_models/train.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
42 changes: 21 additions & 21 deletions src/test.py → malnet/test.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
# test.py
"""Using pretrained model to predict some real examples to demonstrate it works.
Usage: python test.py --input_file PE_FILE --model_path MODEL_PATH
"""
- Author: tamnv
- Description: Use pretrained model to predict some examples
in real to demonstrate its performance.
"""

import os
import glob
import json
Expand All @@ -14,28 +10,37 @@
from sys import argv

import numpy as np
from keras.models import model_from_json
import keras
import tensorflow as tf
from keras.models import load_model

import ember
from util import get_paths

# Fix tensorflow bug on rtx card
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
keras.backend.set_session(tf.Session(config=config))

def parse_arguments(argv):
"""Parse command line arguments."""
parser = argparse.ArgumentParser(prog='MalNet')
parser.add_argument('-i', '--input', dest='input', type=str,
parser.add_argument('--input_file', dest='input_file', type=str,
help='Path to PE file.')
parser.add_argument('--model', dest='model', type=str,
parser.add_argument('--model_path', dest='model_path', type=str,
help='Path to model directory.')
parser.add_argument('--scaler_path', dest='scaler_path', type=str,
help='Path to the scaler object file.')
parser.add_argument('--threshold', dest='threshold', type=float, default=0.273,
help='Threshold to distinguish benign and malicous.')
return parser.parse_args(argv)

# Parse args
args = parse_arguments(argv[1:])
input_file = args.input
input_file = args.input_file
model_path = args.model_path

print('Example: %s' % input_file)
print('Model dir: %s' % args.model)
print('Model path: %s' % args.model_path)
print('Threshold: %f' % args.threshold)

# Extract features from PE file
Expand All @@ -45,16 +50,11 @@ def parse_arguments(argv):
feature = np.array(extractor.feature_vector(raw_bytes), dtype=np.float32)

# Load model and predict
print('Loading model...')
model_dir = args.model
path_dict = get_paths(model_dir)
json_file = open(path_dict['graph'], 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
model.load_weights(path_dict['model'])
with open(path_dict['scaler'], 'rb') as f:
print('Loading model from {}'.format(model_path))
scaler_path = args.scaler_path
with open(scaler_path, 'rb') as f:
scaler = pkl.load(f)
model = load_model(model_path)

features = np.array([feature], dtype=np.float32)
features = scaler.transform(features)
Expand Down
180 changes: 180 additions & 0 deletions malnet/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
"""Train our network on Ember dataset. Before running this script, you need to
extract features first. See create_data.py for the details.
Usage: python train.py \
--data_dir DATA_DIR \
--lr LEARNING_RATE \
--batch_size BATCH_SIZE \
--epochs EPOCHS
"""
import os
import argparse
import numpy as np
import pickle as pkl
from sys import argv

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding, Activation
from keras.layers import Conv1D, MaxPooling1D, BatchNormalization
from keras.optimizers import SGD, Adam
from keras import regularizers
from keras.callbacks import ModelCheckpoint
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, accuracy_score

import ember
import utils


# Fix tensorflow bug on rtx card
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
keras.backend.set_session(tf.Session(config=config))


def parse_arguments(argv):
"""Parse command line arguments."""
parser = argparse.ArgumentParser(prog='MalNet')
parser.add_argument('--data_dir', dest='data_dir', type=str, default='data',
help='Directory that stores our dataset.')
parser.add_argument('--save_dir', dest='save_dir', type=str, default='saved_models',
help='Directory to save model.')
parser.add_argument('--batch_size', dest='batch_size', type=int, default=32,
help='Mini-batch samples.')
parser.add_argument('--epochs', dest='epochs', type=int, default=5,
help='Number of epochs.')
parser.add_argument('--split', dest='split', type=float, default=0.1,
help='Validation dataset ratio.')
parser.add_argument('--lr', dest='learning_rate', type=float, default=1e-3,
help='Learning rate.')
parser.add_argument('--scale', dest='scale', type=float, default=1.,
help='Scale of training/test dataset.')
return parser.parse_args(argv)


# Parse arguments
args = parse_arguments(argv[1:])

# Hyperparameters
batch_size = args.batch_size
epochs = args.epochs
learning_rate = args.learning_rate
weight_decay = 5e-4
save_dir = args.save_dir

# Params
num_classes = 2
split = args.split


# Generate dummy data
print('Loading data...')
data_dir = args.data_dir
X_train, y_train, X_test, y_test = \
ember.read_vectorized_features(data_dir, scale=args.scale)
X_train = np.array(X_train)
X_test = np.array(X_test)


# Only keep supervised data, leave unsupervised data
# Note that unsupervised data has label -1
X_train = X_train[y_train != -1]
y_train = y_train[y_train != -1]
indices = np.arange(X_train.shape[0])
np.random.shuffle(indices)
X_train = X_train[indices]
y_train = y_train[indices]

idx = int((1. - split) * X_train.shape[0])
X_val, y_val = X_train[idx:], y_train[idx:]
X_train, y_train = X_train[:idx], y_train[:idx]


X_test = X_test[y_test != -1]
y_test = y_test[y_test != -1]
print('Train/Val/Test: {}/{}/{}'.format(X_train.shape[0],
X_val.shape[0],
X_test.shape[0]))


# Convert labels to one-hot
y_train = keras.utils.to_categorical(y_train, num_classes=num_classes)
y_val = keras.utils.to_categorical(y_val, num_classes=num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes=num_classes)


# Preprocessing data before training
# Save the standard scaler for deploying phase.
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
if not os.path.exists(save_dir):
os.mkdir(save_dir)
with open(os.path.join(save_dir, 'scaler.pkl'), 'wb') as f:
pkl.dump(scaler, f)


# Model expects 3D input, so expands the last dim.
X_train = np.expand_dims(X_train, axis=-1)
X_val = np.expand_dims(X_val, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

# For convenient
dim = X_train.shape[1]
regularizer = regularizers.l2(weight_decay)

# Build the model
model = Sequential()
model.add(Conv1D(128, 64, strides=64, activation='relu',
kernel_regularizer=regularizer, input_shape=(dim, 1)))
model.add(BatchNormalization())
model.add(Conv1D(128, 3, strides=2, kernel_regularizer=regularizer,activation='relu'))
model.add(BatchNormalization())

model.add(Flatten())
model.add(Dense(256, kernel_regularizer=regularizer, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(32, kernel_regularizer=regularizer, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(num_classes, kernel_regularizer=regularizer, activation='softmax'))
model.summary()

# Print verbose information
print('Batch size: {}'.format(batch_size))
print('Epochs: {}'.format(epochs))
print('Learning rate: {}'.format(learning_rate))
print('Weight decay: {}'.format(weight_decay))

# Define the optimizer and compile model
optimizer = Adam(learning_rate)
model.compile(loss='categorical_crossentropy',
optimizer=optimizer,
metrics=['accuracy'])

# Training
model_name = 'malnet_model.{epoch:03d}.h5'
if not os.path.isdir(save_dir):
os.makedirs(save_dir)
filepath = os.path.join(save_dir, model_name)
checkpoint = ModelCheckpoint(filepath=filepath,
monitor='val_acc',
verbose=1,
save_best_only=True)
history = model.fit(X_train, y_train,
batch_size=batch_size,
epochs=epochs,
validation_data=(X_val, y_val),
callbacks=[checkpoint])

# Visualize the result
utils.visualize_result(history, save_dir)

# ROC curve
y_pred = model.predict(X_test)
fpr, tpr, thresholds = roc_curve(np.argmax(y_test, axis=1), y_pred[:, 1], pos_label=1)
acc = np.mean(np.equal(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))
utils.visualize_roc(fpr, tpr, thresholds, acc, save_dir)
Loading

0 comments on commit e6b6cab

Please sign in to comment.