Deprecated ml methods and rearrange code.

tamnguyenvan · Mar 12, 2020 · e6b6cab · e6b6cab
1 parent 2fce88b
commit e6b6cab
Show file tree

Hide file tree

Showing 15 changed files with 329 additions and 602 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,9 +1,11 @@
+# VScode
 .vscode/
-result/
-data/
-logs/
 .env/
 
+# Model files
+*.h5
+*.pkl
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/src/preprocess.py → malnet/create_data.py b/src/preprocess.py → malnet/create_data.py
@@ -1,5 +1,3 @@
-# preprocess.py
-
 """
 - Author: tamnv
 - Description: This script will extract raw data from EMBER

diff --git a/src/ember/__init__.py → malnet/ember/__init__.py b/src/ember/__init__.py → malnet/ember/__init__.py
diff --git a/src/ember/features.py → malnet/ember/features.py b/src/ember/features.py → malnet/ember/features.py
diff --git a/malnet/eval.py b/malnet/eval.py
@@ -0,0 +1,73 @@
+"""Evaluate pretrained model independently. Note that we must have pretrained
+model before.
+
+Usage: python eval.py \
+        --model_path MODEL_PATH \
+        --scaler_path SCALER_PATH \
+        --data_dir DATA_DIR \
+"""
+import argparse
+from sys import argv
+import pickle as pkl
+
+import keras
+import tensorflow as tf
+import numpy as np
+from keras.models import load_model
+from sklearn.metrics import roc_curve, auc
+
+import utils
+import ember
+
+# Fix tensorflow bug on rtx card
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+keras.backend.set_session(tf.Session(config=config))
+
+def parse_arguments(argv):
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(prog='MalNet')
+    parser.add_argument('--data_dir', dest='data_dir', type=str, default='data',
+                        help='Path to data directory contains test dataset.')
+    parser.add_argument('--model_path', dest='model_path', type=str,
+                        help='Path to model directory.')
+    parser.add_argument('--scaler_path', dest='scaler_path', type=str,
+                        help='Path to the scaler object file.')
+    parser.add_argument('--scale', dest='scale', type=float, default=1.,
+                        help='Scale of training/test dataset.')
+    return parser.parse_args(argv)
+
+
+# Parse arguments
+args = parse_arguments(argv[1:])
+model_path = args.model_path
+scaler_path = args.scaler_path
+num_classes = 2
+
+print('Loading data...')
+data_dir = args.data_dir
+_, _, X_test, y_test = ember.read_vectorized_features(data_dir, scale=args.scale)
+X_test = np.array(X_test)
+
+# Only keep supervised data
+# Note that unsupervised data has label -1
+X_test = X_test[y_test != -1]
+y_test = y_test[y_test != -1]
+
+print('Loading model from {}'.format(model_path))
+scaler_path = args.scaler_path
+with open(scaler_path, 'rb') as f:
+    scaler = pkl.load(f)
+model_path = args.model_path
+model = load_model(model_path)
+model.summary()
+
+X_test = scaler.transform(X_test)
+X_test = np.expand_dims(X_test, axis=-1)
+y_test = keras.utils.to_categorical(y_test, num_classes=num_classes)
+
+# ROC curve
+y_pred = model.predict(X_test)
+fpr, tpr, thresholds = roc_curve(np.argmax(y_test, axis=1), y_pred[:, 1], pos_label=1)
+acc = np.mean(np.equal(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))
+utils.visualize_roc(fpr, tpr, thresholds, acc)
diff --git a/malnet/saved_models/roc_auc.png b/malnet/saved_models/roc_auc.png
diff --git a/malnet/saved_models/train.png b/malnet/saved_models/train.png
diff --git a/src/test.py → malnet/test.py b/src/test.py → malnet/test.py
@@ -1,11 +1,7 @@
-# test.py
+"""Using pretrained model to predict some real examples to demonstrate it works.
 
+Usage: python test.py --input_file PE_FILE --model_path MODEL_PATH
 """
-- Author: tamnv
-- Description: Use pretrained model to predict some examples
-in real to demonstrate its performance.
-"""
-
 import os
 import glob
 import json
@@ -14,28 +10,37 @@
 from sys import argv
 
 import numpy as np
-from keras.models import model_from_json
+import keras
+import tensorflow as tf
+from keras.models import load_model
 
 import ember
-from util import get_paths
 
+# Fix tensorflow bug on rtx card
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+keras.backend.set_session(tf.Session(config=config))
 
 def parse_arguments(argv):
     """Parse command line arguments."""
     parser = argparse.ArgumentParser(prog='MalNet')
-    parser.add_argument('-i', '--input', dest='input', type=str,
+    parser.add_argument('--input_file', dest='input_file', type=str,
                         help='Path to PE file.')
-    parser.add_argument('--model', dest='model', type=str,
+    parser.add_argument('--model_path', dest='model_path', type=str,
                         help='Path to model directory.')
+    parser.add_argument('--scaler_path', dest='scaler_path', type=str,
+                        help='Path to the scaler object file.')
     parser.add_argument('--threshold', dest='threshold', type=float, default=0.273,
                         help='Threshold to distinguish benign and malicous.')
     return parser.parse_args(argv)
 
 # Parse args
 args = parse_arguments(argv[1:])
-input_file = args.input
+input_file = args.input_file
+model_path = args.model_path
+
 print('Example: %s' % input_file)
-print('Model dir: %s' % args.model)
+print('Model path: %s' % args.model_path)
 print('Threshold: %f' % args.threshold)
 
 # Extract features from PE file
@@ -45,16 +50,11 @@ def parse_arguments(argv):
 feature = np.array(extractor.feature_vector(raw_bytes), dtype=np.float32)
 
 # Load model and predict
-print('Loading model...')
-model_dir = args.model
-path_dict = get_paths(model_dir)
-json_file = open(path_dict['graph'], 'r')
-loaded_model_json = json_file.read()
-json_file.close()
-model = model_from_json(loaded_model_json)
-model.load_weights(path_dict['model'])
-with open(path_dict['scaler'], 'rb') as f:
+print('Loading model from {}'.format(model_path))
+scaler_path = args.scaler_path
+with open(scaler_path, 'rb') as f:
     scaler = pkl.load(f)
+model = load_model(model_path)
 
 features = np.array([feature], dtype=np.float32)
 features = scaler.transform(features)

diff --git a/malnet/train.py b/malnet/train.py
@@ -0,0 +1,180 @@
+"""Train our network on Ember dataset. Before running this script, you need to
+extract features first. See create_data.py for the details.
+
+Usage: python train.py \
+        --data_dir DATA_DIR \
+        --lr LEARNING_RATE \
+        --batch_size BATCH_SIZE \
+        --epochs EPOCHS
+"""
+import os
+import argparse
+import numpy as np
+import pickle as pkl
+from sys import argv
+
+import tensorflow as tf
+import keras
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Flatten, Embedding, Activation
+from keras.layers import Conv1D, MaxPooling1D, BatchNormalization
+from keras.optimizers import SGD, Adam
+from keras import regularizers
+from keras.callbacks import ModelCheckpoint
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import roc_curve, auc, accuracy_score
+
+import ember
+import utils
+
+
+# Fix tensorflow bug on rtx card
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+keras.backend.set_session(tf.Session(config=config))
+
+
+def parse_arguments(argv):
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(prog='MalNet')
+    parser.add_argument('--data_dir', dest='data_dir', type=str, default='data',
+                        help='Directory that stores our dataset.')
+    parser.add_argument('--save_dir', dest='save_dir', type=str, default='saved_models',
+                        help='Directory to save model.')
+    parser.add_argument('--batch_size', dest='batch_size', type=int, default=32,
+                        help='Mini-batch samples.')
+    parser.add_argument('--epochs', dest='epochs', type=int, default=5,
+                        help='Number of epochs.')
+    parser.add_argument('--split', dest='split', type=float, default=0.1,
+                        help='Validation dataset ratio.')
+    parser.add_argument('--lr', dest='learning_rate', type=float, default=1e-3,
+                        help='Learning rate.')
+    parser.add_argument('--scale', dest='scale', type=float, default=1.,
+                        help='Scale of training/test dataset.')
+    return parser.parse_args(argv)
+
+
+# Parse arguments
+args = parse_arguments(argv[1:])
+
+# Hyperparameters
+batch_size = args.batch_size
+epochs = args.epochs
+learning_rate = args.learning_rate
+weight_decay = 5e-4
+save_dir = args.save_dir
+
+# Params
+num_classes = 2
+split = args.split
+
+
+# Generate dummy data
+print('Loading data...')
+data_dir = args.data_dir
+X_train, y_train, X_test, y_test = \
+        ember.read_vectorized_features(data_dir, scale=args.scale)
+X_train = np.array(X_train)
+X_test = np.array(X_test)
+
+
+# Only keep supervised data, leave unsupervised data
+# Note that unsupervised data has label -1
+X_train = X_train[y_train != -1]
+y_train = y_train[y_train != -1]
+indices = np.arange(X_train.shape[0])
+np.random.shuffle(indices)
+X_train = X_train[indices]
+y_train = y_train[indices]
+
+idx = int((1. - split) * X_train.shape[0])
+X_val, y_val = X_train[idx:], y_train[idx:]
+X_train, y_train = X_train[:idx], y_train[:idx]
+
+
+X_test = X_test[y_test != -1]
+y_test = y_test[y_test != -1]
+print('Train/Val/Test: {}/{}/{}'.format(X_train.shape[0],
+                                        X_val.shape[0],
+                                        X_test.shape[0]))
+
+
+# Convert labels to one-hot
+y_train = keras.utils.to_categorical(y_train, num_classes=num_classes)
+y_val = keras.utils.to_categorical(y_val, num_classes=num_classes)
+y_test = keras.utils.to_categorical(y_test, num_classes=num_classes)
+
+
+# Preprocessing data before training
+# Save the standard scaler for deploying phase.
+scaler = StandardScaler()
+scaler.fit(X_train)
+X_train = scaler.transform(X_train)
+X_val = scaler.transform(X_val)
+X_test = scaler.transform(X_test)
+if not os.path.exists(save_dir):
+    os.mkdir(save_dir)
+with open(os.path.join(save_dir, 'scaler.pkl'), 'wb') as f:
+    pkl.dump(scaler, f)
+
+
+# Model expects 3D input, so expands the last dim.
+X_train = np.expand_dims(X_train, axis=-1)
+X_val = np.expand_dims(X_val, axis=-1)
+X_test = np.expand_dims(X_test, axis=-1)
+
+# For convenient
+dim = X_train.shape[1]
+regularizer = regularizers.l2(weight_decay)
+
+# Build the model
+model = Sequential()
+model.add(Conv1D(128, 64, strides=64, activation='relu',
+    kernel_regularizer=regularizer, input_shape=(dim, 1)))
+model.add(BatchNormalization())
+model.add(Conv1D(128, 3, strides=2, kernel_regularizer=regularizer,activation='relu'))
+model.add(BatchNormalization())
+
+model.add(Flatten())
+model.add(Dense(256, kernel_regularizer=regularizer, activation='relu'))
+model.add(BatchNormalization())
+model.add(Dense(32, kernel_regularizer=regularizer, activation='relu'))
+model.add(BatchNormalization())
+model.add(Dense(num_classes, kernel_regularizer=regularizer, activation='softmax'))
+model.summary()
+
+# Print verbose information
+print('Batch size: {}'.format(batch_size))
+print('Epochs: {}'.format(epochs))
+print('Learning rate: {}'.format(learning_rate))
+print('Weight decay: {}'.format(weight_decay))
+
+# Define the optimizer and compile model
+optimizer = Adam(learning_rate)
+model.compile(loss='categorical_crossentropy',
+              optimizer=optimizer,
+              metrics=['accuracy'])
+
+# Training
+model_name = 'malnet_model.{epoch:03d}.h5'
+if not os.path.isdir(save_dir):
+    os.makedirs(save_dir)
+filepath = os.path.join(save_dir, model_name)
+checkpoint = ModelCheckpoint(filepath=filepath,
+                             monitor='val_acc',
+                             verbose=1,
+                             save_best_only=True)
+history = model.fit(X_train, y_train,
+                    batch_size=batch_size,
+                    epochs=epochs,
+                    validation_data=(X_val, y_val),
+                    callbacks=[checkpoint])
+
+# Visualize the result
+utils.visualize_result(history, save_dir)
+
+# ROC curve
+y_pred = model.predict(X_test)
+fpr, tpr, thresholds = roc_curve(np.argmax(y_test, axis=1), y_pred[:, 1], pos_label=1)
+acc = np.mean(np.equal(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))
+utils.visualize_roc(fpr, tpr, thresholds, acc, save_dir)