main.py

# -*- coding: utf-8 -*-
"""Malware Detection Using ai.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1v2F9C3RzsBouBDrtA37ZH6PrEnct2C6W

# **Phase 1: Dataset Exploration**
"""

# Step 1: Importing Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Step 2: Load the Dataset
dataset = pd.read_csv('MalwareDataset.csv')

# Step 3: Display the First Few Rows
print("First 5 Rows of the Dataset:")
dataset.head()

# Step 4: Understand the Columns and Identify the Target
print("\nDataset Information:")
dataset.info()  # Gives an overview of columns and data types
print("\nUnique values in the 'legitimate' column:")
dataset['legitimate'].value_counts()  # Confirm the target column distribution

# Step 5: Descriptive Statistics
print("\nDescriptive Statistics:")
dataset.describe()

# Step 6: Check for Missing Values
print("\nMissing Values in Each Column:")
dataset.isnull().sum()

"""**1) How many legitimate and malicious files are in the dataset?**

* **Legitimate files:** 40,918
* **Malicious files:** 96,526

**2) Which features seem to differentiate legitimate files from malware?**

 By analyzing the descriptive statistics:

* **ResourceSize:** The mean for ResourceSize is 247,476 but varies widely. Malware may have more extreme values in this feature compared to legitimate files.
* **DllCharacteristics:** Legitimate files might have more specific and consistent characteristics, while malware could show more variation.
* **MajorImageVersion and MajorOperatingSystemVersion:** Many files have a 0 value for these features, potentially hinting at default or missing metadata that might be more common in malware.
* **NumberOfSections:** Malware files may have anomalous values here compared to legitimate ones.
* **AddressOfEntryPoint:** The range and mean values are significantly large, potentially differentiating benign files from malware if the distribution of entry points is noticeably different.

# **Phase 2: Data Preparation**
"""

# Step 1: Split the dataset into features (X) and target (y)
X = dataset_cleaned.drop(columns=['legitimate'])  # Features
y = dataset_cleaned['legitimate']  # Target

# Step 2: Divide the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Standardize the numerical data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Verify scaling (mean ~0 and std ~1 for training data)
print("\nFirst 5 rows of standardized training data:")
print(X_train_scaled[:5])

print("\nFeature means (training set):", X_train_scaled.mean(axis=0))
print("Feature standard deviations (training set):", X_train_scaled.std(axis=0))

"""# **Phase 3: Model Creation and Training**

# **Part A: Supervised Models**
"""

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Train a Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_scaled, y_train)

# Make predictions using the Decision Tree model
dt_predictions = dt_model.predict(X_test_scaled)

# Evaluate the Decision Tree model
dt_accuracy = accuracy_score(y_test, dt_predictions)
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")
print("\nDecision Tree Classification Report:")
print(classification_report(y_test, dt_predictions))

# Step 2: Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)  # Default: 100 trees
rf_model.fit(X_train_scaled, y_train)

# Make predictions using the Random Forest model
rf_predictions = rf_model.predict(X_test_scaled)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"\nRandom Forest Accuracy: {rf_accuracy:.4f}")
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_predictions))

"""**Comparing the Performance of Decision Tree and Random Forest**

**Accuracy:**
* Decision Tree: 98.79%
* Random Forest: 99.19%

**Observation:**

Random Forest slightly outperforms the Decision Tree in terms of accuracy. This is expected as Random Forest leverages multiple decision trees to reduce overfitting and improve generalization.

**Classification report**

**Precision:**

* Decision Tree (Class 0 - Malware): 0.99 | Class 1 - Legitimate: 0.98
* Random Forest (Class 0 - Malware): 0.99 | Class 1 - Legitimate: 0.99

**Observation:**

Both models perform well in terms of precision, but Random Forest performs better for identifying legitimate files (Class 1), with a slightly higher precision score.

**Recall:**

* Decision Tree (Class 0 - Malware): 0.99 | Class 1 - Legitimate: 0.98
* Random Forest (Class 0 - Malware): 0.99 | Class 1 - Legitimate: 0.99

**Observation:**

Both models have very high recall for both classes, but Random Forest slightly outperforms the Decision Tree when it comes to identifying legitimate files (Class 1).

**F1-Score:**

* Decision Tree (Class 0 - Malware): 0.99 | Class 1 - Legitimate: 0.98
* Random Forest (Class 0 - Malware): 0.99 | Class 1 - Legitimate: 0.99

**Observation:**

F1-score is essentially the harmonic mean of precision and recall, and both models perform similarly in terms of F1-score. However, Random Forest again shows a slight edge for legitimate files (Class 1) due to its improved recall and precision.

**Confusion Matrix:**
"""

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Decision Tree Confusion Matrix
dt_cm = confusion_matrix(y_test, dt_predictions)
print("\nDecision Tree Confusion Matrix:")
print(dt_cm)
ConfusionMatrixDisplay(confusion_matrix=dt_cm, display_labels=['Malware', 'Legitimate']).plot(cmap='Blues')
plt.title("Decision Tree Confusion Matrix")
plt.show()

# Random Forest Confusion Matrix
rf_cm = confusion_matrix(y_test, rf_predictions)
print("\nRandom Forest Confusion Matrix:")
print(rf_cm)
ConfusionMatrixDisplay(confusion_matrix=rf_cm, display_labels=['Malware', 'Legitimate']).plot(cmap='Greens')
plt.title("Random Forest Confusion Matrix")
plt.show()

"""**True Negatives:**
* Decision Tree: 19120
* Random Forest: 19184

**Observation:**

Random Forest performs slightly better at detecting malware, as it has more True Negatives, correctly identifying more malicious files.

**False Positives:**
* Decision Tree: 185
* Random Forest: 121

**Observation:**

Random Forest produces fewer false alarms, meaning it misclassifies fewer malware files as legitimate compared to the Decision Tree.

**False Negatives:**

* Decision Tree: 147
* Random Forest: 102

**Observation:**

Random Forest misses fewer legitimate files, leading to a lower False Negative rate and improving its accuracy in identifying legitimate files.

**True Positives:**

* Decision Tree: 8037
* Random Forest: 8082

**Observation:**

Random Forest identifies more legitimate files as legitimate, contributing to its overall better performance.

**Overall Comparison**

Random Forest outperforms Decision Tree in every metric, though the differences are minimal. It is better at detecting legitimate files, with a slightly lower false positive and false negative rate compared to the Decision Tree.

# **Part B: Unsupervised Model**
"""

from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import numpy as np

# Step 1: Implement KMeans with 2 clusters
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_train_scaled)

# Step 2: Get the predicted cluster labels
cluster_labels = kmeans.labels_

# Step 3: Compare with actual labels (y_train)
cm = confusion_matrix(y_train, cluster_labels)
if cm[0, 1] > cm[1, 0]:
    cluster_labels = 1 - cluster_labels  # Swap the cluster labels

# Step 4: Compute Adjusted Rand Index (ARI) for comparison
ari_score = adjusted_rand_score(y_train, cluster_labels)
print(f"Adjusted Rand Index (ARI): {ari_score:.4f}")

# Step 5: Visualize the clusters
plt.figure(figsize=(8, 6))

# Scatter plot of the first two features colored by predicted cluster labels
plt.scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], c=cluster_labels, cmap='viridis', alpha=0.6)
plt.title("KMeans Clustering of Training Data (2 Clusters)")
plt.xlabel("Feature 1 (scaled)")
plt.ylabel("Feature 2 (scaled)")
plt.colorbar(label='Cluster Label')
plt.show()

# Step 6: Confusion Matrix for Cluster Labels vs Actual Labels
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Malware', 'Legitimate']).plot(cmap='Blues')
plt.title("Confusion Matrix of KMeans Clustering vs Actual Labels")
plt.show()

"""# **Part C: Deep Learning Model**"""

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC
from tensorflow.keras.layers import Input

# 1. Create the model
model = Sequential()

# Input layer: Specify the input shape using Input layer
model.add(Input(shape=(X_train_scaled.shape[1],)))  # input shape is (number of features,)

# Hidden layer 1: Dense layer with 64 neurons, ReLU activation
model.add(Dense(64, activation='relu'))

# Hidden layer 2: Dense layer with 32 neurons, ReLU activation
model.add(Dense(32, activation='relu'))

# Output layer: 1 neuron, sigmoid activation function for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(),
              loss=BinaryCrossentropy(),
              metrics=[AUC()])  # AUC (Area Under Curve) for evaluation

# Train the model on the training data
history = model.fit(X_train_scaled, y_train, epochs=20, batch_size=32, validation_data=(X_test_scaled, y_test))

# 4. Evaluate the model on the test set
test_loss, test_auc = model.evaluate(X_test_scaled, y_test)
print(f"Test Loss: {test_loss:.4f}, Test AUC: {test_auc:.4f}")

# 5. Plot the training and validation AUC
import matplotlib.pyplot as plt

plt.plot(history.history['auc'], label='Train AUC')
plt.plot(history.history['val_auc'], label='Validation AUC')
plt.title('Model AUC')
plt.xlabel('Epochs')
plt.ylabel('AUC')
plt.legend()
plt.show()

"""# **Phase 4: Cross-Validation**"""

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# 1. Initialize the models
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(random_state=42, n_estimators=100)

# 2. Perform 5-fold cross-validation
# Cross-validation for Decision Tree
dt_cv_scores = cross_val_score(decision_tree, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"Decision Tree Cross-Validation Scores: {dt_cv_scores}")
print(f"Decision Tree Average Accuracy: {dt_cv_scores.mean():.4f}")
print(f"Decision Tree Standard Deviation: {dt_cv_scores.std():.4f}")

# Cross-validation for Random Forest
rf_cv_scores = cross_val_score(random_forest, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"Random Forest Cross-Validation Scores: {rf_cv_scores}")
print(f"Random Forest Average Accuracy: {rf_cv_scores.mean():.4f}")
print(f"Random Forest Standard Deviation: {rf_cv_scores.std():.4f}")

# 3. Compare the average scores
if rf_cv_scores.mean() > dt_cv_scores.mean():
    print("Random Forest generalizes better on this dataset.")
else:
    print("Decision Tree generalizes better on this dataset.")

"""# **Phase 5: Model Evaluation and Comparison**"""

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1. Train the Decision Tree and Random Forest models on the entire training set
decision_tree.fit(X_train_scaled, y_train)
random_forest.fit(X_train_scaled, y_train)

# 2. Make predictions on the test set
dt_predictions = decision_tree.predict(X_test_scaled)
rf_predictions = random_forest.predict(X_test_scaled)

# 3. Evaluate Decision Tree
print("Decision Tree Evaluation:")
dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_conf_matrix = confusion_matrix(y_test, dt_predictions)
dt_class_report = classification_report(y_test, dt_predictions)

print(f"Accuracy: {dt_accuracy:.4f}")
print(f"Confusion Matrix:\n{dt_conf_matrix}")
print(f"Classification Report:\n{dt_class_report}")

# 4. Evaluate Random Forest
print("\nRandom Forest Evaluation:")
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_conf_matrix = confusion_matrix(y_test, rf_predictions)
rf_class_report = classification_report(y_test, rf_predictions)

print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Confusion Matrix:\n{rf_conf_matrix}")
print(f"Classification Report:\n{rf_class_report}")

# 5. Evaluate KMeans clustering model
from sklearn.metrics import adjusted_rand_score

kmeans_predictions = kmeans.predict(X_test_scaled)
kmeans_adjusted_rand = adjusted_rand_score(y_test, kmeans_predictions)

print("\nKMeans Clustering Evaluation:")
print(f"Adjusted Rand Index (ARI): {kmeans_adjusted_rand:.4f}")

"""# **Comparison:**

**Best Overall Model:**

* The Random Forest achieves the best overall accuracy (99.19%) among the models.

**Consistency of Results Cross-Validation vs. Train-Test Split:**

The Decision Tree and Random Forest both show consistent performance between simple data splits and 5-fold cross-validation.

1. Decision Tree:

* Simple split accuracy: 98.79% (0.9879)
* Cross-validation average accuracy: 98.65% (0.9865)

2. Random Forest:

* Simple split accuracy: 99.19% (0.9919)
* Cross-validation average accuracy: 99.04% (0.9904)

The results are consistent between the simple data split and cross-validation for both models. This confirms their robustness and reliable performance on the dataset.

**KMeans Effectiveness:**

* KMeans does not effectively differentiate between malware and legitimate files compared to supervised models. This is expected because KMeans relies solely on feature distribution, and the data likely has overlapping regions that are better separated by supervised learning.
"""