Upload RQ1 plotting scripts

reverseame · Jan 29, 2025 · c641e14 · c641e14
1 parent c3b1022
commit c641e14
Show file tree

Hide file tree

Showing 2 changed files with 172 additions and 0 deletions.
diff --git a/plotting/rq1/plot_cumulative.py b/plotting/rq1/plot_cumulative.py
@@ -0,0 +1,72 @@
+import json
+import numpy as np
+import matplotlib.pyplot as plt
+
+json_files = ["cumulative_tlsh.json", "cumulative_ssdeep.json", "cumulative_minhash.json"]
+
+datasets = ["TLSH dataset", "ssdeep dataset", "MinHash dataset"]
+colors = ["black", "red", "blue"]
+markers = ["o", "s", "^"]
+
+data = []
+original_counts = []  # To store the original number of files
+for file in json_files:
+    with open(file, "r") as f:
+        counts = json.load(f)
+        original_counts.append(len(counts))  # Store the total number of files
+        # Filter out files with more than 50 plagiarism cases
+        counts = {key: value for key, value in counts.items() if value <= 50}
+        data.append(list(counts.values()))
+
+x_max = 50 
+x_range = np.arange(0, x_max + 1)
+cumulative_proportions = []
+
+for dataset in data:
+    frequencies = np.zeros(x_max + 1)
+    for count in dataset:
+        if count <= x_max:
+            frequencies[count] += 1
+        else:
+            frequencies[-1] += 1  # Group counts > x_max
+    cumulative_counts = np.cumsum(frequencies)
+    cumulative_proportion = cumulative_counts / len(dataset)
+    cumulative_proportions.append(cumulative_proportion)
+
+plt.figure(figsize=(8, 5))
+for i, proportions in enumerate(cumulative_proportions):
+    plt.plot(
+        x_range,
+        proportions,
+        label=f"{datasets[i]} ($N={original_counts[i]}$ total files)",
+        color=colors[i],
+        marker=markers[i],
+        markersize=5,
+        linewidth=1,
+    )
+
+k_limit = 8
+plt.axvline(x=k_limit, color="blue", linestyle="--", linewidth=1)
+plt.text(
+    k_limit + 0.5,
+    0.1,
+    r"$K = 8$ limit",
+    color="blue",
+    fontsize=10,
+    rotation=0,
+)
+
+plt.xlabel("Number of Detected Similar Matches per File", fontsize=12)
+plt.ylabel("Cumulative Proportion of Files", fontsize=12)
+plt.legend(fontsize=10, loc="lower right")
+plt.xticks(fontsize=10)
+plt.yticks(fontsize=10)
+plt.xlim(0, x_max)
+plt.ylim(0, 1)
+plt.tight_layout()
+
+output_file = "cumulative_plot.pdf"
+plt.savefig(output_file, format="pdf", bbox_inches="tight")
+plt.close()
+
+print(f"Plot saved to {output_file}")
diff --git a/plotting/rq1/plot_performance.py b/plotting/rq1/plot_performance.py
@@ -0,0 +1,100 @@
+import json
+import matplotlib.pyplot as plt
+
+def normalize(number):
+    if number == 100:
+        return 0
+    if number == 95:
+        return 5
+    if number == 90:
+        return 10
+    if number == 85:
+        return 15
+    if number == 80:
+        return 20
+    if number == 75:
+        return 25
+    if number == 70:
+        return 30
+
+def plot_metrics():
+    minhash_thresholds = [70, 75, 80, 85, 90, 95]
+    ssdeep_thresholds = [70, 75, 80, 85, 90, 95, 100]
+    tlsh_thresholds = [70, 75, 80, 85, 90, 95, 100]
+
+    # Load SSDEEP metrics
+    ssdeep_metrics = {"Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}
+    for threshold in ssdeep_thresholds:
+        with open(f"/home/ricardo/plagiarism-dataset/metrics/full_metrics_ssdeep_{threshold}.json", "r") as file:
+            data = json.load(file)
+            for ssdeep_metric in ssdeep_metrics.keys():
+                ssdeep_metrics[ssdeep_metric].append(data[ssdeep_metric])
+
+    # Load TLSH metrics
+    tlsh_metrics = {"Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}
+    for threshold in tlsh_thresholds:
+        with open(f"/home/ricardo/plagiarism-dataset/metrics/full_metrics_tlsh_{normalize(threshold)}.json", "r") as file:
+            data = json.load(file)
+            for tlsh_metric in tlsh_metrics.keys():
+                tlsh_metrics[tlsh_metric].append(data[tlsh_metric])
+
+    # Load MinHash metrics
+    minhash_metrics = {"Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}
+    for threshold in minhash_thresholds:
+        with open(f"metrics2_{threshold / 100:.2f}.json", "r") as file:
+            data = json.load(file)
+            for metric in minhash_metrics.keys():
+                minhash_metrics[metric].append(data["72"][metric])
+
+    # Create a single figure with three horizontally split plots
+    fig, axes = plt.subplots(1, 3, figsize=(6, 3), sharey=True, gridspec_kw={'wspace': 0.05})  # Adjusted figsize
+    axes[0].set_ylim(0.9, 1.0)  # Shared Y-axis limits for all subplots
+
+    # TLSH
+    for metric, values in tlsh_metrics.items():
+        axes[0].plot(tlsh_thresholds, values, label=metric, marker='o', markersize=2, linewidth=0.8)
+    axes[0].set_xlabel(r"$\tau_{TLSH}$", fontsize=8)
+    axes[0].tick_params(axis='both', labelsize=6, length=1, width=0.5)
+    axes[0].set_xticks([70, 75, 80, 85, 90, 95, 100])  
+
+    # SSDEEP
+    for metric, values in ssdeep_metrics.items():
+        axes[1].plot(ssdeep_thresholds, values, label=metric, marker='o', markersize=2, linewidth=0.8)
+    axes[1].set_xlabel(r"$\tau_{ssdeep}$", fontsize=8)
+    axes[1].tick_params(axis='both', labelsize=6, length=1, width=0.5)
+    axes[1].set_xticks([70, 75, 80, 85, 90, 95, 100]) 
+
+    # MinHash
+    for metric, values in minhash_metrics.items():
+        axes[2].plot(minhash_thresholds, values, label=metric, marker='o', markersize=2, linewidth=0.8)
+    axes[2].set_xlabel(r"$\tau_{MinHash LSH}$", fontsize=8)
+    axes[2].tick_params(axis='both', labelsize=6, length=1, width=0.5)
+    axes[2].set_xticks([70, 75, 80, 85, 90, 95, 100]) 
+
+    for ax in axes:
+        ax.spines['top'].set_visible(False)
+        ax.spines['right'].set_visible(False)
+        ax.spines['bottom'].set_linewidth(0.5)
+        ax.spines['left'].set_linewidth(0.5)
+        ax.grid(False)  
+
+    fig.text(0.04, 0.5, 'Metric Value', va='center', rotation='vertical', fontsize=8)
+
+    handles, labels = axes[0].get_legend_handles_labels()
+    fig.legend(
+        handles, 
+        labels, 
+        loc='lower center', 
+        ncol=4, 
+        fontsize=6, 
+        frameon=False, 
+        bbox_to_anchor=(0.5, -0.05)  
+    )
+
+    plt.tight_layout(pad=0.2)
+    plt.subplots_adjust(bottom=0.12)  
+    plt.savefig("performance.pdf", format="pdf", bbox_inches='tight', pad_inches=0, dpi=300)
+    plt.show()
+
+if __name__ == "__main__":
+    plot_metrics()