Skip to content

Commit

Permalink
Upload RQ1 plotting scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
danielhuici committed Jan 29, 2025
1 parent c3b1022 commit c641e14
Show file tree
Hide file tree
Showing 2 changed files with 172 additions and 0 deletions.
72 changes: 72 additions & 0 deletions plotting/rq1/plot_cumulative.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import json
import numpy as np
import matplotlib.pyplot as plt

json_files = ["cumulative_tlsh.json", "cumulative_ssdeep.json", "cumulative_minhash.json"]

datasets = ["TLSH dataset", "ssdeep dataset", "MinHash dataset"]
colors = ["black", "red", "blue"]
markers = ["o", "s", "^"]

data = []
original_counts = [] # To store the original number of files
for file in json_files:
with open(file, "r") as f:
counts = json.load(f)
original_counts.append(len(counts)) # Store the total number of files
# Filter out files with more than 50 plagiarism cases
counts = {key: value for key, value in counts.items() if value <= 50}
data.append(list(counts.values()))

x_max = 50
x_range = np.arange(0, x_max + 1)
cumulative_proportions = []

for dataset in data:
frequencies = np.zeros(x_max + 1)
for count in dataset:
if count <= x_max:
frequencies[count] += 1
else:
frequencies[-1] += 1 # Group counts > x_max
cumulative_counts = np.cumsum(frequencies)
cumulative_proportion = cumulative_counts / len(dataset)
cumulative_proportions.append(cumulative_proportion)

plt.figure(figsize=(8, 5))
for i, proportions in enumerate(cumulative_proportions):
plt.plot(
x_range,
proportions,
label=f"{datasets[i]} ($N={original_counts[i]}$ total files)",
color=colors[i],
marker=markers[i],
markersize=5,
linewidth=1,
)

k_limit = 8
plt.axvline(x=k_limit, color="blue", linestyle="--", linewidth=1)
plt.text(
k_limit + 0.5,
0.1,
r"$K = 8$ limit",
color="blue",
fontsize=10,
rotation=0,
)

plt.xlabel("Number of Detected Similar Matches per File", fontsize=12)
plt.ylabel("Cumulative Proportion of Files", fontsize=12)
plt.legend(fontsize=10, loc="lower right")
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.xlim(0, x_max)
plt.ylim(0, 1)
plt.tight_layout()

output_file = "cumulative_plot.pdf"
plt.savefig(output_file, format="pdf", bbox_inches="tight")
plt.close()

print(f"Plot saved to {output_file}")
100 changes: 100 additions & 0 deletions plotting/rq1/plot_performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import json
import matplotlib.pyplot as plt

def normalize(number):
if number == 100:
return 0
if number == 95:
return 5
if number == 90:
return 10
if number == 85:
return 15
if number == 80:
return 20
if number == 75:
return 25
if number == 70:
return 30

def plot_metrics():
minhash_thresholds = [70, 75, 80, 85, 90, 95]
ssdeep_thresholds = [70, 75, 80, 85, 90, 95, 100]
tlsh_thresholds = [70, 75, 80, 85, 90, 95, 100]

# Load SSDEEP metrics
ssdeep_metrics = {"Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}
for threshold in ssdeep_thresholds:
with open(f"/home/ricardo/plagiarism-dataset/metrics/full_metrics_ssdeep_{threshold}.json", "r") as file:
data = json.load(file)
for ssdeep_metric in ssdeep_metrics.keys():
ssdeep_metrics[ssdeep_metric].append(data[ssdeep_metric])

# Load TLSH metrics
tlsh_metrics = {"Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}
for threshold in tlsh_thresholds:
with open(f"/home/ricardo/plagiarism-dataset/metrics/full_metrics_tlsh_{normalize(threshold)}.json", "r") as file:
data = json.load(file)
for tlsh_metric in tlsh_metrics.keys():
tlsh_metrics[tlsh_metric].append(data[tlsh_metric])

# Load MinHash metrics
minhash_metrics = {"Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}
for threshold in minhash_thresholds:
with open(f"metrics2_{threshold / 100:.2f}.json", "r") as file:
data = json.load(file)
for metric in minhash_metrics.keys():
minhash_metrics[metric].append(data["72"][metric])

# Create a single figure with three horizontally split plots
fig, axes = plt.subplots(1, 3, figsize=(6, 3), sharey=True, gridspec_kw={'wspace': 0.05}) # Adjusted figsize
axes[0].set_ylim(0.9, 1.0) # Shared Y-axis limits for all subplots

# TLSH
for metric, values in tlsh_metrics.items():
axes[0].plot(tlsh_thresholds, values, label=metric, marker='o', markersize=2, linewidth=0.8)
axes[0].set_xlabel(r"$\tau_{TLSH}$", fontsize=8)
axes[0].tick_params(axis='both', labelsize=6, length=1, width=0.5)
axes[0].set_xticks([70, 75, 80, 85, 90, 95, 100])

# SSDEEP
for metric, values in ssdeep_metrics.items():
axes[1].plot(ssdeep_thresholds, values, label=metric, marker='o', markersize=2, linewidth=0.8)
axes[1].set_xlabel(r"$\tau_{ssdeep}$", fontsize=8)
axes[1].tick_params(axis='both', labelsize=6, length=1, width=0.5)
axes[1].set_xticks([70, 75, 80, 85, 90, 95, 100])

# MinHash
for metric, values in minhash_metrics.items():
axes[2].plot(minhash_thresholds, values, label=metric, marker='o', markersize=2, linewidth=0.8)
axes[2].set_xlabel(r"$\tau_{MinHash LSH}$", fontsize=8)
axes[2].tick_params(axis='both', labelsize=6, length=1, width=0.5)
axes[2].set_xticks([70, 75, 80, 85, 90, 95, 100])

for ax in axes:
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_linewidth(0.5)
ax.spines['left'].set_linewidth(0.5)
ax.grid(False)

fig.text(0.04, 0.5, 'Metric Value', va='center', rotation='vertical', fontsize=8)

handles, labels = axes[0].get_legend_handles_labels()
fig.legend(
handles,
labels,
loc='lower center',
ncol=4,
fontsize=6,
frameon=False,
bbox_to_anchor=(0.5, -0.05)
)

plt.tight_layout(pad=0.2)
plt.subplots_adjust(bottom=0.12)
plt.savefig("performance.pdf", format="pdf", bbox_inches='tight', pad_inches=0, dpi=300)
plt.show()

if __name__ == "__main__":
plot_metrics()

0 comments on commit c641e14

Please sign in to comment.