-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c3b1022
commit c641e14
Showing
2 changed files
with
172 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import json | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
|
||
json_files = ["cumulative_tlsh.json", "cumulative_ssdeep.json", "cumulative_minhash.json"] | ||
|
||
datasets = ["TLSH dataset", "ssdeep dataset", "MinHash dataset"] | ||
colors = ["black", "red", "blue"] | ||
markers = ["o", "s", "^"] | ||
|
||
data = [] | ||
original_counts = [] # To store the original number of files | ||
for file in json_files: | ||
with open(file, "r") as f: | ||
counts = json.load(f) | ||
original_counts.append(len(counts)) # Store the total number of files | ||
# Filter out files with more than 50 plagiarism cases | ||
counts = {key: value for key, value in counts.items() if value <= 50} | ||
data.append(list(counts.values())) | ||
|
||
x_max = 50 | ||
x_range = np.arange(0, x_max + 1) | ||
cumulative_proportions = [] | ||
|
||
for dataset in data: | ||
frequencies = np.zeros(x_max + 1) | ||
for count in dataset: | ||
if count <= x_max: | ||
frequencies[count] += 1 | ||
else: | ||
frequencies[-1] += 1 # Group counts > x_max | ||
cumulative_counts = np.cumsum(frequencies) | ||
cumulative_proportion = cumulative_counts / len(dataset) | ||
cumulative_proportions.append(cumulative_proportion) | ||
|
||
plt.figure(figsize=(8, 5)) | ||
for i, proportions in enumerate(cumulative_proportions): | ||
plt.plot( | ||
x_range, | ||
proportions, | ||
label=f"{datasets[i]} ($N={original_counts[i]}$ total files)", | ||
color=colors[i], | ||
marker=markers[i], | ||
markersize=5, | ||
linewidth=1, | ||
) | ||
|
||
k_limit = 8 | ||
plt.axvline(x=k_limit, color="blue", linestyle="--", linewidth=1) | ||
plt.text( | ||
k_limit + 0.5, | ||
0.1, | ||
r"$K = 8$ limit", | ||
color="blue", | ||
fontsize=10, | ||
rotation=0, | ||
) | ||
|
||
plt.xlabel("Number of Detected Similar Matches per File", fontsize=12) | ||
plt.ylabel("Cumulative Proportion of Files", fontsize=12) | ||
plt.legend(fontsize=10, loc="lower right") | ||
plt.xticks(fontsize=10) | ||
plt.yticks(fontsize=10) | ||
plt.xlim(0, x_max) | ||
plt.ylim(0, 1) | ||
plt.tight_layout() | ||
|
||
output_file = "cumulative_plot.pdf" | ||
plt.savefig(output_file, format="pdf", bbox_inches="tight") | ||
plt.close() | ||
|
||
print(f"Plot saved to {output_file}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import json | ||
import matplotlib.pyplot as plt | ||
|
||
def normalize(number): | ||
if number == 100: | ||
return 0 | ||
if number == 95: | ||
return 5 | ||
if number == 90: | ||
return 10 | ||
if number == 85: | ||
return 15 | ||
if number == 80: | ||
return 20 | ||
if number == 75: | ||
return 25 | ||
if number == 70: | ||
return 30 | ||
|
||
def plot_metrics(): | ||
minhash_thresholds = [70, 75, 80, 85, 90, 95] | ||
ssdeep_thresholds = [70, 75, 80, 85, 90, 95, 100] | ||
tlsh_thresholds = [70, 75, 80, 85, 90, 95, 100] | ||
|
||
# Load SSDEEP metrics | ||
ssdeep_metrics = {"Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []} | ||
for threshold in ssdeep_thresholds: | ||
with open(f"/home/ricardo/plagiarism-dataset/metrics/full_metrics_ssdeep_{threshold}.json", "r") as file: | ||
data = json.load(file) | ||
for ssdeep_metric in ssdeep_metrics.keys(): | ||
ssdeep_metrics[ssdeep_metric].append(data[ssdeep_metric]) | ||
|
||
# Load TLSH metrics | ||
tlsh_metrics = {"Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []} | ||
for threshold in tlsh_thresholds: | ||
with open(f"/home/ricardo/plagiarism-dataset/metrics/full_metrics_tlsh_{normalize(threshold)}.json", "r") as file: | ||
data = json.load(file) | ||
for tlsh_metric in tlsh_metrics.keys(): | ||
tlsh_metrics[tlsh_metric].append(data[tlsh_metric]) | ||
|
||
# Load MinHash metrics | ||
minhash_metrics = {"Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []} | ||
for threshold in minhash_thresholds: | ||
with open(f"metrics2_{threshold / 100:.2f}.json", "r") as file: | ||
data = json.load(file) | ||
for metric in minhash_metrics.keys(): | ||
minhash_metrics[metric].append(data["72"][metric]) | ||
|
||
# Create a single figure with three horizontally split plots | ||
fig, axes = plt.subplots(1, 3, figsize=(6, 3), sharey=True, gridspec_kw={'wspace': 0.05}) # Adjusted figsize | ||
axes[0].set_ylim(0.9, 1.0) # Shared Y-axis limits for all subplots | ||
|
||
# TLSH | ||
for metric, values in tlsh_metrics.items(): | ||
axes[0].plot(tlsh_thresholds, values, label=metric, marker='o', markersize=2, linewidth=0.8) | ||
axes[0].set_xlabel(r"$\tau_{TLSH}$", fontsize=8) | ||
axes[0].tick_params(axis='both', labelsize=6, length=1, width=0.5) | ||
axes[0].set_xticks([70, 75, 80, 85, 90, 95, 100]) | ||
|
||
# SSDEEP | ||
for metric, values in ssdeep_metrics.items(): | ||
axes[1].plot(ssdeep_thresholds, values, label=metric, marker='o', markersize=2, linewidth=0.8) | ||
axes[1].set_xlabel(r"$\tau_{ssdeep}$", fontsize=8) | ||
axes[1].tick_params(axis='both', labelsize=6, length=1, width=0.5) | ||
axes[1].set_xticks([70, 75, 80, 85, 90, 95, 100]) | ||
|
||
# MinHash | ||
for metric, values in minhash_metrics.items(): | ||
axes[2].plot(minhash_thresholds, values, label=metric, marker='o', markersize=2, linewidth=0.8) | ||
axes[2].set_xlabel(r"$\tau_{MinHash LSH}$", fontsize=8) | ||
axes[2].tick_params(axis='both', labelsize=6, length=1, width=0.5) | ||
axes[2].set_xticks([70, 75, 80, 85, 90, 95, 100]) | ||
|
||
for ax in axes: | ||
ax.spines['top'].set_visible(False) | ||
ax.spines['right'].set_visible(False) | ||
ax.spines['bottom'].set_linewidth(0.5) | ||
ax.spines['left'].set_linewidth(0.5) | ||
ax.grid(False) | ||
|
||
fig.text(0.04, 0.5, 'Metric Value', va='center', rotation='vertical', fontsize=8) | ||
|
||
handles, labels = axes[0].get_legend_handles_labels() | ||
fig.legend( | ||
handles, | ||
labels, | ||
loc='lower center', | ||
ncol=4, | ||
fontsize=6, | ||
frameon=False, | ||
bbox_to_anchor=(0.5, -0.05) | ||
) | ||
|
||
plt.tight_layout(pad=0.2) | ||
plt.subplots_adjust(bottom=0.12) | ||
plt.savefig("performance.pdf", format="pdf", bbox_inches='tight', pad_inches=0, dpi=300) | ||
plt.show() | ||
|
||
if __name__ == "__main__": | ||
plot_metrics() |