Skip to content

Commit

Permalink
formatting and some warnings
Browse files Browse the repository at this point in the history
  • Loading branch information
Wazzabeee committed Nov 27, 2023
1 parent ec26aa5 commit 2229eb2
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 112 deletions.
43 changes: 29 additions & 14 deletions scripts/html_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,29 +10,42 @@
from os import getcwd, path, makedirs


def get_real_matching_blocks(words_list1: list, words_list2: list, minimum_size: int = 2) -> list:
""" Return list of matching blocks with size greater than n """

matching_blocks = difflib.SequenceMatcher(a=words_list1, b=words_list2).get_matching_blocks()
def get_real_matching_blocks(
words_list1: list, words_list2: list, minimum_size: int = 2
) -> list:
"""Return list of matching blocks with size greater than n"""

matching_blocks = difflib.SequenceMatcher(
a=words_list1, b=words_list2
).get_matching_blocks()
if minimum_size and minimum_size > 0:
return [b for b in matching_blocks if b.size >= minimum_size]
else:
return [b for b in matching_blocks if b.size >= 2]


def get_ordered_blocks_positions(string: str, matching_blocks: list, string_blocks: list) -> list:
""" Return ordered list of all positions of matching blocks in string """
def get_ordered_blocks_positions(
string: str, matching_blocks: list, string_blocks: list
) -> list:
"""Return ordered list of all positions of matching blocks in string"""

all_blocks_positions = []

for block_ind, _ in enumerate(matching_blocks):
# Find all positions of substring in string
block_positions = [char for char in range(len(string)) if string.startswith(
string_blocks[block_ind], char)]
block_positions = [
char
for char in range(len(string))
if string.startswith(string_blocks[block_ind], char)
]

for position in block_positions:
# We check if there is another block starting at the same position
var = [pos_tuple for pos_tuple in all_blocks_positions if pos_tuple[0] == position]
var = [
pos_tuple
for pos_tuple in all_blocks_positions
if pos_tuple[0] == position
]
if var: # If there is one such block
size = len(string_blocks[var[0][1]]) # get size of block in var
if size < len(string_blocks[block_ind]):
Expand All @@ -48,30 +61,32 @@ def get_ordered_blocks_positions(string: str, matching_blocks: list, string_bloc


def blocks_list_to_strings_list(blocks_list: list, curr_text: list) -> list:
""" Convert blocks list to len of blocks strings """
"""Convert blocks list to len of blocks strings"""

strings_len_list = []

for block in blocks_list:
# Append size of block in string
strings_len_list.append(len(' '.join(map(str, curr_text[block.a:block.a + block.size]))))
strings_len_list.append(
len(" ".join(map(str, curr_text[block.a : block.a + block.size])))
)

return strings_len_list


def writing_results(dir_name: str) -> str:
""" Create new directory for results in current working directory """
"""Create new directory for results in current working directory"""

curr_directory = path.dirname(getcwd())
final_directory = path.join(curr_directory, r'results\\' + dir_name)
final_directory = path.join(curr_directory, r"results\\" + dir_name)
if not path.exists(final_directory):
makedirs(final_directory)

return final_directory


def get_color_from_similarity(similarity_score: float) -> str:
""" Return css style according to similarity score """
"""Return css style according to similarity score"""

if float(similarity_score) > 15:
return "#990033; font-weight: bold"
Expand Down
97 changes: 54 additions & 43 deletions scripts/html_writing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,68 +13,76 @@
from bs4 import BeautifulSoup as Bs
from tabulate import tabulate

from html_utils import get_color_from_similarity, get_real_matching_blocks, \
blocks_list_to_strings_list, get_ordered_blocks_positions
from html_utils import (
get_color_from_similarity,
get_real_matching_blocks,
blocks_list_to_strings_list,
get_ordered_blocks_positions,
)
from utils import is_float


def add_links_to_html_table(html_path: str) -> None:
""" Add links to HTML data cells at specified path
"""Add links to HTML data cells at specified path
This method will link to all HTML TD tags which contain a float different from - 1 the
corresponding HTML comparison file. The links will be opened in a new tab. The colors of
the text in tag will change depending on similarity score.
"""

with open(html_path, encoding='utf-8') as html:
soup = Bs(html, 'html.parser')
with open(html_path, encoding="utf-8") as html:
soup = Bs(html, "html.parser")
file_ind = 0 # Cursor on file number for the naming of html files

for td_tag in soup.findAll('td'): # Retrieve all data celss from html table in path

for td_tag in soup.findAll(
"td"
): # Retrieve all data celss from html table in path
if is_float(td_tag.text): # If td is not filename or -1

tmp = soup.new_tag('a',
href='file:///' + html_path.replace('_results', str(file_ind)),
target="_blank",
style="color:" + get_color_from_similarity(td_tag.text))
tmp = soup.new_tag(
"a",
href="file:///" + html_path.replace("_results", str(file_ind)),
target="_blank",
style="color:" + get_color_from_similarity(td_tag.text),
)

td_tag.string.wrap(tmp) # We wrap the td string between the hyperlink
file_ind += 1

# We update the HTML of the file at path
with open(html_path, 'wb') as f_output:
with open(html_path, "wb") as f_output:
f_output.write(soup.prettify("utf-8"))
f_output.flush()
fsync(f_output.fileno())
f_output.close()


def get_span_blocks(bs_obj: Bs, text1: list, text2: list, block_size: int) -> list:
""" Return list of spans with colors for HTML rendering """
"""Return list of spans with colors for HTML rendering"""

results = [[], []] # List of spans list

# Get matching blocks with chosen minimum size
matching_blocks = get_real_matching_blocks(text1, text2, block_size)

# Generate one unique color for each matching block
colors = [f'#%06X' % randint(0, 0xFFFFFF) for _ in range(len(matching_blocks))]
colors = [f"#{randint(0, 0xFFFFFF):06X}" for _ in range(len(matching_blocks))]

# Convert blocks from list of list of strings to list of strings
string_blocks = [' '.join(map(str, text1[b.a:b.a + b.size])) for b in matching_blocks]
string_blocks = [
" ".join(map(str, text1[b.a : b.a + b.size])) for b in matching_blocks
]

# Store lengths of blocks in text
strings_len_list = blocks_list_to_strings_list(matching_blocks, text1)

# Convert list of strings to strings
str1, str2 = ' '.join(map(str, text1)), ' '.join(map(str, text2))
str1, str2 = " ".join(map(str, text1)), " ".join(map(str, text2))

global_positions_list = [get_ordered_blocks_positions(str1, matching_blocks,
string_blocks),
get_ordered_blocks_positions(str2, matching_blocks,
string_blocks)]
global_positions_list = [
get_ordered_blocks_positions(str1, matching_blocks, string_blocks),
get_ordered_blocks_positions(str2, matching_blocks, string_blocks),
]

for num, pos_list in enumerate(global_positions_list):
cursor = 0 # Cursor on current string
Expand All @@ -84,13 +92,14 @@ def get_span_blocks(bs_obj: Bs, text1: list, text2: list, block_size: int) -> li

for block in pos_list:
# Span tag for the text before the matching sequence
span = bs_obj.new_tag('span')
span.string = str1[cursor:block[0]]
span = bs_obj.new_tag("span")
span.string = str1[cursor : block[0]]

# Span tag for the text in the matching sequence
blockspan = bs_obj.new_tag('span',
style="color:" + colors[block[1]] + "; font-weight:bold")
blockspan.string = str1[block[0]:block[0] + strings_len_list[block[1]]]
blockspan = bs_obj.new_tag(
"span", style="color:" + colors[block[1]] + "; font-weight:bold"
)
blockspan.string = str1[block[0] : block[0] + strings_len_list[block[1]]]

# Append spans tags to results list
results[num].append(span)
Expand All @@ -100,50 +109,52 @@ def get_span_blocks(bs_obj: Bs, text1: list, text2: list, block_size: int) -> li
cursor = block[0] + strings_len_list[block[1]]

# End of loop, last span tag for the rest of the text
span = bs_obj.new_tag('span')
span = bs_obj.new_tag("span")
span.string = str1[cursor:]
results[num].append(span)

return results


def papers_comparison(save_dir: str, ind: int, text1: list, text2: list, filenames: tuple,
block_size: int) -> None:
""" Write to HTML file texts that have been compared with highlighted similar blocks """

copy(path.join("..", "templates", "template.html"), save_dir) # Copy comparison template to curr dir
comp_path = path.join(save_dir, str(ind) + '.html')
rename(path.join(save_dir, 'template.html'), comp_path)
def papers_comparison(
save_dir: str, ind: int, text1: list, text2: list, filenames: tuple, block_size: int
) -> None:
"""Write to HTML file texts that have been compared with highlighted similar blocks"""

with open(comp_path, encoding='utf-8') as html:
copy(
path.join("..", "templates", "template.html"), save_dir
) # Copy comparison template to curr dir
comp_path = path.join(save_dir, str(ind) + ".html")
rename(path.join(save_dir, "template.html"), comp_path)

soup = Bs(html, 'html.parser')
with open(comp_path, encoding="utf-8") as html:
soup = Bs(html, "html.parser")
res = get_span_blocks(soup, text1, text2, block_size)
blocks = soup.findAll(attrs={'class': 'block'})
blocks = soup.findAll(attrs={"class": "block"})

# Append filename tags and span tags to html
for i, filename in enumerate(filenames):
temp_tag = soup.new_tag('h3')
temp_tag = soup.new_tag("h3")
temp_tag.string = filename
blocks[i].append(temp_tag)
for tag in res[i]:
blocks[i].append(tag)

with open(comp_path, 'wb') as f_output:
with open(comp_path, "wb") as f_output:
f_output.write(soup.prettify("utf-8"))


def results_to_html(scores: list, files_names: list, html_path: str) -> None:
""" Write similarity results to HTML page """
"""Write similarity results to HTML page"""

for ind, _ in enumerate(files_names):
scores[ind].insert(0, files_names[ind])

scores.insert(0, files_names)
scores[0].insert(0, '')
scores[0].insert(0, "")

with open(html_path, 'w', encoding='utf-8') as file:
file.write(tabulate(scores, tablefmt='html'))
with open(html_path, "w", encoding="utf-8") as file:
file.write(tabulate(scores, tablefmt="html"))
file.flush()
fsync(file.fileno())
file.close()
33 changes: 23 additions & 10 deletions scripts/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,30 +26,36 @@ def main() -> None:
if path.exists(in_dir): # Check if specified path exists
if not path.isabs(in_dir):
in_dir = path.abspath(in_dir)
if len(listdir(in_dir)) > 1: # Check if there are at least 2 files at specified path
if (
len(listdir(in_dir)) > 1
): # Check if there are at least 2 files at specified path
filenames, processed_files = [], []
students_names = get_student_names(in_dir)
for ind, direc in enumerate(listdir(in_dir)):
if path.isdir(path.join(in_dir, direc)):

for file in listdir(path.join(in_dir, direc)):
file_words = file_extension_call(str(path.join(in_dir, direc, file)))
file_words = file_extension_call(
str(path.join(in_dir, direc, file))
)

if file_words: # If all files have supported format
processed_files.append(file_words)
filenames.append(students_names[ind])
else: # At least one file was not supported
print(
"Remove files which are not txt, pdf, docx or odt and run the "
"script again.")
"script again."
)
sys.exit()
if out_dir is not None and path.exists(out_dir):
if not path.isabs(out_dir):
out_dir = path.abspath(out_dir)
results_directory = out_dir
else:
# Create new directory for storing html files
results_directory = writing_results(datetime.now().strftime("%Y%m%d_%H%M%S"))
results_directory = writing_results(
datetime.now().strftime("%Y%m%d_%H%M%S")
)

difflib_scores = [[] for _ in range(len(processed_files))]
file_ind = 0
Expand All @@ -61,13 +67,19 @@ def main() -> None:
difflib_scores[i].append(difflib_overlap(text, text_bis))

# Write text with matching blocks colored in results directory
papers_comparison(results_directory, file_ind, text, text_bis,
(filenames[i], filenames[j]), block_size)
papers_comparison(
results_directory,
file_ind,
text,
text_bis,
(filenames[i], filenames[j]),
block_size,
)
file_ind += 1
else:
difflib_scores[i].append(-1)

results_directory = path.join(results_directory, '_results.html')
results_directory = path.join(results_directory, "_results.html")
print(results_directory)

results_to_html(difflib_scores, filenames, results_directory)
Expand All @@ -80,12 +92,13 @@ def main() -> None:
else:
print(
"Minimum number of files is not present. Please check that there are at least "
"two files to compare.")
"two files to compare."
)
sys.exit()
else:
print("The specified path does not exist : " + in_dir)
sys.exit()


if __name__ == '__main__':
if __name__ == "__main__":
main()
Loading

0 comments on commit 2229eb2

Please sign in to comment.