diff --git a/scripts/html_utils.py b/scripts/html_utils.py
index ede72ab..c5ee2c3 100644
--- a/scripts/html_utils.py
+++ b/scripts/html_utils.py
@@ -10,29 +10,42 @@
from os import getcwd, path, makedirs
-def get_real_matching_blocks(words_list1: list, words_list2: list, minimum_size: int = 2) -> list:
- """ Return list of matching blocks with size greater than n """
-
- matching_blocks = difflib.SequenceMatcher(a=words_list1, b=words_list2).get_matching_blocks()
+def get_real_matching_blocks(
+ words_list1: list, words_list2: list, minimum_size: int = 2
+) -> list:
+ """Return list of matching blocks with size greater than n"""
+
+ matching_blocks = difflib.SequenceMatcher(
+ a=words_list1, b=words_list2
+ ).get_matching_blocks()
if minimum_size and minimum_size > 0:
return [b for b in matching_blocks if b.size >= minimum_size]
else:
return [b for b in matching_blocks if b.size >= 2]
-def get_ordered_blocks_positions(string: str, matching_blocks: list, string_blocks: list) -> list:
- """ Return ordered list of all positions of matching blocks in string """
+def get_ordered_blocks_positions(
+ string: str, matching_blocks: list, string_blocks: list
+) -> list:
+ """Return ordered list of all positions of matching blocks in string"""
all_blocks_positions = []
for block_ind, _ in enumerate(matching_blocks):
# Find all positions of substring in string
- block_positions = [char for char in range(len(string)) if string.startswith(
- string_blocks[block_ind], char)]
+ block_positions = [
+ char
+ for char in range(len(string))
+ if string.startswith(string_blocks[block_ind], char)
+ ]
for position in block_positions:
# We check if there is another block starting at the same position
- var = [pos_tuple for pos_tuple in all_blocks_positions if pos_tuple[0] == position]
+ var = [
+ pos_tuple
+ for pos_tuple in all_blocks_positions
+ if pos_tuple[0] == position
+ ]
if var: # If there is one such block
size = len(string_blocks[var[0][1]]) # get size of block in var
if size < len(string_blocks[block_ind]):
@@ -48,22 +61,24 @@ def get_ordered_blocks_positions(string: str, matching_blocks: list, string_bloc
def blocks_list_to_strings_list(blocks_list: list, curr_text: list) -> list:
- """ Convert blocks list to len of blocks strings """
+ """Convert blocks list to len of blocks strings"""
strings_len_list = []
for block in blocks_list:
# Append size of block in string
- strings_len_list.append(len(' '.join(map(str, curr_text[block.a:block.a + block.size]))))
+ strings_len_list.append(
+ len(" ".join(map(str, curr_text[block.a : block.a + block.size])))
+ )
return strings_len_list
def writing_results(dir_name: str) -> str:
- """ Create new directory for results in current working directory """
+ """Create new directory for results in current working directory"""
curr_directory = path.dirname(getcwd())
- final_directory = path.join(curr_directory, r'results\\' + dir_name)
+ final_directory = path.join(curr_directory, r"results\\" + dir_name)
if not path.exists(final_directory):
makedirs(final_directory)
@@ -71,7 +86,7 @@ def writing_results(dir_name: str) -> str:
def get_color_from_similarity(similarity_score: float) -> str:
- """ Return css style according to similarity score """
+ """Return css style according to similarity score"""
if float(similarity_score) > 15:
return "#990033; font-weight: bold"
diff --git a/scripts/html_writing.py b/scripts/html_writing.py
index 063a1ce..d25517f 100644
--- a/scripts/html_writing.py
+++ b/scripts/html_writing.py
@@ -13,13 +13,17 @@
from bs4 import BeautifulSoup as Bs
from tabulate import tabulate
-from html_utils import get_color_from_similarity, get_real_matching_blocks, \
- blocks_list_to_strings_list, get_ordered_blocks_positions
+from html_utils import (
+ get_color_from_similarity,
+ get_real_matching_blocks,
+ blocks_list_to_strings_list,
+ get_ordered_blocks_positions,
+)
from utils import is_float
def add_links_to_html_table(html_path: str) -> None:
- """ Add links to HTML data cells at specified path
+ """Add links to HTML data cells at specified path
This method will link to all HTML TD tags which contain a float different from - 1 the
corresponding HTML comparison file. The links will be opened in a new tab. The colors of
@@ -27,24 +31,26 @@ def add_links_to_html_table(html_path: str) -> None:
"""
- with open(html_path, encoding='utf-8') as html:
- soup = Bs(html, 'html.parser')
+ with open(html_path, encoding="utf-8") as html:
+ soup = Bs(html, "html.parser")
file_ind = 0 # Cursor on file number for the naming of html files
- for td_tag in soup.findAll('td'): # Retrieve all data celss from html table in path
-
+ for td_tag in soup.findAll(
+ "td"
+ ): # Retrieve all data celss from html table in path
if is_float(td_tag.text): # If td is not filename or -1
-
- tmp = soup.new_tag('a',
- href='file:///' + html_path.replace('_results', str(file_ind)),
- target="_blank",
- style="color:" + get_color_from_similarity(td_tag.text))
+ tmp = soup.new_tag(
+ "a",
+ href="file:///" + html_path.replace("_results", str(file_ind)),
+ target="_blank",
+ style="color:" + get_color_from_similarity(td_tag.text),
+ )
td_tag.string.wrap(tmp) # We wrap the td string between the hyperlink
file_ind += 1
# We update the HTML of the file at path
- with open(html_path, 'wb') as f_output:
+ with open(html_path, "wb") as f_output:
f_output.write(soup.prettify("utf-8"))
f_output.flush()
fsync(f_output.fileno())
@@ -52,7 +58,7 @@ def add_links_to_html_table(html_path: str) -> None:
def get_span_blocks(bs_obj: Bs, text1: list, text2: list, block_size: int) -> list:
- """ Return list of spans with colors for HTML rendering """
+ """Return list of spans with colors for HTML rendering"""
results = [[], []] # List of spans list
@@ -60,21 +66,23 @@ def get_span_blocks(bs_obj: Bs, text1: list, text2: list, block_size: int) -> li
matching_blocks = get_real_matching_blocks(text1, text2, block_size)
# Generate one unique color for each matching block
- colors = [f'#%06X' % randint(0, 0xFFFFFF) for _ in range(len(matching_blocks))]
+ colors = [f"#{randint(0, 0xFFFFFF):06X}" for _ in range(len(matching_blocks))]
# Convert blocks from list of list of strings to list of strings
- string_blocks = [' '.join(map(str, text1[b.a:b.a + b.size])) for b in matching_blocks]
+ string_blocks = [
+ " ".join(map(str, text1[b.a : b.a + b.size])) for b in matching_blocks
+ ]
# Store lengths of blocks in text
strings_len_list = blocks_list_to_strings_list(matching_blocks, text1)
# Convert list of strings to strings
- str1, str2 = ' '.join(map(str, text1)), ' '.join(map(str, text2))
+ str1, str2 = " ".join(map(str, text1)), " ".join(map(str, text2))
- global_positions_list = [get_ordered_blocks_positions(str1, matching_blocks,
- string_blocks),
- get_ordered_blocks_positions(str2, matching_blocks,
- string_blocks)]
+ global_positions_list = [
+ get_ordered_blocks_positions(str1, matching_blocks, string_blocks),
+ get_ordered_blocks_positions(str2, matching_blocks, string_blocks),
+ ]
for num, pos_list in enumerate(global_positions_list):
cursor = 0 # Cursor on current string
@@ -84,13 +92,14 @@ def get_span_blocks(bs_obj: Bs, text1: list, text2: list, block_size: int) -> li
for block in pos_list:
# Span tag for the text before the matching sequence
- span = bs_obj.new_tag('span')
- span.string = str1[cursor:block[0]]
+ span = bs_obj.new_tag("span")
+ span.string = str1[cursor : block[0]]
# Span tag for the text in the matching sequence
- blockspan = bs_obj.new_tag('span',
- style="color:" + colors[block[1]] + "; font-weight:bold")
- blockspan.string = str1[block[0]:block[0] + strings_len_list[block[1]]]
+ blockspan = bs_obj.new_tag(
+ "span", style="color:" + colors[block[1]] + "; font-weight:bold"
+ )
+ blockspan.string = str1[block[0] : block[0] + strings_len_list[block[1]]]
# Append spans tags to results list
results[num].append(span)
@@ -100,50 +109,52 @@ def get_span_blocks(bs_obj: Bs, text1: list, text2: list, block_size: int) -> li
cursor = block[0] + strings_len_list[block[1]]
# End of loop, last span tag for the rest of the text
- span = bs_obj.new_tag('span')
+ span = bs_obj.new_tag("span")
span.string = str1[cursor:]
results[num].append(span)
return results
-def papers_comparison(save_dir: str, ind: int, text1: list, text2: list, filenames: tuple,
- block_size: int) -> None:
- """ Write to HTML file texts that have been compared with highlighted similar blocks """
-
- copy(path.join("..", "templates", "template.html"), save_dir) # Copy comparison template to curr dir
- comp_path = path.join(save_dir, str(ind) + '.html')
- rename(path.join(save_dir, 'template.html'), comp_path)
+def papers_comparison(
+ save_dir: str, ind: int, text1: list, text2: list, filenames: tuple, block_size: int
+) -> None:
+ """Write to HTML file texts that have been compared with highlighted similar blocks"""
- with open(comp_path, encoding='utf-8') as html:
+ copy(
+ path.join("..", "templates", "template.html"), save_dir
+ ) # Copy comparison template to curr dir
+ comp_path = path.join(save_dir, str(ind) + ".html")
+ rename(path.join(save_dir, "template.html"), comp_path)
- soup = Bs(html, 'html.parser')
+ with open(comp_path, encoding="utf-8") as html:
+ soup = Bs(html, "html.parser")
res = get_span_blocks(soup, text1, text2, block_size)
- blocks = soup.findAll(attrs={'class': 'block'})
+ blocks = soup.findAll(attrs={"class": "block"})
# Append filename tags and span tags to html
for i, filename in enumerate(filenames):
- temp_tag = soup.new_tag('h3')
+ temp_tag = soup.new_tag("h3")
temp_tag.string = filename
blocks[i].append(temp_tag)
for tag in res[i]:
blocks[i].append(tag)
- with open(comp_path, 'wb') as f_output:
+ with open(comp_path, "wb") as f_output:
f_output.write(soup.prettify("utf-8"))
def results_to_html(scores: list, files_names: list, html_path: str) -> None:
- """ Write similarity results to HTML page """
+ """Write similarity results to HTML page"""
for ind, _ in enumerate(files_names):
scores[ind].insert(0, files_names[ind])
scores.insert(0, files_names)
- scores[0].insert(0, '')
+ scores[0].insert(0, "")
- with open(html_path, 'w', encoding='utf-8') as file:
- file.write(tabulate(scores, tablefmt='html'))
+ with open(html_path, "w", encoding="utf-8") as file:
+ file.write(tabulate(scores, tablefmt="html"))
file.flush()
fsync(file.fileno())
file.close()
diff --git a/scripts/main.py b/scripts/main.py
index 97137a2..f57ada8 100644
--- a/scripts/main.py
+++ b/scripts/main.py
@@ -26,14 +26,17 @@ def main() -> None:
if path.exists(in_dir): # Check if specified path exists
if not path.isabs(in_dir):
in_dir = path.abspath(in_dir)
- if len(listdir(in_dir)) > 1: # Check if there are at least 2 files at specified path
+ if (
+ len(listdir(in_dir)) > 1
+ ): # Check if there are at least 2 files at specified path
filenames, processed_files = [], []
students_names = get_student_names(in_dir)
for ind, direc in enumerate(listdir(in_dir)):
if path.isdir(path.join(in_dir, direc)):
-
for file in listdir(path.join(in_dir, direc)):
- file_words = file_extension_call(str(path.join(in_dir, direc, file)))
+ file_words = file_extension_call(
+ str(path.join(in_dir, direc, file))
+ )
if file_words: # If all files have supported format
processed_files.append(file_words)
@@ -41,7 +44,8 @@ def main() -> None:
else: # At least one file was not supported
print(
"Remove files which are not txt, pdf, docx or odt and run the "
- "script again.")
+ "script again."
+ )
sys.exit()
if out_dir is not None and path.exists(out_dir):
if not path.isabs(out_dir):
@@ -49,7 +53,9 @@ def main() -> None:
results_directory = out_dir
else:
# Create new directory for storing html files
- results_directory = writing_results(datetime.now().strftime("%Y%m%d_%H%M%S"))
+ results_directory = writing_results(
+ datetime.now().strftime("%Y%m%d_%H%M%S")
+ )
difflib_scores = [[] for _ in range(len(processed_files))]
file_ind = 0
@@ -61,13 +67,19 @@ def main() -> None:
difflib_scores[i].append(difflib_overlap(text, text_bis))
# Write text with matching blocks colored in results directory
- papers_comparison(results_directory, file_ind, text, text_bis,
- (filenames[i], filenames[j]), block_size)
+ papers_comparison(
+ results_directory,
+ file_ind,
+ text,
+ text_bis,
+ (filenames[i], filenames[j]),
+ block_size,
+ )
file_ind += 1
else:
difflib_scores[i].append(-1)
- results_directory = path.join(results_directory, '_results.html')
+ results_directory = path.join(results_directory, "_results.html")
print(results_directory)
results_to_html(difflib_scores, filenames, results_directory)
@@ -80,12 +92,13 @@ def main() -> None:
else:
print(
"Minimum number of files is not present. Please check that there are at least "
- "two files to compare.")
+ "two files to compare."
+ )
sys.exit()
else:
print("The specified path does not exist : " + in_dir)
sys.exit()
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/scripts/processing_files.py b/scripts/processing_files.py
index 36339c3..d0e8bae 100644
--- a/scripts/processing_files.py
+++ b/scripts/processing_files.py
@@ -10,28 +10,28 @@
def get_file_extension(filepath: str) -> str:
- """ Return the file extension of the file at the specified path """
+ """Return the file extension of the file at the specified path"""
try:
return path.splitext(filepath)[1]
except (Exception,):
print("File extension error")
- return ''
+ return ""
def file_extension_call(file: str) -> list:
- """ Map file extension to appropriate function """
+ """Map file extension to appropriate function"""
extension = get_file_extension(file)
if extension:
- if extension == '.pdf':
+ if extension == ".pdf":
return get_words_from_pdf_file(file)
- if extension == '.docx':
+ if extension == ".docx":
return get_words_from_docx_file(file)
- if extension == '.odt':
+ if extension == ".odt":
return get_words_from_odt_file(file)
- if extension == '.txt':
+ if extension == ".txt":
return get_words_from_txt_file(file)
print("File format is not supported. Please convert to pdf, docx, odt or txt")
@@ -39,21 +39,20 @@ def file_extension_call(file: str) -> list:
def get_words_from_pdf_file(pdf_path: str) -> list:
- """ Return list of words from pdf file at specified path """
+ """Return list of words from pdf file at specified path"""
- with open(pdf_path, 'rb') as file:
+ with open(pdf_path, "rb") as file:
extracted_text = slate.PDF(file)
nested_lists_length_sum = sum([len(temp) for temp in extracted_text])
- count_line_return = sum([string.count('\n') for string in extracted_text])
+ count_line_return = sum([string.count("\n") for string in extracted_text])
# Check \n ratio compared to length of text
if nested_lists_length_sum / count_line_return > 10:
-
for i, _ in enumerate(extracted_text):
- extracted_text[i] = extracted_text[i].replace('\n', ' ')
- extracted_text[i] = re.sub('<(.|\n)*?>', '', str(extracted_text[i]))
- extracted_text[i] = re.findall(r'\w+', extracted_text[i].lower())
+ extracted_text[i] = extracted_text[i].replace("\n", " ")
+ extracted_text[i] = re.sub("<(.|\n)*?>", "", str(extracted_text[i]))
+ extracted_text[i] = re.findall(r"\w+", extracted_text[i].lower())
return [item for sublist in extracted_text for item in sublist]
@@ -62,48 +61,47 @@ def get_words_from_pdf_file(pdf_path: str) -> list:
def get_words_from_special_pdf(pdf_path: str) -> str:
- """ Return list of words from pdf file when Slate library can't scrape it """
+ """Return list of words from pdf file when Slate library can't scrape it"""
with pdfplumber.open(pdf_path) as file:
- concat_string = ''
+ concat_string = ""
for page in file.pages:
- text_page = page.extract_text()+'\n'
+ text_page = page.extract_text() + "\n"
concat_string += text_page
return " ".join(concat_string.replace("\xa0", " ").strip().split())
def get_words_from_txt_file(txt_path: str) -> list:
- """ Return list of words from txt file at specified path """
+ """Return list of words from txt file at specified path"""
words = []
- with open(txt_path, encoding='utf-8') as file:
-
+ with open(txt_path, encoding="utf-8") as file:
for line in file:
try:
for word in line.split():
words.append(word.lower())
- except (UnicodeError, UnicodeDecodeError) as _:
+ except (UnicodeError, UnicodeDecodeError):
pass
- str_words = ' '.join(map(str, words))
+ str_words = " ".join(map(str, words))
- return re.findall(r'\w+', str_words)
+ return re.findall(r"\w+", str_words)
def get_words_from_docx_file(docx_path: str) -> list:
- """ Return list of words from docx file at specified path """
+ """Return list of words from docx file at specified path"""
with zipfile.ZipFile(docx_path) as docx:
- content = docx.read('word/document.xml').decode('utf-8')
- cleaned = re.sub('<(.|\n)*?>', '', content)
+ content = docx.read("word/document.xml").decode("utf-8")
+ cleaned = re.sub("<(.|\n)*?>", "", content)
- return re.findall(r'\w+', cleaned.lower())
+ return re.findall(r"\w+", cleaned.lower())
def get_words_from_odt_file(odt_path: str) -> list:
- """ Return list of words from odt file at specified path """
+ """Return list of words from odt file at specified path"""
textdoc = load(odt_path)
paragraphs = textdoc.getElementsByType(text.P)
@@ -114,4 +112,4 @@ def get_words_from_odt_file(odt_path: str) -> list:
temp = teletype.extractText(paragraph)
full_text += temp.lower()
- return re.findall(r'\w+', full_text)
+ return re.findall(r"\w+", full_text)
diff --git a/scripts/similarity.py b/scripts/similarity.py
index ddc3725..eca3fb1 100644
--- a/scripts/similarity.py
+++ b/scripts/similarity.py
@@ -13,7 +13,7 @@
def difflib_overlap(word_token1: list, word_token2: list) -> float:
- """ Get similarity percentage from matching sequences between two strings """
+ """Get similarity percentage from matching sequences between two strings"""
seq = difflib.SequenceMatcher(a=word_token1, b=word_token2)
@@ -22,7 +22,7 @@ def difflib_overlap(word_token1: list, word_token2: list) -> float:
def calculate_overlap(word_token1: list, word_token2: list) -> float:
- """ Get similarity percentage from usage of similar words in two strings """
+ """Get similarity percentage from usage of similar words in two strings"""
overlapping_words = []
@@ -36,7 +36,7 @@ def calculate_overlap(word_token1: list, word_token2: list) -> float:
def calculate_jaccard(word_tokens1: list, word_tokens2: list) -> float:
- """ Calculates intersection over union and return Jaccard similarity score """
+ """Calculates intersection over union and return Jaccard similarity score"""
list1, list2 = remove_numbers(word_tokens1), remove_numbers(word_tokens2)
list1, list2 = remove_stop_words(list1), remove_stop_words(list2)
diff --git a/scripts/utils.py b/scripts/utils.py
index e9230a6..05db92c 100644
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -17,15 +17,21 @@
def parse_options():
parser = argparse.ArgumentParser()
parser.add_argument("in_dir", type=str, help="input directory for text files")
- parser.add_argument("-o", "--out_dir", type=str, help="output directory for html results files")
- parser.add_argument("-s", "--block_size", type=int, help="minimum number of consecutive and "
- "similar words detected (default=2)")
+ parser.add_argument(
+ "-o", "--out_dir", type=str, help="output directory for html results files"
+ )
+ parser.add_argument(
+ "-s",
+ "--block_size",
+ type=int,
+ help="minimum number of consecutive and " "similar words detected (default=2)",
+ )
return parser.parse_args()
def is_float(value: float) -> bool:
- """ Return true if value is a float and not equal to -1 """
+ """Return true if value is a float and not equal to -1"""
try:
temp = float(value)
@@ -35,14 +41,15 @@ def is_float(value: float) -> bool:
def get_student_names(main_path):
- sub_directories = [name for name in listdir(main_path)
- if path.isdir(path.join(main_path, name))]
+ sub_directories = [
+ name for name in listdir(main_path) if path.isdir(path.join(main_path, name))
+ ]
- return [title.split('_')[0] for title in sub_directories]
+ return [title.split("_")[0] for title in sub_directories]
def pretty_table(scores: list, names: list) -> None:
- """ Print similarity results nicely """
+ """Print similarity results nicely"""
row_format = "{:>15}" * (len(names) + 1)
print(row_format.format("", *names))
@@ -51,7 +58,7 @@ def pretty_table(scores: list, names: list) -> None:
def wait_for_file(file_path: str, timeout: int = 10) -> bool:
- """ Wait for the creation of a specific file.
+ """Wait for the creation of a specific file.
This method checks if the specified file exists and waits for it to
appear during the specified amount of time (by default 10 seconds).
@@ -73,22 +80,22 @@ def wait_for_file(file_path: str, timeout: int = 10) -> bool:
def remove_numbers(words_list: list) -> list:
- """ Remove all numbers from strings list to avoid errors """
+ """Remove all numbers from strings list to avoid errors"""
temp = [w for w in words_list if not isinstance(w, int)]
return [w for w in temp if not isinstance(w, float)]
def remove_stop_words(words_list: list) -> list:
- """ Remove stop words from strings list """
+ """Remove stop words from strings list"""
- en_stop_words = set(stopwords.words('english'))
+ en_stop_words = set(stopwords.words("english"))
return [w for w in words_list if str(w).lower not in en_stop_words]
def lemmatize(words_list: list) -> list:
- """ Return lemmatized words list """
+ """Return lemmatized words list"""
lemmatizer = WordNetLemmatizer()