diff --git a/scripts/html_utils.py b/scripts/html_utils.py index ede72ab..c5ee2c3 100644 --- a/scripts/html_utils.py +++ b/scripts/html_utils.py @@ -10,29 +10,42 @@ from os import getcwd, path, makedirs -def get_real_matching_blocks(words_list1: list, words_list2: list, minimum_size: int = 2) -> list: - """ Return list of matching blocks with size greater than n """ - - matching_blocks = difflib.SequenceMatcher(a=words_list1, b=words_list2).get_matching_blocks() +def get_real_matching_blocks( + words_list1: list, words_list2: list, minimum_size: int = 2 +) -> list: + """Return list of matching blocks with size greater than n""" + + matching_blocks = difflib.SequenceMatcher( + a=words_list1, b=words_list2 + ).get_matching_blocks() if minimum_size and minimum_size > 0: return [b for b in matching_blocks if b.size >= minimum_size] else: return [b for b in matching_blocks if b.size >= 2] -def get_ordered_blocks_positions(string: str, matching_blocks: list, string_blocks: list) -> list: - """ Return ordered list of all positions of matching blocks in string """ +def get_ordered_blocks_positions( + string: str, matching_blocks: list, string_blocks: list +) -> list: + """Return ordered list of all positions of matching blocks in string""" all_blocks_positions = [] for block_ind, _ in enumerate(matching_blocks): # Find all positions of substring in string - block_positions = [char for char in range(len(string)) if string.startswith( - string_blocks[block_ind], char)] + block_positions = [ + char + for char in range(len(string)) + if string.startswith(string_blocks[block_ind], char) + ] for position in block_positions: # We check if there is another block starting at the same position - var = [pos_tuple for pos_tuple in all_blocks_positions if pos_tuple[0] == position] + var = [ + pos_tuple + for pos_tuple in all_blocks_positions + if pos_tuple[0] == position + ] if var: # If there is one such block size = len(string_blocks[var[0][1]]) # get size of block in var if size < len(string_blocks[block_ind]): @@ -48,22 +61,24 @@ def get_ordered_blocks_positions(string: str, matching_blocks: list, string_bloc def blocks_list_to_strings_list(blocks_list: list, curr_text: list) -> list: - """ Convert blocks list to len of blocks strings """ + """Convert blocks list to len of blocks strings""" strings_len_list = [] for block in blocks_list: # Append size of block in string - strings_len_list.append(len(' '.join(map(str, curr_text[block.a:block.a + block.size])))) + strings_len_list.append( + len(" ".join(map(str, curr_text[block.a : block.a + block.size]))) + ) return strings_len_list def writing_results(dir_name: str) -> str: - """ Create new directory for results in current working directory """ + """Create new directory for results in current working directory""" curr_directory = path.dirname(getcwd()) - final_directory = path.join(curr_directory, r'results\\' + dir_name) + final_directory = path.join(curr_directory, r"results\\" + dir_name) if not path.exists(final_directory): makedirs(final_directory) @@ -71,7 +86,7 @@ def writing_results(dir_name: str) -> str: def get_color_from_similarity(similarity_score: float) -> str: - """ Return css style according to similarity score """ + """Return css style according to similarity score""" if float(similarity_score) > 15: return "#990033; font-weight: bold" diff --git a/scripts/html_writing.py b/scripts/html_writing.py index 063a1ce..d25517f 100644 --- a/scripts/html_writing.py +++ b/scripts/html_writing.py @@ -13,13 +13,17 @@ from bs4 import BeautifulSoup as Bs from tabulate import tabulate -from html_utils import get_color_from_similarity, get_real_matching_blocks, \ - blocks_list_to_strings_list, get_ordered_blocks_positions +from html_utils import ( + get_color_from_similarity, + get_real_matching_blocks, + blocks_list_to_strings_list, + get_ordered_blocks_positions, +) from utils import is_float def add_links_to_html_table(html_path: str) -> None: - """ Add links to HTML data cells at specified path + """Add links to HTML data cells at specified path This method will link to all HTML TD tags which contain a float different from - 1 the corresponding HTML comparison file. The links will be opened in a new tab. The colors of @@ -27,24 +31,26 @@ def add_links_to_html_table(html_path: str) -> None: """ - with open(html_path, encoding='utf-8') as html: - soup = Bs(html, 'html.parser') + with open(html_path, encoding="utf-8") as html: + soup = Bs(html, "html.parser") file_ind = 0 # Cursor on file number for the naming of html files - for td_tag in soup.findAll('td'): # Retrieve all data celss from html table in path - + for td_tag in soup.findAll( + "td" + ): # Retrieve all data celss from html table in path if is_float(td_tag.text): # If td is not filename or -1 - - tmp = soup.new_tag('a', - href='file:///' + html_path.replace('_results', str(file_ind)), - target="_blank", - style="color:" + get_color_from_similarity(td_tag.text)) + tmp = soup.new_tag( + "a", + href="file:///" + html_path.replace("_results", str(file_ind)), + target="_blank", + style="color:" + get_color_from_similarity(td_tag.text), + ) td_tag.string.wrap(tmp) # We wrap the td string between the hyperlink file_ind += 1 # We update the HTML of the file at path - with open(html_path, 'wb') as f_output: + with open(html_path, "wb") as f_output: f_output.write(soup.prettify("utf-8")) f_output.flush() fsync(f_output.fileno()) @@ -52,7 +58,7 @@ def add_links_to_html_table(html_path: str) -> None: def get_span_blocks(bs_obj: Bs, text1: list, text2: list, block_size: int) -> list: - """ Return list of spans with colors for HTML rendering """ + """Return list of spans with colors for HTML rendering""" results = [[], []] # List of spans list @@ -60,21 +66,23 @@ def get_span_blocks(bs_obj: Bs, text1: list, text2: list, block_size: int) -> li matching_blocks = get_real_matching_blocks(text1, text2, block_size) # Generate one unique color for each matching block - colors = [f'#%06X' % randint(0, 0xFFFFFF) for _ in range(len(matching_blocks))] + colors = [f"#{randint(0, 0xFFFFFF):06X}" for _ in range(len(matching_blocks))] # Convert blocks from list of list of strings to list of strings - string_blocks = [' '.join(map(str, text1[b.a:b.a + b.size])) for b in matching_blocks] + string_blocks = [ + " ".join(map(str, text1[b.a : b.a + b.size])) for b in matching_blocks + ] # Store lengths of blocks in text strings_len_list = blocks_list_to_strings_list(matching_blocks, text1) # Convert list of strings to strings - str1, str2 = ' '.join(map(str, text1)), ' '.join(map(str, text2)) + str1, str2 = " ".join(map(str, text1)), " ".join(map(str, text2)) - global_positions_list = [get_ordered_blocks_positions(str1, matching_blocks, - string_blocks), - get_ordered_blocks_positions(str2, matching_blocks, - string_blocks)] + global_positions_list = [ + get_ordered_blocks_positions(str1, matching_blocks, string_blocks), + get_ordered_blocks_positions(str2, matching_blocks, string_blocks), + ] for num, pos_list in enumerate(global_positions_list): cursor = 0 # Cursor on current string @@ -84,13 +92,14 @@ def get_span_blocks(bs_obj: Bs, text1: list, text2: list, block_size: int) -> li for block in pos_list: # Span tag for the text before the matching sequence - span = bs_obj.new_tag('span') - span.string = str1[cursor:block[0]] + span = bs_obj.new_tag("span") + span.string = str1[cursor : block[0]] # Span tag for the text in the matching sequence - blockspan = bs_obj.new_tag('span', - style="color:" + colors[block[1]] + "; font-weight:bold") - blockspan.string = str1[block[0]:block[0] + strings_len_list[block[1]]] + blockspan = bs_obj.new_tag( + "span", style="color:" + colors[block[1]] + "; font-weight:bold" + ) + blockspan.string = str1[block[0] : block[0] + strings_len_list[block[1]]] # Append spans tags to results list results[num].append(span) @@ -100,50 +109,52 @@ def get_span_blocks(bs_obj: Bs, text1: list, text2: list, block_size: int) -> li cursor = block[0] + strings_len_list[block[1]] # End of loop, last span tag for the rest of the text - span = bs_obj.new_tag('span') + span = bs_obj.new_tag("span") span.string = str1[cursor:] results[num].append(span) return results -def papers_comparison(save_dir: str, ind: int, text1: list, text2: list, filenames: tuple, - block_size: int) -> None: - """ Write to HTML file texts that have been compared with highlighted similar blocks """ - - copy(path.join("..", "templates", "template.html"), save_dir) # Copy comparison template to curr dir - comp_path = path.join(save_dir, str(ind) + '.html') - rename(path.join(save_dir, 'template.html'), comp_path) +def papers_comparison( + save_dir: str, ind: int, text1: list, text2: list, filenames: tuple, block_size: int +) -> None: + """Write to HTML file texts that have been compared with highlighted similar blocks""" - with open(comp_path, encoding='utf-8') as html: + copy( + path.join("..", "templates", "template.html"), save_dir + ) # Copy comparison template to curr dir + comp_path = path.join(save_dir, str(ind) + ".html") + rename(path.join(save_dir, "template.html"), comp_path) - soup = Bs(html, 'html.parser') + with open(comp_path, encoding="utf-8") as html: + soup = Bs(html, "html.parser") res = get_span_blocks(soup, text1, text2, block_size) - blocks = soup.findAll(attrs={'class': 'block'}) + blocks = soup.findAll(attrs={"class": "block"}) # Append filename tags and span tags to html for i, filename in enumerate(filenames): - temp_tag = soup.new_tag('h3') + temp_tag = soup.new_tag("h3") temp_tag.string = filename blocks[i].append(temp_tag) for tag in res[i]: blocks[i].append(tag) - with open(comp_path, 'wb') as f_output: + with open(comp_path, "wb") as f_output: f_output.write(soup.prettify("utf-8")) def results_to_html(scores: list, files_names: list, html_path: str) -> None: - """ Write similarity results to HTML page """ + """Write similarity results to HTML page""" for ind, _ in enumerate(files_names): scores[ind].insert(0, files_names[ind]) scores.insert(0, files_names) - scores[0].insert(0, '') + scores[0].insert(0, "") - with open(html_path, 'w', encoding='utf-8') as file: - file.write(tabulate(scores, tablefmt='html')) + with open(html_path, "w", encoding="utf-8") as file: + file.write(tabulate(scores, tablefmt="html")) file.flush() fsync(file.fileno()) file.close() diff --git a/scripts/main.py b/scripts/main.py index 97137a2..f57ada8 100644 --- a/scripts/main.py +++ b/scripts/main.py @@ -26,14 +26,17 @@ def main() -> None: if path.exists(in_dir): # Check if specified path exists if not path.isabs(in_dir): in_dir = path.abspath(in_dir) - if len(listdir(in_dir)) > 1: # Check if there are at least 2 files at specified path + if ( + len(listdir(in_dir)) > 1 + ): # Check if there are at least 2 files at specified path filenames, processed_files = [], [] students_names = get_student_names(in_dir) for ind, direc in enumerate(listdir(in_dir)): if path.isdir(path.join(in_dir, direc)): - for file in listdir(path.join(in_dir, direc)): - file_words = file_extension_call(str(path.join(in_dir, direc, file))) + file_words = file_extension_call( + str(path.join(in_dir, direc, file)) + ) if file_words: # If all files have supported format processed_files.append(file_words) @@ -41,7 +44,8 @@ def main() -> None: else: # At least one file was not supported print( "Remove files which are not txt, pdf, docx or odt and run the " - "script again.") + "script again." + ) sys.exit() if out_dir is not None and path.exists(out_dir): if not path.isabs(out_dir): @@ -49,7 +53,9 @@ def main() -> None: results_directory = out_dir else: # Create new directory for storing html files - results_directory = writing_results(datetime.now().strftime("%Y%m%d_%H%M%S")) + results_directory = writing_results( + datetime.now().strftime("%Y%m%d_%H%M%S") + ) difflib_scores = [[] for _ in range(len(processed_files))] file_ind = 0 @@ -61,13 +67,19 @@ def main() -> None: difflib_scores[i].append(difflib_overlap(text, text_bis)) # Write text with matching blocks colored in results directory - papers_comparison(results_directory, file_ind, text, text_bis, - (filenames[i], filenames[j]), block_size) + papers_comparison( + results_directory, + file_ind, + text, + text_bis, + (filenames[i], filenames[j]), + block_size, + ) file_ind += 1 else: difflib_scores[i].append(-1) - results_directory = path.join(results_directory, '_results.html') + results_directory = path.join(results_directory, "_results.html") print(results_directory) results_to_html(difflib_scores, filenames, results_directory) @@ -80,12 +92,13 @@ def main() -> None: else: print( "Minimum number of files is not present. Please check that there are at least " - "two files to compare.") + "two files to compare." + ) sys.exit() else: print("The specified path does not exist : " + in_dir) sys.exit() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/scripts/processing_files.py b/scripts/processing_files.py index 36339c3..d0e8bae 100644 --- a/scripts/processing_files.py +++ b/scripts/processing_files.py @@ -10,28 +10,28 @@ def get_file_extension(filepath: str) -> str: - """ Return the file extension of the file at the specified path """ + """Return the file extension of the file at the specified path""" try: return path.splitext(filepath)[1] except (Exception,): print("File extension error") - return '' + return "" def file_extension_call(file: str) -> list: - """ Map file extension to appropriate function """ + """Map file extension to appropriate function""" extension = get_file_extension(file) if extension: - if extension == '.pdf': + if extension == ".pdf": return get_words_from_pdf_file(file) - if extension == '.docx': + if extension == ".docx": return get_words_from_docx_file(file) - if extension == '.odt': + if extension == ".odt": return get_words_from_odt_file(file) - if extension == '.txt': + if extension == ".txt": return get_words_from_txt_file(file) print("File format is not supported. Please convert to pdf, docx, odt or txt") @@ -39,21 +39,20 @@ def file_extension_call(file: str) -> list: def get_words_from_pdf_file(pdf_path: str) -> list: - """ Return list of words from pdf file at specified path """ + """Return list of words from pdf file at specified path""" - with open(pdf_path, 'rb') as file: + with open(pdf_path, "rb") as file: extracted_text = slate.PDF(file) nested_lists_length_sum = sum([len(temp) for temp in extracted_text]) - count_line_return = sum([string.count('\n') for string in extracted_text]) + count_line_return = sum([string.count("\n") for string in extracted_text]) # Check \n ratio compared to length of text if nested_lists_length_sum / count_line_return > 10: - for i, _ in enumerate(extracted_text): - extracted_text[i] = extracted_text[i].replace('\n', ' ') - extracted_text[i] = re.sub('<(.|\n)*?>', '', str(extracted_text[i])) - extracted_text[i] = re.findall(r'\w+', extracted_text[i].lower()) + extracted_text[i] = extracted_text[i].replace("\n", " ") + extracted_text[i] = re.sub("<(.|\n)*?>", "", str(extracted_text[i])) + extracted_text[i] = re.findall(r"\w+", extracted_text[i].lower()) return [item for sublist in extracted_text for item in sublist] @@ -62,48 +61,47 @@ def get_words_from_pdf_file(pdf_path: str) -> list: def get_words_from_special_pdf(pdf_path: str) -> str: - """ Return list of words from pdf file when Slate library can't scrape it """ + """Return list of words from pdf file when Slate library can't scrape it""" with pdfplumber.open(pdf_path) as file: - concat_string = '' + concat_string = "" for page in file.pages: - text_page = page.extract_text()+'\n' + text_page = page.extract_text() + "\n" concat_string += text_page return " ".join(concat_string.replace("\xa0", " ").strip().split()) def get_words_from_txt_file(txt_path: str) -> list: - """ Return list of words from txt file at specified path """ + """Return list of words from txt file at specified path""" words = [] - with open(txt_path, encoding='utf-8') as file: - + with open(txt_path, encoding="utf-8") as file: for line in file: try: for word in line.split(): words.append(word.lower()) - except (UnicodeError, UnicodeDecodeError) as _: + except (UnicodeError, UnicodeDecodeError): pass - str_words = ' '.join(map(str, words)) + str_words = " ".join(map(str, words)) - return re.findall(r'\w+', str_words) + return re.findall(r"\w+", str_words) def get_words_from_docx_file(docx_path: str) -> list: - """ Return list of words from docx file at specified path """ + """Return list of words from docx file at specified path""" with zipfile.ZipFile(docx_path) as docx: - content = docx.read('word/document.xml').decode('utf-8') - cleaned = re.sub('<(.|\n)*?>', '', content) + content = docx.read("word/document.xml").decode("utf-8") + cleaned = re.sub("<(.|\n)*?>", "", content) - return re.findall(r'\w+', cleaned.lower()) + return re.findall(r"\w+", cleaned.lower()) def get_words_from_odt_file(odt_path: str) -> list: - """ Return list of words from odt file at specified path """ + """Return list of words from odt file at specified path""" textdoc = load(odt_path) paragraphs = textdoc.getElementsByType(text.P) @@ -114,4 +112,4 @@ def get_words_from_odt_file(odt_path: str) -> list: temp = teletype.extractText(paragraph) full_text += temp.lower() - return re.findall(r'\w+', full_text) + return re.findall(r"\w+", full_text) diff --git a/scripts/similarity.py b/scripts/similarity.py index ddc3725..eca3fb1 100644 --- a/scripts/similarity.py +++ b/scripts/similarity.py @@ -13,7 +13,7 @@ def difflib_overlap(word_token1: list, word_token2: list) -> float: - """ Get similarity percentage from matching sequences between two strings """ + """Get similarity percentage from matching sequences between two strings""" seq = difflib.SequenceMatcher(a=word_token1, b=word_token2) @@ -22,7 +22,7 @@ def difflib_overlap(word_token1: list, word_token2: list) -> float: def calculate_overlap(word_token1: list, word_token2: list) -> float: - """ Get similarity percentage from usage of similar words in two strings """ + """Get similarity percentage from usage of similar words in two strings""" overlapping_words = [] @@ -36,7 +36,7 @@ def calculate_overlap(word_token1: list, word_token2: list) -> float: def calculate_jaccard(word_tokens1: list, word_tokens2: list) -> float: - """ Calculates intersection over union and return Jaccard similarity score """ + """Calculates intersection over union and return Jaccard similarity score""" list1, list2 = remove_numbers(word_tokens1), remove_numbers(word_tokens2) list1, list2 = remove_stop_words(list1), remove_stop_words(list2) diff --git a/scripts/utils.py b/scripts/utils.py index e9230a6..05db92c 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -17,15 +17,21 @@ def parse_options(): parser = argparse.ArgumentParser() parser.add_argument("in_dir", type=str, help="input directory for text files") - parser.add_argument("-o", "--out_dir", type=str, help="output directory for html results files") - parser.add_argument("-s", "--block_size", type=int, help="minimum number of consecutive and " - "similar words detected (default=2)") + parser.add_argument( + "-o", "--out_dir", type=str, help="output directory for html results files" + ) + parser.add_argument( + "-s", + "--block_size", + type=int, + help="minimum number of consecutive and " "similar words detected (default=2)", + ) return parser.parse_args() def is_float(value: float) -> bool: - """ Return true if value is a float and not equal to -1 """ + """Return true if value is a float and not equal to -1""" try: temp = float(value) @@ -35,14 +41,15 @@ def is_float(value: float) -> bool: def get_student_names(main_path): - sub_directories = [name for name in listdir(main_path) - if path.isdir(path.join(main_path, name))] + sub_directories = [ + name for name in listdir(main_path) if path.isdir(path.join(main_path, name)) + ] - return [title.split('_')[0] for title in sub_directories] + return [title.split("_")[0] for title in sub_directories] def pretty_table(scores: list, names: list) -> None: - """ Print similarity results nicely """ + """Print similarity results nicely""" row_format = "{:>15}" * (len(names) + 1) print(row_format.format("", *names)) @@ -51,7 +58,7 @@ def pretty_table(scores: list, names: list) -> None: def wait_for_file(file_path: str, timeout: int = 10) -> bool: - """ Wait for the creation of a specific file. + """Wait for the creation of a specific file. This method checks if the specified file exists and waits for it to appear during the specified amount of time (by default 10 seconds). @@ -73,22 +80,22 @@ def wait_for_file(file_path: str, timeout: int = 10) -> bool: def remove_numbers(words_list: list) -> list: - """ Remove all numbers from strings list to avoid errors """ + """Remove all numbers from strings list to avoid errors""" temp = [w for w in words_list if not isinstance(w, int)] return [w for w in temp if not isinstance(w, float)] def remove_stop_words(words_list: list) -> list: - """ Remove stop words from strings list """ + """Remove stop words from strings list""" - en_stop_words = set(stopwords.words('english')) + en_stop_words = set(stopwords.words("english")) return [w for w in words_list if str(w).lower not in en_stop_words] def lemmatize(words_list: list) -> list: - """ Return lemmatized words list """ + """Return lemmatized words list""" lemmatizer = WordNetLemmatizer()