formatting and some warnings

Wazzabeee · Nov 27, 2023 · 2229eb2 · 2229eb2
1 parent ec26aa5
commit 2229eb2
Show file tree

Hide file tree

Showing 6 changed files with 156 additions and 112 deletions.
diff --git a/scripts/html_utils.py b/scripts/html_utils.py
@@ -10,29 +10,42 @@
 from os import getcwd, path, makedirs
 
 
-def get_real_matching_blocks(words_list1: list, words_list2: list, minimum_size: int = 2) -> list:
-    """ Return list of matching blocks with size greater than n """
-
-    matching_blocks = difflib.SequenceMatcher(a=words_list1, b=words_list2).get_matching_blocks()
+def get_real_matching_blocks(
+    words_list1: list, words_list2: list, minimum_size: int = 2
+) -> list:
+    """Return list of matching blocks with size greater than n"""
+
+    matching_blocks = difflib.SequenceMatcher(
+        a=words_list1, b=words_list2
+    ).get_matching_blocks()
     if minimum_size and minimum_size > 0:
         return [b for b in matching_blocks if b.size >= minimum_size]
     else:
         return [b for b in matching_blocks if b.size >= 2]
 
 
-def get_ordered_blocks_positions(string: str, matching_blocks: list, string_blocks: list) -> list:
-    """ Return ordered list of all positions of matching blocks in string """
+def get_ordered_blocks_positions(
+    string: str, matching_blocks: list, string_blocks: list
+) -> list:
+    """Return ordered list of all positions of matching blocks in string"""
 
     all_blocks_positions = []
 
     for block_ind, _ in enumerate(matching_blocks):
         # Find all positions of substring in string
-        block_positions = [char for char in range(len(string)) if string.startswith(
-            string_blocks[block_ind], char)]
+        block_positions = [
+            char
+            for char in range(len(string))
+            if string.startswith(string_blocks[block_ind], char)
+        ]
 
         for position in block_positions:
             # We check if there is another block starting at the same position
-            var = [pos_tuple for pos_tuple in all_blocks_positions if pos_tuple[0] == position]
+            var = [
+                pos_tuple
+                for pos_tuple in all_blocks_positions
+                if pos_tuple[0] == position
+            ]
             if var:  # If there is one such block
                 size = len(string_blocks[var[0][1]])  # get size of block in var
                 if size < len(string_blocks[block_ind]):
@@ -48,30 +61,32 @@ def get_ordered_blocks_positions(string: str, matching_blocks: list, string_bloc
 
 
 def blocks_list_to_strings_list(blocks_list: list, curr_text: list) -> list:
-    """ Convert blocks list to len of blocks strings """
+    """Convert blocks list to len of blocks strings"""
 
     strings_len_list = []
 
     for block in blocks_list:
         # Append size of block in string
-        strings_len_list.append(len(' '.join(map(str, curr_text[block.a:block.a + block.size]))))
+        strings_len_list.append(
+            len(" ".join(map(str, curr_text[block.a : block.a + block.size])))
+        )
 
     return strings_len_list
 
 
 def writing_results(dir_name: str) -> str:
-    """ Create new directory for results in current working directory """
+    """Create new directory for results in current working directory"""
 
     curr_directory = path.dirname(getcwd())
-    final_directory = path.join(curr_directory, r'results\\' + dir_name)
+    final_directory = path.join(curr_directory, r"results\\" + dir_name)
     if not path.exists(final_directory):
         makedirs(final_directory)
 
     return final_directory
 
 
 def get_color_from_similarity(similarity_score: float) -> str:
-    """ Return css style according to similarity score """
+    """Return css style according to similarity score"""
 
     if float(similarity_score) > 15:
         return "#990033; font-weight: bold"

diff --git a/scripts/html_writing.py b/scripts/html_writing.py
@@ -13,68 +13,76 @@
 from bs4 import BeautifulSoup as Bs
 from tabulate import tabulate
 
-from html_utils import get_color_from_similarity, get_real_matching_blocks, \
-    blocks_list_to_strings_list, get_ordered_blocks_positions
+from html_utils import (
+    get_color_from_similarity,
+    get_real_matching_blocks,
+    blocks_list_to_strings_list,
+    get_ordered_blocks_positions,
+)
 from utils import is_float
 
 
 def add_links_to_html_table(html_path: str) -> None:
-    """ Add links to HTML data cells at specified path
+    """Add links to HTML data cells at specified path
 
     This method will link to all HTML TD tags which contain a float different from - 1 the
     corresponding HTML comparison file. The links will be opened in a new tab. The colors of
     the text in tag will change depending on similarity score.
 
     """
 
-    with open(html_path, encoding='utf-8') as html:
-        soup = Bs(html, 'html.parser')
+    with open(html_path, encoding="utf-8") as html:
+        soup = Bs(html, "html.parser")
         file_ind = 0  # Cursor on file number for the naming of html files
 
-        for td_tag in soup.findAll('td'):  # Retrieve all data celss from html table in path
-
+        for td_tag in soup.findAll(
+            "td"
+        ):  # Retrieve all data celss from html table in path
             if is_float(td_tag.text):  # If td is not filename or -1
-
-                tmp = soup.new_tag('a',
-                                   href='file:///' + html_path.replace('_results', str(file_ind)),
-                                   target="_blank",
-                                   style="color:" + get_color_from_similarity(td_tag.text))
+                tmp = soup.new_tag(
+                    "a",
+                    href="file:///" + html_path.replace("_results", str(file_ind)),
+                    target="_blank",
+                    style="color:" + get_color_from_similarity(td_tag.text),
+                )
 
                 td_tag.string.wrap(tmp)  # We wrap the td string between the hyperlink
                 file_ind += 1
 
         # We update the HTML of the file at path
-        with open(html_path, 'wb') as f_output:
+        with open(html_path, "wb") as f_output:
             f_output.write(soup.prettify("utf-8"))
             f_output.flush()
             fsync(f_output.fileno())
             f_output.close()
 
 
 def get_span_blocks(bs_obj: Bs, text1: list, text2: list, block_size: int) -> list:
-    """ Return list of spans with colors for HTML rendering """
+    """Return list of spans with colors for HTML rendering"""
 
     results = [[], []]  # List of spans list
 
     # Get matching blocks with chosen minimum size
     matching_blocks = get_real_matching_blocks(text1, text2, block_size)
 
     # Generate one unique color for each matching block
-    colors = [f'#%06X' % randint(0, 0xFFFFFF) for _ in range(len(matching_blocks))]
+    colors = [f"#{randint(0, 0xFFFFFF):06X}" for _ in range(len(matching_blocks))]
 
     # Convert blocks from list of list of strings to list of strings
-    string_blocks = [' '.join(map(str, text1[b.a:b.a + b.size])) for b in matching_blocks]
+    string_blocks = [
+        " ".join(map(str, text1[b.a : b.a + b.size])) for b in matching_blocks
+    ]
 
     # Store lengths of blocks in text
     strings_len_list = blocks_list_to_strings_list(matching_blocks, text1)
 
     # Convert list of strings to strings
-    str1, str2 = ' '.join(map(str, text1)), ' '.join(map(str, text2))
+    str1, str2 = " ".join(map(str, text1)), " ".join(map(str, text2))
 
-    global_positions_list = [get_ordered_blocks_positions(str1, matching_blocks,
-                                                          string_blocks),
-                             get_ordered_blocks_positions(str2, matching_blocks,
-                                                          string_blocks)]
+    global_positions_list = [
+        get_ordered_blocks_positions(str1, matching_blocks, string_blocks),
+        get_ordered_blocks_positions(str2, matching_blocks, string_blocks),
+    ]
 
     for num, pos_list in enumerate(global_positions_list):
         cursor = 0  # Cursor on current string
@@ -84,13 +92,14 @@ def get_span_blocks(bs_obj: Bs, text1: list, text2: list, block_size: int) -> li
 
         for block in pos_list:
             # Span tag for the text before the matching sequence
-            span = bs_obj.new_tag('span')
-            span.string = str1[cursor:block[0]]
+            span = bs_obj.new_tag("span")
+            span.string = str1[cursor : block[0]]
 
             # Span tag for the text in the matching sequence
-            blockspan = bs_obj.new_tag('span',
-                                       style="color:" + colors[block[1]] + "; font-weight:bold")
-            blockspan.string = str1[block[0]:block[0] + strings_len_list[block[1]]]
+            blockspan = bs_obj.new_tag(
+                "span", style="color:" + colors[block[1]] + "; font-weight:bold"
+            )
+            blockspan.string = str1[block[0] : block[0] + strings_len_list[block[1]]]
 
             # Append spans tags to results list
             results[num].append(span)
@@ -100,50 +109,52 @@ def get_span_blocks(bs_obj: Bs, text1: list, text2: list, block_size: int) -> li
             cursor = block[0] + strings_len_list[block[1]]
 
         # End of loop, last span tag for the rest of the text
-        span = bs_obj.new_tag('span')
+        span = bs_obj.new_tag("span")
         span.string = str1[cursor:]
         results[num].append(span)
 
     return results
 
 
-def papers_comparison(save_dir: str, ind: int, text1: list, text2: list, filenames: tuple,
-                      block_size: int) -> None:
-    """ Write to HTML file texts that have been compared with highlighted similar blocks """
-
-    copy(path.join("..", "templates", "template.html"), save_dir)  # Copy comparison template to curr dir
-    comp_path = path.join(save_dir, str(ind) + '.html')
-    rename(path.join(save_dir, 'template.html'), comp_path)
+def papers_comparison(
+    save_dir: str, ind: int, text1: list, text2: list, filenames: tuple, block_size: int
+) -> None:
+    """Write to HTML file texts that have been compared with highlighted similar blocks"""
 
-    with open(comp_path, encoding='utf-8') as html:
+    copy(
+        path.join("..", "templates", "template.html"), save_dir
+    )  # Copy comparison template to curr dir
+    comp_path = path.join(save_dir, str(ind) + ".html")
+    rename(path.join(save_dir, "template.html"), comp_path)
 
-        soup = Bs(html, 'html.parser')
+    with open(comp_path, encoding="utf-8") as html:
+        soup = Bs(html, "html.parser")
         res = get_span_blocks(soup, text1, text2, block_size)
-        blocks = soup.findAll(attrs={'class': 'block'})
+        blocks = soup.findAll(attrs={"class": "block"})
 
         # Append filename tags and span tags to html
         for i, filename in enumerate(filenames):
-            temp_tag = soup.new_tag('h3')
+            temp_tag = soup.new_tag("h3")
             temp_tag.string = filename
             blocks[i].append(temp_tag)
             for tag in res[i]:
                 blocks[i].append(tag)
 
-    with open(comp_path, 'wb') as f_output:
+    with open(comp_path, "wb") as f_output:
         f_output.write(soup.prettify("utf-8"))
 
 
 def results_to_html(scores: list, files_names: list, html_path: str) -> None:
-    """  Write similarity results to HTML page """
+    """Write similarity results to HTML page"""
 
     for ind, _ in enumerate(files_names):
         scores[ind].insert(0, files_names[ind])
 
     scores.insert(0, files_names)
-    scores[0].insert(0, '')
+    scores[0].insert(0, "")
 
-    with open(html_path, 'w', encoding='utf-8') as file:
-        file.write(tabulate(scores, tablefmt='html'))
+    with open(html_path, "w", encoding="utf-8") as file:
+        file.write(tabulate(scores, tablefmt="html"))
         file.flush()
         fsync(file.fileno())
         file.close()
diff --git a/scripts/main.py b/scripts/main.py
@@ -26,30 +26,36 @@ def main() -> None:
     if path.exists(in_dir):  # Check if specified path exists
         if not path.isabs(in_dir):
             in_dir = path.abspath(in_dir)
-        if len(listdir(in_dir)) > 1:  # Check if there are at least 2 files at specified path
+        if (
+            len(listdir(in_dir)) > 1
+        ):  # Check if there are at least 2 files at specified path
             filenames, processed_files = [], []
             students_names = get_student_names(in_dir)
             for ind, direc in enumerate(listdir(in_dir)):
                 if path.isdir(path.join(in_dir, direc)):
-
                     for file in listdir(path.join(in_dir, direc)):
-                        file_words = file_extension_call(str(path.join(in_dir, direc, file)))
+                        file_words = file_extension_call(
+                            str(path.join(in_dir, direc, file))
+                        )
 
                         if file_words:  # If all files have supported format
                             processed_files.append(file_words)
                             filenames.append(students_names[ind])
                         else:  # At least one file was not supported
                             print(
                                 "Remove files which are not txt, pdf, docx or odt and run the "
-                                "script again.")
+                                "script again."
+                            )
                             sys.exit()
             if out_dir is not None and path.exists(out_dir):
                 if not path.isabs(out_dir):
                     out_dir = path.abspath(out_dir)
                 results_directory = out_dir
             else:
                 # Create new directory for storing html files
-                results_directory = writing_results(datetime.now().strftime("%Y%m%d_%H%M%S"))
+                results_directory = writing_results(
+                    datetime.now().strftime("%Y%m%d_%H%M%S")
+                )
 
             difflib_scores = [[] for _ in range(len(processed_files))]
             file_ind = 0
@@ -61,13 +67,19 @@ def main() -> None:
                         difflib_scores[i].append(difflib_overlap(text, text_bis))
 
                         # Write text with matching blocks colored in results directory
-                        papers_comparison(results_directory, file_ind, text, text_bis,
-                                          (filenames[i], filenames[j]), block_size)
+                        papers_comparison(
+                            results_directory,
+                            file_ind,
+                            text,
+                            text_bis,
+                            (filenames[i], filenames[j]),
+                            block_size,
+                        )
                         file_ind += 1
                     else:
                         difflib_scores[i].append(-1)
 
-            results_directory = path.join(results_directory, '_results.html')
+            results_directory = path.join(results_directory, "_results.html")
             print(results_directory)
 
             results_to_html(difflib_scores, filenames, results_directory)
@@ -80,12 +92,13 @@ def main() -> None:
         else:
             print(
                 "Minimum number of files is not present. Please check that there are at least "
-                "two files to compare.")
+                "two files to compare."
+            )
             sys.exit()
     else:
         print("The specified path does not exist : " + in_dir)
         sys.exit()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()