Skip to content

Commit

Permalink
Avoid keeping all files in memory for find-unusual-characters
Browse files Browse the repository at this point in the history
Each file can be processed independently, so processing them one at a
time reduces total memory consumption.
  • Loading branch information
apasel422 committed Jun 8, 2024
1 parent 14ecff0 commit fca5bdc
Showing 1 changed file with 28 additions and 32 deletions.
60 changes: 28 additions & 32 deletions se/commands/find_unusual_characters.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,32 +26,6 @@ def find_unusual_characters(plain_output: bool) -> int:
return_code = 0
unusual_characters: Dict[str, int] = {} # key: word; value: count
target_filenames = se.get_target_filenames(args.targets, ".xhtml")
files_xhtml = []

# Read files and cache for later
for filename in target_filenames:
try:
with open(filename, "r", encoding="utf-8") as file:
xhtml = file.read()
dom = se.easy_xml.EasyXmlTree(xhtml)

# Save any `alt` and `title` attributes because we may be interested in their contents
for node in dom.xpath("//*[@alt or @title]"):
for _, value in node.attrs.items():
xhtml = xhtml + f" {value} "

# Strip tags
xhtml = regex.sub(r"<[^>]+?>", " ", xhtml)

files_xhtml.append(xhtml)

except FileNotFoundError:
se.print_error(f"Couldn’t open file: [path][link=file://{filename}]{filename}[/][/].", plain_output=plain_output)
return_code = se.InvalidInputException.code

except se.SeException as ex:
se.print_error(str(ex) + f" File: [path][link=file://{filename}]{filename}[/][/].", plain_output=plain_output)
return_code = ex.code

# Create a regex for unusual characters.
# The result is a series of Unicode ranges that cover the characters
Expand Down Expand Up @@ -118,12 +92,34 @@ def find_unusual_characters(plain_output: bool) -> int:
unusual_character_set += "\u2e3c-\ufefe"
unusual_character_set += "]"

for xhtml in files_xhtml:
for character in regex.findall(unusual_character_set, xhtml):
if character in unusual_characters:
unusual_characters[character] = unusual_characters[character] + len(character)
else:
unusual_characters[character] = len(character)
# Read files and process one at a time
for filename in target_filenames:
try:
with open(filename, "r", encoding="utf-8") as file:
xhtml = file.read()
dom = se.easy_xml.EasyXmlTree(xhtml)

# Save any `alt` and `title` attributes because we may be interested in their contents
for node in dom.xpath("//*[@alt or @title]"):
for _, value in node.attrs.items():
xhtml = xhtml + f" {value} "

# Strip tags
xhtml = regex.sub(r"<[^>]+?>", " ", xhtml)

for character in regex.findall(unusual_character_set, xhtml):
if character in unusual_characters:
unusual_characters[character] = unusual_characters[character] + len(character)
else:
unusual_characters[character] = len(character)

except FileNotFoundError:
se.print_error(f"Couldn’t open file: [path][link=file://{filename}]{filename}[/][/].", plain_output=plain_output)
return_code = se.InvalidInputException.code

except se.SeException as ex:
se.print_error(str(ex) + f" File: [path][link=file://{filename}]{filename}[/][/].", plain_output=plain_output)
return_code = ex.code

# Sort and prepare the output
lines = []
Expand Down

0 comments on commit fca5bdc

Please sign in to comment.