forked from davisdude/mbox2html
-
Notifications
You must be signed in to change notification settings - Fork 0
/
redact.py
77 lines (69 loc) · 2.61 KB
/
redact.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import bs4
import os
import re
# Removes area codes from phone numbers and TLDs from emails
def replace(string):
string = string.replace("<br>", "\n")
# Removes area code from phone numbers
# Only handles US-style (+country-(area)-(xxx)-(xxxx)) or plain-style (no
# separators)
# TODO: Exclude URLs
phone_find = re.compile(
r"""
(\s+|^) # Don't match in the middle (e.g. URL)
(\+\s*\d+)? # Country code
(\s|\-|\.|/)* # Separator
(\()?(\s*) # Area code open paren
(\d{3}) # Area code
(\s*)(\))? # Area code close paren
(\s|\-|\.|/)* # Separator
(\d{3}) # Next 3 digits
(\s|\-|\.|/)* # Separator
(\d{4}) # Final 4 digits
""",
re.X,
)
phone_repl = r"\1\2\3\4\5XXX\7\8\9\10\11\12"
string = phone_find.sub(phone_repl, string)
# Removes email addresses
email_find = re.compile(
r"""
([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-.]+)(\.[a-zA-Z0-9-]+)
""",
re.X,
)
email_repl = r"\1.[redacted]"
string = email_find.sub(email_repl, string)
return string
def recursive_replace(soup):
if type(soup) == str:
return replace(soup)
elif not hasattr(soup, "contents"):
return soup.__class__(replace(soup.string))
else:
for c in soup.contents:
c.replace_with(recursive_replace(c))
for k, v in soup.attrs.items():
# Only worry about mailto (for now, at least)
if k != "href":
continue
if re.match(r"^mailto:", v, re.I) is None:
continue
soup[k] = recursive_replace(v)
return soup
if __name__ == "__main__":
for entry in os.scandir("out"):
if entry.name.endswith(".html") and entry.is_file():
with open(entry.path, "r") as file:
soup = bs4.BeautifulSoup(file, "html.parser")
recursive_replace(soup)
with open(entry.path, "w") as file:
file.write(soup.prettify())
elif entry.is_dir():
for att in os.scandir(entry.path):
if att.name.endswith(".vcf") and att.is_file():
with open(att.path, "r") as file:
text = file.read()
text = replace(text)
with open(att.path, "w") as file:
file.write(text)