Skip to content

Commit

Permalink
support fonttbl
Browse files Browse the repository at this point in the history
  • Loading branch information
sannies committed Oct 25, 2024
1 parent e68ead6 commit 1a5c867
Showing 1 changed file with 75 additions and 15 deletions.
90 changes: 75 additions & 15 deletions striprtf/striprtf.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import re
import codecs

"""
Taken from https://gist.github.com/gilsondev/7c1d2d753ddb522e7bc22511cfb08676
and modified for better output of tables.
"""


# fmt: off
# control words which specify a "destination".
destinations = frozenset((
Expand All @@ -16,7 +16,7 @@
'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
'ffname','ffstattext','file','filetbl','fldinst','fldtype',
'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr',
'fname','fontemb','fontfile','footer','footerf','footerl','footerr',
'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
Expand Down Expand Up @@ -49,15 +49,50 @@
'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
'xmlopen',
))
))
# fmt: on

charset_map = {
0: 'cp1252', # Default
42: 'cp1252', # Symbol
77: 'mac_roman', # Mac Roman
78: 'mac_japanese', # Mac Japanese
79: 'mac_chinesetrad', # Mac Traditional Chinese
80: 'mac_korean', # Mac Korean
81: 'mac_arabic', # Mac Arabic
82: 'mac_hebrew', # Mac Hebrew
83: 'mac_greek', # Mac Greek
84: 'mac_cyrillic', # Mac Cyrillic
85: 'mac_chinesesimp', # Mac Simplified Chinese
86: 'mac_rumanian', # Mac Romanian
87: 'mac_ukrainian', # Mac Ukrainian
88: 'mac_thai', # Mac Thai
89: 'mac_ce', # Mac Central European
128: 'cp932', # Japanese
129: 'cp949', # Korean
130: 'cp1361', # Johab (Korean)
134: 'cp936', # Simplified Chinese (GBK)
136: 'cp950', # Traditional Chinese (Big5)
161: 'cp1253', # Greek
162: 'cp1254', # Turkish
163: 'cp1258', # Vietnamese
177: 'cp1255', # Hebrew
178: 'cp1256', # Arabic
186: 'cp1257', # Baltic
204: 'cp1251', # Cyrillic
222: 'cp874', # Thai
238: 'cp1250', # Eastern European
254: 'cp437', # OEM United States
255: 'cp850', # OEM Multilingual Latin 1
}

# Translation of some special characters.
specialchars = {
# and section characters reset formatting
sectionchars = {
"par": "\n",
"sect": "\n\n",
"page": "\n\n",
"page": "\n\n"
}
specialchars = {
"line": "\n",
"tab": "\t",
"emdash": "\u2014",
Expand All @@ -82,7 +117,7 @@
"-": "\xad",
"_": "\u2011"

}
} | sectionchars

PATTERN = re.compile(
r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)",
Expand All @@ -94,7 +129,8 @@
re.IGNORECASE
)




def rtf_to_text(text, encoding="cp1252", errors="strict"):
""" Converts the rtf text to plain text.
Expand All @@ -103,7 +139,7 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"):
text : str
The rtf text
encoding : str
Input encoding which is ignored if the rtf file contains an explicit codepage directive,
Input encoding which is ignored if the rtf file contains an explicit codepage directive,
as it is typically the case. Defaults to `cp1252` encoding as it the most commonly used.
errors : str
How to handle encoding errors. Default is "strict", which throws an error. Another
Expand All @@ -114,9 +150,13 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"):
str
the converted rtf text as a python unicode string
"""
text = re.sub(HYPERLINKS, "\\1(\\2)", text) # captures links like link_text(http://link_dest)
text = re.sub(HYPERLINKS, "\\1(\\2)", text) # captures links like link_text(http://link_dest)
stack = []
fonttbl = {}
default_font = None
current_font = None
ignorable = False # Whether this group (and all inside it) are "ignorable".
suppress_output = False # Whether this group (and all inside it) are "ignorable".
ucskip = 1 # Number of ASCII characters to skip after a unicode character.
curskip = 0 # Number of ASCII characters left to skip
hexes = None
Expand All @@ -125,17 +165,17 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"):
for match in PATTERN.finditer(text):
word, arg, _hex, char, brace, tchar = match.groups()
if hexes and not _hex:
out += bytes.fromhex(hexes).decode(encoding=encoding, errors=errors)
out += bytes.fromhex(hexes).decode(encoding=fonttbl.get(current_font, {'encoding': encoding}).get('encoding'), errors=errors)
hexes = None
if brace:
curskip = 0
if brace == "{":
# Push state
stack.append((ucskip, ignorable))
stack.append((ucskip, ignorable, suppress_output))
elif brace == "}":
# Pop state
if stack:
ucskip, ignorable = stack.pop()
ucskip, ignorable, suppress_output = stack.pop()
# sample_3.rtf throws an IndexError because of stack being empty.
# don't know right now how this could happen, so for now this is
# a ugly hack to prevent it
Expand All @@ -145,8 +185,10 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"):
elif char: # \x (not a letter)
curskip = 0
if char in specialchars:
if char in sectionchars:
current_font = default_font
if not ignorable:
out += specialchars[char]
out += specialchars[char]
elif char == "*":
ignorable = True
elif word: # \foo
Expand Down Expand Up @@ -176,6 +218,20 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"):
c += 0x10000
out += chr(c)
curskip = ucskip
elif word == "f":
current_font = arg
if current_font not in fonttbl:
fonttbl[current_font] = {}
elif word == "fonttbl":
fonttbl = {}
suppress_output = True
elif word == "fcharset":
fonttbl[current_font]['charset'] = arg
fonttbl[current_font]['encoding'] = charset_map.get(int(arg), encoding)
ignorable = True
elif word == "deff":
default_font = arg

elif _hex: # \'xx
if curskip > 0:
curskip -= 1
Expand All @@ -188,6 +244,10 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"):
elif tchar:
if curskip > 0:
curskip -= 1
elif not ignorable:
elif not ignorable and not suppress_output:
out += tchar
print(fonttbl)



return out

0 comments on commit 1a5c867

Please sign in to comment.