diff --git a/striprtf/striprtf.py b/striprtf/striprtf.py index fe3bf4a..1ea79b8 100644 --- a/striprtf/striprtf.py +++ b/striprtf/striprtf.py @@ -1,11 +1,11 @@ import re import codecs + """ Taken from https://gist.github.com/gilsondev/7c1d2d753ddb522e7bc22511cfb08676 and modified for better output of tables. """ - # fmt: off # control words which specify a "destination". destinations = frozenset(( @@ -16,7 +16,7 @@ 'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt', 'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl', 'ffname','ffstattext','file','filetbl','fldinst','fldtype', - 'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr', + 'fname','fontemb','fontfile','footer','footerf','footerl','footerr', 'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl', 'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc', 'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers', @@ -49,15 +49,50 @@ 'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform', 'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl', 'xmlopen', - )) +)) # fmt: on - +charset_map = { + 0: 'cp1252', # Default + 42: 'cp1252', # Symbol + 77: 'mac_roman', # Mac Roman + 78: 'mac_japanese', # Mac Japanese + 79: 'mac_chinesetrad', # Mac Traditional Chinese + 80: 'mac_korean', # Mac Korean + 81: 'mac_arabic', # Mac Arabic + 82: 'mac_hebrew', # Mac Hebrew + 83: 'mac_greek', # Mac Greek + 84: 'mac_cyrillic', # Mac Cyrillic + 85: 'mac_chinesesimp', # Mac Simplified Chinese + 86: 'mac_rumanian', # Mac Romanian + 87: 'mac_ukrainian', # Mac Ukrainian + 88: 'mac_thai', # Mac Thai + 89: 'mac_ce', # Mac Central European + 128: 'cp932', # Japanese + 129: 'cp949', # Korean + 130: 'cp1361', # Johab (Korean) + 134: 'cp936', # Simplified Chinese (GBK) + 136: 'cp950', # Traditional Chinese (Big5) + 161: 'cp1253', # Greek + 162: 'cp1254', # Turkish + 163: 'cp1258', # Vietnamese + 177: 'cp1255', # Hebrew + 178: 'cp1256', # Arabic + 186: 'cp1257', # Baltic + 204: 'cp1251', # Cyrillic + 222: 'cp874', # Thai + 238: 'cp1250', # Eastern European + 254: 'cp437', # OEM United States + 255: 'cp850', # OEM Multilingual Latin 1 + } # Translation of some special characters. -specialchars = { +# and section characters reset formatting +sectionchars = { "par": "\n", "sect": "\n\n", - "page": "\n\n", + "page": "\n\n" +} +specialchars = { "line": "\n", "tab": "\t", "emdash": "\u2014", @@ -82,7 +117,7 @@ "-": "\xad", "_": "\u2011" -} +} | sectionchars PATTERN = re.compile( r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", @@ -94,7 +129,8 @@ re.IGNORECASE ) - + + def rtf_to_text(text, encoding="cp1252", errors="strict"): """ Converts the rtf text to plain text. @@ -103,7 +139,7 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"): text : str The rtf text encoding : str - Input encoding which is ignored if the rtf file contains an explicit codepage directive, + Input encoding which is ignored if the rtf file contains an explicit codepage directive, as it is typically the case. Defaults to `cp1252` encoding as it the most commonly used. errors : str How to handle encoding errors. Default is "strict", which throws an error. Another @@ -114,9 +150,13 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"): str the converted rtf text as a python unicode string """ - text = re.sub(HYPERLINKS, "\\1(\\2)", text) # captures links like link_text(http://link_dest) + text = re.sub(HYPERLINKS, "\\1(\\2)", text) # captures links like link_text(http://link_dest) stack = [] + fonttbl = {} + default_font = None + current_font = None ignorable = False # Whether this group (and all inside it) are "ignorable". + suppress_output = False # Whether this group (and all inside it) are "ignorable". ucskip = 1 # Number of ASCII characters to skip after a unicode character. curskip = 0 # Number of ASCII characters left to skip hexes = None @@ -125,17 +165,17 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"): for match in PATTERN.finditer(text): word, arg, _hex, char, brace, tchar = match.groups() if hexes and not _hex: - out += bytes.fromhex(hexes).decode(encoding=encoding, errors=errors) + out += bytes.fromhex(hexes).decode(encoding=fonttbl.get(current_font, {'encoding': encoding}).get('encoding'), errors=errors) hexes = None if brace: curskip = 0 if brace == "{": # Push state - stack.append((ucskip, ignorable)) + stack.append((ucskip, ignorable, suppress_output)) elif brace == "}": # Pop state if stack: - ucskip, ignorable = stack.pop() + ucskip, ignorable, suppress_output = stack.pop() # sample_3.rtf throws an IndexError because of stack being empty. # don't know right now how this could happen, so for now this is # a ugly hack to prevent it @@ -145,8 +185,10 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"): elif char: # \x (not a letter) curskip = 0 if char in specialchars: + if char in sectionchars: + current_font = default_font if not ignorable: - out += specialchars[char] + out += specialchars[char] elif char == "*": ignorable = True elif word: # \foo @@ -176,6 +218,20 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"): c += 0x10000 out += chr(c) curskip = ucskip + elif word == "f": + current_font = arg + if current_font not in fonttbl: + fonttbl[current_font] = {} + elif word == "fonttbl": + fonttbl = {} + suppress_output = True + elif word == "fcharset": + fonttbl[current_font]['charset'] = arg + fonttbl[current_font]['encoding'] = charset_map.get(int(arg), encoding) + ignorable = True + elif word == "deff": + default_font = arg + elif _hex: # \'xx if curskip > 0: curskip -= 1 @@ -188,6 +244,10 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"): elif tchar: if curskip > 0: curskip -= 1 - elif not ignorable: + elif not ignorable and not suppress_output: out += tchar + print(fonttbl) + + + return out