support fonttbl

castlabs · Oct 25, 2024 · 1a5c867 · 1a5c867
1 parent e68ead6
commit 1a5c867
Showing 1 changed file with 75 additions and 15 deletions.
diff --git a/striprtf/striprtf.py b/striprtf/striprtf.py
@@ -1,11 +1,11 @@
 import re
 import codecs
+
 """
 Taken from https://gist.github.com/gilsondev/7c1d2d753ddb522e7bc22511cfb08676
 and modified for better output of tables.
 """
 
-
 # fmt: off
 # control words which specify a "destination".
 destinations = frozenset((
@@ -16,7 +16,7 @@
     'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
     'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
     'ffname','ffstattext','file','filetbl','fldinst','fldtype',
-    'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr',
+    'fname','fontemb','fontfile','footer','footerf','footerl','footerr',
     'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
     'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
     'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
@@ -49,15 +49,50 @@
     'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
     'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
     'xmlopen',
-    ))
+))
 # fmt: on
-
+charset_map = {
+        0: 'cp1252',  # Default
+        42: 'cp1252',  # Symbol
+        77: 'mac_roman',  # Mac Roman
+        78: 'mac_japanese',  # Mac Japanese
+        79: 'mac_chinesetrad',  # Mac Traditional Chinese
+        80: 'mac_korean',  # Mac Korean
+        81: 'mac_arabic',  # Mac Arabic
+        82: 'mac_hebrew',  # Mac Hebrew
+        83: 'mac_greek',  # Mac Greek
+        84: 'mac_cyrillic',  # Mac Cyrillic
+        85: 'mac_chinesesimp',  # Mac Simplified Chinese
+        86: 'mac_rumanian',  # Mac Romanian
+        87: 'mac_ukrainian',  # Mac Ukrainian
+        88: 'mac_thai',  # Mac Thai
+        89: 'mac_ce',  # Mac Central European
+        128: 'cp932',  # Japanese
+        129: 'cp949',  # Korean
+        130: 'cp1361',  # Johab (Korean)
+        134: 'cp936',  # Simplified Chinese (GBK)
+        136: 'cp950',  # Traditional Chinese (Big5)
+        161: 'cp1253',  # Greek
+        162: 'cp1254',  # Turkish
+        163: 'cp1258',  # Vietnamese
+        177: 'cp1255',  # Hebrew
+        178: 'cp1256',  # Arabic
+        186: 'cp1257',  # Baltic
+        204: 'cp1251',  # Cyrillic
+        222: 'cp874',  # Thai
+        238: 'cp1250',  # Eastern European
+        254: 'cp437',  # OEM United States
+        255: 'cp850',  # OEM Multilingual Latin 1
+    }
 
 # Translation of some special characters.
-specialchars = {
+# and section characters reset formatting
+sectionchars = {
     "par": "\n",
     "sect": "\n\n",
-    "page": "\n\n",
+    "page": "\n\n"
+}
+specialchars = {
     "line": "\n",
     "tab": "\t",
     "emdash": "\u2014",
@@ -82,7 +117,7 @@
     "-": "\xad",
     "_": "\u2011"
 
-}
+} | sectionchars
 
 PATTERN = re.compile(
     r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)",
@@ -94,7 +129,8 @@
     re.IGNORECASE
 )
 
-
+
+
 def rtf_to_text(text, encoding="cp1252", errors="strict"):
     """ Converts the rtf text to plain text.
 
@@ -103,7 +139,7 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"):
     text : str
         The rtf text
     encoding : str
-        Input encoding which is ignored if the rtf file contains an explicit codepage directive, 
+        Input encoding which is ignored if the rtf file contains an explicit codepage directive,
         as it is typically the case. Defaults to `cp1252` encoding as it the most commonly used.
     errors : str
         How to handle encoding errors. Default is "strict", which throws an error. Another
@@ -114,9 +150,13 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"):
     str
         the converted rtf text as a python unicode string
     """
-    text = re.sub(HYPERLINKS, "\\1(\\2)", text) # captures links like link_text(http://link_dest)
+    text = re.sub(HYPERLINKS, "\\1(\\2)", text)  # captures links like link_text(http://link_dest)
     stack = []
+    fonttbl = {}
+    default_font = None
+    current_font = None
     ignorable = False  # Whether this group (and all inside it) are "ignorable".
+    suppress_output = False  # Whether this group (and all inside it) are "ignorable".
     ucskip = 1  # Number of ASCII characters to skip after a unicode character.
     curskip = 0  # Number of ASCII characters left to skip
     hexes = None
@@ -125,17 +165,17 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"):
     for match in PATTERN.finditer(text):
         word, arg, _hex, char, brace, tchar = match.groups()
         if hexes and not _hex:
-            out += bytes.fromhex(hexes).decode(encoding=encoding, errors=errors)
+            out += bytes.fromhex(hexes).decode(encoding=fonttbl.get(current_font, {'encoding': encoding}).get('encoding'), errors=errors)
             hexes = None
         if brace:
             curskip = 0
             if brace == "{":
                 # Push state
-                stack.append((ucskip, ignorable))
+                stack.append((ucskip, ignorable, suppress_output))
             elif brace == "}":
                 # Pop state
                 if stack:
-                    ucskip, ignorable = stack.pop()
+                    ucskip, ignorable, suppress_output = stack.pop()
                 # sample_3.rtf throws an IndexError because of stack being empty.
                 # don't know right now how this could happen, so for now this is
                 # a ugly hack to prevent it
@@ -145,8 +185,10 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"):
         elif char:  # \x (not a letter)
             curskip = 0
             if char in specialchars:
+                if char in sectionchars:
+                    current_font = default_font
                 if not ignorable:
-                   out += specialchars[char]
+                    out += specialchars[char]
             elif char == "*":
                 ignorable = True
         elif word:  # \foo
@@ -176,6 +218,20 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"):
                         c += 0x10000
                     out += chr(c)
                     curskip = ucskip
+            elif word == "f":
+                current_font = arg
+                if current_font not in fonttbl:
+                    fonttbl[current_font] = {}
+            elif word == "fonttbl":
+                fonttbl = {}
+                suppress_output = True
+            elif word == "fcharset":
+                fonttbl[current_font]['charset'] = arg
+                fonttbl[current_font]['encoding'] = charset_map.get(int(arg), encoding)
+                ignorable = True
+            elif word == "deff":
+                default_font = arg
+
         elif _hex:  # \'xx
             if curskip > 0:
                 curskip -= 1
@@ -188,6 +244,10 @@ def rtf_to_text(text, encoding="cp1252", errors="strict"):
         elif tchar:
             if curskip > 0:
                 curskip -= 1
-            elif not ignorable:
+            elif not ignorable and not suppress_output:
                 out += tchar
+    print(fonttbl)
+
+
+
     return out