|
| 1 | +#!/usr/bin/python3 |
| 2 | + |
| 3 | +# Extracts translatable strings from HTML files in the following forms: |
| 4 | +# |
| 5 | +# <tag translate>String</tag> |
| 6 | +# <tag translate context="value">String</tag> |
| 7 | +# |
| 8 | +# Note that some of the use of the translated may not support all the strings |
| 9 | +# depending on the code actually using these strings to translate the HTML. |
| 10 | + |
| 11 | + |
| 12 | +import argparse |
| 13 | +import collections.abc |
| 14 | +import json |
| 15 | +import pathlib |
| 16 | +from collections import defaultdict |
| 17 | +from html.parser import HTMLParser |
| 18 | + |
| 19 | +PO_HEADER = r"""msgid "" |
| 20 | +msgstr "" |
| 21 | +"Project-Id-Version: PACKAGE_VERSION\n" |
| 22 | +"MIME-Version: 1.0\n" |
| 23 | +"Content-Type: text/plain; charset=UTF-8\n" |
| 24 | +"Content-Transfer-Encoding: 8bit\n" |
| 25 | +"X-Generator: Cockpit html2po\n" |
| 26 | +""" |
| 27 | + |
| 28 | + |
| 29 | +class MyHTMLParser(HTMLParser): |
| 30 | + current_attr: tuple[str | None, str] | None = None |
| 31 | + filename: str |
| 32 | + |
| 33 | + def __init__(self, filename: str): |
| 34 | + self.strings = defaultdict[tuple[str | None, str], set[str]](set) |
| 35 | + self.filename = filename |
| 36 | + super().__init__() |
| 37 | + |
| 38 | + def handle_starttag(self, tag, attrs): |
| 39 | + translatable = False |
| 40 | + context = None |
| 41 | + |
| 42 | + # attrs => (key, value) where value needs to be split |
| 43 | + for (attr, value) in attrs: |
| 44 | + if attr == 'translate-context': |
| 45 | + context = value |
| 46 | + elif attr == 'context': |
| 47 | + context = value |
| 48 | + elif attr == 'translate' and (value is None or 'yes' in value.split(' ')): |
| 49 | + translatable = True |
| 50 | + |
| 51 | + if not translatable: |
| 52 | + return |
| 53 | + |
| 54 | + lineno = self.getpos()[0] |
| 55 | + filename = f'{self.filename}:{lineno}' |
| 56 | + self.current_attr = (context, filename) |
| 57 | + |
| 58 | + def handle_data(self, data: str): |
| 59 | + if self.current_attr and data: |
| 60 | + context, filename = self.current_attr |
| 61 | + self.strings[context, data].add(filename) |
| 62 | + self.current_attr = None |
| 63 | + |
| 64 | + |
| 65 | +def main() -> None: |
| 66 | + parser = argparse.ArgumentParser(prog='html2po', |
| 67 | + description='Extracts translatable strings from HTML files') |
| 68 | + parser.add_argument('-d', '--directory', help='Base directory for input files') |
| 69 | + parser.add_argument('-o', '--output', help='Output file') |
| 70 | + parser.add_argument('files', nargs='+', help='One or more input files', type=pathlib.Path, metavar='FILE') |
| 71 | + |
| 72 | + args = parser.parse_args() |
| 73 | + strings = defaultdict[tuple[str | None, str], set[str]](set) |
| 74 | + |
| 75 | + files: collections.abc.Iterable[pathlib.Path] = args.files |
| 76 | + for file in files: |
| 77 | + # Qualify the filename if necessary |
| 78 | + full_path = args.directory / file if args.directory else file |
| 79 | + |
| 80 | + htmlparser = MyHTMLParser(str(file)) |
| 81 | + with open(full_path, 'r') as fp: |
| 82 | + htmlparser.feed(fp.read()) |
| 83 | + for ctx_msgid, filename in htmlparser.strings.items(): |
| 84 | + strings[ctx_msgid].update(filename) |
| 85 | + |
| 86 | + with open(args.output, 'w') as fp: |
| 87 | + fp.write(PO_HEADER) |
| 88 | + for (context, msgid), filenames in strings.items(): |
| 89 | + fp.write(f"\n#: {' '.join(filenames)}\n") |
| 90 | + # json.dumps() to escape any quotes in translation text or context |
| 91 | + if context: |
| 92 | + fp.write(f'msgctxt {json.dumps(context)}\n') |
| 93 | + |
| 94 | + fp.write(f'msgid {json.dumps(msgid)}\n') |
| 95 | + fp.write('msgstr ""\n') |
| 96 | + |
| 97 | + |
| 98 | +if __name__ == "__main__": |
| 99 | + main() |
0 commit comments