doc2json.py

from collections import defaultdict
import json
import zipfile
from lxml import etree

# Define common fonts to ignore
common_fonts = {
    'Times New Roman',
    'Arial',
    'Calibri',
    # Add any other common fonts here
}

# Define elements to ignore
ignored_elements = {
    'proofErr',
    'bookmarkStart',
    'bookmarkEnd',
    'lastRenderedPageBreak',
    'webHidden',
    'numPr',
    'pBdr',
    'ind',
    'spacing',
    'jc',
    'tabs',
    'sectPr',
    'pgMar'
    # Add any other elements to ignore here
}

# Define attributes to ignore
ignored_attributes = {
    'rsidR',
    'rsidRPr',
    'rsidRDefault',
    'rsidP',
    'paraId',
    'textId',
    'rsidR',
    'rsidRPr',
    'rsidDel',
    'rsidP',
    'rsidTr',
    # Add any other attributes to ignore here
}

# Define metadata elements to ignore
ignored_metadata_elements = {
    'application',
    'docSecurity',
    'scaleCrop',
    'linksUpToDate',
    'charactersWithSpaces',
    'hiddenSlides',
    'mmClips',
    'notes',
    'words',
    'characters',
    'pages',
    'lines',
    'paragraphs',
    'company',
    'template',
    # Add any other metadata elements to ignore here
}

def remove_ignored_elements(tree):
    """Remove all ignored elements from the XML tree, except highlights."""
    for elem in tree.xpath(".//*"):
        tag_without_ns = elem.tag.split('}')[-1]
        if tag_without_ns in ignored_elements:
            elem.getparent().remove(elem)
        elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr':  # Check for highlights in rPr
            if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
                elem.getparent().remove(elem)
        else:
            # Remove ignored attributes
            for attr in list(elem.attrib):
                attr_without_ns = attr.split('}')[-1]
                if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
                    del elem.attrib[attr]
    return tree

def etree_to_dict(t):
    """Convert an lxml etree to a nested dictionary, excluding ignored namespaces and attributes."""
    tag = t.tag.split('}')[-1]  # Remove namespace URI
    if tag in ignored_elements:
        return None

    d = {tag: {} if t.attrib else None}
    children = list(t)
    if children:
        dd = defaultdict(list)
        for dc in filter(None, map(etree_to_dict, children)):
            for k, v in dc.items():
                dd[k].append(v)
        d = {tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}

    if t.attrib:
        # Filter out common fonts and ignored attributes
        filtered_attribs = {}
        for k, v in t.attrib.items():
            k = k.split('}')[-1]  # Remove namespace URI
            if k in ('ascii', 'hAnsi', 'cs', 'eastAsia'):
                if v not in common_fonts:
                    filtered_attribs[k] = v
            elif k not in ignored_attributes and not k.startswith('rsid'):
                filtered_attribs[k] = v
        d[tag].update(filtered_attribs)
    
    if t.text:
        text = t.text.strip()
        # Here we ensure that the text encoding is correctly handled
        text = bytes(text, 'utf-8').decode('utf-8', 'ignore')
        if children or t.attrib:
            if text:
                d[tag]['#text'] = text
        else:
            d[tag] = text
    
    if not t.attrib and not children and not t.text:
        return None

    return d

# Additionally, update the 'remove_ignored_elements' function to fix encoding
def remove_ignored_elements(tree):
    """Remove all ignored elements from the XML tree, except highlights."""
    for elem in tree.xpath(".//*"):
        tag_without_ns = elem.tag.split('}')[-1]
        if tag_without_ns in ignored_elements:
            elem.getparent().remove(elem)
        elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr':  # Check for highlights in rPr
            if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
                elem.getparent().remove(elem)
        else:
            # Remove ignored attributes
            for attr in list(elem.attrib):
                attr_without_ns = attr.split('}')[-1]
                if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
                    del elem.attrib[attr]
    # Decode the text correctly for each XML element
    for elem in tree.xpath(".//text()"):
        elem_text = elem.strip()
        encoded_text = bytes(elem_text, 'utf-8').decode('utf-8', 'ignore')
        parent = elem.getparent()
        if parent is not None:
            parent.text = encoded_text
    return tree

def extract_metadata(docx):
    """Extract metadata from the document properties, ignoring specified elements."""
    metadata = {}
    with docx.open('docProps/core.xml') as core_xml:
        xml_content = core_xml.read()
        core_tree = etree.XML(xml_content)
        for child in core_tree.getchildren():
            tag = child.tag.split('}')[-1]  # Get tag without namespace
            if tag not in ignored_metadata_elements:
                metadata[tag] = child.text
    return metadata

def process_docx(file_path):
    # Load the document with zipfile and lxml
    with zipfile.ZipFile(file_path) as docx:
        metadata = extract_metadata(docx)
        with docx.open('word/document.xml') as document_xml:
            xml_content = document_xml.read()
            document_tree = etree.XML(xml_content)

            # Remove the ignored elements
            document_tree = remove_ignored_elements(document_tree)

            # Convert the rest of the XML tree to a dictionary
            document_dict = etree_to_dict(document_tree)
            document_dict['metadata'] = metadata  # Add metadata to the document dictionary

            docx_json = json.dumps(document_dict, ensure_ascii=False, indent=2)

            return docx_json