From fbdff2356b6c4f5891ad3e5550ff67d9e5171c40 Mon Sep 17 00:00:00 2001 From: Aolin Date: Sat, 21 Dec 2024 11:59:26 +0800 Subject: [PATCH] fix: preserve hyperlink references in Word document merge (#3) --- README.md | 4 +++ pdm.lock | 10 ++++---- pyproject.toml | 10 ++++---- src/tidocs/cli.py | 2 +- src/tidocs/docx_handler.py | 45 +++++++++++++++++++++++++-------- src/tidocs/merge/main_marimo.py | 36 +++++++++++++------------- 6 files changed, 67 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 448242b..fa5f9fe 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,10 @@ Use the `tidocs merge` command to access a web interface for combining multiple ## Changelog +### [1.0.6] - 2024-12-21 + +- Fix the issue that hyperlinks become broken after merging Word documents due to incorrect relationship reference handling. ([#2](https://github.com/Oreoxmt/tidocs/issues/2)) + ### [1.0.5] - 2024-12-03 - Fix compatibility issues with Python 3.9. diff --git a/pdm.lock b/pdm.lock index 4597a25..2d07f62 100644 --- a/pdm.lock +++ b/pdm.lock @@ -321,7 +321,7 @@ files = [ [[package]] name = "marimo" -version = "0.9.20" +version = "0.10.6" requires_python = ">=3.9" summary = "A library for making reactive notebooks and apps" groups = ["default"] @@ -335,18 +335,18 @@ dependencies = [ "packaging", "psutil>=5.0", "pygments<3,>=2.13", - "pymdown-extensions<11,>=9.0", + "pymdown-extensions<11,>=10", "pyyaml>=6.0", "ruff", "starlette!=0.36.0,>=0.26.1", "tomlkit>=0.12.0", "typing-extensions>=4.4.0; python_version < \"3.11\"", "uvicorn>=0.22.0", - "websockets<13.0.0,>=10.0.0", + "websockets>=10.0.0", ] files = [ - {file = "marimo-0.9.20-py3-none-any.whl", hash = "sha256:46befa7b64e03faa6a30b0e2960c6fc1fb425c5a0cf1c1cb0698f01c5c748ca2"}, - {file = "marimo-0.9.20.tar.gz", hash = "sha256:c53c692ab46664a2d6d0dcb5a10e8272dd00d69e32f9e683ade0c0b0e9f3cdec"}, + {file = "marimo-0.10.6-py3-none-any.whl", hash = "sha256:278c8cbf37b323ad913b20af5abc17da089d88d4d6bdb484d78fcc2bab21448e"}, + {file = "marimo-0.10.6.tar.gz", hash = "sha256:b3bdc78cf07801d1197b0d3957e845526f82db30f48e2dc91324923579204d66"}, ] [[package]] diff --git a/pyproject.toml b/pyproject.toml index a7e9054..da715cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,11 @@ license = { text = "Apache-2.0" } requires = ["pdm-backend"] build-backend = "pdm.backend" +[dependency-groups] +dev = [ + "pytest>=8.3.4", +] + [tool.pdm] distribution = true @@ -32,8 +37,3 @@ includes = ["src/tidocs/"] [tool.pdm.version] source = "scm" - -[dependency-groups] -dev = [ - "pytest>=8.3.4", -] diff --git a/src/tidocs/cli.py b/src/tidocs/cli.py index 05b6d23..4d79c95 100644 --- a/src/tidocs/cli.py +++ b/src/tidocs/cli.py @@ -38,7 +38,7 @@ def launch_marimo_app(appname: str, host: str, port: int) -> None: @click.command(no_args_is_help=True) -@click.version_option(version="1.0.5") +@click.version_option(version="1.0.6") @click.argument("appname", type=click.Choice(list(APPS.keys())), required=True) @click.option( "--host", diff --git a/src/tidocs/docx_handler.py b/src/tidocs/docx_handler.py index 0a8dd4d..1814359 100644 --- a/src/tidocs/docx_handler.py +++ b/src/tidocs/docx_handler.py @@ -1,6 +1,7 @@ import io - from docx import Document +from docx.oxml import parse_xml +from docx.oxml.shared import qn def merge_word_docs_with_tables( @@ -9,7 +10,7 @@ def merge_word_docs_with_tables( marker_text: str = "TIDOCS_REPLACE_TABLE", ) -> bytes: """ - Merges tables from one Word document into another at specified marker locations. + Merges tables from one Word document into another at specified marker locations, preserving hyperlinks and other document relationships. Args: main_doc_data (bytes): The main document binary data @@ -23,20 +24,43 @@ def merge_word_docs_with_tables( main_doc = Document(io.BytesIO(main_doc_data)) table_doc = Document(io.BytesIO(table_doc_data)) + # Create a mapping of relationship IDs between documents + rel_map = {} + + # Copy hyperlink relationships from table_doc to main_doc + for rel_id, rel in table_doc.part.rels.items(): + if ( + rel.reltype + == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" + ): + new_rel_id = main_doc.part.relate_to( + rel._target, rel.reltype, rel.is_external + ) + rel_map[rel_id] = new_rel_id + # Find all tables in the table document tables_to_insert = {} current_heading = None # Associate tables with their preceding headings for element in table_doc.element.body: - if element.tag.endswith("p"): # It's a paragraph + if element.tag.endswith("p"): paragraph_text = element.text.strip() if paragraph_text: - # print(paragraph_text) current_heading = paragraph_text - elif element.tag.endswith("tbl"): # It's a table + elif element.tag.endswith("tbl"): if current_heading: - tables_to_insert[current_heading] = element + # Deep copy the table element + table_copy = parse_xml(element.xml) + + # Update relationship IDs in the copied table + # Find all hyperlinks using the proper namespace approach + for hyperlink in table_copy.xpath(".//w:hyperlink"): + old_rid = hyperlink.get(qn("r:id")) + if old_rid in rel_map: + hyperlink.set(qn("r:id"), rel_map[old_rid]) + + tables_to_insert[current_heading] = table_copy # Process the main document for paragraph in main_doc.paragraphs: @@ -53,17 +77,16 @@ def merge_word_docs_with_tables( return output.getvalue() -# Usage with your existing code def merge_documents(doc_data: bytes, table_data: bytes) -> bytes: """ - Wrapper function to merge your documents using the existing download objects + Merge two Word documents, inserting table_data into doc_data. Args: - doc_data (bytes): Main document data from first Pandoc conversion - table_data (bytes): Table document data from second Pandoc conversion + doc_data: Main document binary data + table_data: Table document binary data Returns: - bytes: Merged document data + Merged document binary data """ try: merged_data = merge_word_docs_with_tables(doc_data, table_data) diff --git a/src/tidocs/merge/main_marimo.py b/src/tidocs/merge/main_marimo.py index df3103b..cceefc6 100644 --- a/src/tidocs/merge/main_marimo.py +++ b/src/tidocs/merge/main_marimo.py @@ -1,17 +1,17 @@ import marimo -__generated_with = "0.9.20" +__generated_with = "0.10.6" app = marimo.App(app_title="TiDocs - Merge Release Notes") @app.cell -def __(): +def _(): import marimo as mo return (mo,) @app.cell -def __(mo): +def _(mo): mo.md( """ # TiDocs: Merge Release Notes @@ -23,18 +23,18 @@ def __(mo): @app.cell -def __(upload_area): +def _(upload_area): upload_area return @app.cell -def __(is_valid_filename, md_files, mo): +def _(is_valid_filename, md_files, mo): for i in range(len(md_files.value)): mo.stop( not is_valid_filename(md_files.value[i].name), mo.md( - f'#### {mo.icon("ic:round-error-outline", color="darkorange", inline=True)} Invalid format.\n\nPlease upload release notes in `release-x.y.z.md` format.' + f"#### {mo.icon('ic:round-error-outline', color='darkorange', inline=True)} Invalid format.\n\nPlease upload release notes in `release-x.y.z.md` format." ) .center() .callout(kind="danger"), @@ -43,13 +43,13 @@ def __(is_valid_filename, md_files, mo): @app.cell -def __(config_area): +def _(config_area): config_area return @app.cell -def __(merged_doc, mo): +def _(merged_doc, mo): download_area = mo.vstack( [ mo.md(f"""## {mo.icon('fluent:document-one-page-multiple-sparkle-24-regular')} 3. Generate Document @@ -67,7 +67,7 @@ def __(merged_doc, mo): @app.cell -def __( +def _( abstract_input, authors_input, date_input, @@ -86,7 +86,7 @@ def __( @app.cell -def __(mo): +def _(mo): md_files = mo.ui.file( filetypes=[".md"], multiple=True, @@ -106,7 +106,7 @@ def __(mo): @app.cell -def __(mo): +def _(mo): config_area_title = mo.md( f"""## {mo.icon('lucide:edit')} 2. Configure Document Information @@ -165,7 +165,7 @@ def __(mo): @app.cell -def __( +def _( base_url_input, extract_and_mark_html_tables, md_files, @@ -208,7 +208,7 @@ def extract_version(filename): @app.cell -def __(Pandoc, get_reference_doc, md_contents, mo, table_contents): +def _(Pandoc, get_reference_doc, md_contents, mo, table_contents): reference_doc = get_reference_doc() pandoc = Pandoc() @@ -229,7 +229,7 @@ def __(Pandoc, get_reference_doc, md_contents, mo, table_contents): mo.stop( md_doc_err.decode("utf-8") != "", mo.md( - f'#### {mo.icon("ic:round-error-outline", color="darkorange", inline=True)} Failed to convert to Word.\n\n{md_doc_err.decode("utf-8")}' + f"#### {mo.icon('ic:round-error-outline', color='darkorange', inline=True)} Failed to convert to Word.\n\n{md_doc_err.decode('utf-8')}" ) .center() .callout(kind="danger"), @@ -248,7 +248,7 @@ def __(Pandoc, get_reference_doc, md_contents, mo, table_contents): mo.stop( table_doc_err.decode("utf-8") != "", mo.md( - f'####{mo.icon("ic:round-error-outline", color="darkorange", inline=True)} Failed to convert to Word.\n\n{table_doc_err.decode("utf-8")}' + f"####{mo.icon('ic:round-error-outline', color='darkorange', inline=True)} Failed to convert to Word.\n\n{table_doc_err.decode('utf-8')}" ) .center() .callout(kind="danger"), @@ -264,7 +264,7 @@ def __(Pandoc, get_reference_doc, md_contents, mo, table_contents): @app.cell -def __(md_doc_data, merge_documents, mo, table_doc_data): +def _(md_doc_data, merge_documents, mo, table_doc_data): merged_doc_data = merge_documents(md_doc_data, table_doc_data) merged_doc = mo.download( @@ -276,7 +276,7 @@ def __(md_doc_data, merge_documents, mo, table_doc_data): @app.cell -def __(mo): +def _(mo): mo.md(f"""## {mo.icon('icon-park-outline:format')} 4. Post-process Document After generating the Word document, follow these steps to finalize it: @@ -298,7 +298,7 @@ def __(mo): @app.cell -def __(): +def _(): from tidocs.markdown_handler import ( generate_pandoc_metadata, remove_front_matter,