Skip to content

Commit

Permalink
fix: preserve hyperlink references in Word document merge (#3)
Browse files Browse the repository at this point in the history
  • Loading branch information
Oreoxmt authored Dec 21, 2024
1 parent 8b80a1a commit fbdff23
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 40 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ Use the `tidocs merge` command to access a web interface for combining multiple

## Changelog

### [1.0.6] - 2024-12-21

- Fix the issue that hyperlinks become broken after merging Word documents due to incorrect relationship reference handling. ([#2](https://github.com/Oreoxmt/tidocs/issues/2))

### [1.0.5] - 2024-12-03

- Fix compatibility issues with Python 3.9.
Expand Down
10 changes: 5 additions & 5 deletions pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 5 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ license = { text = "Apache-2.0" }
requires = ["pdm-backend"]
build-backend = "pdm.backend"

[dependency-groups]
dev = [
"pytest>=8.3.4",
]

[tool.pdm]
distribution = true

Expand All @@ -32,8 +37,3 @@ includes = ["src/tidocs/"]

[tool.pdm.version]
source = "scm"

[dependency-groups]
dev = [
"pytest>=8.3.4",
]
2 changes: 1 addition & 1 deletion src/tidocs/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def launch_marimo_app(appname: str, host: str, port: int) -> None:


@click.command(no_args_is_help=True)
@click.version_option(version="1.0.5")
@click.version_option(version="1.0.6")
@click.argument("appname", type=click.Choice(list(APPS.keys())), required=True)
@click.option(
"--host",
Expand Down
45 changes: 34 additions & 11 deletions src/tidocs/docx_handler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import io

from docx import Document
from docx.oxml import parse_xml
from docx.oxml.shared import qn


def merge_word_docs_with_tables(
Expand All @@ -9,7 +10,7 @@ def merge_word_docs_with_tables(
marker_text: str = "TIDOCS_REPLACE_TABLE",
) -> bytes:
"""
Merges tables from one Word document into another at specified marker locations.
Merges tables from one Word document into another at specified marker locations, preserving hyperlinks and other document relationships.
Args:
main_doc_data (bytes): The main document binary data
Expand All @@ -23,20 +24,43 @@ def merge_word_docs_with_tables(
main_doc = Document(io.BytesIO(main_doc_data))
table_doc = Document(io.BytesIO(table_doc_data))

# Create a mapping of relationship IDs between documents
rel_map = {}

# Copy hyperlink relationships from table_doc to main_doc
for rel_id, rel in table_doc.part.rels.items():
if (
rel.reltype
== "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"
):
new_rel_id = main_doc.part.relate_to(
rel._target, rel.reltype, rel.is_external
)
rel_map[rel_id] = new_rel_id

# Find all tables in the table document
tables_to_insert = {}
current_heading = None

# Associate tables with their preceding headings
for element in table_doc.element.body:
if element.tag.endswith("p"): # It's a paragraph
if element.tag.endswith("p"):
paragraph_text = element.text.strip()
if paragraph_text:
# print(paragraph_text)
current_heading = paragraph_text
elif element.tag.endswith("tbl"): # It's a table
elif element.tag.endswith("tbl"):
if current_heading:
tables_to_insert[current_heading] = element
# Deep copy the table element
table_copy = parse_xml(element.xml)

# Update relationship IDs in the copied table
# Find all hyperlinks using the proper namespace approach
for hyperlink in table_copy.xpath(".//w:hyperlink"):
old_rid = hyperlink.get(qn("r:id"))
if old_rid in rel_map:
hyperlink.set(qn("r:id"), rel_map[old_rid])

tables_to_insert[current_heading] = table_copy

# Process the main document
for paragraph in main_doc.paragraphs:
Expand All @@ -53,17 +77,16 @@ def merge_word_docs_with_tables(
return output.getvalue()


# Usage with your existing code
def merge_documents(doc_data: bytes, table_data: bytes) -> bytes:
"""
Wrapper function to merge your documents using the existing download objects
Merge two Word documents, inserting table_data into doc_data.
Args:
doc_data (bytes): Main document data from first Pandoc conversion
table_data (bytes): Table document data from second Pandoc conversion
doc_data: Main document binary data
table_data: Table document binary data
Returns:
bytes: Merged document data
Merged document binary data
"""
try:
merged_data = merge_word_docs_with_tables(doc_data, table_data)
Expand Down
36 changes: 18 additions & 18 deletions src/tidocs/merge/main_marimo.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import marimo

__generated_with = "0.9.20"
__generated_with = "0.10.6"
app = marimo.App(app_title="TiDocs - Merge Release Notes")


@app.cell
def __():
def _():
import marimo as mo
return (mo,)


@app.cell
def __(mo):
def _(mo):
mo.md(
"""
# TiDocs: Merge Release Notes
Expand All @@ -23,18 +23,18 @@ def __(mo):


@app.cell
def __(upload_area):
def _(upload_area):
upload_area
return


@app.cell
def __(is_valid_filename, md_files, mo):
def _(is_valid_filename, md_files, mo):
for i in range(len(md_files.value)):
mo.stop(
not is_valid_filename(md_files.value[i].name),
mo.md(
f'#### {mo.icon("ic:round-error-outline", color="darkorange", inline=True)} Invalid format.\n\nPlease upload release notes in `release-x.y.z.md` format.'
f"#### {mo.icon('ic:round-error-outline', color='darkorange', inline=True)} Invalid format.\n\nPlease upload release notes in `release-x.y.z.md` format."
)
.center()
.callout(kind="danger"),
Expand All @@ -43,13 +43,13 @@ def __(is_valid_filename, md_files, mo):


@app.cell
def __(config_area):
def _(config_area):
config_area
return


@app.cell
def __(merged_doc, mo):
def _(merged_doc, mo):
download_area = mo.vstack(
[
mo.md(f"""## {mo.icon('fluent:document-one-page-multiple-sparkle-24-regular')} 3. Generate Document
Expand All @@ -67,7 +67,7 @@ def __(merged_doc, mo):


@app.cell
def __(
def _(
abstract_input,
authors_input,
date_input,
Expand All @@ -86,7 +86,7 @@ def __(


@app.cell
def __(mo):
def _(mo):
md_files = mo.ui.file(
filetypes=[".md"],
multiple=True,
Expand All @@ -106,7 +106,7 @@ def __(mo):


@app.cell
def __(mo):
def _(mo):
config_area_title = mo.md(
f"""## {mo.icon('lucide:edit')} 2. Configure Document Information
Expand Down Expand Up @@ -165,7 +165,7 @@ def __(mo):


@app.cell
def __(
def _(
base_url_input,
extract_and_mark_html_tables,
md_files,
Expand Down Expand Up @@ -208,7 +208,7 @@ def extract_version(filename):


@app.cell
def __(Pandoc, get_reference_doc, md_contents, mo, table_contents):
def _(Pandoc, get_reference_doc, md_contents, mo, table_contents):
reference_doc = get_reference_doc()

pandoc = Pandoc()
Expand All @@ -229,7 +229,7 @@ def __(Pandoc, get_reference_doc, md_contents, mo, table_contents):
mo.stop(
md_doc_err.decode("utf-8") != "",
mo.md(
f'#### {mo.icon("ic:round-error-outline", color="darkorange", inline=True)} Failed to convert to Word.\n\n{md_doc_err.decode("utf-8")}'
f"#### {mo.icon('ic:round-error-outline', color='darkorange', inline=True)} Failed to convert to Word.\n\n{md_doc_err.decode('utf-8')}"
)
.center()
.callout(kind="danger"),
Expand All @@ -248,7 +248,7 @@ def __(Pandoc, get_reference_doc, md_contents, mo, table_contents):
mo.stop(
table_doc_err.decode("utf-8") != "",
mo.md(
f'####{mo.icon("ic:round-error-outline", color="darkorange", inline=True)} Failed to convert to Word.\n\n{table_doc_err.decode("utf-8")}'
f"####{mo.icon('ic:round-error-outline', color='darkorange', inline=True)} Failed to convert to Word.\n\n{table_doc_err.decode('utf-8')}"
)
.center()
.callout(kind="danger"),
Expand All @@ -264,7 +264,7 @@ def __(Pandoc, get_reference_doc, md_contents, mo, table_contents):


@app.cell
def __(md_doc_data, merge_documents, mo, table_doc_data):
def _(md_doc_data, merge_documents, mo, table_doc_data):
merged_doc_data = merge_documents(md_doc_data, table_doc_data)

merged_doc = mo.download(
Expand All @@ -276,7 +276,7 @@ def __(md_doc_data, merge_documents, mo, table_doc_data):


@app.cell
def __(mo):
def _(mo):
mo.md(f"""## {mo.icon('icon-park-outline:format')} 4. Post-process Document
After generating the Word document, follow these steps to finalize it:
Expand All @@ -298,7 +298,7 @@ def __(mo):


@app.cell
def __():
def _():
from tidocs.markdown_handler import (
generate_pandoc_metadata,
remove_front_matter,
Expand Down

0 comments on commit fbdff23

Please sign in to comment.