Skip to content

Commit

Permalink
Merge pull request #352 from uhh-lt/add-pages-to-pdfs
Browse files Browse the repository at this point in the history
Add pages to pdfs
  • Loading branch information
bigabig authored Feb 11, 2024
2 parents e6b8d04 + 635d914 commit 85161f9
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@


def clean_html_tags_and_attrs(
tags_to_kill: List[str], tags_to_remove: List[str], attrs_to_keep: List[str]
tags_to_kill: List[str],
tags_to_remove: List[str],
attrs_to_keep: List[str],
) -> Callable[[str], str]:
def x(html: str) -> str:
# use cleaner to only include relevant attributes and to remove unwanted tags
Expand Down Expand Up @@ -115,6 +117,7 @@ def x(html_content: str) -> str:
"width",
"height",
"target",
"pagenum",
],
),
string_replace(replace={"\n": "", "<": "❮", ">": "❯"}),
Expand All @@ -137,6 +140,7 @@ def x(html_content: str) -> str:
"width",
"height",
"target",
"pagenum",
],
),
string_replace(replace={"\n": "", "<": "❮", ">": "❯"}),
Expand All @@ -158,7 +162,7 @@ def clean_content_in_html(cargo: PipelineCargo) -> PipelineCargo:

if not has_readability_watermark(content_in_html):
# here, we apply the same cleaning pipeline as in the crawler
logger.debug("Processing HTML with readability!")
logger.info("Processing HTML with readability!")
content_in_html = cleaning_with_readability_pipeline(content_in_html)

logger.info("Cleaning HTML document!")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ def __extract_content_in_html_from_pdf_docs(filepath: Path) -> Tuple[str, List[P
logger.warning(f"File {filepath} is not a Word document!")
return "", []
doc = fitz.open(str(filepath)) # type: ignore
page_soups = []
pages = []
extracted_images: List[Path] = []
for page in doc:
for page_num, page in enumerate(doc):
# extract images and save on disk
extracted_images_in_page = []
if cc.preprocessing.extract_images_from_pdf:
Expand All @@ -82,9 +82,9 @@ def __extract_content_in_html_from_pdf_docs(filepath: Path) -> Tuple[str, List[P
img_tag["src"] = img_name.name
del img_tag["width"]
del img_tag["height"]
page_soups.append(soup)
pages.append(f'<section pagenum="{page_num}">{str(soup)}</section>')

doc_html = "\n".join(map(lambda p: str(p), page_soups))
doc_html = "\n".join(pages)
return f"<html><body>{doc_html}</body></html>", extracted_images


Expand Down
3 changes: 1 addition & 2 deletions frontend/src/features/DocumentRenderer/DocumentRenderer.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ function DocumentRenderer({
content = content.substring(0, content.length - 6);
}
content = content.trim();
const regex = /<page num="\d+">|<\/page><page num="\d+">|<\/page>/gm;
const regex = /<section pagenum="\d+">|<\/section><section pagenum="\d+">|<\/section>/gm;
let splitted = content.split(regex);
splitted = splitted.filter((s) => s.length > 0);
return splitted;
Expand All @@ -56,7 +56,6 @@ function DocumentRenderer({
count: numPages,
getScrollElement: () => listRef.current,
estimateSize: () => 155,
overscan: 1,
});

// Order matters. Instructions are processed in
Expand Down
4 changes: 2 additions & 2 deletions frontend/src/views/search/DocumentViewer/DocumentViewer.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ function DocumentViewer({

return (
<Card raised {...props}>
<CardContent>
<Stack spacing={2}>
<CardContent className="h100">
<Stack spacing={2} className="h100">
<div style={{ display: "flex", alignItems: "center" }}>
<EditableDocumentName
sdocId={sdocId}
Expand Down
5 changes: 3 additions & 2 deletions frontend/src/views/search/Search.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,14 @@ function Search() {
</Grid>
)}
{(isSplitView || viewDocument) && (
<Grid item md={isSplitView ? 6 : 12} className="h100" overflow={"auto"}>
<Container sx={{ my: 2, height: "fit-content" }}>
<Grid item md={isSplitView ? 6 : 12} className="h100">
<Container className="h100" sx={{ py: 2 }}>
<DocumentViewer
sdocId={sdocId ? parseInt(sdocId) : undefined}
handleTagClick={handleAddTagFilter}
showEntities={isShowEntities}
isIdleContent={<Typography>Click a document to read it :)</Typography>}
className="h100"
/>
</Container>
</Grid>
Expand Down

0 comments on commit 85161f9

Please sign in to comment.