Merge pull request #352 from uhh-lt/add-pages-to-pdfs

Add pages to pdfs
uhh-lt · Feb 11, 2024 · 85161f9 · 85161f9
2 parents e6b8d04 + 635d914
commit 85161f9
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 12 deletions.
diff --git a/backend/src/app/preprocessing/pipeline/steps/text/clean_html.py b/backend/src/app/preprocessing/pipeline/steps/text/clean_html.py
@@ -14,7 +14,9 @@
 
 
 def clean_html_tags_and_attrs(
-    tags_to_kill: List[str], tags_to_remove: List[str], attrs_to_keep: List[str]
+    tags_to_kill: List[str],
+    tags_to_remove: List[str],
+    attrs_to_keep: List[str],
 ) -> Callable[[str], str]:
     def x(html: str) -> str:
         # use cleaner to only include relevant attributes and to remove unwanted tags
@@ -115,6 +117,7 @@ def x(html_content: str) -> str:
                 "width",
                 "height",
                 "target",
+                "pagenum",
             ],
         ),
         string_replace(replace={"\n": "", "&lt;": "❮", "&gt;": "❯"}),
@@ -137,6 +140,7 @@ def x(html_content: str) -> str:
                 "width",
                 "height",
                 "target",
+                "pagenum",
             ],
         ),
         string_replace(replace={"\n": "", "&lt;": "❮", "&gt;": "❯"}),
@@ -158,7 +162,7 @@ def clean_content_in_html(cargo: PipelineCargo) -> PipelineCargo:
 
     if not has_readability_watermark(content_in_html):
         # here, we apply the same cleaning pipeline as in the crawler
-        logger.debug("Processing HTML with readability!")
+        logger.info("Processing HTML with readability!")
         content_in_html = cleaning_with_readability_pipeline(content_in_html)
 
     logger.info("Cleaning HTML document!")

diff --git a/...rc/app/preprocessing/pipeline/steps/text/extract_content_in_html_from_word_or_pdf_docs.py b/...rc/app/preprocessing/pipeline/steps/text/extract_content_in_html_from_word_or_pdf_docs.py
@@ -57,9 +57,9 @@ def __extract_content_in_html_from_pdf_docs(filepath: Path) -> Tuple[str, List[P
         logger.warning(f"File {filepath} is not a Word document!")
         return "", []
     doc = fitz.open(str(filepath))  # type: ignore
-    page_soups = []
+    pages = []
     extracted_images: List[Path] = []
-    for page in doc:
+    for page_num, page in enumerate(doc):
         # extract images and save on disk
         extracted_images_in_page = []
         if cc.preprocessing.extract_images_from_pdf:
@@ -82,9 +82,9 @@ def __extract_content_in_html_from_pdf_docs(filepath: Path) -> Tuple[str, List[P
             img_tag["src"] = img_name.name
             del img_tag["width"]
             del img_tag["height"]
-        page_soups.append(soup)
+        pages.append(f'<section pagenum="{page_num}">{str(soup)}</section>')
 
-    doc_html = "\n".join(map(lambda p: str(p), page_soups))
+    doc_html = "\n".join(pages)
     return f"<html><body>{doc_html}</body></html>", extracted_images
 
 

diff --git a/frontend/src/features/DocumentRenderer/DocumentRenderer.tsx b/frontend/src/features/DocumentRenderer/DocumentRenderer.tsx
@@ -43,7 +43,7 @@ function DocumentRenderer({
       content = content.substring(0, content.length - 6);
     }
     content = content.trim();
-    const regex = /<page num="\d+">|<\/page><page num="\d+">|<\/page>/gm;
+    const regex = /<section pagenum="\d+">|<\/section><section pagenum="\d+">|<\/section>/gm;
     let splitted = content.split(regex);
     splitted = splitted.filter((s) => s.length > 0);
     return splitted;
@@ -56,7 +56,6 @@ function DocumentRenderer({
     count: numPages,
     getScrollElement: () => listRef.current,
     estimateSize: () => 155,
-    overscan: 1,
   });
 
   // Order matters. Instructions are processed in

diff --git a/frontend/src/views/search/DocumentViewer/DocumentViewer.tsx b/frontend/src/views/search/DocumentViewer/DocumentViewer.tsx
@@ -49,8 +49,8 @@ function DocumentViewer({
 
   return (
     <Card raised {...props}>
-      <CardContent>
-        <Stack spacing={2}>
+      <CardContent className="h100">
+        <Stack spacing={2} className="h100">
           <div style={{ display: "flex", alignItems: "center" }}>
             <EditableDocumentName
               sdocId={sdocId}

diff --git a/frontend/src/views/search/Search.tsx b/frontend/src/views/search/Search.tsx
@@ -165,13 +165,14 @@ function Search() {
                 </Grid>
               )}
               {(isSplitView || viewDocument) && (
-                <Grid item md={isSplitView ? 6 : 12} className="h100" overflow={"auto"}>
-                  <Container sx={{ my: 2, height: "fit-content" }}>
+                <Grid item md={isSplitView ? 6 : 12} className="h100">
+                  <Container className="h100" sx={{ py: 2 }}>
                     <DocumentViewer
                       sdocId={sdocId ? parseInt(sdocId) : undefined}
                       handleTagClick={handleAddTagFilter}
                       showEntities={isShowEntities}
                       isIdleContent={<Typography>Click a document to read it :)</Typography>}
+                      className="h100"
                     />
                   </Container>
                 </Grid>