Merge pull request #18 from apankowski/bundle-captured-pages-in-a-zip…

…-file Bundle all captured page images in a ZIP file
apankowski · Feb 17, 2024 · af7a9bc · af7a9bc
2 parents 782293f + d611041
commit af7a9bc
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -4,17 +4,19 @@ This is a POC downloader of documents from [doc88.com](https://doc88.com). It sa
 
 Then, having pages saved as images, a searchable PDF can be reconstructed from them.
 
-## Step 1: Save pages as images
+## Step 1: Save pages of a document as images
 
 ### Option A: Bookmark
 
-Create a browser bookmark pasting content of [this file](bookmark.min.js) (exactly as it is) in the URL field.
+Create a browser bookmark, pasting content of [this file](bookmark.min.js) (exactly as it is) in its URL field.
 
-From now on, clicking it on a document page will download all pages as JPEGs.
+From now on, clicking the bookmark on a document page will capture all pages as JPEGs, bundle them in a ZIP archive and download it.
 
-Don't interact with the page during the process until it finishes.
-
-⚠️ In case of Chrome, the first time you download pages you may see a popup stating that "This site is attempting to download multiple files". You have to allow it as each page is downloaded as a separate file.
+> [!IMPORTANT]  
+> Don't interact with the browser during the process.  
+> Be patient, especially with large documents containing hundreds of pages.  
+> You can assess the progress of the process in doc88's page selector (e.g. "17 / 42").  
+> Check that all desired pages were captured correctly.
 
 ### Option B: Manual (finer control over the process)
 
@@ -27,13 +29,17 @@ Don't interact with the page during the process until it finishes.
     ```javascript
     downloadPages()
     ```
-   This will download all the pages.  
-   Pages will be automatically preloaded and saved one by one.
-    * ℹ️ See [options](#options) section below for options.  
-    * ⚠️ In case of Chrome, the first time you download pages you may see a popup stating that "This site is attempting to download multiple files". You have to allow it as each page is downloaded as a separate file.
-7. Don't interact with the page during the process.   
-   Wait until it ends, printing `Finished downloading pages` in the Console.  
-   Make sure all desired pages were downloaded correctly.
+   This will capture and bundle all the pages in a ZIP file.  
+   Pages will be automatically preloaded and captured one by one.
+
+See [options](#options) section below for options.
+
+> [!IMPORTANT]  
+> Don't interact with the browser during the process.  
+> Be patient, especially with large documents containing hundreds of pages.  
+> Wait until it ends, printing `Finished downloading pages` in the Console.  
+> Check that all desired pages were captured correctly.
+
 
 #### Options
 
@@ -50,6 +56,10 @@ Possible options are:
 3. `format` – downloaded image format; string; either `'jpg'` or `'png'`; default is `'jpg'`
 4. `quality` – quality of images; applicable when `format` is `'jpg'`; number between `0` and `1`; default is `0.9`
 5. `imageNamePrefix` – prefix for names of downloaded images; string; default is `'page'` (resulting in downloaded file names e.g.: `page001.jpg`, `page002.jpg`, etc. assuming `format` is `'jpg'`)
+6. `archive` – type of archive to put the captured images in; string; either `'zip'` or `'none'`; default is `'zip'`; `'none'` will result in each image downloaded as a separate file
+
+> [!NOTE]  
+> In case of Chrome, if you set `archive` to `none`, the first time you download pages you may see a popup stating that "This site is attempting to download multiple files". You have to allow it, as with this option each page will be downloaded as a separate file.
 
 ## Step 2: Converting images back to a PDF
 

diff --git a/bookmark.min.js b/bookmark.min.js
diff --git a/downloadPages.js b/downloadPages.js
@@ -96,14 +96,56 @@ function downloadBlob(blob, filename) {
   URL.revokeObjectURL(anchor.href)
 }
 
-function pageImageHandlerFor({}) {
-  return {
-    initialize: async () => {},
-    handlePageImage: async (pageNo, imageBlob, imageFilename) => {
-      downloadBlob(imageBlob, imageFilename)
-      console.log(`Downloaded page #${pageNo}`)
-    },
-    finalize: async () => {},
+function getDocumentTitle() {
+  return document.querySelector('h1')?.title
+    || document.querySelector('meta[property="og:title"]')?.content
+}
+
+async function loadSupportScript(url) {
+  return new Promise((resolve, reject) => {
+    let script = document.createElement('script')
+    script.type = 'text/javascript'
+    script.src = url
+    script.onload = () => resolve()
+    script.onerror = (event) => reject(new Error(`Failed to load support script ${url}: ${event.type}`))
+    document.head.appendChild(script)
+  })
+}
+
+function pageImageHandlerFor({ archive = 'zip' }) {
+  switch (archive) {
+    case 'none':
+      return {
+        initialize: async () => {},
+        handlePageImage: async (pageNo, imageBlob, imageFilename) => {
+          downloadBlob(imageBlob, imageFilename)
+          console.log(`Downloaded page #${pageNo}`)
+        },
+        finalize: async () => {},
+      }
+    case 'zip': {
+      let zip
+      return {
+        initialize: async () => {
+          await loadSupportScript('https://cdnjs.cloudflare.com/ajax/libs/jszip/3.10.1/jszip.min.js')
+          zip = new JSZip()
+          console.log('Initialized ZIP archive')
+        },
+        handlePageImage: async (pageNo, imageBlob, imageFilename) => {
+          zip.file(imageFilename, imageBlob, { compression: 'DEFLATE' })
+          console.log(`Added page #${pageNo} to ZIP archive`)
+        },
+        finalize: async () => {
+          const zipFilename = (getDocumentTitle() || 'pages') + '.zip'
+          const zipBlob = await zip.generateAsync({ type: 'blob' })
+          downloadBlob(zipBlob, zipFilename)
+          console.log('Downloaded ZIP archive')
+          zip = null
+        },
+      }
+    }
+    default:
+      throw new Error(`Unknown archive type ${archive}`)
   }
 }
 
@@ -123,7 +165,7 @@ async function downloadPages(options = {}) {
     const imageFilename = imageFilenameFor(pageNo, options, imageFormat)
 
     await preloadPage(pageNo, pageCanvas)
-    let imageBlob = await captureAsImageBlob(pageCanvas, imageFormat)
+    const imageBlob = await captureAsImageBlob(pageCanvas, imageFormat)
 
     await pageImageHandler.handlePageImage(pageNo, imageBlob, imageFilename)
   }