feat: using markdown as main import format, html for images (#34)

* feat: using markdown as main import format, html for images * feat: update export markdown images flow * fix: wrong image detection with custom element and hr tags in coda contents --------- Co-authored-by: Hung Luu <[email protected]>
CoderPush · Feb 27, 2024 · 95458c9 · 95458c9
1 parent e4adc19
commit 95458c9
Show file tree

Hide file tree

Showing 4 changed files with 161 additions and 16 deletions.
diff --git a/.changeset/healthy-timers-hear.md b/.changeset/healthy-timers-hear.md
@@ -0,0 +1,5 @@
+---
+"coda-mover": patch
+---
+
+Use markdown as main import format, with images collected from html export
diff --git a/src/modules/simple-mover/Mover.ts b/src/modules/simple-mover/Mover.ts
@@ -137,6 +137,12 @@ export class Mover implements IMover {
 
     this.setStatus(docId, ITEM_STATUS_PENDING)
 
+    // when doc is marked for listing or re-listing,
+    // its current inner pages should be also marked as stale (pending for revalidation) as well
+    this.getInnerPages(doc).forEach(page => {
+      this.itemStatuses[page.id] = { id: page.id, status: ITEM_STATUS_PENDING }
+    })
+
     this.tasks.add({
       id: `list:${docId}`,
       execute: async () => {

diff --git a/src/modules/simple-mover/events.ts b/src/modules/simple-mover/events.ts
@@ -30,6 +30,10 @@ export const ITEM_STATUS_ARCHIVING = 'archiving'
 export const ITEM_STATUS_RETRYING = 'retrying'
 export const ITEM_STATUS_CANCELLED = 'cancelled'
 
+export const ITEM_STATUS_FETCHING_IMAGES = 'fetching images'
+export const ITEM_STATUS_DOWNLOADING_IMAGES = 'downloading images'
+export const ITEM_STATUS_REPLACING_IMAGES = 'replacing images'
+
 export const ItemStatuses = [
   ITEM_STATUS_PENDING,
   ITEM_STATUS_LISTING,
@@ -45,4 +49,7 @@ export const ItemStatuses = [
   ITEM_STATUS_CONFIRMING,
   ITEM_STATUS_CANCELLED,
   ITEM_STATUS_RETRYING,
+  ITEM_STATUS_FETCHING_IMAGES,
+  ITEM_STATUS_DOWNLOADING_IMAGES,
+  ITEM_STATUS_REPLACING_IMAGES,
 ] as const
diff --git a/src/modules/simple-mover/transfers/CodaExporter.ts b/src/modules/simple-mover/transfers/CodaExporter.ts
@@ -1,10 +1,24 @@
 import { TaskEmitter, TaskPriority } from '@abxvn/tasks'
 import { isAxiosError } from 'axios'
 import type { ICodaApis, ICodaPage, IMover, IExporter, IStatus } from '../interfaces'
-import { createWriteStream, ensureDir } from 'fs-extra'
+import { createWriteStream, ensureDir, readFile, writeFile } from 'fs-extra'
 import { getCurrentIsoDateTime, getParentDir, trimSlashes } from '../lib'
 import { download } from '../apis'
-import { ITEM_STATUS_DONE, ITEM_STATUS_DOWNLOADING, ITEM_STATUS_ERROR, ITEM_STATUS_EXPORTING, ITEM_STATUS_PENDING, SERVER_SAVE_ITEMS } from '../events'
+import {
+  ITEM_STATUS_DONE,
+  ITEM_STATUS_DOWNLOADING,
+  ITEM_STATUS_ERROR,
+  ITEM_STATUS_EXPORTING,
+  ITEM_STATUS_PENDING,
+  SERVER_SAVE_ITEMS,
+  ITEM_STATUS_FETCHING_IMAGES,
+  ITEM_STATUS_REPLACING_IMAGES,
+  ITEM_STATUS_DOWNLOADING_IMAGES,
+} from '../events'
+import { dirname } from 'path'
+
+const CODA_IMAGE_REPLACEMENT_START_REGEX = /^\n{2}/
+const CODA_IMAGE_REPLACEMENT_BODY_REGEX = /\n{4}/g
 
 export class CodaExporter implements IExporter {
   private importChunkCounter = 0
@@ -48,45 +62,158 @@ export class CodaExporter implements IExporter {
     this.tasks.next()
   }
 
-  async exportPage (page: ICodaPage, exportId?: string) {
+  async exportPage (page: ICodaPage, exportId?: string, imageExportId?: string) {
     const docId = trimSlashes(page.treePath).split('/').shift()
     if (!docId) throw Error('invalid page tree path')
 
     const parentDir = getParentDir(page, this.items)
-    const pageFilePath = `${parentDir}/${page.name.replace(/\//g, ' ')}.html`
-
-    if (!exportId) {
-      this.setStatus(page.id, ITEM_STATUS_EXPORTING)
-      const exportRes = await this.apis.exportPage(docId, page.id)
+    const pageFilePath = `${parentDir}/${page.name.replace(/\//g, ' ')}.md`
+
+    if (!exportId) exportId = await this.exportPageAsMarkdown(docId, page)
+    if (!exportId) throw Error('markdown export isn\'t requested')
+    if (!imageExportId) {
+      const isMarkdownDownloaded = await this.downloadMarkdownExport(docId, page, pageFilePath, exportId)
+      if (!isMarkdownDownloaded) {
+        return
+      }
+    }
 
-      exportId = exportRes.id
+    const markdownContent = await readFile(pageFilePath, 'utf8')
+    const shouldAddImages = CODA_IMAGE_REPLACEMENT_START_REGEX.test(markdownContent) ||
+      CODA_IMAGE_REPLACEMENT_BODY_REGEX.test(markdownContent)
+
+    if (shouldAddImages) {
+      if (!imageExportId) imageExportId = await this.exportPageAsHtml(docId, page)
+      if (!imageExportId) throw Error('html images export isn\'t requested')
+
+      const isImageReplaced = await this.downloadImageExportAndReplaceInMarkdown(
+        docId,
+        page,
+        pageFilePath,
+        markdownContent,
+        exportId,
+        imageExportId,
+      )
+      if (!isImageReplaced) {
+        return
+      }
     }
 
-    if (!exportId) throw Error('export isn\'t requested')
+    this.setStatus(page.id, ITEM_STATUS_DONE)
+  }
+
+  private async exportPageAsMarkdown (docId: string, page: ICodaPage) {
+    this.setStatus(page.id, ITEM_STATUS_EXPORTING)
+    const exportRes = await this.apis.exportPage(docId, page.id, 'markdown')
 
-    const pageExport = await this.apis.getPageExport(docId, page.id, exportId)
+    return exportRes.id
+  }
 
-    if (!pageExport.downloadLink) { // retry later at low priority
+  private async downloadMarkdownExport (
+    docId: string,
+    page: ICodaPage,
+    pageFilePath: string,
+    markdownExportId: string,
+  ) {
+    const pageExport = await this.apis.getPageExport(docId, page.id, markdownExportId)
+
+    if (!pageExport.downloadLink) {
+      // retry later at low priority with current markdown export id
       this.tasks.add({
         id: page.id,
-        execute: async () => await this.exportPage(page, exportId),
+        execute: async () => await this.exportPage(page, markdownExportId),
         priority: TaskPriority.LOW,
       })
 
-      return
+      return false
     }
 
     this.setStatus(page.id, ITEM_STATUS_DOWNLOADING)
     this.items[page.id].syncedAt = getCurrentIsoDateTime()
 
-    await ensureDir(parentDir)
+    await ensureDir(dirname(pageFilePath))
     await download(pageExport.downloadLink, createWriteStream(pageFilePath, {
       flags: 'w',
       encoding: 'utf8',
     }))
 
     this.items[page.id].filePath = pageFilePath
-    this.setStatus(page.id, ITEM_STATUS_DONE)
+
+    return true
+  }
+
+  private async exportPageAsHtml (docId: string, page: ICodaPage) {
+    this.setStatus(page.id, ITEM_STATUS_FETCHING_IMAGES)
+    const exportRes = await this.apis.exportPage(docId, page.id, 'html')
+
+    return exportRes.id
+  }
+
+  private async downloadImageExportAndReplaceInMarkdown (
+    docId: string,
+    page: ICodaPage,
+    pageFilePath: string,
+    markdownContent: string,
+    markdownExportId: string,
+    htmlExportId: string,
+  ) {
+    this.setStatus(page.id, ITEM_STATUS_DOWNLOADING_IMAGES)
+    const htmlExport = await this.apis.getPageExport(docId, page.id, htmlExportId)
+    const htmlFilePath = pageFilePath.replace(/\.md$/, '.html')
+
+    if (!htmlExport.downloadLink) {
+      // retry later at low priority with current both markdown and html export ids
+      this.tasks.add({
+        id: page.id,
+        execute: async () => await this.exportPage(page, markdownExportId, htmlExportId),
+        priority: TaskPriority.LOW,
+      })
+
+      return false
+    }
+
+    await download(htmlExport.downloadLink, createWriteStream(htmlFilePath, {
+      flags: 'w',
+      encoding: 'utf8',
+    }))
+
+    this.setStatus(page.id, ITEM_STATUS_REPLACING_IMAGES)
+    const htmlContent = await readFile(htmlFilePath, 'utf8')
+    const replacedBlocks: string[] = []
+    // img and hr tags are rendered as 3 empty lines or 2 empty lines at start
+    const replacedHtmlTags = htmlContent.match(/<img[^>]+src="[^">]+"|<hr/g)
+
+    replacedHtmlTags?.forEach(tag => {
+      if (!tag.startsWith('<img')) { // not image tag, ignored
+        return replacedBlocks.push('\n')
+      }
+
+      const src = tag.match(/src="([^"]*)"/)?.[1]
+      const alt = tag.match(/alt="([^"]*)"/)?.[1]
+
+      replacedBlocks.push(`![${alt}](${src})`)
+    })
+
+    let replacementCount = 0
+    let markdownContentWithImages = markdownContent.replace(CODA_IMAGE_REPLACEMENT_START_REGEX, emptyLines => {
+      return replacedBlocks[replacementCount]
+        ? `${replacedBlocks[replacementCount++]}\n\n`
+        : emptyLines // restore empty lines if replacement not found from html export
+    })
+
+    markdownContentWithImages = markdownContentWithImages.replace(CODA_IMAGE_REPLACEMENT_BODY_REGEX, emptyLines => {
+      return replacedBlocks[replacementCount]
+        ? `\n\n${replacedBlocks[replacementCount++]}\n\n`
+        : emptyLines // restore empty lines if replacement not found from html export
+    })
+
+    if (replacementCount < replacedBlocks.length) {
+      markdownContentWithImages += replacedBlocks.slice(replacementCount).join('\n\n')
+    }
+
+    await writeFile(pageFilePath, markdownContentWithImages, 'utf8')
+
+    return true
   }
 
   stopPendingExports () {