From 3240838fd5500e6964941db95112a5327810832d Mon Sep 17 00:00:00 2001 From: Tsuni Date: Fri, 12 Jul 2024 12:53:43 -0500 Subject: [PATCH] Fix GitLab parser + include credentials --- src/background/index.chrome.ts | 5 +++-- src/background/index.firefox.ts | 5 +++-- src/background/parser.ts | 3 +-- src/background/services/github.ts | 2 +- src/background/services/gitlab.ts | 30 +++++++++++++++++++++++++++++- src/background/services/index.ts | 8 +++++--- 6 files changed, 42 insertions(+), 11 deletions(-) diff --git a/src/background/index.chrome.ts b/src/background/index.chrome.ts index 088365b..4a1e946 100644 --- a/src/background/index.chrome.ts +++ b/src/background/index.chrome.ts @@ -1,6 +1,6 @@ import { DOMParser } from 'linkedom'; import { installedHandler } from './common'; -import { parseAndReply } from './parser'; +import { addBaseElement, parseAndReply } from './parser'; import { resolveURL } from './services'; const scrapeHandler = async ({ url }, res: (response?: any) => void) => { @@ -10,10 +10,11 @@ const scrapeHandler = async ({ url }, res: (response?: any) => void) => { let doc: Document while (oldUrl !== newUrl) { oldUrl = newUrl || oldUrl - const resp = await fetch(oldUrl) + const resp = await fetch(oldUrl, { credentials: 'include' }) const html = await resp.text() // @ts-expect-error - linkedom's document is FAKE and missing lots of properties, but we don't care because we don't use them :) doc = new DOMParser().parseFromString(html, 'text/html'); + addBaseElement(doc, url); newUrl = await resolveURL(doc, oldUrl) || oldUrl } await parseAndReply(doc, newUrl, res) diff --git a/src/background/index.firefox.ts b/src/background/index.firefox.ts index e5374ef..96be5ff 100644 --- a/src/background/index.firefox.ts +++ b/src/background/index.firefox.ts @@ -1,5 +1,5 @@ import { installedHandler } from "./common"; -import { parseAndReply } from "./parser"; +import { addBaseElement, parseAndReply } from "./parser"; import { resolveURL } from "./services"; const scrapeHandler = async ({ url }, res: (response?: any) => void) => { @@ -9,9 +9,10 @@ const scrapeHandler = async ({ url }, res: (response?: any) => void) => { let doc: Document while (oldUrl !== newUrl) { oldUrl = newUrl || oldUrl - const resp = await fetch(oldUrl) + const resp = await fetch(oldUrl, { credentials: 'include'}) const html = await resp.text() doc = new DOMParser().parseFromString(html, "text/html") + addBaseElement(doc, url); newUrl = await resolveURL(doc, oldUrl) || oldUrl } await parseAndReply(doc, newUrl, res) diff --git a/src/background/parser.ts b/src/background/parser.ts index 32ac2c9..506281d 100644 --- a/src/background/parser.ts +++ b/src/background/parser.ts @@ -8,7 +8,6 @@ export interface Meta { } const parseHTMLMeta = (doc: Document, url: string) => { - addBaseElement(doc, url); const title = (doc.querySelector('meta[property="og:title"]') as HTMLMetaElement)?.content || doc.querySelector('title')?.textContent; const description = (doc.querySelector('meta[property="og:description"]') as HTMLMetaElement)?.content || @@ -26,7 +25,7 @@ const parseHTMLMeta = (doc: Document, url: string) => { } as Meta } -const addBaseElement = (doc: Document, url: string) => { +export const addBaseElement = (doc: Document, url: string) => { let baseEl = doc.createElement('base'); // https://stackoverflow.com/questions/55232202/optional-baseuri-location-in-domparser baseEl.setAttribute('href', url); doc.head.append(baseEl); diff --git a/src/background/services/github.ts b/src/background/services/github.ts index f063ccd..ccf9ed8 100644 --- a/src/background/services/github.ts +++ b/src/background/services/github.ts @@ -7,7 +7,7 @@ const GithubParser: Parser = { const path = new URL(url).pathname.split("/") // ["", "user", "repo", ...] return path.length >= 3 }, - parse: async (node: Node, url: string) => { + parse: async (doc: Document, url: string) => { const path = new URL(url).pathname.split("/") // ["", "user", "repo", ...] const data = await fetch(`https://api.github.com/repos/${path[1]}/${path[2]}/readme`).then(res => res.json()) const decoded = atob(data.content) diff --git a/src/background/services/gitlab.ts b/src/background/services/gitlab.ts index fecefbc..e9b55b9 100644 --- a/src/background/services/gitlab.ts +++ b/src/background/services/gitlab.ts @@ -5,8 +5,36 @@ const GitlabParser: Parser = { const siteName = doc.querySelector("meta[property='og:site_name']")?.getAttribute("content") return siteName === "GitLab" }, + // GitLab doesn't preview documents statically, and I don't think we're executing JS. + // This means we need to navigate to the repo homepage to find the README link + // And then instead of following that, replace /blob/ with /raw/ to get the raw text rewrite: async (doc: Document, url: string) => { - return url.replace("\/-\/.*","") // Link to homepage of repository + if (url.match("README")) { // If we're already at the README, stop + return url + } + if (url.match(/\/-\//)) { // If we aren't at the root of the repo, go to the root + return url.replace(/\/-\/.*/,"") + } + }, + parse: async (doc: Document, url: string) => { + // Find the README link and replace /blob/ with /raw/ + const sidebarLinks = doc.querySelectorAll(".project-page-sidebar-block .nav-item a") as NodeListOf + let readmeUrl = "" + for (const link of sidebarLinks) { + if (link.textContent?.match(/readme/i)) { + readmeUrl = link.href.replace("/blob/", "/raw/") + break + } + } + if (!readmeUrl) { + return {} + } + const resp = await fetch(readmeUrl) + const text = await resp.text() + return { + body: text, + siteName: "GitLab" + } } } diff --git a/src/background/services/index.ts b/src/background/services/index.ts index 6af4b55..278e2e3 100644 --- a/src/background/services/index.ts +++ b/src/background/services/index.ts @@ -1,10 +1,12 @@ import github from './github'; -const parsers = [github]; +import gitlab from './gitlab'; + +const parsers = [github, gitlab]; export interface Parser { matches: (doc: Document, url: string) => Promise, rewrite?: (doc: Document, url: string) => Promise, - parse?: (node: Node, url: string) => Promise<{ title?: string, description?: string, imageUrl?: string, body?: string, siteName?: string }> + parse?: (doc: Document, url: string) => Promise<{ title?: string, description?: string, imageUrl?: string, body?: string, siteName?: string }> } export const resolveURL = async (doc: Document, url: string) => { @@ -24,7 +26,7 @@ export const doCustomParse = async (doc: Document, url: string) => { for (const parser of parsers) { try { if (await parser.matches(doc, url) && parser.parse) { - const documentClone = doc.cloneNode(true); + const documentClone = doc.cloneNode(true) as Document; return await parser.parse(documentClone, url); } } catch (e) {