Skip to content

Commit

Permalink
Fix GitLab parser + include credentials
Browse files Browse the repository at this point in the history
  • Loading branch information
TetraTsunami committed Jul 12, 2024
1 parent e910b37 commit 3240838
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 11 deletions.
5 changes: 3 additions & 2 deletions src/background/index.chrome.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { DOMParser } from 'linkedom';
import { installedHandler } from './common';
import { parseAndReply } from './parser';
import { addBaseElement, parseAndReply } from './parser';
import { resolveURL } from './services';

const scrapeHandler = async ({ url }, res: (response?: any) => void) => {
Expand All @@ -10,10 +10,11 @@ const scrapeHandler = async ({ url }, res: (response?: any) => void) => {
let doc: Document
while (oldUrl !== newUrl) {
oldUrl = newUrl || oldUrl
const resp = await fetch(oldUrl)
const resp = await fetch(oldUrl, { credentials: 'include' })
const html = await resp.text()
// @ts-expect-error - linkedom's document is FAKE and missing lots of properties, but we don't care because we don't use them :)
doc = new DOMParser().parseFromString(html, 'text/html');
addBaseElement(doc, url);
newUrl = await resolveURL(doc, oldUrl) || oldUrl
}
await parseAndReply(doc, newUrl, res)
Expand Down
5 changes: 3 additions & 2 deletions src/background/index.firefox.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { installedHandler } from "./common";
import { parseAndReply } from "./parser";
import { addBaseElement, parseAndReply } from "./parser";
import { resolveURL } from "./services";

const scrapeHandler = async ({ url }, res: (response?: any) => void) => {
Expand All @@ -9,9 +9,10 @@ const scrapeHandler = async ({ url }, res: (response?: any) => void) => {
let doc: Document
while (oldUrl !== newUrl) {
oldUrl = newUrl || oldUrl
const resp = await fetch(oldUrl)
const resp = await fetch(oldUrl, { credentials: 'include'})
const html = await resp.text()
doc = new DOMParser().parseFromString(html, "text/html")
addBaseElement(doc, url);
newUrl = await resolveURL(doc, oldUrl) || oldUrl
}
await parseAndReply(doc, newUrl, res)
Expand Down
3 changes: 1 addition & 2 deletions src/background/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ export interface Meta {
}

const parseHTMLMeta = (doc: Document, url: string) => {
addBaseElement(doc, url);
const title = (doc.querySelector('meta[property="og:title"]') as HTMLMetaElement)?.content ||
doc.querySelector('title')?.textContent;
const description = (doc.querySelector('meta[property="og:description"]') as HTMLMetaElement)?.content ||
Expand All @@ -26,7 +25,7 @@ const parseHTMLMeta = (doc: Document, url: string) => {
} as Meta
}

const addBaseElement = (doc: Document, url: string) => {
export const addBaseElement = (doc: Document, url: string) => {
let baseEl = doc.createElement('base'); // https://stackoverflow.com/questions/55232202/optional-baseuri-location-in-domparser
baseEl.setAttribute('href', url);
doc.head.append(baseEl);
Expand Down
2 changes: 1 addition & 1 deletion src/background/services/github.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ const GithubParser: Parser = {
const path = new URL(url).pathname.split("/") // ["", "user", "repo", ...]
return path.length >= 3
},
parse: async (node: Node, url: string) => {
parse: async (doc: Document, url: string) => {
const path = new URL(url).pathname.split("/") // ["", "user", "repo", ...]
const data = await fetch(`https://api.github.com/repos/${path[1]}/${path[2]}/readme`).then(res => res.json())
const decoded = atob(data.content)
Expand Down
30 changes: 29 additions & 1 deletion src/background/services/gitlab.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,36 @@ const GitlabParser: Parser = {
const siteName = doc.querySelector("meta[property='og:site_name']")?.getAttribute("content")
return siteName === "GitLab"
},
// GitLab doesn't preview documents statically, and I don't think we're executing JS.
// This means we need to navigate to the repo homepage to find the README link
// And then instead of following that, replace /blob/ with /raw/ to get the raw text
rewrite: async (doc: Document, url: string) => {
return url.replace("\/-\/.*","") // Link to homepage of repository
if (url.match("README")) { // If we're already at the README, stop
return url
}
if (url.match(/\/-\//)) { // If we aren't at the root of the repo, go to the root
return url.replace(/\/-\/.*/,"")
}
},
parse: async (doc: Document, url: string) => {
// Find the README link and replace /blob/ with /raw/
const sidebarLinks = doc.querySelectorAll(".project-page-sidebar-block .nav-item a") as NodeListOf<HTMLAnchorElement>
let readmeUrl = ""
for (const link of sidebarLinks) {
if (link.textContent?.match(/readme/i)) {
readmeUrl = link.href.replace("/blob/", "/raw/")
break
}
}
if (!readmeUrl) {
return {}
}
const resp = await fetch(readmeUrl)
const text = await resp.text()
return {
body: text,
siteName: "GitLab"
}
}
}

Expand Down
8 changes: 5 additions & 3 deletions src/background/services/index.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import github from './github';
const parsers = [github];
import gitlab from './gitlab';

const parsers = [github, gitlab];

export interface Parser {
matches: (doc: Document, url: string) => Promise<boolean>,
rewrite?: (doc: Document, url: string) => Promise<string>,
parse?: (node: Node, url: string) => Promise<{ title?: string, description?: string, imageUrl?: string, body?: string, siteName?: string }>
parse?: (doc: Document, url: string) => Promise<{ title?: string, description?: string, imageUrl?: string, body?: string, siteName?: string }>
}

export const resolveURL = async (doc: Document, url: string) => {
Expand All @@ -24,7 +26,7 @@ export const doCustomParse = async (doc: Document, url: string) => {
for (const parser of parsers) {
try {
if (await parser.matches(doc, url) && parser.parse) {
const documentClone = doc.cloneNode(true);
const documentClone = doc.cloneNode(true) as Document;
return await parser.parse(documentClone, url);
}
} catch (e) {
Expand Down

0 comments on commit 3240838

Please sign in to comment.