diff --git a/.gitignore b/.gitignore index 924751c..2255aa6 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,9 @@ functions.json /tools/test.* /.env *.cmd -__pycache__ \ No newline at end of file +__pycache__ +/venv +node_modules +package.json +package-lock.json +*.lock \ No newline at end of file diff --git a/README.md b/README.md index f50da50..7fa267f 100644 --- a/README.md +++ b/README.md @@ -130,7 +130,7 @@ The agent has the following folder structure: ``` └── agents └── myagent - ├── embeddings/ # Contains RAG files for knownledge + ├── embeddings/ # Contains RAG files for knowledge ├── functions.json # Function declarations file (Auto-generated) ├── index.yaml # Agent definition file └── tools.{sh,js,py} # Agent tools script diff --git a/scripts/crawler.mjs b/scripts/crawler.mjs new file mode 100755 index 0000000..4db2e00 --- /dev/null +++ b/scripts/crawler.mjs @@ -0,0 +1,215 @@ +#!/usr/bin/env node + +/** + * Crawler document website. + * + * The script can be used in following scenarios: + * 1. Generate knowledge.json for the agent + * > node scripts/crawler.mjs https://github.com/reactjs/react.dev/tree/main/src/content/reference tmp/knowledge.json + * 2. To be used as a `recursive_url` document loader of AIChat + * > recursive_url: 'node /scripts/crawler.mjs $1 $2' + */ + +// DEPS: npm i @octokit/rest cheerio html-to-text node-fetch https-proxy-agent + +import { Octokit } from "@octokit/rest"; +import * as cheerio from "cheerio"; +import { URL } from "node:url"; +import { writeFileSync } from "node:fs"; +import { compile } from "html-to-text"; +import fetch from "node-fetch"; +import { HttpsProxyAgent } from "https-proxy-agent"; + +const compiledConvert = compile({ wordwrap: false, selectors: [{ selector: 'a', options: { ignoreHref: true } }] }); + +const MAX_DEPTH = parseInt(process.env.CRAWLER_MAX_DEPTH) || 3;; + +const MAX_CONCURRENT = parseInt(process.env.CRAWLER_MAX_CONCURRENT) || 5; + +const IGNORE_LINKS = new Set(); + +const IGNORE_PATHS_ENDING_IN = [ + "search.html", + "search", + "changelog", + "changelog.html", +]; + +let fetchOptions = { + headers: { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" }, +}; + +async function main() { + const [startUrlRaw, outfile] = process.argv.slice(2); + if (!startUrlRaw || !outfile) { + console.log("Usage: ./crawler.mjs "); + process.exit(1); + } + if (startUrlRaw.startsWith("https://") && process.env["HTTPS_PROXY"]) { + fetchOptions["agent"] = new HttpsProxyAgent(process.env["HTTPS_PROXY"]); + } + let pages = []; + for await (const page of crawlPage(startUrlRaw, MAX_DEPTH)) { + pages.push(page); + } + const output = JSON.stringify(pages, null, 2); + writeFileSync(outfile, output); +} + +/** + * + * @param {String} startUrl + * @param {number} maxDepth + */ +async function* crawlPage(startUrlRaw, maxDepth = 3) { + if (!startUrlRaw.endsWith("/")) { + startUrlRaw += "/" + } + console.log("Starting crawl from: ", startUrlRaw, " - Max Depth: ", maxDepth); + const startUrl = new URL(startUrlRaw); + let paths = [{ path: startUrl.pathname, depth: 0 }]; + + if (startUrl.hostname === "github.com") { + const githubLinks = await crawlGithubRepo(startUrl); + paths = githubLinks.map((link) => ({ + path: link, + depth: 1, + })); + } + + let index = 0; + while (index < paths.length) { + const batch = paths.slice(index, index + MAX_CONCURRENT); + + const promises = batch.map(({ path, depth }) => + getLinksFromUrl(startUrlRaw, path).then((links) => ({ + links, + path, + depth, + })), + ); + + const results = await Promise.all(promises); + for (const { + links: { markdown, links: linksArray }, + path, + depth, + } of results) { + if (markdown !== "" && depth <= maxDepth) { + yield { + path: new URL(path, startUrl).toString(), + markdown, + }; + } + + if (depth < maxDepth) { + for (let link of linksArray) { + if (!paths.some((p) => p.path === link)) { + paths.push({ path: link, depth: depth + 1 }); + } + } + } + } + + index += batch.length; + } + console.log("Crawl completed"); +} + +/** + * + * @param {import("node:url").Url} startUrl + * @returns + */ +async function crawlGithubRepo(startUrl) { + const octokit = new Octokit({ + auth: undefined, + }); + + const [_, owner, repo, scope, branch, ...pathParts] = startUrl.pathname.split("/"); + if (scope !== "tree" && !branch) { + throw new Error("Invalid Github URL. It must follow the format: https://github.com///tree//") + } + const rootPath = pathParts.join("/"); + + const tree = await octokit.request( + "GET /repos/{owner}/{repo}/git/trees/{tree_sha}", + { + owner, + repo, + tree_sha: branch, + headers: { + "X-GitHub-Api-Version": "2022-11-28", + }, + recursive: "true", + }, + ); + + const paths = tree.data.tree + .filter((file) => file.type === "blob" && file.path?.endsWith(".md") && file.path.startsWith(rootPath)) + .map( + (file) => + `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${file.path}`, + ); + + return paths; +} + +/** + * + * @param {String} startUrlRaw + * @param {String} path + * @returns + */ +async function getLinksFromUrl(startUrlRaw, path) { + const location = new URL(path, startUrlRaw).toString(); + + console.log(`Crawl ${location}`) + + const response = await fetch(location, fetchOptions); + const html = await response.text(); + + let links = []; + + if (startUrlRaw.includes("github.com")) { + return { + markdown: html, + links, + }; + } + + const $ = cheerio.load(html); + + IGNORE_LINKS.add(path); + if (path.endsWith("/")) { + IGNORE_LINKS.add(`${path}index.html`); + } + + $("a").each((_, element) => { + const href = $(element).attr("href"); + if (!href) { + return; + } + + const parsedUrl = new URL(href, startUrlRaw); + if (parsedUrl.toString().startsWith(startUrlRaw)) { + const link = parsedUrl.pathname; + if ( + !IGNORE_LINKS.has(link) && + !link.includes("#") && + !IGNORE_PATHS_ENDING_IN.some((ending) => link.endsWith(ending)) + ) { + links.push(link); + } + } + }); + + links = [...new Set(links)]; + + return { + markdown: compiledConvert(html), + links, + }; +} + +main();