Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add scripts/crawler.mjs #45

Merged
merged 1 commit into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,9 @@ functions.json
/tools/test.*
/.env
*.cmd
__pycache__
__pycache__
/venv
node_modules
package.json
package-lock.json
*.lock
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ The agent has the following folder structure:
```
└── agents
└── myagent
├── embeddings/ # Contains RAG files for knownledge
├── embeddings/ # Contains RAG files for knowledge
├── functions.json # Function declarations file (Auto-generated)
├── index.yaml # Agent definition file
└── tools.{sh,js,py} # Agent tools script
Expand Down
215 changes: 215 additions & 0 deletions scripts/crawler.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
#!/usr/bin/env node

/**
* Crawler document website.
*
* The script can be used in following scenarios:
* 1. Generate knowledge.json for the agent
* > node scripts/crawler.mjs https://github.com/reactjs/react.dev/tree/main/src/content/reference tmp/knowledge.json
* 2. To be used as a `recursive_url` document loader of AIChat
* > recursive_url: 'node <path-to-llm-functions>/scripts/crawler.mjs $1 $2'
*/

// DEPS: npm i @octokit/rest cheerio html-to-text node-fetch https-proxy-agent

import { Octokit } from "@octokit/rest";
import * as cheerio from "cheerio";
import { URL } from "node:url";
import { writeFileSync } from "node:fs";
import { compile } from "html-to-text";
import fetch from "node-fetch";
import { HttpsProxyAgent } from "https-proxy-agent";

const compiledConvert = compile({ wordwrap: false, selectors: [{ selector: 'a', options: { ignoreHref: true } }] });

const MAX_DEPTH = parseInt(process.env.CRAWLER_MAX_DEPTH) || 3;;

const MAX_CONCURRENT = parseInt(process.env.CRAWLER_MAX_CONCURRENT) || 5;

const IGNORE_LINKS = new Set();

const IGNORE_PATHS_ENDING_IN = [
"search.html",
"search",
"changelog",
"changelog.html",
];

let fetchOptions = {
headers: { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" },
};

async function main() {
const [startUrlRaw, outfile] = process.argv.slice(2);
if (!startUrlRaw || !outfile) {
console.log("Usage: ./crawler.mjs <url> <outfile>");
process.exit(1);
}
if (startUrlRaw.startsWith("https://") && process.env["HTTPS_PROXY"]) {
fetchOptions["agent"] = new HttpsProxyAgent(process.env["HTTPS_PROXY"]);
}
let pages = [];
for await (const page of crawlPage(startUrlRaw, MAX_DEPTH)) {
pages.push(page);
}
const output = JSON.stringify(pages, null, 2);
writeFileSync(outfile, output);
}

/**
*
* @param {String} startUrl
* @param {number} maxDepth
*/
async function* crawlPage(startUrlRaw, maxDepth = 3) {
if (!startUrlRaw.endsWith("/")) {
startUrlRaw += "/"
}
console.log("Starting crawl from: ", startUrlRaw, " - Max Depth: ", maxDepth);
const startUrl = new URL(startUrlRaw);
let paths = [{ path: startUrl.pathname, depth: 0 }];

if (startUrl.hostname === "github.com") {
const githubLinks = await crawlGithubRepo(startUrl);
paths = githubLinks.map((link) => ({
path: link,
depth: 1,
}));
}

let index = 0;
while (index < paths.length) {
const batch = paths.slice(index, index + MAX_CONCURRENT);

const promises = batch.map(({ path, depth }) =>
getLinksFromUrl(startUrlRaw, path).then((links) => ({
links,
path,
depth,
})),
);

const results = await Promise.all(promises);
for (const {
links: { markdown, links: linksArray },
path,
depth,
} of results) {
if (markdown !== "" && depth <= maxDepth) {
yield {
path: new URL(path, startUrl).toString(),
markdown,
};
}

if (depth < maxDepth) {
for (let link of linksArray) {
if (!paths.some((p) => p.path === link)) {
paths.push({ path: link, depth: depth + 1 });
}
}
}
}

index += batch.length;
}
console.log("Crawl completed");
}

/**
*
* @param {import("node:url").Url} startUrl
* @returns
*/
async function crawlGithubRepo(startUrl) {
const octokit = new Octokit({
auth: undefined,
});

const [_, owner, repo, scope, branch, ...pathParts] = startUrl.pathname.split("/");
if (scope !== "tree" && !branch) {
throw new Error("Invalid Github URL. It must follow the format: https://github.com/<owner>/<repo>/tree/<branch>/<path>")
}
const rootPath = pathParts.join("/");

const tree = await octokit.request(
"GET /repos/{owner}/{repo}/git/trees/{tree_sha}",
{
owner,
repo,
tree_sha: branch,
headers: {
"X-GitHub-Api-Version": "2022-11-28",
},
recursive: "true",
},
);

const paths = tree.data.tree
.filter((file) => file.type === "blob" && file.path?.endsWith(".md") && file.path.startsWith(rootPath))
.map(
(file) =>
`https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${file.path}`,
);

return paths;
}

/**
*
* @param {String} startUrlRaw
* @param {String} path
* @returns
*/
async function getLinksFromUrl(startUrlRaw, path) {
const location = new URL(path, startUrlRaw).toString();

console.log(`Crawl ${location}`)

const response = await fetch(location, fetchOptions);
const html = await response.text();

let links = [];

if (startUrlRaw.includes("github.com")) {
return {
markdown: html,
links,
};
}

const $ = cheerio.load(html);

IGNORE_LINKS.add(path);
if (path.endsWith("/")) {
IGNORE_LINKS.add(`${path}index.html`);
}

$("a").each((_, element) => {
const href = $(element).attr("href");
if (!href) {
return;
}

const parsedUrl = new URL(href, startUrlRaw);
if (parsedUrl.toString().startsWith(startUrlRaw)) {
const link = parsedUrl.pathname;
if (
!IGNORE_LINKS.has(link) &&
!link.includes("#") &&
!IGNORE_PATHS_ENDING_IN.some((ending) => link.endsWith(ending))
) {
links.push(link);
}
}
});

links = [...new Set(links)];

return {
markdown: compiledConvert(html),
links,
};
}

main();