From 0308566b4c95933bf307a676b5c44bf34cdcbeb1 Mon Sep 17 00:00:00 2001 From: xxcdd <42600601+xxcdd@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:31:31 +0800 Subject: [PATCH] getCoreContentText for any websites using https://github.com/mozilla/readability --- package-lock.json | 9 +++++ package.json | 1 + src/utils/get-core-content-text.mjs | 57 ++++------------------------- 3 files changed, 17 insertions(+), 50 deletions(-) diff --git a/package-lock.json b/package-lock.json index 3872d5a6..22ac4c04 100644 --- a/package-lock.json +++ b/package-lock.json @@ -6,6 +6,7 @@ "": { "name": "chatgptbox", "dependencies": { + "@mozilla/readability": "^0.5.0", "@nem035/gpt-3-encoder": "^1.1.7", "@picocss/pico": "^1.5.9", "@primer/octicons-react": "^18.3.0", @@ -2077,6 +2078,14 @@ "@jridgewell/sourcemap-codec": "^1.4.14" } }, + "node_modules/@mozilla/readability": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz", + "integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/@nem035/gpt-3-encoder": { "version": "1.1.7", "resolved": "https://registry.npmjs.org/@nem035/gpt-3-encoder/-/gpt-3-encoder-1.1.7.tgz", diff --git a/package.json b/package.json index 52212e15..9956b537 100644 --- a/package.json +++ b/package.json @@ -19,6 +19,7 @@ "lint" ], "dependencies": { + "@mozilla/readability": "^0.5.0", "@nem035/gpt-3-encoder": "^1.1.7", "@picocss/pico": "^1.5.9", "@primer/octicons-react": "^18.3.0", diff --git a/src/utils/get-core-content-text.mjs b/src/utils/get-core-content-text.mjs index bef4dc8f..dc008698 100644 --- a/src/utils/get-core-content-text.mjs +++ b/src/utils/get-core-content-text.mjs @@ -1,9 +1,5 @@ import { getPossibleElementByQuerySelector } from './get-possible-element-by-query-selector.mjs' - -function getArea(e) { - const rect = e.getBoundingClientRect() - return rect.width * rect.height -} +import { Readability } from "@mozilla/readability" const adapters = { 'scholar.google': ['#gs_res_ccl_mid'], @@ -17,31 +13,6 @@ const adapters = { 'new.qq.com': ['.content-article'], } -function findLargestElement(e) { - if (!e) { - return null - } - let maxArea = 0 - let largestElement = null - const limitedArea = 0.8 * getArea(e) - - function traverseDOM(node) { - if (node.nodeType === Node.ELEMENT_NODE) { - const area = getArea(node) - - if (area > maxArea && area < limitedArea) { - maxArea = area - largestElement = node - } - - Array.from(node.children).forEach(traverseDOM) - } - } - - traverseDOM(e) - return largestElement -} - export function getCoreContentText() { function getTextFrom(e) { return e.innerText || e.textContent @@ -60,24 +31,10 @@ export function getCoreContentText() { return getTextFrom(element) } - const largestElement = findLargestElement(document.body) - const secondLargestElement = findLargestElement(largestElement) - console.log(largestElement) - console.log(secondLargestElement) - - let ret - if (!largestElement) { - ret = getTextFrom(document.body) - console.log('use document.body') - } else if ( - secondLargestElement && - getArea(secondLargestElement) > 0.5 * getArea(largestElement) - ) { - ret = getTextFrom(secondLargestElement) - console.log('use second') - } else { - ret = getTextFrom(largestElement) - console.log('use first') - } - return ret.trim().replaceAll(' ', '').replaceAll('\n\n', '').replaceAll(',,', '') + let article = new Readability(document.cloneNode(true), { + keepClasses: true + }).parse() + let content = article.textContent.trim().replaceAll(' ', '').replaceAll('\t', '').replaceAll('\n\n', '').replaceAll(',,', '') + console.log(content) + return content }