Skip to content

Commit

Permalink
getCoreContentText for any websites using https://github.com/mozilla/…
Browse files Browse the repository at this point in the history
  • Loading branch information
xxcdd committed Mar 6, 2024
1 parent c4a537f commit 0308566
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 50 deletions.
9 changes: 9 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"lint"
],
"dependencies": {
"@mozilla/readability": "^0.5.0",
"@nem035/gpt-3-encoder": "^1.1.7",
"@picocss/pico": "^1.5.9",
"@primer/octicons-react": "^18.3.0",
Expand Down
57 changes: 7 additions & 50 deletions src/utils/get-core-content-text.mjs
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
import { getPossibleElementByQuerySelector } from './get-possible-element-by-query-selector.mjs'

function getArea(e) {
const rect = e.getBoundingClientRect()
return rect.width * rect.height
}
import { Readability } from "@mozilla/readability"

const adapters = {
'scholar.google': ['#gs_res_ccl_mid'],
Expand All @@ -17,31 +13,6 @@ const adapters = {
'new.qq.com': ['.content-article'],
}

function findLargestElement(e) {
if (!e) {
return null
}
let maxArea = 0
let largestElement = null
const limitedArea = 0.8 * getArea(e)

function traverseDOM(node) {
if (node.nodeType === Node.ELEMENT_NODE) {
const area = getArea(node)

if (area > maxArea && area < limitedArea) {
maxArea = area
largestElement = node
}

Array.from(node.children).forEach(traverseDOM)
}
}

traverseDOM(e)
return largestElement
}

export function getCoreContentText() {
function getTextFrom(e) {
return e.innerText || e.textContent
Expand All @@ -60,24 +31,10 @@ export function getCoreContentText() {
return getTextFrom(element)
}

const largestElement = findLargestElement(document.body)
const secondLargestElement = findLargestElement(largestElement)
console.log(largestElement)
console.log(secondLargestElement)

let ret
if (!largestElement) {
ret = getTextFrom(document.body)
console.log('use document.body')
} else if (
secondLargestElement &&
getArea(secondLargestElement) > 0.5 * getArea(largestElement)
) {
ret = getTextFrom(secondLargestElement)
console.log('use second')
} else {
ret = getTextFrom(largestElement)
console.log('use first')
}
return ret.trim().replaceAll(' ', '').replaceAll('\n\n', '').replaceAll(',,', '')
let article = new Readability(document.cloneNode(true), {
keepClasses: true
}).parse()
let content = article.textContent.trim().replaceAll(' ', '').replaceAll('\t', '').replaceAll('\n\n', '').replaceAll(',,', '')
console.log(content)
return content
}

0 comments on commit 0308566

Please sign in to comment.