From b24f3ab6b1e600cf4dc07a46c074450229b9aa3d Mon Sep 17 00:00:00 2001 From: KirianCaumes Date: Sat, 18 Nov 2023 17:56:21 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Improve=20performance?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/launch.json | 11 ++++ .vscode/tasks.json | 4 ++ README.MD | 14 ----- package-lock.json | 150 -------------------------------------------- package.json | 1 - src/marketplace.ts | 130 +++++++++++++++++++++++--------------- tsconfig.json | 2 +- 7 files changed, 94 insertions(+), 218 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 .vscode/tasks.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..304acbd --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,11 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "🪲 Debug", + "type": "node-terminal", + "request": "launch", + "command": "npm run start:dev" + } + ] +} diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..513d537 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,4 @@ +{ + "version": "2.0.0", + "tasks": [] +} diff --git a/README.MD b/README.MD index 14a7eb6..a9aa2ce 100644 --- a/README.MD +++ b/README.MD @@ -302,18 +302,4 @@ sudo npx playwright install-deps chromium More information [here](https://playwright.dev/docs/browsers#install-system-dependencies). -### Linkedom - -If you are using this library on a Typescript project, you might encounter issues with `linkedom`. - -To fix it, use this in command line: `--skipLibCheck` or add that in you `tsconfig.json`: - -```json -{ - "compilerOptions": { - "skipLibCheck": true - } -} -``` - If you found another problem, feel free to open an issue. diff --git a/package-lock.json b/package-lock.json index e6e10b8..807862c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,6 @@ "version": "1.8.0", "license": "MIT", "dependencies": { - "linkedom": "^0.16.4", "playwright-chromium": "^1.40.0", "user-agents": "^1.1.24" }, @@ -2514,11 +2513,6 @@ "node": ">=8" } }, - "node_modules/boolbase": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", - "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==" - }, "node_modules/brace-expansion": { "version": "1.1.11", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", @@ -3383,37 +3377,6 @@ "url": "https://github.com/chalk/strip-ansi?sponsor=1" } }, - "node_modules/css-select": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz", - "integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==", - "dependencies": { - "boolbase": "^1.0.0", - "css-what": "^6.1.0", - "domhandler": "^5.0.2", - "domutils": "^3.0.1", - "nth-check": "^2.0.1" - }, - "funding": { - "url": "https://github.com/sponsors/fb55" - } - }, - "node_modules/css-what": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz", - "integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==", - "engines": { - "node": ">= 6" - }, - "funding": { - "url": "https://github.com/sponsors/fb55" - } - }, - "node_modules/cssom": { - "version": "0.5.0", - "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.5.0.tgz", - "integrity": "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==" - }, "node_modules/debug": { "version": "4.3.4", "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", @@ -3527,57 +3490,6 @@ "node": ">=6.0.0" } }, - "node_modules/dom-serializer": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", - "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", - "dependencies": { - "domelementtype": "^2.3.0", - "domhandler": "^5.0.2", - "entities": "^4.2.0" - }, - "funding": { - "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" - } - }, - "node_modules/domelementtype": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", - "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/fb55" - } - ] - }, - "node_modules/domhandler": { - "version": "5.0.3", - "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", - "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", - "dependencies": { - "domelementtype": "^2.3.0" - }, - "engines": { - "node": ">= 4" - }, - "funding": { - "url": "https://github.com/fb55/domhandler?sponsor=1" - } - }, - "node_modules/domutils": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz", - "integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==", - "dependencies": { - "dom-serializer": "^2.0.0", - "domelementtype": "^2.3.0", - "domhandler": "^5.0.3" - }, - "funding": { - "url": "https://github.com/fb55/domutils?sponsor=1" - } - }, "node_modules/dot-prop": { "version": "6.0.1", "resolved": "https://registry.npmjs.org/dot-prop/-/dot-prop-6.0.1.tgz", @@ -3636,17 +3548,6 @@ "node": ">=10.13.0" } }, - "node_modules/entities": { - "version": "4.5.0", - "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", - "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", - "engines": { - "node": ">=0.12" - }, - "funding": { - "url": "https://github.com/fb55/entities?sponsor=1" - } - }, "node_modules/error-ex": { "version": "1.3.2", "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.2.tgz", @@ -4752,24 +4653,6 @@ "integrity": "sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==", "dev": true }, - "node_modules/htmlparser2": { - "version": "9.0.0", - "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.0.0.tgz", - "integrity": "sha512-uxbSI98wmFT/G4P2zXx4OVx04qWUmyFPrD2/CNepa2Zo3GPNaCaaxElDgwUrwYWkK1nr9fft0Ya8dws8coDLLQ==", - "funding": [ - "https://github.com/fb55/htmlparser2?sponsor=1", - { - "type": "github", - "url": "https://github.com/sponsors/fb55" - } - ], - "dependencies": { - "domelementtype": "^2.3.0", - "domhandler": "^5.0.3", - "domutils": "^3.1.0", - "entities": "^4.5.0" - } - }, "node_modules/human-signals": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-2.1.0.tgz", @@ -6006,23 +5889,6 @@ "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==", "dev": true }, - "node_modules/linkedom": { - "version": "0.16.4", - "resolved": "https://registry.npmjs.org/linkedom/-/linkedom-0.16.4.tgz", - "integrity": "sha512-SykvDVh/jAnaO+WiPqH5vX3QpZrIRImuppzYhIHons3RXPhDwqN2dOyfopOVaHleqWtoS+3vWCqen+m8M3HToQ==", - "dependencies": { - "css-select": "^5.1.0", - "cssom": "^0.5.0", - "html-escaper": "^3.0.3", - "htmlparser2": "^9.0.0", - "uhyphen": "^0.2.0" - } - }, - "node_modules/linkedom/node_modules/html-escaper": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-3.0.3.tgz", - "integrity": "sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==" - }, "node_modules/lint-staged": { "version": "15.1.0", "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-15.1.0.tgz", @@ -6482,17 +6348,6 @@ "node": ">=8" } }, - "node_modules/nth-check": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", - "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", - "dependencies": { - "boolbase": "^1.0.0" - }, - "funding": { - "url": "https://github.com/fb55/nth-check?sponsor=1" - } - }, "node_modules/object-inspect": { "version": "1.12.3", "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.12.3.tgz", @@ -7868,11 +7723,6 @@ "node": ">=14.17" } }, - "node_modules/uhyphen": { - "version": "0.2.0", - "resolved": "https://registry.npmjs.org/uhyphen/-/uhyphen-0.2.0.tgz", - "integrity": "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA==" - }, "node_modules/unbox-primitive": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.0.2.tgz", diff --git a/package.json b/package.json index f211f8b..1476cb2 100644 --- a/package.json +++ b/package.json @@ -33,7 +33,6 @@ "node": ">=14.0.0" }, "dependencies": { - "linkedom": "^0.16.4", "playwright-chromium": "^1.40.0", "user-agents": "^1.1.24" }, diff --git a/src/marketplace.ts b/src/marketplace.ts index 4b8bcfb..4045e43 100644 --- a/src/marketplace.ts +++ b/src/marketplace.ts @@ -1,7 +1,6 @@ import UserAgent from 'user-agents' -import { parseHTML } from 'linkedom' import { chromium as playwright } from 'playwright-chromium' -import { CURRENCIES, COUNTRIES } from 'data' +import { CURRENCIES as CURRENCIES_DATA, COUNTRIES as COUNTRIES_DATA } from 'data' import type { InputInterface, OutputErrorInterface, OutputSuccessInterface } from 'interfaces' /** @@ -34,7 +33,9 @@ export default abstract class Marketplace { }: InputInterface): Promise { try { /** Init browser */ - const browser = await playwright.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'] }) + const browser = await playwright.launch({ + args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-gl-drawing-for-tests'], + }) /** Init context */ const browserContext = await browser.newContext({ @@ -42,6 +43,7 @@ export default abstract class Marketplace { extraHTTPHeaders: { 'X-PJAX': 'true', }, + javaScriptEnabled: false, }) /** Init page */ @@ -93,27 +95,44 @@ export default abstract class Marketplace { /** Init page */ const response = await browserPage.goto(url, { waitUntil: 'domcontentloaded' }) - /** Get HTML */ - const bodyHTML = await browserPage.content() - - /** Close browser */ - await browser.close() + /** Status code from the page */ + const status = response?.status() || 500 /** If error, reject */ - if ((response?.status() ?? 0) >= 400) { + if (status >= 400) { // eslint-disable-next-line @typescript-eslint/no-throw-literal throw { - message: parseHTML(bodyHTML).document?.querySelector('h1 + p')?.innerHTML?.trim() ?? 'An error occurred', - code: response?.status() || 500, + message: + (await browserPage.evaluate(() => document?.querySelector('h1 + p')?.innerHTML?.trim())) ?? 'An error occurred', + code: status, } as OutputErrorInterface } - return this.getData(parseHTML(bodyHTML).document, browserPage.url(), { - searchType, - searchValue, - limit, - page, - }) + /** Url generated by browser */ + const urlGenerated = browserPage.url() + + /** + * Result from scraping. + * Scraping inside Playwright is faster (x2) compare to exporting HTML outside and parsing it with another library. + */ + const result = await browserPage.evaluate(this.getResult, { + input: { + page, + limit, + searchValue, + searchType, + }, + dependencies: { + urlGenerated, + CURRENCIES: CURRENCIES_DATA, + COUNTRIES: COUNTRIES_DATA, + }, + } as Parameters[0]) + + /** Close browser */ + await browser.close() + + return result } catch (error: unknown) { if ((error as OutputErrorInterface).message && (error as OutputErrorInterface).code) { // Rethrow error @@ -177,42 +196,51 @@ export default abstract class Marketplace { } /** - * Function to convert currency to ISO name - * @param value String to clean - * @returns Cleanup string + * Parse HTML to clean result + * @returns Items found */ - private static convertCurrency(value: string): string { - if (!value) { - return value + private static getResult({ + input: { page, limit, searchValue, searchType }, + dependencies: { urlGenerated, CURRENCIES, COUNTRIES }, + }: { + /** Input */ + input: InputInterface + /** Dependencies */ + dependencies: { + /** UrlGenerated */ + urlGenerated: string + /** Currencies */ + CURRENCIES: typeof CURRENCIES_DATA + /** Currencies */ + COUNTRIES: typeof COUNTRIES_DATA } + }): OutputSuccessInterface { + /** + * Function to convert currency to ISO name + * @param value String to clean + * @returns Cleanup string + */ + const convertCurrency = (value: string): string => { + if (!value) { + return value + } - const currencyFound = Object.keys(CURRENCIES).find(key => value.includes(key)) ?? '' + const currencyFound = Object.keys(CURRENCIES).find(key => value.includes(key)) ?? '' - if (!currencyFound) { - return value - } + if (!currencyFound) { + return value + } - const currencyClean = CURRENCIES[currencyFound as keyof typeof CURRENCIES] + const currencyClean = CURRENCIES[currencyFound as keyof typeof CURRENCIES] - const amount = value - .replace(currencyClean !== 'JPY' ? /[.](?=.*[.])/g : /\./g, '') // Remove all dot but last, except if JPY - .replace(currencyFound, '') // Remove original currency - .replace(/\s/g, '') // Remove spaces + const amount = value + .replace(currencyClean !== 'JPY' ? /[.](?=.*[.])/g : /\./g, '') // Remove all dot but last, except if JPY + .replace(currencyFound, '') // Remove original currency + .replace(/\s/g, '') // Remove spaces - return `${amount} ${currencyClean}`.replace(/\s\s+/g, ' ') // Remove useless spaces - } + return `${amount} ${currencyClean}`.replace(/\s\s+/g, ' ') // Remove useless spaces + } - /** - * Parse HTML to clean result - * @param document Document to parse - * @param urlGenerated Url generated - * @returns Items found - */ - private static getData( - document: Document, - urlGenerated: string, - { searchType = 'q', searchValue = undefined, limit = 25, page = 1 }: Partial, - ): OutputSuccessInterface { const totalItems = parseFloat( document .querySelector('.pagination_total') @@ -226,7 +254,7 @@ export default abstract class Marketplace { items: [...document.querySelectorAll('table.table_block tbody tr')]?.map(el => { // eslint-disable-next-line @typescript-eslint/no-explicit-any - const shipping: any = this.convertCurrency( + const shipping: any = convertCurrency( el .querySelector('.item_shipping') ?.childNodes?.[0]?.textContent?.replace(/(\s+|\+)/g, ' ') @@ -321,9 +349,7 @@ export default abstract class Marketplace { notes: Number.isNaN(notes) ? 0 : notes, }, price: { - base: this.convertCurrency( - el.querySelector('.price')?.textContent?.replace(/\s+/g, ' ')?.replace(/,/, '.') ?? '', - ), + base: convertCurrency(el.querySelector('.price')?.textContent?.replace(/\s+/g, ' ')?.replace(/,/, '.') ?? ''), shipping: Number.isNaN(parseFloat(shipping)) ? null : shipping, }, country: { @@ -341,12 +367,12 @@ export default abstract class Marketplace { } }) || [], page: { - current: page, - total: Math.ceil(totalItems / limit), + current: page!, + total: Math.ceil(totalItems / limit!), }, result: { total: totalItems, - perPage: limit, + perPage: limit!, }, search: { value: searchValue ?? '', diff --git a/tsconfig.json b/tsconfig.json index 81562fa..5a39892 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,6 +1,6 @@ { "compilerOptions": { - "target": "es6", + "target": "ES2017", "module": "commonjs", "declaration": true, "outDir": "./dist",