From 626e14341bb9068709735c5177aba0546575fffb Mon Sep 17 00:00:00 2001 From: KiraLT Date: Tue, 23 Jan 2024 16:27:14 +0200 Subject: [PATCH] fix: date parsing improvements --- .devcontainer/devcontainer.json | 4 ++-- .github/workflows/coverage.yml | 2 +- .github/workflows/publish-docs.yml | 4 ++-- .github/workflows/release.yml | 4 ++-- CHANGELOG.md | 6 ++---- package-lock.json | 8 +++++++- package.json | 3 ++- spec/parser.spec.ts | 14 +++++++++++--- src/extraction.ts | 27 ++++++++++++++++++++++++--- tsconfig.json | 4 ++-- 10 files changed, 55 insertions(+), 21 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index b44429e..7a813f9 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -2,7 +2,7 @@ "name": "isomorphic-htmlparser", "image": "mcr.microsoft.com/devcontainers/universal:2", "features": { - "ghcr.io/devcontainers/features/node:1": {} + "ghcr.io/devcontainers/features/node:1": {}, }, - "updateContentCommand": "npm install" + "updateContentCommand": "npm install", } diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 03e195e..099ec64 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -11,7 +11,7 @@ jobs: - uses: actions/setup-node@v3 with: - node-version: 18 + node-version: 20 cache: 'npm' - run: npm ci diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml index 7379e3a..3e8e207 100644 --- a/.github/workflows/publish-docs.yml +++ b/.github/workflows/publish-docs.yml @@ -34,13 +34,13 @@ jobs: - uses: actions/setup-node@v3 with: - node-version: 18 + node-version: 20 cache: 'npm' - run: npm ci - run: npm run build - + - name: Setup Pages uses: actions/configure-pages@v3 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 28cb58e..fdac378 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -13,7 +13,7 @@ jobs: - uses: actions/setup-node@v3 with: - node-version: 18 + node-version: 20 cache: 'npm' - run: npm ci @@ -30,7 +30,7 @@ jobs: - uses: actions/setup-node@v3 with: - node-version: 18 + node-version: 20 cache: 'npm' - run: npm ci diff --git a/CHANGELOG.md b/CHANGELOG.md index 20235e2..484e273 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,14 +1,12 @@ # [1.2.0](https://github.com/KiraLT/isomorphic-htmlparser/compare/v1.1.1...v1.2.0) (2023-11-02) - ### Bug Fixes -* improved extraction filters ([a4ab272](https://github.com/KiraLT/isomorphic-htmlparser/commit/a4ab272e066b62a443e827bbb6259e2ce6a8a3ef)) - +- improved extraction filters ([a4ab272](https://github.com/KiraLT/isomorphic-htmlparser/commit/a4ab272e066b62a443e827bbb6259e2ce6a8a3ef)) ### Features -* number parser extraction filters ([4948811](https://github.com/KiraLT/isomorphic-htmlparser/commit/4948811ea4cc3aca3eb36e879f92a854fa13e0e8)) +- number parser extraction filters ([4948811](https://github.com/KiraLT/isomorphic-htmlparser/commit/4948811ea4cc3aca3eb36e879f92a854fa13e0e8)) ## [1.1.1](https://github.com/KiraLT/isomorphic-htmlparser/compare/v1.1.0...v1.1.1) (2023-03-22) diff --git a/package-lock.json b/package-lock.json index cc6ebf1..fb116ee 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,7 +10,8 @@ "license": "MIT", "dependencies": { "cheerio": "^1.0.0-rc.12", - "common-stuff": "^1.10.3" + "common-stuff": "^1.10.3", + "dayjs": "^1.11.10" }, "devDependencies": { "@types/jest": "^29.5.11", @@ -3672,6 +3673,11 @@ "node": ">=8.0.0" } }, + "node_modules/dayjs": { + "version": "1.11.10", + "resolved": "https://registry.npmjs.org/dayjs/-/dayjs-1.11.10.tgz", + "integrity": "sha512-vjAczensTgRcqDERK0SR2XMwsF/tSvnvlv6VcF2GIhg6Sx4yOIt/irsr1RDJsKiIyBzJDpCoXiWWq28MqH2cnQ==" + }, "node_modules/debug": { "version": "4.3.4", "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", diff --git a/package.json b/package.json index 31aa932..f59edc9 100644 --- a/package.json +++ b/package.json @@ -85,6 +85,7 @@ }, "dependencies": { "cheerio": "^1.0.0-rc.12", - "common-stuff": "^1.10.3" + "common-stuff": "^1.10.3", + "dayjs": "^1.11.10" } } diff --git a/spec/parser.spec.ts b/spec/parser.spec.ts index 7b674e5..cd1b99a 100644 --- a/spec/parser.spec.ts +++ b/spec/parser.spec.ts @@ -1,3 +1,4 @@ +import { defaultFilters } from '../src/extraction' import { parseHTML } from '../src/node' const html = ` @@ -7,8 +8,6 @@ const html = `

My First Heading - -

My first paragraph.

@@ -24,6 +23,9 @@ const html = `
2023-11-10 10:30
+
+ Dec. 20th '23 +
` @@ -132,6 +134,12 @@ describe('parseHTML', () => { it('extracts date', () => { const dom = parseHTML(html) - expect(dom.extract('#date @ text | parseDate')).toBe(1699605000000) + // Date parsing depends of current timezone + expect(dom.extract('#date @ text | parseDate')).toBeGreaterThan( + 1699600000000, + ) + expect( + dom.extract('#date2 @ text | parseDate:"MMM. Do \'YY"'), + ).toBeGreaterThan(1699600000000) }) }) diff --git a/src/extraction.ts b/src/extraction.ts index 05e5658..86bc94c 100644 --- a/src/extraction.ts +++ b/src/extraction.ts @@ -1,4 +1,12 @@ import { ensureArray, getByKey, titleCase, parseSize } from 'common-stuff' +import dayjs from 'dayjs' +import customParseFormat from 'dayjs/plugin/customParseFormat' +import advancedFormat from 'dayjs/plugin/advancedFormat' + +import 'dayjs/locale/en' + +dayjs.extend(customParseFormat) +dayjs.extend(advancedFormat) interface ExtractExpression { selector: string @@ -172,7 +180,7 @@ export const defaultFilters = { * const output = `el` * ``` */ - slice: (value: unknown, start: unknown, end: unknown): string => { + slice: (value: unknown, start?: unknown, end?: unknown): string => { return String(value).slice( parseInt(String(start), 10) || undefined, parseInt(String(end)) || undefined, @@ -229,8 +237,21 @@ export const defaultFilters = { * const output = 1699605000000 * ``` */ - parseDate: (value: unknown): number | undefined => { - const parsed = Date.parse(defaultFilters.trim(value)) + parseDate: ( + value: unknown, + format?: unknown, + locale?: unknown, + ): number | undefined => { + const cleanValue = (v: string) => v.replace(/[.]/g, ' ') + + const parsed = dayjs( + cleanValue(defaultFilters.trim(value)), + format ? cleanValue(String(format)) : undefined, + locale ? String(locale) : undefined, + ) + .toDate() + .getTime() + return isNaN(parsed) ? undefined : parsed }, } diff --git a/tsconfig.json b/tsconfig.json index c6ec47d..02c33c0 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -16,7 +16,7 @@ /* Additional Checks */ "noImplicitReturns": true /* Report error when not all code paths in function return a value. */, "noFallthroughCasesInSwitch": true /* Report errors for fallthrough cases in switch statement. */, - "noUncheckedIndexedAccess": true /* Turning on noUncheckedIndexedAccess will add undefined to any un-declared field in the type. */ + "noUncheckedIndexedAccess": true /* Turning on noUncheckedIndexedAccess will add undefined to any un-declared field in the type. */, }, - "include": ["src/**/*"] + "include": ["src/**/*"], }