From e1fd9200ed2b09f26af9ad033249941c83df6bd4 Mon Sep 17 00:00:00 2001 From: ydennisy Date: Sun, 26 Nov 2023 18:50:01 +0000 Subject: [PATCH 1/2] chore: big schema refactor to simplify, and allow simpler evolution of the individual models --- backend/package-lock.json | 381 +++++++++++++++++- backend/package.json | 3 + .../20231119091508_init_tables/migration.sql | 77 ---- .../migration.sql | 2 - .../20231126122313_init_tables/migration.sql | 48 +++ backend/prisma/schema.prisma | 70 ++-- backend/src/app.ts | 112 +++-- backend/src/domain/entities/index.ts | 2 + backend/src/domain/entities/note.ts | 45 +++ backend/src/domain/entities/web-page.ts | 57 +++ backend/src/domain/values/domain.ts | 12 + backend/src/domain/values/id.ts | 58 +++ backend/src/domain/values/index.ts | 2 + backend/src/llm.ts | 10 +- backend/src/parser.ts | 82 +--- backend/src/repo.ts | 179 ++++---- backend/src/scraper.ts | 20 +- backend/tests/api.test.js | 106 ----- backend/tests/api.test.ts | 30 +- backend/tests/fixtures/body.md | 1 - backend/tests/fixtures/front-matter-empty.md | 4 - .../tests/fixtures/front-matter-title-tags.md | 6 - backend/tests/fixtures/front-matter-title.md | 5 - .../fixtures/multiple-links-same-line.md | 1 + backend/tests/fixtures/multiple-links.md | 2 + .../fixtures/note-with-link-same-line.md | 1 + ...h-links.md => note-with-multiple-links.md} | 0 backend/tests/fixtures/single-link.md | 1 + backend/tests/parser.test.js | 56 --- backend/tests/parser.test.ts | 75 ++-- backend/tests/scraper.test.js | 13 - cli/src/add.ts | 10 +- cli/src/search.ts | 15 +- 33 files changed, 891 insertions(+), 595 deletions(-) delete mode 100644 backend/prisma/migrations/20231119091508_init_tables/migration.sql delete mode 100644 backend/prisma/migrations/20231119091949_add_initial_counter/migration.sql create mode 100644 backend/prisma/migrations/20231126122313_init_tables/migration.sql create mode 100644 backend/src/domain/entities/index.ts create mode 100644 backend/src/domain/entities/note.ts create mode 100644 backend/src/domain/entities/web-page.ts create mode 100644 backend/src/domain/values/domain.ts create mode 100644 backend/src/domain/values/id.ts create mode 100644 backend/src/domain/values/index.ts delete mode 100644 backend/tests/api.test.js delete mode 100644 backend/tests/fixtures/body.md delete mode 100644 backend/tests/fixtures/front-matter-empty.md delete mode 100644 backend/tests/fixtures/front-matter-title-tags.md delete mode 100644 backend/tests/fixtures/front-matter-title.md create mode 100644 backend/tests/fixtures/multiple-links-same-line.md create mode 100644 backend/tests/fixtures/multiple-links.md create mode 100644 backend/tests/fixtures/note-with-link-same-line.md rename backend/tests/fixtures/{body-with-links.md => note-with-multiple-links.md} (100%) create mode 100644 backend/tests/fixtures/single-link.md delete mode 100644 backend/tests/parser.test.js delete mode 100644 backend/tests/scraper.test.js diff --git a/backend/package-lock.json b/backend/package-lock.json index a625551..dc3ba50 100644 --- a/backend/package-lock.json +++ b/backend/package-lock.json @@ -1,23 +1,26 @@ { - "name": "kg1", + "name": "kg1-backend", "version": "0.0.1", "lockfileVersion": 3, "requires": true, "packages": { "": { - "name": "kg1", + "name": "kg1-backend", "version": "0.0.1", - "license": "ISC", + "license": "MIT", "dependencies": { + "@mozilla/readability": "^0.4.4", "@prisma/client": "^5.6.0", "entities": "^4.5.0", "fastify": "^4.24.3", "got-scraping": "^3.2.15", + "jsdom": "^22.1.0", "node-html-parser": "^6.1.10", "openai": "^4.15.0", "pgvector": "^0.1.5" }, "devDependencies": { + "@types/jsdom": "^21.1.6", "@types/node": "^20.6.3", "nodemon": "^3.0.1", "prisma": "^5.6.0", @@ -191,6 +194,14 @@ "@jridgewell/sourcemap-codec": "^1.4.10" } }, + "node_modules/@mozilla/readability": { + "version": "0.4.4", + "resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.4.4.tgz", + "integrity": "sha512-MCgZyANpJ6msfvVMi6+A0UAsvZj//4OHREYUB9f2087uXHVoU+H+SWhuihvb1beKpM323bReQPRio0WNk2+V6g==", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/@npmcli/agent": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/@npmcli/agent/-/agent-2.2.0.tgz", @@ -878,6 +889,14 @@ "@tapjs/core": "1.4.5" } }, + "node_modules/@tootallnate/once": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/@tootallnate/once/-/once-2.0.0.tgz", + "integrity": "sha512-XCuKFP5PS55gnMVu3dty8KPatLqUoy/ZYzDzAGCQ8JNFCkLXzmI7vNHCR+XpbZaMWQK/vQubr7PkYq8g470J/A==", + "engines": { + "node": ">= 10" + } + }, "node_modules/@tsconfig/node10": { "version": "1.0.9", "resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.9.tgz", @@ -966,6 +985,17 @@ "integrity": "sha512-zONci81DZYCZjiLe0r6equvZut0b+dBRPBN5kBDjsONnutYNtJMoWQ9uR2RkL1gLG9NMTzvf+29e5RFfPbeKhQ==", "dev": true }, + "node_modules/@types/jsdom": { + "version": "21.1.6", + "resolved": "https://registry.npmjs.org/@types/jsdom/-/jsdom-21.1.6.tgz", + "integrity": "sha512-/7kkMsC+/kMs7gAYmmBR9P0vGTnOoLhQhyhQJSlXGI5bzTHp6xdo0TtKWQAsz6pmSAeVqKSbqeyP6hytqr9FDw==", + "dev": true, + "dependencies": { + "@types/node": "*", + "@types/tough-cookie": "*", + "parse5": "^7.0.0" + } + }, "node_modules/@types/node": { "version": "20.8.9", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.8.9.tgz", @@ -991,6 +1021,17 @@ "@types/node": "*" } }, + "node_modules/@types/tough-cookie": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.5.tgz", + "integrity": "sha512-/Ad8+nIOV7Rl++6f1BdKxFSMgmoqEoYbHRpPcx3JEfv8VRsQe9Z4mCXeJBzxs7mbHY/XOZZuXlRNfhpVPbs6ZA==", + "dev": true + }, + "node_modules/abab": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/abab/-/abab-2.0.6.tgz", + "integrity": "sha512-j2afSsaIENvHZN2B8GOpF566vZ5WVk5opAiMTvWgaQT8DkbOqsTfvNAvHoRGU2zzP8cPoqys+xHTRDWW8L+/BA==" + }, "node_modules/abbrev": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz", @@ -1943,6 +1984,61 @@ "url": "https://github.com/sponsors/fb55" } }, + "node_modules/cssstyle": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-3.0.0.tgz", + "integrity": "sha512-N4u2ABATi3Qplzf0hWbVCdjenim8F3ojEXpBDF5hBpjzW182MjNGLqfmQ0SkSPeQ+V86ZXgeH8aXj6kayd4jgg==", + "dependencies": { + "rrweb-cssom": "^0.6.0" + }, + "engines": { + "node": ">=14" + } + }, + "node_modules/data-urls": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-4.0.0.tgz", + "integrity": "sha512-/mMTei/JXPqvFqQtfyTowxmJVwr2PVAeCcDxyFf6LhoOu/09TX2OX3kb2wzi4DMXcfj4OItwDOnhl5oziPnT6g==", + "dependencies": { + "abab": "^2.0.6", + "whatwg-mimetype": "^3.0.0", + "whatwg-url": "^12.0.0" + }, + "engines": { + "node": ">=14" + } + }, + "node_modules/data-urls/node_modules/tr46": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-4.1.1.tgz", + "integrity": "sha512-2lv/66T7e5yNyhAAC4NaKe5nVavzuGJQVVtRYLyQ2OI8tsJ61PMLlelehb0wi2Hx6+hT/OJUWZcw8MjlSRnxvw==", + "dependencies": { + "punycode": "^2.3.0" + }, + "engines": { + "node": ">=14" + } + }, + "node_modules/data-urls/node_modules/webidl-conversions": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz", + "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==", + "engines": { + "node": ">=12" + } + }, + "node_modules/data-urls/node_modules/whatwg-url": { + "version": "12.0.1", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-12.0.1.tgz", + "integrity": "sha512-Ed/LrqB8EPlGxjS+TrsXcpUond1mhccS3pchLhzSgPCnTimUCKj3IZE75pAs5m6heB2U2TMerKFUXheyHY+VDQ==", + "dependencies": { + "tr46": "^4.1.1", + "webidl-conversions": "^7.0.0" + }, + "engines": { + "node": ">=14" + } + }, "node_modules/debug": { "version": "4.3.4", "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", @@ -1959,6 +2055,11 @@ } } }, + "node_modules/decimal.js": { + "version": "10.4.3", + "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.3.tgz", + "integrity": "sha512-VBBaLc1MgL5XpzgIP7ny5Z6Nx3UrRkIViUkPUdtl9aya5amy3De1gsUUSB1g3+3sExYNjCAsAznmukyxCb1GRA==" + }, "node_modules/decompress-response": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz", @@ -2031,6 +2132,25 @@ } ] }, + "node_modules/domexception": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/domexception/-/domexception-4.0.0.tgz", + "integrity": "sha512-A2is4PLG+eeSfoTMA95/s4pvAoSo2mKtiM5jlHkAVewmiO8ISFTFKZjH7UAM1Atli/OT/7JHOrJRJiMKUZKYBw==", + "dependencies": { + "webidl-conversions": "^7.0.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/domexception/node_modules/webidl-conversions": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz", + "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==", + "engines": { + "node": ">=12" + } + }, "node_modules/domhandler": { "version": "5.0.3", "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", @@ -2629,6 +2749,17 @@ "node": "14 || >=16.14" } }, + "node_modules/html-encoding-sniffer": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-3.0.0.tgz", + "integrity": "sha512-oWv4T4yJ52iKrufjnyZPkrN0CH3QnrUqdB6In1g5Fe1mia8GmF36gnfNySxoZtxD5+NmYw1EElVXiBk93UeskA==", + "dependencies": { + "whatwg-encoding": "^2.0.0" + }, + "engines": { + "node": ">=12" + } + }, "node_modules/html-escaper": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz", @@ -2690,7 +2821,6 @@ "version": "0.6.3", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", - "optional": true, "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" }, @@ -2989,6 +3119,11 @@ "node": ">=0.10.0" } }, + "node_modules/is-potential-custom-element-name": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz", + "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==" + }, "node_modules/is-upper-case": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/is-upper-case/-/is-upper-case-2.0.2.tgz", @@ -3088,6 +3223,114 @@ "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==", "dev": true }, + "node_modules/jsdom": { + "version": "22.1.0", + "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-22.1.0.tgz", + "integrity": "sha512-/9AVW7xNbsBv6GfWho4TTNjEo9fe6Zhf9O7s0Fhhr3u+awPwAJMKwAMXnkk5vBxflqLW9hTHX/0cs+P3gW+cQw==", + "dependencies": { + "abab": "^2.0.6", + "cssstyle": "^3.0.0", + "data-urls": "^4.0.0", + "decimal.js": "^10.4.3", + "domexception": "^4.0.0", + "form-data": "^4.0.0", + "html-encoding-sniffer": "^3.0.0", + "http-proxy-agent": "^5.0.0", + "https-proxy-agent": "^5.0.1", + "is-potential-custom-element-name": "^1.0.1", + "nwsapi": "^2.2.4", + "parse5": "^7.1.2", + "rrweb-cssom": "^0.6.0", + "saxes": "^6.0.0", + "symbol-tree": "^3.2.4", + "tough-cookie": "^4.1.2", + "w3c-xmlserializer": "^4.0.0", + "webidl-conversions": "^7.0.0", + "whatwg-encoding": "^2.0.0", + "whatwg-mimetype": "^3.0.0", + "whatwg-url": "^12.0.1", + "ws": "^8.13.0", + "xml-name-validator": "^4.0.0" + }, + "engines": { + "node": ">=16" + }, + "peerDependencies": { + "canvas": "^2.5.0" + }, + "peerDependenciesMeta": { + "canvas": { + "optional": true + } + } + }, + "node_modules/jsdom/node_modules/agent-base": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-6.0.2.tgz", + "integrity": "sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==", + "dependencies": { + "debug": "4" + }, + "engines": { + "node": ">= 6.0.0" + } + }, + "node_modules/jsdom/node_modules/http-proxy-agent": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-5.0.0.tgz", + "integrity": "sha512-n2hY8YdoRE1i7r6M0w9DIw5GgZN0G25P8zLCRQ8rjXtTU3vsNFBI/vWK/UIeE6g5MUUz6avwAPXmL6Fy9D/90w==", + "dependencies": { + "@tootallnate/once": "2", + "agent-base": "6", + "debug": "4" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/jsdom/node_modules/https-proxy-agent": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz", + "integrity": "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==", + "dependencies": { + "agent-base": "6", + "debug": "4" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/jsdom/node_modules/tr46": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-4.1.1.tgz", + "integrity": "sha512-2lv/66T7e5yNyhAAC4NaKe5nVavzuGJQVVtRYLyQ2OI8tsJ61PMLlelehb0wi2Hx6+hT/OJUWZcw8MjlSRnxvw==", + "dependencies": { + "punycode": "^2.3.0" + }, + "engines": { + "node": ">=14" + } + }, + "node_modules/jsdom/node_modules/webidl-conversions": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz", + "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==", + "engines": { + "node": ">=12" + } + }, + "node_modules/jsdom/node_modules/whatwg-url": { + "version": "12.0.1", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-12.0.1.tgz", + "integrity": "sha512-Ed/LrqB8EPlGxjS+TrsXcpUond1mhccS3pchLhzSgPCnTimUCKj3IZE75pAs5m6heB2U2TMerKFUXheyHY+VDQ==", + "dependencies": { + "tr46": "^4.1.1", + "webidl-conversions": "^7.0.0" + }, + "engines": { + "node": ">=14" + } + }, "node_modules/json-buffer": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz", @@ -3768,6 +4011,11 @@ "url": "https://github.com/fb55/nth-check?sponsor=1" } }, + "node_modules/nwsapi": { + "version": "2.2.7", + "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.7.tgz", + "integrity": "sha512-ub5E4+FBPKwAZx0UwIQOjYWGHTEq5sPqHQNRN8Z9e4A7u3Tj1weLJsL59yH9vmvqEtBHaOmT6cYQKIZOxp35FQ==" + }, "node_modules/on-exit-leak-free": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/on-exit-leak-free/-/on-exit-leak-free-2.1.2.tgz", @@ -3938,6 +4186,17 @@ "node": "^16.14.0 || >=18.0.0" } }, + "node_modules/parse5": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz", + "integrity": "sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==", + "dependencies": { + "entities": "^4.4.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, "node_modules/patch-console": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/patch-console/-/patch-console-2.0.0.tgz", @@ -4187,6 +4446,11 @@ "node": ">= 0.10" } }, + "node_modules/psl": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/psl/-/psl-1.9.0.tgz", + "integrity": "sha512-E/ZsdU4HLs/68gYzgGTkMicWTLPdAftJLfJFlLUAAKZGkStNU72sZjT66SnMDVOfOWY/YAoiD7Jxa9iHvngcag==" + }, "node_modules/pstree.remy": { "version": "1.1.8", "resolved": "https://registry.npmjs.org/pstree.remy/-/pstree.remy-1.1.8.tgz", @@ -4210,6 +4474,11 @@ "node": ">=6" } }, + "node_modules/querystringify": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/querystringify/-/querystringify-2.2.0.tgz", + "integrity": "sha512-FIqgj2EUvTa7R50u0rGsyTftzjYmv/a3hO345bZNrqabNqjtgiDMgmo4mkUjd+nzU5oF3dClKqFIPUKybUyqoQ==" + }, "node_modules/quick-format-unescaped": { "version": "4.0.4", "resolved": "https://registry.npmjs.org/quick-format-unescaped/-/quick-format-unescaped-4.0.4.tgz", @@ -4369,6 +4638,11 @@ "node": ">=0.10.0" } }, + "node_modules/requires-port": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz", + "integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ==" + }, "node_modules/resolve-alpn": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/resolve-alpn/-/resolve-alpn-1.2.1.tgz", @@ -4472,6 +4746,11 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/rrweb-cssom": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.6.0.tgz", + "integrity": "sha512-APM0Gt1KoXBz0iIkkdB/kfvGOwC4UuJFeG/c+yV7wSc7q96cG/kJ0HiYCnzivD9SB53cLV1MlHFNfOuPaadYSw==" + }, "node_modules/safe-buffer": { "version": "5.2.1", "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", @@ -4510,8 +4789,18 @@ "node_modules/safer-buffer": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", - "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", - "optional": true + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" + }, + "node_modules/saxes": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz", + "integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==", + "dependencies": { + "xmlchars": "^2.2.0" + }, + "engines": { + "node": ">=v12.22.7" + } }, "node_modules/scheduler": { "version": "0.23.0", @@ -4872,6 +5161,11 @@ "node": ">=4" } }, + "node_modules/symbol-tree": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz", + "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==" + }, "node_modules/sync-content": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/sync-content/-/sync-content-1.0.2.tgz", @@ -5122,6 +5416,20 @@ "node": "*" } }, + "node_modules/tough-cookie": { + "version": "4.1.3", + "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.3.tgz", + "integrity": "sha512-aX/y5pVRkfRnfmuX+OdbSdXvPe6ieKX/G2s7e98f4poJHnqH3281gDPm/metm6E/WRamfx7WC4HUqkWHfQHprw==", + "dependencies": { + "psl": "^1.1.33", + "punycode": "^2.1.1", + "universalify": "^0.2.0", + "url-parse": "^1.5.3" + }, + "engines": { + "node": ">=6" + } + }, "node_modules/tr46": { "version": "0.0.3", "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", @@ -5302,6 +5610,14 @@ "node": "^14.17.0 || ^16.13.0 || >=18.0.0" } }, + "node_modules/universalify": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.2.0.tgz", + "integrity": "sha512-CJ1QgKmNg3CwvAv/kOFmtnEN05f0D/cn9QntgNOQlQF9dgvVTHj3t+8JPdjqawCHk7V/KA+fbUqzZ9XWhcqPUg==", + "engines": { + "node": ">= 4.0.0" + } + }, "node_modules/update-browserslist-db": { "version": "1.0.13", "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.0.13.tgz", @@ -5339,6 +5655,15 @@ "punycode": "^2.1.0" } }, + "node_modules/url-parse": { + "version": "1.5.10", + "resolved": "https://registry.npmjs.org/url-parse/-/url-parse-1.5.10.tgz", + "integrity": "sha512-WypcfiRhfeUP9vvF0j6rw0J3hrWrw6iZv3+22h6iRMJ/8z1Tj6XfLP4DsUix5MhMPnXpiHDoKyoZ/bdCkwBCiQ==", + "dependencies": { + "querystringify": "^2.1.1", + "requires-port": "^1.0.0" + } + }, "node_modules/uuid": { "version": "8.3.2", "resolved": "https://registry.npmjs.org/uuid/-/uuid-8.3.2.tgz", @@ -5408,6 +5733,17 @@ "node": "^14.17.0 || ^16.13.0 || >=18.0.0" } }, + "node_modules/w3c-xmlserializer": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-4.0.0.tgz", + "integrity": "sha512-d+BFHzbiCx6zGfz0HyQ6Rg69w9k19nviJspaj4yNscGjrHu94sVP+aRm75yEbCh+r2/yR+7q6hux9LVtbuTGBw==", + "dependencies": { + "xml-name-validator": "^4.0.0" + }, + "engines": { + "node": ">=14" + } + }, "node_modules/walk-up-path": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/walk-up-path/-/walk-up-path-3.0.1.tgz", @@ -5427,6 +5763,25 @@ "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==" }, + "node_modules/whatwg-encoding": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-2.0.0.tgz", + "integrity": "sha512-p41ogyeMUrw3jWclHWTQg1k05DSVXPLcVxRTYsXUk+ZooOCZLcoYgPZ/HL/D/N+uQPOtcp1me1WhBEaX02mhWg==", + "dependencies": { + "iconv-lite": "0.6.3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/whatwg-mimetype": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-3.0.0.tgz", + "integrity": "sha512-nt+N2dzIutVRxARx1nghPKGv1xHikU7HKdfafKkLNLindmPU/ch3U31NOCGGA/dmPcmb1VlofO0vnKAcsm0o/Q==", + "engines": { + "node": ">=12" + } + }, "node_modules/whatwg-url": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", @@ -5575,7 +5930,6 @@ "version": "8.14.2", "resolved": "https://registry.npmjs.org/ws/-/ws-8.14.2.tgz", "integrity": "sha512-wEBG1ftX4jcglPxgFCMJmZ2PLtSbJ2Peg6TmpJFTbe9GZYOQCDPdMYu/Tm0/bGZkw8paZnJY45J4K2PZrLYq8g==", - "dev": true, "engines": { "node": ">=10.0.0" }, @@ -5592,6 +5946,19 @@ } } }, + "node_modules/xml-name-validator": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-4.0.0.tgz", + "integrity": "sha512-ICP2e+jsHvAj2E2lIHxa5tjXRlKDJo4IdvPvCXbXQGdzSfmSpNVyIKMvoZHjDY9DP0zV17iI85o90vRFXNccRw==", + "engines": { + "node": ">=12" + } + }, + "node_modules/xmlchars": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz", + "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==" + }, "node_modules/y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", diff --git a/backend/package.json b/backend/package.json index 561fe0d..5b6c237 100644 --- a/backend/package.json +++ b/backend/package.json @@ -15,6 +15,7 @@ "test": "tap run" }, "devDependencies": { + "@types/jsdom": "^21.1.6", "@types/node": "^20.6.3", "nodemon": "^3.0.1", "prisma": "^5.6.0", @@ -23,10 +24,12 @@ "typescript": "^5.2.2" }, "dependencies": { + "@mozilla/readability": "^0.4.4", "@prisma/client": "^5.6.0", "entities": "^4.5.0", "fastify": "^4.24.3", "got-scraping": "^3.2.15", + "jsdom": "^22.1.0", "node-html-parser": "^6.1.10", "openai": "^4.15.0", "pgvector": "^0.1.5" diff --git a/backend/prisma/migrations/20231119091508_init_tables/migration.sql b/backend/prisma/migrations/20231119091508_init_tables/migration.sql deleted file mode 100644 index 8c95522..0000000 --- a/backend/prisma/migrations/20231119091508_init_tables/migration.sql +++ /dev/null @@ -1,77 +0,0 @@ --- CreateEnum -CREATE TYPE "NodeType" AS ENUM ('NOTE', 'WEB_PAGE'); - --- CreateTable -CREATE TABLE "nodes" ( - "id" SERIAL NOT NULL, - "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, - "updated_at" TIMESTAMP(3) NOT NULL, - "raw" TEXT NOT NULL, - "title" TEXT, - "type" "NodeType" NOT NULL, - "embedding" vector(1536), - - CONSTRAINT "nodes_pkey" PRIMARY KEY ("id") -); - --- CreateTable -CREATE TABLE "edges" ( - "id" SERIAL NOT NULL, - "parent_id" INTEGER NOT NULL, - "child_id" INTEGER NOT NULL, - - CONSTRAINT "edges_pkey" PRIMARY KEY ("id") -); - --- CreateTable -CREATE TABLE "web_pages" ( - "id" SERIAL NOT NULL, - "url" TEXT NOT NULL, - "title" TEXT NOT NULL, - "node_id" INTEGER NOT NULL, - - CONSTRAINT "web_pages_pkey" PRIMARY KEY ("id") -); - --- CreateTable -CREATE TABLE "notes" ( - "id" SERIAL NOT NULL, - "title" TEXT NOT NULL, - "body" TEXT NOT NULL, - "node_id" INTEGER NOT NULL, - - CONSTRAINT "notes_pkey" PRIMARY KEY ("id") -); - --- CreateTable -CREATE TABLE "counters" ( - "id" SERIAL NOT NULL, - "name" TEXT NOT NULL, - "value" INTEGER NOT NULL, - - CONSTRAINT "counters_pkey" PRIMARY KEY ("id") -); - --- CreateIndex -CREATE INDEX "nodes_id_idx" ON "nodes"("id"); - --- CreateIndex -CREATE UNIQUE INDEX "edges_parent_id_child_id_key" ON "edges"("parent_id", "child_id"); - --- CreateIndex -CREATE UNIQUE INDEX "web_pages_node_id_key" ON "web_pages"("node_id"); - --- CreateIndex -CREATE UNIQUE INDEX "notes_node_id_key" ON "notes"("node_id"); - --- AddForeignKey -ALTER TABLE "edges" ADD CONSTRAINT "edges_parent_id_fkey" FOREIGN KEY ("parent_id") REFERENCES "nodes"("id") ON DELETE RESTRICT ON UPDATE CASCADE; - --- AddForeignKey -ALTER TABLE "edges" ADD CONSTRAINT "edges_child_id_fkey" FOREIGN KEY ("child_id") REFERENCES "nodes"("id") ON DELETE RESTRICT ON UPDATE CASCADE; - --- AddForeignKey -ALTER TABLE "web_pages" ADD CONSTRAINT "web_pages_node_id_fkey" FOREIGN KEY ("node_id") REFERENCES "nodes"("id") ON DELETE RESTRICT ON UPDATE CASCADE; - --- AddForeignKey -ALTER TABLE "notes" ADD CONSTRAINT "notes_node_id_fkey" FOREIGN KEY ("node_id") REFERENCES "nodes"("id") ON DELETE RESTRICT ON UPDATE CASCADE; diff --git a/backend/prisma/migrations/20231119091949_add_initial_counter/migration.sql b/backend/prisma/migrations/20231119091949_add_initial_counter/migration.sql deleted file mode 100644 index 0c283de..0000000 --- a/backend/prisma/migrations/20231119091949_add_initial_counter/migration.sql +++ /dev/null @@ -1,2 +0,0 @@ --- Adding the counter -INSERT INTO "counters" ("name", "value") VALUES ('openai-api-calls', 0); diff --git a/backend/prisma/migrations/20231126122313_init_tables/migration.sql b/backend/prisma/migrations/20231126122313_init_tables/migration.sql new file mode 100644 index 0000000..a00a3ec --- /dev/null +++ b/backend/prisma/migrations/20231126122313_init_tables/migration.sql @@ -0,0 +1,48 @@ +-- CreateTable +CREATE TABLE "web_pages" ( + "id" TEXT NOT NULL, + "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + "updated_at" TIMESTAMP(3) NOT NULL, + "url" TEXT NOT NULL, + "title" TEXT NOT NULL, + "domain" TEXT NOT NULL, + "content" TEXT NOT NULL, + "embedding" vector(1536), + + CONSTRAINT "web_pages_pkey" PRIMARY KEY ("id") +); + +-- CreateTable +CREATE TABLE "notes" ( + "id" TEXT NOT NULL, + "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + "updated_at" TIMESTAMP(3) NOT NULL, + "title" TEXT NOT NULL, + "content" TEXT NOT NULL, + "embedding" vector(1536), + + CONSTRAINT "notes_pkey" PRIMARY KEY ("id") +); + +-- CreateTable +CREATE TABLE "papers" ( + "id" TEXT NOT NULL, + "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + "updated_at" TIMESTAMP(3) NOT NULL, + "title" TEXT NOT NULL, + "author" TEXT NOT NULL, + "url" TEXT NOT NULL, + "content" TEXT NOT NULL, + "embedding" vector(1536), + + CONSTRAINT "papers_pkey" PRIMARY KEY ("id") +); + +-- CreateTable +CREATE TABLE "counters" ( + "id" SERIAL NOT NULL, + "name" TEXT NOT NULL, + "value" INTEGER NOT NULL, + + CONSTRAINT "counters_pkey" PRIMARY KEY ("id") +); diff --git a/backend/prisma/schema.prisma b/backend/prisma/schema.prisma index fa1fbb5..34905fd 100644 --- a/backend/prisma/schema.prisma +++ b/backend/prisma/schema.prisma @@ -8,54 +8,43 @@ datasource db { directUrl = env("DATABASE_URL_DIRECT") } -model Node { - id Int @id @default(autoincrement()) - createdAt DateTime @default(now()) @map("created_at") - updatedAt DateTime @updatedAt @map("updated_at") - raw String - title String? - type NodeType - embedding Unsupported("vector(1536)")? - nodeParents Edge[] @relation("child_to_parent") - nodeChildren Edge[] @relation("parent_to_child") - note Note? - webPage WebPage? - - @@index(id) - @@map("nodes") -} - -model Edge { - id Int @id @default(autoincrement()) - parent Node @relation("parent_to_child", fields: [parentId], references: [id]) - parentId Int @map("parent_id") - child Node @relation("child_to_parent", fields: [childId], references: [id]) - childId Int @map("child_id") - - @@unique([parentId, childId]) - @@map("edges") -} - model WebPage { - id Int @id @default(autoincrement()) - url String - title String - node Node @relation(fields: [nodeId], references: [id]) - nodeId Int @unique @map("node_id") + id String @id + createdAt DateTime @default(now()) @map("created_at") + updatedAt DateTime @updatedAt @map("updated_at") + url String + title String + domain String + content String + embedding Unsupported("vector(1536)")? @@map("web_pages") } model Note { - id Int @id @default(autoincrement()) - title String - body String - node Node @relation(fields: [nodeId], references: [id]) - nodeId Int @unique @map("node_id") + id String @id + createdAt DateTime @default(now()) @map("created_at") + updatedAt DateTime @updatedAt @map("updated_at") + title String + content String + embedding Unsupported("vector(1536)")? @@map("notes") } +model Paper { + id String @id + createdAt DateTime @default(now()) @map("created_at") + updatedAt DateTime @updatedAt @map("updated_at") + title String + author String + url String + content String + embedding Unsupported("vector(1536)")? + + @@map("papers") +} + model Counter { id Int @id @default(autoincrement()) name String @@ -63,8 +52,3 @@ model Counter { @@map("counters") } - -enum NodeType { - NOTE - WEB_PAGE -} diff --git a/backend/src/app.ts b/backend/src/app.ts index 8224ecd..c0b3173 100644 --- a/backend/src/app.ts +++ b/backend/src/app.ts @@ -1,15 +1,15 @@ import Fastify from 'fastify'; import { PrismaClient } from '@prisma/client'; -import { Node, NodeFactory } from './node'; -import { PrismaNodeRepo } from './repo'; +import { WebPage, Note } from './domain/entities'; +import { PrismaRepo } from './repo'; import { parse } from './parser'; import { embed } from './embedder'; import { scrape } from './scraper'; import { generateTitle, summariseOrAnswerFromDocuments } from './llm'; -interface GetNodeParams { +/* interface GetNodeParams { id: number; -} +} */ interface GetSearchParams { q: string; @@ -27,94 +27,76 @@ const OPENAI_API_CALL_LIMIT = Number(process.env.OPENAI_API_CALL_LIMIT); const NODE_COUNT_LIMIT = Number(process.env.NODE_COUNT_LIMIT); const prisma = new PrismaClient(); -const nodeRepo = new PrismaNodeRepo(prisma); +const repo = new PrismaRepo(prisma); app.get('/health', async () => { return { status: 'OK' }; }); app.post<{ Body: PostNodeBody }>('/nodes', async (req, res) => { - const nodesInDb = await nodeRepo.getNodesCount(); - if (nodesInDb >= NODE_COUNT_LIMIT) { - return res.status(429).send({ - message: `The Node limit of ${NODE_COUNT_LIMIT} has been exhausted.`, - }); - } - - const { raw } = req.body; - if (!raw && typeof raw === 'string' && raw.length > 5) { - throw new Error('400 - Raw must be present as a string of > 5 chars.'); - } - const parserResult = parse(raw); - const nodes = NodeFactory.create(parserResult); + try { + /* const nodesInDb = await repo.getNodesCount(); + if (nodesInDb >= NODE_COUNT_LIMIT) { + return res.status(429).send({ + message: `The Node limit of ${NODE_COUNT_LIMIT} has been exhausted.`, + }); + } */ - const processNode = async (node: Node) => { - if (node.type === 'WEB_PAGE') { - const title = (await scrape(node.raw)).title; - node.title = title; + const { raw } = req.body; + if (!raw && typeof raw === 'string' && raw.length > 5) { + throw new Error('400 - Raw must be present as a string of > 5 chars.'); } - if (node.type === 'NOTE' && !node.title) { - const title = await generateTitle(node.raw); - node.title = title; - } - const embedding = await embed(node.text); - node.embedding = embedding; - - if (node.children && node.children.length > 0) { - for (const childNode of node.children) { - await processNode(childNode); + const parsed = parse(raw); + const created = []; + for (const elem of parsed) { + if (elem.type === 'WEB_PAGE') { + const { raw: url } = elem; + const { title, content } = await scrape(url); + const embedding = await embed(title + ' ' + content); + const webPage = WebPage.create({ url, title, content, embedding }); + created.push((await repo.createWebPage(webPage)).toDTO()); + } else if (elem.type === 'NOTE') { + const { raw: content, title } = elem; + const titleOrGeneratedTitle = title ?? (await generateTitle(content)); + const embedding = await embed(titleOrGeneratedTitle + ' ' + content); + const note = Note.create({ + title: titleOrGeneratedTitle, + content, + embedding, + }); + created.push((await repo.createNote(note)).toDTO()); + } else { + throw new Error(`Unsupported input type: ${elem.type}`); } } - }; - - for (const node of nodes) { - await processNode(node); + return created; + } catch (err) { + app.log.error(err); + res.status(500).send(); } - - const persistedNodes = await nodeRepo.createMany(nodes); - // TODO: add proper error handling - const isNode = (value: any): value is Node => { - return value instanceof Node; - }; - const validNodes = persistedNodes.filter(isNode); - return validNodes.map((node) => node.toDTO()); -}); - -app.get<{ Params: GetNodeParams }>('/nodes/:id', async (req, res) => { - const { id } = req.params; - // TODO: adding fastify checks on inputs I assume - // would convert the ID to a number. - const node = await nodeRepo.findbyId(Number(id)); - if (!node) return res.status(404).send(); - return { ...node.toDTO() }; -}); - -app.get('/nodes', async () => { - const nodes = await nodeRepo.findAll(); - return nodes.map((node) => ({ ...node.toDTO() })); }); app.get<{ Querystring: GetSearchParams }>('/search', async (req, res) => { try { - const counter = await nodeRepo.incrementOpenAiCounter(); + /* const counter = await repo.incrementOpenAiCounter(); if (counter >= OPENAI_API_CALL_LIMIT) { return res.status(429).send({ message: `The API call limit of ${OPENAI_API_CALL_LIMIT} has been exhausted.`, }); - } + } */ const { q } = req.query; const queryEmbedding = await embed(q); - const results = await nodeRepo.search(queryEmbedding); - return results.map((node) => ({ ...node.toDTO() })); + const results = await repo.search(queryEmbedding); + return results; } catch (err) { app.log.error(err); - res.status(500); + res.status(500).send(); } }); app.get<{ Querystring: GetSearchParams }>('/chat', async (req, res) => { try { - const counter = await nodeRepo.incrementOpenAiCounter(); + const counter = await repo.incrementOpenAiCounter(); if (counter >= OPENAI_API_CALL_LIMIT) { return res.status(429).send({ message: `The API call limit of ${OPENAI_API_CALL_LIMIT} has been exhausted.`, @@ -122,7 +104,7 @@ app.get<{ Querystring: GetSearchParams }>('/chat', async (req, res) => { } const { q } = req.query; const queryEmbedding = await embed(q); - const results = await nodeRepo.search(queryEmbedding); + const results = await repo.search(queryEmbedding); const stream = await summariseOrAnswerFromDocuments(results, q); res.raw.writeHead(200, { 'Content-Type': 'text/plain' }); for await (const part of stream) { diff --git a/backend/src/domain/entities/index.ts b/backend/src/domain/entities/index.ts new file mode 100644 index 0000000..a7fd9a9 --- /dev/null +++ b/backend/src/domain/entities/index.ts @@ -0,0 +1,2 @@ +export { Note } from './note'; +export { WebPage } from './web-page'; diff --git a/backend/src/domain/entities/note.ts b/backend/src/domain/entities/note.ts new file mode 100644 index 0000000..9fdbc21 --- /dev/null +++ b/backend/src/domain/entities/note.ts @@ -0,0 +1,45 @@ +import { ID } from '../values'; + +interface NoteCreateInput { + id?: string; + title: string; + content: string; + // TODO: Prisma does not allow setting embedding to be required? + embedding?: number[]; +} + +interface NoteProps { + id: ID; + title: string; + content: string; + embedding?: number[]; +} + +class Note { + props: NoteProps; + + private constructor({ id, title, content, embedding }: NoteCreateInput) { + this.props = { + id: id ? ID.fromExisting(id) : ID.create(), + title, + content, + embedding, + }; + } + + public static create(input: NoteCreateInput) { + return new Note(input); + } + + public toDTO() { + const { id, title } = this.props; + return { id: id.value, title }; + } + + public toPersistence() { + const { id, ...rest } = this.props; + return { id: id.value, ...rest }; + } +} + +export { Note }; diff --git a/backend/src/domain/entities/web-page.ts b/backend/src/domain/entities/web-page.ts new file mode 100644 index 0000000..694956c --- /dev/null +++ b/backend/src/domain/entities/web-page.ts @@ -0,0 +1,57 @@ +import { ID, Domain } from '../values'; + +interface WebPageCreateInput { + id?: string; + domain?: string; + url: string; + title: string; + content: string; + // TODO: Prisma does not allow setting embedding to be required? + embedding?: number[]; +} + +interface WebPageProps { + id: ID; + domain: Domain; + url: string; + title: string; + content: string; + embedding?: number[]; +} + +class WebPage { + props: WebPageProps; + + private constructor({ + id, + url, + title, + content, + embedding, + }: WebPageCreateInput) { + this.props = { + id: id ? ID.fromExisting(id) : ID.create(), + domain: Domain.fromUrl(url), + url, + title, + content, + embedding, + }; + } + + public static create(input: WebPageCreateInput) { + return new WebPage(input); + } + + public toDTO() { + const { id, url, title } = this.props; + return { id: id.value, url, title }; + } + + public toPersistence() { + const { id, domain, ...rest } = this.props; + return { id: id.value, domain: domain.value, ...rest }; + } +} + +export { WebPage }; diff --git a/backend/src/domain/values/domain.ts b/backend/src/domain/values/domain.ts new file mode 100644 index 0000000..88e3b33 --- /dev/null +++ b/backend/src/domain/values/domain.ts @@ -0,0 +1,12 @@ +class Domain { + value: string; + constructor(url: string) { + // TODO: actually create the domain! + this.value = url; + } + public static fromUrl(url: string) { + return new Domain(url); + } +} + +export { Domain }; diff --git a/backend/src/domain/values/id.ts b/backend/src/domain/values/id.ts new file mode 100644 index 0000000..529585f --- /dev/null +++ b/backend/src/domain/values/id.ts @@ -0,0 +1,58 @@ +import { randomFillSync } from 'crypto'; + +const UNIX_TS_MS_BITS = 48; +const VER_DIGIT = '7'; +const SEQ_BITS = 12; +const VAR = 0b10; +const VAR_BITS = 2; +const RAND_BITS = 62; + +class ID { + value: string; + + private constructor(id?: string) { + this.value = id ?? this.generateId(); + } + + public static create() { + return new ID(); + } + + public static fromExisting(id: string) { + return new ID(id); + } + + private generateId() { + let prevTimestamp = -1; + let seq = 0; + + const timestamp = Math.max(Date.now(), prevTimestamp); + seq = timestamp === prevTimestamp ? seq + 1 : 0; + prevTimestamp = timestamp; + + const var_rand = new Uint32Array(2); + randomFillSync(var_rand); + var_rand[0] = (VAR << (32 - VAR_BITS)) | (var_rand[0]! >>> VAR_BITS); + + const digits = + timestamp.toString(16).padStart(UNIX_TS_MS_BITS / 4, '0') + + VER_DIGIT + + seq.toString(16).padStart(SEQ_BITS / 4, '0') + + var_rand[0]!.toString(16).padStart((VAR_BITS + RAND_BITS) / 2 / 4, '0') + + var_rand[1]!.toString(16).padStart((VAR_BITS + RAND_BITS) / 2 / 4, '0'); + + return ( + digits.slice(0, 8) + + '-' + + digits.slice(8, 12) + + '-' + + digits.slice(12, 16) + + '-' + + digits.slice(16, 20) + + '-' + + digits.slice(20) + ); + } +} + +export { ID }; diff --git a/backend/src/domain/values/index.ts b/backend/src/domain/values/index.ts new file mode 100644 index 0000000..528e913 --- /dev/null +++ b/backend/src/domain/values/index.ts @@ -0,0 +1,2 @@ +export { ID } from './id'; +export { Domain } from './domain'; diff --git a/backend/src/llm.ts b/backend/src/llm.ts index fb7caa0..f658ab2 100644 --- a/backend/src/llm.ts +++ b/backend/src/llm.ts @@ -1,5 +1,6 @@ import OpenAI from 'openai'; -import { Node } from './node'; +import { SearchResultRow } from './repo'; + const openai = new OpenAI(); const generateTitle = async (text: string): Promise => { @@ -44,8 +45,11 @@ const generateTitle = async (text: string): Promise => { return title; }; -const summariseOrAnswerFromDocuments = async (nodes: Node[], query: string) => { - const titles = nodes.map((node) => `- ${node.toDTO().title}`).join('\n'); +const summariseOrAnswerFromDocuments = async ( + rows: SearchResultRow[], + query: string, +) => { + const titles = rows.map((row) => `- ${row.title}`).join('\n'); const input = ` Given the following list of documents, and query, you must answer the question, or summarise the docs depending on the tone. diff --git a/backend/src/parser.ts b/backend/src/parser.ts index 4ae1795..9c33b62 100644 --- a/backend/src/parser.ts +++ b/backend/src/parser.ts @@ -1,49 +1,11 @@ +type NodeType = 'WEB_PAGE' | 'NOTE' | 'PAPER'; + export interface ParserResult { raw: string; - title: string | null; - body: string | null; - tags: string[]; - links: string[]; + title?: string; + type: NodeType; } -// Function to extract YAML front matter -const extractFrontMatter = ( - raw: string, -): { frontMatter: string; bodyStartIndex: number } => { - const pattern = /---(.*?)---/gs; - const matches = raw.match(pattern); - let bodyStartIndex = 0; - let frontMatter = ''; - - if (matches) { - const match = matches[0]; - bodyStartIndex = raw.indexOf(match) + match.length; - frontMatter = match.replace(/---/g, '').trim(); - } - return { frontMatter, bodyStartIndex }; -}; - -// Function to parse YAML front matter -const parseFrontMatter = ( - frontMatter: string, -): { title: string | null; tags: string[] } => { - let title = null; - let tags: string[] = []; - - if (frontMatter !== '') { - const items = frontMatter.split('\n'); - items.forEach((item) => { - if (item.startsWith('title:')) title = item.replace('title:', '').trim(); - if (item.startsWith('tags:')) - tags = item - .replace('tags:', '') - .split(',') - .map((t) => t.trim()); - }); - } - return { title, tags }; -}; - const extractLinks = (body: string): string[] => { const links = []; const urlPattern = @@ -53,34 +15,24 @@ const extractLinks = (body: string): string[] => { return links; }; -const bodyWithoutLinks = (body: string, links: string[]): string | null => { - let bodyWithoutLinks = body; +const isNote = (raw: string, links: string[]): boolean => { + let bodyWithoutLinks = raw; links.forEach((link) => { bodyWithoutLinks = bodyWithoutLinks.replace(link, '').trim(); }); - return bodyWithoutLinks.length > 0 ? bodyWithoutLinks : null; + return bodyWithoutLinks.length > 0; }; -// Main parse function -const parse = (raw: string): ParserResult => { - const result: ParserResult = { - raw, - title: null, - body: null, - tags: [], - links: [], - }; - - const { frontMatter, bodyStartIndex } = extractFrontMatter(raw); - const { title, tags } = parseFrontMatter(frontMatter); - const body = raw.substring(bodyStartIndex).trim(); - - result.title = title; - result.tags = tags; - result.links = extractLinks(body); - result.body = bodyWithoutLinks(body, result.links); - - return result; +const parse = (raw: string): ParserResult[] => { + const nodesToCreate: ParserResult[] = []; + const links = extractLinks(raw); + if (isNote(raw, links)) { + nodesToCreate.push({ type: 'NOTE', raw }); + } + for (const link of links) { + nodesToCreate.push({ type: 'WEB_PAGE', raw: link }); + } + return nodesToCreate; }; export { parse }; diff --git a/backend/src/repo.ts b/backend/src/repo.ts index 1ffb831..9f4aa4d 100644 --- a/backend/src/repo.ts +++ b/backend/src/repo.ts @@ -1,125 +1,128 @@ -import { PrismaClient } from '@prisma/client'; import pgvector from 'pgvector/utils'; -import { Node } from './node'; +import { PrismaClient } from '@prisma/client'; +import { Note, WebPage } from './domain/entities'; + +export type SearchResultRow = { + id: string; + title: string; + similarity: number; + data: { [key: string]: string }; +}; -export interface NodeRepo { - create(node: Node): Promise; - createMany(nodes: Node[]): Promise<(Node | Error)[]>; - findbyId(id: number): Promise; - findAll(): Promise; +export interface Repo { + search(embedding: number[]): Promise; + createNote(input: Note): Promise; + createWebPage(input: WebPage): Promise; + //createMany(nodes: Node[]): Promise<(Node | Error)[]>; + //findbyId(id: number): Promise; + //findAll(): Promise; } -export class PrismaNodeRepo implements NodeRepo { +export class PrismaRepo implements Repo { private prisma: PrismaClient; constructor(prisma: PrismaClient) { this.prisma = prisma; } - public async create(node: Node): Promise { + public async createNote(note: Note): Promise { try { - const { children, ...rest } = node.toPersistence(); - const parentResult = await this.prisma.node.create({ data: { ...rest } }); - await this.setEmbedding(parentResult.id, node.embedding); - const childResults = []; - if (children) { - for (const child of children) { - const { children: _, ...rest } = child.toPersistence(); - const childResult = await this.prisma.node.create({ - data: { ...rest }, - }); - await this.setEmbedding(childResult.id, child.embedding); - childResults.push(childResult); - await this.prisma.edge.create({ - data: { - parent: { - connect: { - id: parentResult.id, - }, - }, - child: { - connect: { - id: childResult.id, - }, - }, - }, - }); - } - } - - return Node.create({ - ...parentResult, - children: childResults.map((node) => Node.create(node)), + const { embedding, ...rest } = note.toPersistence(); + const result = await this.prisma.note.create({ + data: { ...rest }, }); + await this.setEmbedding(result.id, 'notes', embedding); + return Note.create(result); } catch (err) { console.error(err); - throw new Error('Failed to persist node to database'); + throw new Error('Failed to persist Note to the database.'); } } - public async createMany(nodes: Node[]): Promise<(Node | Error)[]> { - const creationPromises = nodes.map((node) => - this.create(node) - .then((result) => result) - .catch((error) => error), - ); - - const results = await Promise.allSettled(creationPromises); - - return results.map((result) => { - if (result.status === 'fulfilled') { - return result.value; - } else { - return new Error(`Failed to create node: ${result.reason}`); - } - }); + public async createWebPage(webPage: WebPage): Promise { + try { + const { embedding, ...rest } = webPage.toPersistence(); + const result = await this.prisma.webPage.create({ + data: { ...rest }, + }); + await this.setEmbedding(result.id, 'web_pages', embedding); + return WebPage.create(result); + } catch (err) { + console.error(err); + throw new Error('Failed to persist WebPage to the database.'); + } } - public async findbyId(id: number): Promise { - const result = await this.prisma.node.findUnique({ - where: { id }, - }); + public async search(queryEmbedding: number[]): Promise { + const queryEmbeddingSql = pgvector.toSql(queryEmbedding); + return await this.prisma.$queryRaw` + SELECT * FROM ( + SELECT + id, + 'web_page' as type, + jsonb_build_object('title', title, 'url', url) as data, + 1 - (embedding <=> ${queryEmbeddingSql}::vector) AS similarity + FROM web_pages + WHERE embedding IS NOT NULL + + UNION - return result ? Node.create(result) : null; - } + SELECT + id, + 'note' as type, + jsonb_build_object('title', title, 'content', content) as data, + 1 - (embedding <=> ${queryEmbeddingSql}::vector) AS similarity + FROM notes + WHERE embedding IS NOT NULL - public async findAll(): Promise { - const results = await this.prisma.node.findMany(); - return results.map((node) => Node.create(node)); - } + UNION - public async search(queryEmbedding: number[]): Promise { - const queryEmbeddingSql = pgvector.toSql(queryEmbedding); - const results = await this.prisma.$queryRaw` - SELECT id, type, title, 1 - (embedding <=> ${queryEmbeddingSql}::vector) AS similarity - FROM nodes + SELECT + id, + 'paper' as type, + jsonb_build_object('title', title, 'content', content) as data, + 1 - (embedding <=> ${queryEmbeddingSql}::vector) AS similarity + FROM papers WHERE embedding IS NOT NULL - ORDER BY similarity DESC LIMIT 5`; - //@ts-ignore - return results.map((node) => Node.create(node)); + ) AS combined_results + ORDER BY similarity DESC LIMIT 10`; } public async incrementOpenAiCounter(): Promise { - const updatedCounter = await this.prisma.counter.update({ - where: { - id: 1, - }, - data: { value: { increment: 1 } }, - }); - return updatedCounter.value; + try { + const updatedCounter = await this.prisma.counter.update({ + where: { + id: 1, + }, + data: { value: { increment: 1 } }, + }); + return updatedCounter.value; + } catch (err) { + const counter = await this.prisma.counter.create({ + data: { name: 'open-ai-api-call', value: 1 }, + }); + return counter.value; + } } - public async getNodesCount(): Promise { + /* public async getNodesCount(): Promise { const count = await this.prisma.node.count(); return count; - } + } */ - private async setEmbedding(id: number, embedding: number[]): Promise { + private async setEmbedding( + id: string, + table: string, + embedding?: number[], + ): Promise { + // TODO: move this check higher up in the call chain. + // Prisma forces us to even consider this to be undefined... if (!embedding) { throw new Error('When setting embedding it cannot be undefined.'); } const embeddingSql = pgvector.toSql(embedding); - await this.prisma - .$executeRaw`UPDATE nodes SET embedding = ${embeddingSql}::vector WHERE id = ${id}`; + // TODO: check table is one of allowed tables. + const query = `UPDATE ${table} SET embedding = $1::vector WHERE id = $2`; + await this.prisma.$executeRawUnsafe(query, embeddingSql, id); } } diff --git a/backend/src/scraper.ts b/backend/src/scraper.ts index f2c0431..21f3e1c 100644 --- a/backend/src/scraper.ts +++ b/backend/src/scraper.ts @@ -1,9 +1,12 @@ import { decodeHTML } from 'entities'; import { gotScraping as got } from 'got-scraping'; import { parse } from 'node-html-parser'; +import { Readability } from '@mozilla/readability'; +import { JSDOM } from 'jsdom'; interface ScraperResult { title: string; + content: string; } // NOTE: need some time to decide on a good node lib for parsing @@ -11,12 +14,19 @@ interface ScraperResult { const scrape = async (url: string): Promise => { const { body } = await got.get(url); - const root = parse(body); - const title = root.getElementsByTagName('title')[0].innerText; - if (!title) { - throw new Error(`Failed to scrape and get title from: ${url}`); + const dom = new JSDOM(body, { url }); + const parsed = new Readability(dom.window.document).parse(); + if (!parsed) { + throw new Error('Failed to parse HTML from ${}'); } - return { title: decodeHTML(title) }; + const { title, content } = parsed; + return { title, content }; + //const root = parse(body); + //const title = root.getElementsByTagName('title')[0].innerText; + //if (!title) { + // throw new Error(`Failed to scrape and get title from: ${url}`); + //} + //return { title: decodeHTML(title) }; }; export { scrape }; diff --git a/backend/tests/api.test.js b/backend/tests/api.test.js deleted file mode 100644 index 2a3f652..0000000 --- a/backend/tests/api.test.js +++ /dev/null @@ -1,106 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -const tap_1 = __importDefault(require("tap")); -const app_1 = require("../src/app"); -tap_1.default.test('Create a NOTE type node', async (t) => { - const res = await app_1.app.inject({ - method: 'POST', - url: '/nodes', - body: { raw: 'hello world!' }, - }); - const { statusCode } = res; - const nodes = res.json(); - console.log(nodes); - t.equal(statusCode, 200); - t.equal(nodes.length, 1); - t.equal(nodes[0].raw, 'hello world!'); - t.equal(nodes[0].type, 'NOTE'); - t.end(); -}); -tap_1.default.test('Create a WEB_PAGE type node', async (t) => { - const raw = 'https://example.com/article'; - const res = await app_1.app.inject({ - method: 'POST', - url: '/nodes', - body: { raw }, - }); - const { statusCode } = res; - const nodes = res.json(); - console.log(nodes); - t.equal(statusCode, 200); - t.equal(nodes.length, 1); - t.equal(nodes[0].raw, raw); - t.equal(nodes[0].type, 'WEB_PAGE'); - t.end(); -}); -tap_1.default.test('Create a 2x WEB_PAGE type nodes', async (t) => { - const articleA = 'https://example.com/article/A'; - const articleB = 'https://example.com/article/B'; - const raw = `${articleA} ${articleB}`; - const res = await app_1.app.inject({ - method: 'POST', - url: '/nodes', - body: { raw }, - }); - const { statusCode } = res; - const nodes = res.json(); - console.log(nodes); - t.equal(statusCode, 200); - t.equal(nodes.length, 2); - t.equal(nodes[0].raw, articleA); - t.equal(nodes[0].type, 'WEB_PAGE'); - t.equal(nodes[1].raw, articleB); - t.equal(nodes[1].type, 'WEB_PAGE'); - t.end(); -}); -tap_1.default.test('Create a NOTE type node with a WEB_PAGE inside', async (t) => { - const site = 'https://example.com/article'; - const raw = `A great article about dogs ${site}`; - const res = await app_1.app.inject({ - method: 'POST', - url: '/nodes', - body: { raw }, - }); - const { statusCode } = res; - const nodes = res.json(); - console.log(nodes); - t.equal(statusCode, 200); - t.equal(nodes.length, 1); - t.equal(nodes[0].raw, raw); - t.equal(nodes[0].type, 'NOTE'); - t.type(nodes[0].title, 'string'); - t.equal(nodes[0].children.length, 1); - t.equal(nodes[0].children[0].type, 'WEB_PAGE'); - t.equal(nodes[0].children[0].raw, site); - t.type(nodes[0].children[0].title, 'string'); - t.end(); -}); -tap_1.default.test('Create a NOTE type node with a 2x WEB_PAGE nodes inside', async (t) => { - const articleA = 'https://example.com/article/A'; - const articleB = 'https://example.com/article/B'; - const raw = `A great article about dogs ${articleA} ${articleB}`; - const res = await app_1.app.inject({ - method: 'POST', - url: '/nodes', - body: { raw }, - }); - const { statusCode } = res; - const nodes = res.json(); - console.log(nodes); - t.equal(statusCode, 200); - t.equal(nodes.length, 1); - t.equal(nodes[0].raw, raw); - t.equal(nodes[0].type, 'NOTE'); - t.equal(nodes[0].children.length, 2); - t.equal(nodes[0].children[0].type, 'WEB_PAGE'); - t.equal(nodes[0].children[0].raw, articleA); - t.equal(nodes[0].children[1].type, 'WEB_PAGE'); - t.equal(nodes[0].children[1].raw, articleB); - t.end(); -}); -tap_1.default.teardown(async () => { - await app_1.app.close(); -}); diff --git a/backend/tests/api.test.ts b/backend/tests/api.test.ts index 698eb1d..2de2310 100644 --- a/backend/tests/api.test.ts +++ b/backend/tests/api.test.ts @@ -1,42 +1,39 @@ import t from 'tap'; import { app } from '../src/app'; -t.test('Create a NOTE type node', async (t) => { +t.test('Succesfully create a NOTE type node', async (t) => { const res = await app.inject({ method: 'POST', url: '/nodes', body: { raw: 'hello world!' }, }); - const { statusCode } = res; const nodes = res.json(); console.log(nodes); t.equal(statusCode, 200); t.equal(nodes.length, 1); - t.equal(nodes[0].raw, 'hello world!'); - t.equal(nodes[0].type, 'NOTE'); + t.ok(typeof nodes[0].id === 'string'); t.end(); }); -t.test('Create a WEB_PAGE type node', async (t) => { +t.test('Succesfully create a WEB_PAGE type node', async (t) => { const raw = 'https://example.com/article'; const res = await app.inject({ method: 'POST', url: '/nodes', body: { raw }, }); - const { statusCode } = res; const nodes = res.json(); - console.log(nodes); + t.equal(statusCode, 200); t.equal(nodes.length, 1); - t.equal(nodes[0].raw, raw); - t.equal(nodes[0].type, 'WEB_PAGE'); + t.equal(nodes[0].url, raw); + t.ok(typeof nodes[0].id === 'string'); t.end(); }); -t.test('Create a 2x WEB_PAGE type nodes', async (t) => { +t.test('Succesfully create 2 WEB_PAGE type nodes', async (t) => { const articleA = 'https://example.com/article/A'; const articleB = 'https://example.com/article/B'; const raw = `${articleA} ${articleB}`; @@ -45,19 +42,16 @@ t.test('Create a 2x WEB_PAGE type nodes', async (t) => { url: '/nodes', body: { raw }, }); - const { statusCode } = res; const nodes = res.json(); - console.log(nodes); + t.equal(statusCode, 200); t.equal(nodes.length, 2); - t.equal(nodes[0].raw, articleA); - t.equal(nodes[0].type, 'WEB_PAGE'); - t.equal(nodes[1].raw, articleB); - t.equal(nodes[1].type, 'WEB_PAGE'); + t.equal(nodes[0].url, articleA); + t.equal(nodes[1].url, articleB); t.end(); }); - +/* t.test('Create a NOTE type node with a WEB_PAGE inside', async (t) => { const site = 'https://example.com/article'; const raw = `A great article about dogs ${site}`; @@ -108,7 +102,7 @@ t.test('Create a NOTE type node with a 2x WEB_PAGE nodes inside', async (t) => { t.equal(nodes[0].children[1].raw, articleB); t.end(); }); - + */ t.teardown(async () => { await app.close(); }); diff --git a/backend/tests/fixtures/body.md b/backend/tests/fixtures/body.md deleted file mode 100644 index 2637ae2..0000000 --- a/backend/tests/fixtures/body.md +++ /dev/null @@ -1 +0,0 @@ -This is an article about sport! diff --git a/backend/tests/fixtures/front-matter-empty.md b/backend/tests/fixtures/front-matter-empty.md deleted file mode 100644 index 7384b4f..0000000 --- a/backend/tests/fixtures/front-matter-empty.md +++ /dev/null @@ -1,4 +0,0 @@ ---- ---- - -This is an article about sport! diff --git a/backend/tests/fixtures/front-matter-title-tags.md b/backend/tests/fixtures/front-matter-title-tags.md deleted file mode 100644 index 63b6a99..0000000 --- a/backend/tests/fixtures/front-matter-title-tags.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -title: Sport is good -tags: sport,football, liverpool fc ---- - -This is an article about sport! diff --git a/backend/tests/fixtures/front-matter-title.md b/backend/tests/fixtures/front-matter-title.md deleted file mode 100644 index 6d9359f..0000000 --- a/backend/tests/fixtures/front-matter-title.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Sport is good ---- - -This is an article about sport! diff --git a/backend/tests/fixtures/multiple-links-same-line.md b/backend/tests/fixtures/multiple-links-same-line.md new file mode 100644 index 0000000..7c79f2b --- /dev/null +++ b/backend/tests/fixtures/multiple-links-same-line.md @@ -0,0 +1 @@ +https://github.com/mozilla/readability https://takelessons.com/blog/derivative-in-math-an-introduction-with-examples diff --git a/backend/tests/fixtures/multiple-links.md b/backend/tests/fixtures/multiple-links.md new file mode 100644 index 0000000..faec3fc --- /dev/null +++ b/backend/tests/fixtures/multiple-links.md @@ -0,0 +1,2 @@ +https://github.com/mozilla/readability +https://takelessons.com/blog/derivative-in-math-an-introduction-with-examples diff --git a/backend/tests/fixtures/note-with-link-same-line.md b/backend/tests/fixtures/note-with-link-same-line.md new file mode 100644 index 0000000..0aa7564 --- /dev/null +++ b/backend/tests/fixtures/note-with-link-same-line.md @@ -0,0 +1 @@ +This is an article about sport! http://www.express.co.uk/sport/football/1816094/Man-Utd-news-Ten-Hag-Burnley-ref-VAR diff --git a/backend/tests/fixtures/body-with-links.md b/backend/tests/fixtures/note-with-multiple-links.md similarity index 100% rename from backend/tests/fixtures/body-with-links.md rename to backend/tests/fixtures/note-with-multiple-links.md diff --git a/backend/tests/fixtures/single-link.md b/backend/tests/fixtures/single-link.md new file mode 100644 index 0000000..87fc31d --- /dev/null +++ b/backend/tests/fixtures/single-link.md @@ -0,0 +1 @@ +https://github.com/mozilla/readability diff --git a/backend/tests/parser.test.js b/backend/tests/parser.test.js deleted file mode 100644 index 40bdc52..0000000 --- a/backend/tests/parser.test.js +++ /dev/null @@ -1,56 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -const promises_1 = __importDefault(require("fs/promises")); -const path_1 = __importDefault(require("path")); -const tap_1 = __importDefault(require("tap")); -const parser_1 = require("../src/parser"); -tap_1.default.test('file contains empty front matter', async (t) => { - const filePath = path_1.default.join(__dirname, 'fixtures/front-matter-empty.md'); - const fileContents = await promises_1.default.readFile(filePath, 'utf8'); - const parsed = (0, parser_1.parse)(fileContents); - t.equal(parsed.title, null); - t.equal(parsed.body, 'This is an article about sport!'); - t.end(); -}); -tap_1.default.test('file contains front matter with a title', async (t) => { - const filePath = path_1.default.join(__dirname, 'fixtures/front-matter-title.md'); - const fileContents = await promises_1.default.readFile(filePath, 'utf8'); - const parsed = (0, parser_1.parse)(fileContents); - t.equal(parsed.title, 'Sport is good'); - t.equal(parsed.body, 'This is an article about sport!'); - t.end(); -}); -tap_1.default.test('file contains front matter with a title & tags', async (t) => { - const filePath = path_1.default.join(__dirname, 'fixtures/front-matter-title-tags.md'); - const fileContents = await promises_1.default.readFile(filePath, 'utf8'); - const parsed = (0, parser_1.parse)(fileContents); - t.equal(parsed.title, 'Sport is good'); - t.has(parsed.tags, ['sport', 'football', 'liverpool fc']); - t.equal(parsed.body, 'This is an article about sport!'); - t.end(); -}); -tap_1.default.test('file contains no front matter, but does have a body', async (t) => { - const filePath = path_1.default.join(__dirname, 'fixtures/body.md'); - const fileContents = await promises_1.default.readFile(filePath, 'utf8'); - const parsed = (0, parser_1.parse)(fileContents); - t.equal(parsed.title, null); - t.equal(parsed.body, 'This is an article about sport!'); - t.end(); -}); -tap_1.default.test('file contains no front matter, but a body with 3 links', async (t) => { - const filePath = path_1.default.join(__dirname, 'fixtures/body-with-links.md'); - const fileContents = await promises_1.default.readFile(filePath, 'utf8'); - const parsed = (0, parser_1.parse)(fileContents); - t.equal(parsed.title, null); - t.has(parsed.tags, []); - t.ok(parsed.body?.includes('This is an article about sport!')); - t.has(parsed.links, [ - 'http://www.express.co.uk/sport/football/1816094/Man-Utd-news-Ten-Hag-Burnley-ref-VAR', - 'https://www.express.co.uk/sport/football/1816037/Man-Utd-transfer-news-Liverpool-Arsenal-Ivan-Toney-Brentford', - 'https://www.express.co.uk/sport/football/1816044/Tottenham-transfer-news-Ange-Postecoglou-Celtic-Ivan-Perisic', - ]); - t.end(); -}); diff --git a/backend/tests/parser.test.ts b/backend/tests/parser.test.ts index 2b92889..3023c10 100644 --- a/backend/tests/parser.test.ts +++ b/backend/tests/parser.test.ts @@ -3,59 +3,82 @@ import path from 'path'; import t from 'tap'; import { parse } from '../src/parser'; -t.test('file contains empty front matter', async (t) => { - const filePath = path.join(__dirname, 'fixtures/front-matter-empty.md'); +t.test('File with a single link', async (t) => { + const filePath = path.join(__dirname, 'fixtures/single-link.md'); const fileContents = await fs.readFile(filePath, 'utf8'); - const parsed = parse(fileContents); - t.equal(parsed.title, null); - t.equal(parsed.body, 'This is an article about sport!'); + + t.equal(parsed.length, 1); + t.equal(parsed[0].raw, 'https://github.com/mozilla/readability'); t.end(); }); -t.test('file contains front matter with a title', async (t) => { - const filePath = path.join(__dirname, 'fixtures/front-matter-title.md'); +t.test('File with 2 links, each on a new line', async (t) => { + const filePath = path.join(__dirname, 'fixtures/multiple-links.md'); const fileContents = await fs.readFile(filePath, 'utf8'); const parsed = parse(fileContents); - t.equal(parsed.title, 'Sport is good'); - t.equal(parsed.body, 'This is an article about sport!'); + t.equal(parsed.length, 2); + t.equal(parsed[0].raw, 'https://github.com/mozilla/readability'); + t.equal( + parsed[1].raw, + 'https://takelessons.com/blog/derivative-in-math-an-introduction-with-examples', + ); t.end(); }); -t.test('file contains front matter with a title & tags', async (t) => { - const filePath = path.join(__dirname, 'fixtures/front-matter-title-tags.md'); +t.test('File with 2 links, on the same line', async (t) => { + const filePath = path.join(__dirname, 'fixtures/multiple-links-same-line.md'); const fileContents = await fs.readFile(filePath, 'utf8'); const parsed = parse(fileContents); - t.equal(parsed.title, 'Sport is good'); - t.has(parsed.tags, ['sport', 'football', 'liverpool fc']); - t.equal(parsed.body, 'This is an article about sport!'); + t.equal(parsed.length, 2); + t.equal(parsed[0].raw, 'https://github.com/mozilla/readability'); + t.equal( + parsed[1].raw, + 'https://takelessons.com/blog/derivative-in-math-an-introduction-with-examples', + ); t.end(); }); -t.test('file contains no front matter, but does have a body', async (t) => { - const filePath = path.join(__dirname, 'fixtures/body.md'); +t.test('File with a note and 1 link, on the same line', async (t) => { + const filePath = path.join(__dirname, 'fixtures/note-with-link-same-line.md'); const fileContents = await fs.readFile(filePath, 'utf8'); - const parsed = parse(fileContents); - t.equal(parsed.title, null); - t.equal(parsed.body, 'This is an article about sport!'); + + t.equal(parsed.length, 2); + t.equal(parsed[0].type, 'NOTE'); + t.equal(parsed[0].raw, fileContents); + t.equal(parsed[1].type, 'WEB_PAGE'); + t.equal( + parsed[1].raw, + 'http://www.express.co.uk/sport/football/1816094/Man-Utd-news-Ten-Hag-Burnley-ref-VAR', + ); t.end(); }); -t.test('file contains no front matter, but a body with 3 links', async (t) => { - const filePath = path.join(__dirname, 'fixtures/body-with-links.md'); +t.test('File with a note and 3 links, on multiple lines', async (t) => { + const filePath = path.join(__dirname, 'fixtures/note-with-multiple-links.md'); const fileContents = await fs.readFile(filePath, 'utf8'); const parsed = parse(fileContents); - t.equal(parsed.title, null); - t.has(parsed.tags, []); - t.ok(parsed.body?.includes('This is an article about sport!')); - t.has(parsed.links, [ + t.equal(parsed.length, 4); + t.equal(parsed[0].type, 'NOTE'); + t.equal(parsed[0].raw, fileContents); + t.equal(parsed[1].type, 'WEB_PAGE'); + t.equal( + parsed[1].raw, 'http://www.express.co.uk/sport/football/1816094/Man-Utd-news-Ten-Hag-Burnley-ref-VAR', + ); + t.equal(parsed[2].type, 'WEB_PAGE'); + t.equal( + parsed[2].raw, 'https://www.express.co.uk/sport/football/1816037/Man-Utd-transfer-news-Liverpool-Arsenal-Ivan-Toney-Brentford', + ); + t.equal(parsed[3].type, 'WEB_PAGE'); + t.equal( + parsed[3].raw, 'https://www.express.co.uk/sport/football/1816044/Tottenham-transfer-news-Ange-Postecoglou-Celtic-Ivan-Perisic', - ]); + ); t.end(); }); diff --git a/backend/tests/scraper.test.js b/backend/tests/scraper.test.js deleted file mode 100644 index 4737731..0000000 --- a/backend/tests/scraper.test.js +++ /dev/null @@ -1,13 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -const tap_1 = __importDefault(require("tap")); -const scraper_1 = require("../src/scraper"); -tap_1.default.test('scrapes a site and fetches title', async (t) => { - const URL = 'https://apify.com'; - const scraped = await (0, scraper_1.scrape)(URL); - t.equal(scraped.title, 'Apify: Full-stack web scraping and data extraction platform'); - t.end(); -}); diff --git a/cli/src/add.ts b/cli/src/add.ts index 37d6332..4a9f23a 100644 --- a/cli/src/add.ts +++ b/cli/src/add.ts @@ -2,9 +2,9 @@ import axios from 'axios'; import chalk from 'chalk'; interface Node { - id: number; + id: string; title: string; - type: 'NOTE' | 'WEB_PAGE'; + type: 'NOTE' | 'WEB_PAGE' | 'PAPER'; } const BASE_URL = @@ -15,10 +15,14 @@ const add = async (raw: string): Promise => { const { data } = await axios.post(`${BASE_URL}/nodes`, { raw, }); + const nodes = data.reduce((acc, { id, ...x }) => { + acc[id] = x; + return acc; + }, {} as Record); console.log(`✅ ${chalk.green('Succesfully added node(s).')}`); console.log(); - console.table(data); + console.table(nodes); } catch (err: any) { if (err.response && err.response.status) { console.error( diff --git a/cli/src/search.ts b/cli/src/search.ts index 83d69e6..0d7d008 100644 --- a/cli/src/search.ts +++ b/cli/src/search.ts @@ -1,19 +1,30 @@ import axios from 'axios'; import chalk from 'chalk'; +interface SearchResultRow { + id: string; + title: string; + type: 'NOTE' | 'WEB_PAGE' | 'PAPER'; +} + const BASE_URL = process.env.BASE_URL || 'https://kg1-backend-j5dxapaafq-ew.a.run.app'; const search = async (q: string): Promise => { try { - const { data } = await axios.get(`${BASE_URL}/search`, { + const { data } = await axios.get(`${BASE_URL}/search`, { params: { q, }, }); + const nodes = data.reduce((acc, { id, ...x }) => { + acc[id] = x; + return acc; + }, {} as Record); + console.log(`✅ ${chalk.green(`Succesfully searched for: "${q}"`)}`); console.log(); - console.table(data); + console.table(nodes); } catch (err: any) { if (err.response && err.response.status === 429) { console.error( From 26076ef89d6a27f81c382ab2edd7399876aabd47 Mon Sep 17 00:00:00 2001 From: ydennisy Date: Sun, 26 Nov 2023 19:06:35 +0000 Subject: [PATCH 2/2] chore: start to tidy up the CLI --- backend/src/llm.ts | 5 ++++- backend/src/repo.ts | 4 ++-- cli/src/search.ts | 10 ++++++++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/backend/src/llm.ts b/backend/src/llm.ts index f658ab2..0843210 100644 --- a/backend/src/llm.ts +++ b/backend/src/llm.ts @@ -49,7 +49,9 @@ const summariseOrAnswerFromDocuments = async ( rows: SearchResultRow[], query: string, ) => { - const titles = rows.map((row) => `- ${row.title}`).join('\n'); + const titles = rows + .map((row) => `- ${row.title}: ${row.data?.content ?? ''}`) + .join('\n'); const input = ` Given the following list of documents, and query, you must answer the question, or summarise the docs depending on the tone. @@ -60,6 +62,7 @@ const summariseOrAnswerFromDocuments = async ( QUERY: ${query} `; + console.log(input); const stream = await openai.chat.completions.create({ model: 'gpt-3.5-turbo', messages: [{ role: 'user', content: input }], diff --git a/backend/src/repo.ts b/backend/src/repo.ts index 9f4aa4d..a4cf92e 100644 --- a/backend/src/repo.ts +++ b/backend/src/repo.ts @@ -60,7 +60,7 @@ export class PrismaRepo implements Repo { SELECT id, 'web_page' as type, - jsonb_build_object('title', title, 'url', url) as data, + jsonb_build_object('title', title, 'url', url, 'content', content) as data, 1 - (embedding <=> ${queryEmbeddingSql}::vector) AS similarity FROM web_pages WHERE embedding IS NOT NULL @@ -80,7 +80,7 @@ export class PrismaRepo implements Repo { SELECT id, 'paper' as type, - jsonb_build_object('title', title, 'content', content) as data, + jsonb_build_object('title', title, 'url', url, 'author', author, 'content', content) as data, 1 - (embedding <=> ${queryEmbeddingSql}::vector) AS similarity FROM papers WHERE embedding IS NOT NULL diff --git a/cli/src/search.ts b/cli/src/search.ts index 0d7d008..e2df4d4 100644 --- a/cli/src/search.ts +++ b/cli/src/search.ts @@ -4,7 +4,9 @@ import chalk from 'chalk'; interface SearchResultRow { id: string; title: string; + similarity: number; type: 'NOTE' | 'WEB_PAGE' | 'PAPER'; + data: {}; } const BASE_URL = @@ -17,8 +19,12 @@ const search = async (q: string): Promise => { q, }, }); - const nodes = data.reduce((acc, { id, ...x }) => { - acc[id] = x; + const nodes = data.reduce((acc, { id, data, ...rest }) => { + // const {content} = data; + acc[id] = { + ...rest, + //...data, + }; return acc; }, {} as Record);