diff --git a/packages/components/nodes/documentloaders/Json/Json.ts b/packages/components/nodes/documentloaders/Json/Json.ts index c413e5feec8..ee07f1369be 100644 --- a/packages/components/nodes/documentloaders/Json/Json.ts +++ b/packages/components/nodes/documentloaders/Json/Json.ts @@ -1,8 +1,36 @@ import { omit } from 'lodash' import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface' import { TextSplitter } from 'langchain/text_splitter' -import { JSONLoader } from 'langchain/document_loaders/fs/json' import { getFileFromStorage, handleEscapeCharacters, INodeOutputsValue } from '../../../src' +import { Document } from '@langchain/core/documents' +import jsonpointer from 'jsonpointer' +import type { readFile as ReadFileT } from 'node:fs/promises' +import { BaseDocumentLoader } from 'langchain/document_loaders/base' + +const howToUseCode = ` +You can add metadata dynamically from the document: + +For example, if the JSON document is: +\`\`\`json +[ + { + "url": "https://www.google.com", + "body": "This is body 1" + }, + { + "url": "https://www.yahoo.com", + "body": "This is body 2" + } +] + +\`\`\` + +You can have the "url" value as metadata by returning the following: +\`\`\`json +{ + "url": "/url" +} +\`\`\`` class Json_DocumentLoaders implements INode { label: string @@ -19,7 +47,7 @@ class Json_DocumentLoaders implements INode { constructor() { this.label = 'Json File' this.name = 'jsonFile' - this.version = 2.0 + this.version = 3.0 this.type = 'Document' this.icon = 'json.svg' this.category = 'Document Loaders' @@ -42,15 +70,21 @@ class Json_DocumentLoaders implements INode { label: 'Pointers Extraction (separated by commas)', name: 'pointersName', type: 'string', - description: 'Extracting multiple pointers', - placeholder: 'Enter pointers name', + description: + 'Ex: { "key": "value" }, Pointer Extraction = "key", "value" will be extracted as pageContent of the chunk. Use comma to separate multiple pointers', + placeholder: 'key1, key2', optional: true }, { label: 'Additional Metadata', name: 'metadata', type: 'json', - description: 'Additional metadata to be added to the extracted documents', + description: + 'Additional metadata to be added to the extracted documents. You can add metadata dynamically from the document. Ex: { "key": "value", "source": "www.example.com" }. Metadata: { "page": "/source" } will extract the value of the key "source" from the document and add it to the metadata with the key "page"', + hint: { + label: 'How to use', + value: howToUseCode + }, optional: true, additionalParams: true }, @@ -118,7 +152,7 @@ class Json_DocumentLoaders implements INode { if (!file) continue const fileData = await getFileFromStorage(file, chatflowid) const blob = new Blob([fileData]) - const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined) + const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined, metadata) if (textSplitter) { let splittedDocs = await loader.load() @@ -141,7 +175,7 @@ class Json_DocumentLoaders implements INode { splitDataURI.pop() const bf = Buffer.from(splitDataURI.pop() || '', 'base64') const blob = new Blob([bf]) - const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined) + const loader = new JSONLoader(blob, pointers.length != 0 ? pointers : undefined, metadata) if (textSplitter) { let splittedDocs = await loader.load() @@ -154,7 +188,8 @@ class Json_DocumentLoaders implements INode { } if (metadata) { - const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata) + let parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata) + parsedMetadata = removeValuesStartingWithSlash(parsedMetadata) docs = docs.map((doc) => ({ ...doc, metadata: @@ -197,4 +232,210 @@ class Json_DocumentLoaders implements INode { } } +const removeValuesStartingWithSlash = (obj: Record): Record => { + const result: Record = {} + + for (const key in obj) { + const value = obj[key] + if (typeof value === 'string' && value.startsWith('/')) { + continue + } + result[key] = value + } + + return result +} + +class TextLoader extends BaseDocumentLoader { + constructor(public filePathOrBlob: string | Blob) { + super() + } + + protected async parse(raw: string): Promise<{ pageContent: string; metadata: ICommonObject }[]> { + return [{ pageContent: raw, metadata: {} }] + } + + public async load(): Promise { + let text: string + let metadata: Record + if (typeof this.filePathOrBlob === 'string') { + const { readFile } = await TextLoader.imports() + text = await readFile(this.filePathOrBlob, 'utf8') + metadata = { source: this.filePathOrBlob } + } else { + text = await this.filePathOrBlob.text() + metadata = { source: 'blob', blobType: this.filePathOrBlob.type } + } + const parsed = await this.parse(text) + parsed.forEach((parsedData, i) => { + const { pageContent } = parsedData + if (typeof pageContent !== 'string') { + throw new Error(`Expected string, at position ${i} got ${typeof pageContent}`) + } + }) + return parsed.map((parsedData, i) => { + const { pageContent, metadata: additionalMetadata } = parsedData + return new Document({ + pageContent, + metadata: + parsed.length === 1 + ? { ...metadata, ...additionalMetadata } + : { + ...metadata, + line: i + 1, + ...additionalMetadata + } + }) + }) + } + + static async imports(): Promise<{ + readFile: typeof ReadFileT + }> { + try { + const { readFile } = await import('node:fs/promises') + return { readFile } + } catch (e) { + console.error(e) + throw new Error(`Failed to load fs/promises. Make sure you are running in Node.js environment.`) + } + } +} + +class JSONLoader extends TextLoader { + public pointers: string[] + private metadataMapping: Record + + constructor(filePathOrBlob: string | Blob, pointers: string | string[] = [], metadataMapping: Record = {}) { + super(filePathOrBlob) + this.pointers = Array.isArray(pointers) ? pointers : [pointers] + if (metadataMapping) { + this.metadataMapping = typeof metadataMapping === 'object' ? metadataMapping : JSON.parse(metadataMapping) + } + } + + protected async parse(raw: string): Promise { + const json = JSON.parse(raw.trim()) + const documents: Document[] = [] + + // Handle both single object and array of objects + const jsonArray = Array.isArray(json) ? json : [json] + + for (const item of jsonArray) { + const content = this.extractContent(item) + const metadata = this.extractMetadata(item) + + for (const pageContent of content) { + documents.push({ + pageContent, + metadata + }) + } + } + + return documents + } + + /** + * Extracts content based on specified pointers or all strings if no pointers + */ + private extractContent(json: any): string[] { + const compiledPointers = this.pointers.map((pointer) => jsonpointer.compile(pointer)) + + return this.extractArrayStringsFromObject(json, compiledPointers, !(this.pointers.length > 0)) + } + + /** + * Extracts metadata based on the mapping configuration + */ + private extractMetadata(json: any): Record { + let metadata: Record = {} + + if (this.metadataMapping) { + const values = Object.values(this.metadataMapping).filter((value) => typeof value === 'string' && value.startsWith('/')) + for (const value of values) { + if (value) { + const key = Object.keys(this.metadataMapping).find((key) => this.metadataMapping?.[key] === value) + if (key) { + metadata = { + ...metadata, + [key]: jsonpointer.get(json, value) + } + } + } + } + } + + return metadata + } + + /** + * If JSON pointers are specified, return all strings below any of them + * and exclude all other nodes expect if they match a JSON pointer. + * If no JSON pointer is specified then return all string in the object. + */ + private extractArrayStringsFromObject( + json: any, + pointers: jsonpointer[], + extractAllStrings = false, + keyHasBeenFound = false + ): string[] { + if (!json) { + return [] + } + + if (typeof json === 'string' && extractAllStrings) { + return [json] + } + + if (Array.isArray(json) && extractAllStrings) { + let extractedString: string[] = [] + for (const element of json) { + extractedString = extractedString.concat(this.extractArrayStringsFromObject(element, pointers, true)) + } + return extractedString + } + + if (typeof json === 'object') { + if (extractAllStrings) { + return this.extractArrayStringsFromObject(Object.values(json), pointers, true) + } + + const targetedEntries = this.getTargetedEntries(json, pointers) + const thisLevelEntries = Object.values(json) as object[] + const notTargetedEntries = thisLevelEntries.filter((entry: object) => !targetedEntries.includes(entry)) + + let extractedStrings: string[] = [] + if (targetedEntries.length > 0) { + for (const oneEntry of targetedEntries) { + extractedStrings = extractedStrings.concat(this.extractArrayStringsFromObject(oneEntry, pointers, true, true)) + } + + for (const oneEntry of notTargetedEntries) { + extractedStrings = extractedStrings.concat(this.extractArrayStringsFromObject(oneEntry, pointers, false, true)) + } + } else if (extractAllStrings || !keyHasBeenFound) { + for (const oneEntry of notTargetedEntries) { + extractedStrings = extractedStrings.concat(this.extractArrayStringsFromObject(oneEntry, pointers, extractAllStrings)) + } + } + + return extractedStrings + } + + return [] + } + + private getTargetedEntries(json: object, pointers: jsonpointer[]): object[] { + const targetEntries = [] + for (const pointer of pointers) { + const targetedEntry = pointer.get(json) + if (targetedEntry) { + targetEntries.push(targetedEntry) + } + } + return targetEntries + } +} + module.exports = { nodeClass: Json_DocumentLoaders }