Skip to content

Commit

Permalink
Merge pull request #9411 from weseek/feat/157516-remove-unnecessary-s…
Browse files Browse the repository at this point in the history
…trings-from-markdown-and-save-in-vector-store

imprv(ai): Remove unnecessary strings from markdown when creating VectorStoreFIie
  • Loading branch information
mergify[bot] authored Nov 18, 2024
2 parents 0ac74ca + 1feba06 commit 25a1bec
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 1 deletion.
5 changes: 4 additions & 1 deletion apps/app/src/features/openai/server/services/openai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@ import { createBatchStream } from '~/server/util/batch-stream';
import loggerFactory from '~/utils/logger';

import { OpenaiServiceTypes } from '../../interfaces/ai';
import { sanitizeMarkdown } from '../utils/sanitize-markdown';

import { getClient } from './client-delegator';
// import { splitMarkdownIntoChunks } from './markdown-splitter/markdown-token-splitter';
import { oepnaiApiErrorHandler } from './openai-api-error-handler';


const BATCH_SIZE = 100;

const logger = loggerFactory('growi:service:openai');
Expand Down Expand Up @@ -155,7 +157,8 @@ class OpenaiService implements IOpenaiService {
// }

private async uploadFile(pageId: Types.ObjectId, body: string): Promise<OpenAI.Files.FileObject> {
const file = await toFile(Readable.from(body), `${pageId}.md`);
const sanitizedMarkdown = await sanitizeMarkdown(body);
const file = await toFile(Readable.from(sanitizedMarkdown), `${pageId}.md`);
const uploadedFile = await this.client.uploadFile(file);
return uploadedFile;
}
Expand Down
65 changes: 65 additions & 0 deletions apps/app/src/features/openai/server/utils/sanitize-markdown.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import { dynamicImport } from '@cspell/dynamic-import';
import type { Root, Code } from 'mdast';
import type * as RemarkParse from 'remark-parse';
import type * as RemarkStringify from 'remark-stringify';
import type * as Unified from 'unified';
import type * as UnistUtilVisit from 'unist-util-visit';

interface ModuleCache {
remarkParse?: typeof RemarkParse.default;
remarkStringify?: typeof RemarkStringify.default;
unified?: typeof Unified.unified;
visit?: typeof UnistUtilVisit.visit;
}

let moduleCache: ModuleCache = {};

const initializeModules = async(): Promise<void> => {
if (moduleCache.remarkParse != null && moduleCache.remarkStringify != null && moduleCache.unified != null && moduleCache.visit != null) {
return;
}

const [{ default: remarkParse }, { default: remarkStringify }, { unified }, { visit }] = await Promise.all([
dynamicImport<typeof RemarkParse>('remark-parse', __dirname),
dynamicImport<typeof RemarkStringify>('remark-stringify', __dirname),
dynamicImport<typeof Unified>('unified', __dirname),
dynamicImport<typeof UnistUtilVisit>('unist-util-visit', __dirname),
]);

moduleCache = {
remarkParse,
remarkStringify,
unified,
visit,
};
};

export const sanitizeMarkdown = async(markdown: string): Promise<string> => {
await initializeModules();

const {
remarkParse, remarkStringify, unified, visit,
} = moduleCache;


if (remarkParse == null || remarkStringify == null || unified == null || visit == null) {
throw new Error('Failed to initialize required modules');
}

const sanitize = () => {
return (tree: Root) => {
visit(tree, 'code', (node: Code) => {
if (node.lang === 'drawio') {
node.value = '<!-- drawio content replaced -->';
}
});
};
};

const processor = unified()
.use(remarkParse)
.use(sanitize)
.use(remarkStringify);

return processor.processSync(markdown).toString();
};

0 comments on commit 25a1bec

Please sign in to comment.