Merge pull request #3690 from udecode/fixai

felixfeng33 · web-flow · commit 61d9e93c6fc6 · 2024-10-30T20:10:39.000+08:00
Fix the copilot issues with mixed Chinese and English text.
diff --git a/.changeset/large-items-remember.md b/.changeset/large-items-remember.md
@@ -0,0 +1,5 @@
+---
+'@udecode/plate-ai': patch
+---
+
+Copilot: `getNextWord` when handle the case with mixed Chinese and English text.
diff --git a/packages/ai/src/react/copilot/utils/getNextWord.spec.ts b/packages/ai/src/react/copilot/utils/getNextWord.spec.ts
@@ -0,0 +1,108 @@
+import { getNextWord } from './getNextWord';
+
+describe('getNextWord', () => {
+  describe('English text', () => {
+    it('should get first word with no spaces', () => {
+      expect(getNextWord({ text: 'hello world' })).toEqual({
+        firstWord: 'hello',
+        remainingText: ' world',
+      });
+    });
+
+    it('should handle leading spaces', () => {
+      expect(getNextWord({ text: '   hello world' })).toEqual({
+        firstWord: '   hello',
+        remainingText: ' world',
+      });
+    });
+
+    it('should handle single word', () => {
+      expect(getNextWord({ text: 'hello' })).toEqual({
+        firstWord: 'hello',
+        remainingText: '',
+      });
+    });
+  });
+
+  describe('CJK characters', () => {
+    it('should handle Chinese characters', () => {
+      expect(getNextWord({ text: '你好 世界' })).toEqual({
+        firstWord: '你',
+        remainingText: '好 世界',
+      });
+    });
+
+    it('should handle Chinese character followed by punctuation', () => {
+      expect(getNextWord({ text: '你。好 世界' })).toEqual({
+        firstWord: '你。',
+        remainingText: '好 世界',
+      });
+    });
+
+    it('should handle various CJK punctuation marks', () => {
+      expect(getNextWord({ text: '你、好 世界' })).toEqual({
+        firstWord: '你、',
+        remainingText: '好 世界',
+      });
+
+      expect(getNextWord({ text: '你！世界' })).toEqual({
+        firstWord: '你！',
+        remainingText: '世界',
+      });
+
+      expect(getNextWord({ text: '你？好' })).toEqual({
+        firstWord: '你？',
+        remainingText: '好',
+      });
+
+      expect(getNextWord({ text: 'hello? world' })).toEqual({
+        firstWord: 'hello?',
+        remainingText: ' world',
+      });
+    });
+
+    it('should handle Japanese Hiragana', () => {
+      expect(getNextWord({ text: 'こんにちは 世界' })).toEqual({
+        firstWord: 'こ',
+        remainingText: 'んにちは 世界',
+      });
+    });
+
+    it('should handle Korean characters', () => {
+      expect(getNextWord({ text: '안녕하세요 세계' })).toEqual({
+        firstWord: '안',
+        remainingText: '녕하세요 세계',
+      });
+    });
+
+    it('should handle CJK with leading spaces', () => {
+      expect(getNextWord({ text: '  你好 世界' })).toEqual({
+        firstWord: '  你',
+        remainingText: '好 世界',
+      });
+    });
+  });
+
+  describe('mixed content', () => {
+    it('should handle mix of English and CJK', () => {
+      expect(getNextWord({ text: 'hello 你好' })).toEqual({
+        firstWord: 'hello',
+        remainingText: ' 你好',
+      });
+    });
+
+    it('should handle English words directly adjacent to Chinese characters', () => {
+      expect(getNextWord({ text: 'React是nice框架' })).toEqual({
+        firstWord: 'React',
+        remainingText: '是nice框架',
+      });
+    });
+
+    it('should handle CJK followed by English', () => {
+      expect(getNextWord({ text: '你 hello' })).toEqual({
+        firstWord: '你',
+        remainingText: ' hello',
+      });
+    });
+  });
+});
diff --git a/packages/ai/src/react/copilot/utils/getNextWord.ts b/packages/ai/src/react/copilot/utils/getNextWord.ts
@@ -30,18 +30,36 @@ export const getNextWord: GetNextWord = ({ text }) => {
   let firstWord, remainingText;
 
   if (isCJKChar) {
-    // CJK characters: match leading spaces + first character + trailing spaces
+    // CJK characters: match leading spaces + first character + optional punctuation
     const match =
-      // eslint-disable-next-line regexp/no-unused-capturing-group
-      /^(\s*[\u1100-\u11FF\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF]\s*)/.exec(
+      /^(\s*)([\u1100-\u11FF\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF])([\u3000-\u303F\uFF00-\uFFEF])?/.exec(
         text
       );
-    firstWord = match?.[0] || '';
-    remainingText = text.slice(firstWord.length);
+
+    if (match) {
+      // eslint-disable-next-line @typescript-eslint/no-unused-vars
+      const [fullMatch, spaces = '', char = '', punctuation = ''] = match;
+      firstWord = spaces + char + punctuation;
+      remainingText = text.slice(firstWord.length);
+    } else {
+      firstWord = '';
+      remainingText = text;
+    }
   } else {
-    // Other characters (e.g., English): use space-based word separation
-    firstWord = /^\s*\S+/.exec(text)?.[0] || '';
-    remainingText = text.slice(firstWord.length);
+    // For non-CJK text (including mixed content), match until space or CJK char
+    const match =
+      // eslint-disable-next-line regexp/no-unused-capturing-group
+      /^(\s*\S+?)(?=[\s\u1100-\u11FF\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF]|$)/.exec(
+        text
+      );
+
+    if (match) {
+      firstWord = match[0];
+      remainingText = text.slice(firstWord.length);
+    } else {
+      firstWord = text;
+      remainingText = '';
+    }
   }
 
   return { firstWord, remainingText };