Skip to content

Commit 61d9e93

Browse files
authored
Merge pull request #3690 from udecode/fixai
Fix the copilot issues with mixed Chinese and English text.
2 parents e17e584 + ac08f2e commit 61d9e93

File tree

3 files changed

+139
-8
lines changed

3 files changed

+139
-8
lines changed

.changeset/large-items-remember.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
'@udecode/plate-ai': patch
3+
---
4+
5+
Copilot: `getNextWord` when handle the case with mixed Chinese and English text.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import { getNextWord } from './getNextWord';
2+
3+
describe('getNextWord', () => {
4+
describe('English text', () => {
5+
it('should get first word with no spaces', () => {
6+
expect(getNextWord({ text: 'hello world' })).toEqual({
7+
firstWord: 'hello',
8+
remainingText: ' world',
9+
});
10+
});
11+
12+
it('should handle leading spaces', () => {
13+
expect(getNextWord({ text: ' hello world' })).toEqual({
14+
firstWord: ' hello',
15+
remainingText: ' world',
16+
});
17+
});
18+
19+
it('should handle single word', () => {
20+
expect(getNextWord({ text: 'hello' })).toEqual({
21+
firstWord: 'hello',
22+
remainingText: '',
23+
});
24+
});
25+
});
26+
27+
describe('CJK characters', () => {
28+
it('should handle Chinese characters', () => {
29+
expect(getNextWord({ text: '你好 世界' })).toEqual({
30+
firstWord: '你',
31+
remainingText: '好 世界',
32+
});
33+
});
34+
35+
it('should handle Chinese character followed by punctuation', () => {
36+
expect(getNextWord({ text: '你。好 世界' })).toEqual({
37+
firstWord: '你。',
38+
remainingText: '好 世界',
39+
});
40+
});
41+
42+
it('should handle various CJK punctuation marks', () => {
43+
expect(getNextWord({ text: '你、好 世界' })).toEqual({
44+
firstWord: '你、',
45+
remainingText: '好 世界',
46+
});
47+
48+
expect(getNextWord({ text: '你!世界' })).toEqual({
49+
firstWord: '你!',
50+
remainingText: '世界',
51+
});
52+
53+
expect(getNextWord({ text: '你?好' })).toEqual({
54+
firstWord: '你?',
55+
remainingText: '好',
56+
});
57+
58+
expect(getNextWord({ text: 'hello? world' })).toEqual({
59+
firstWord: 'hello?',
60+
remainingText: ' world',
61+
});
62+
});
63+
64+
it('should handle Japanese Hiragana', () => {
65+
expect(getNextWord({ text: 'こんにちは 世界' })).toEqual({
66+
firstWord: 'こ',
67+
remainingText: 'んにちは 世界',
68+
});
69+
});
70+
71+
it('should handle Korean characters', () => {
72+
expect(getNextWord({ text: '안녕하세요 세계' })).toEqual({
73+
firstWord: '안',
74+
remainingText: '녕하세요 세계',
75+
});
76+
});
77+
78+
it('should handle CJK with leading spaces', () => {
79+
expect(getNextWord({ text: ' 你好 世界' })).toEqual({
80+
firstWord: ' 你',
81+
remainingText: '好 世界',
82+
});
83+
});
84+
});
85+
86+
describe('mixed content', () => {
87+
it('should handle mix of English and CJK', () => {
88+
expect(getNextWord({ text: 'hello 你好' })).toEqual({
89+
firstWord: 'hello',
90+
remainingText: ' 你好',
91+
});
92+
});
93+
94+
it('should handle English words directly adjacent to Chinese characters', () => {
95+
expect(getNextWord({ text: 'React是nice框架' })).toEqual({
96+
firstWord: 'React',
97+
remainingText: '是nice框架',
98+
});
99+
});
100+
101+
it('should handle CJK followed by English', () => {
102+
expect(getNextWord({ text: '你 hello' })).toEqual({
103+
firstWord: '你',
104+
remainingText: ' hello',
105+
});
106+
});
107+
});
108+
});

packages/ai/src/react/copilot/utils/getNextWord.ts

+26-8
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,36 @@ export const getNextWord: GetNextWord = ({ text }) => {
3030
let firstWord, remainingText;
3131

3232
if (isCJKChar) {
33-
// CJK characters: match leading spaces + first character + trailing spaces
33+
// CJK characters: match leading spaces + first character + optional punctuation
3434
const match =
35-
// eslint-disable-next-line regexp/no-unused-capturing-group
36-
/^(\s*[\u1100-\u11FF\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF]\s*)/.exec(
35+
/^(\s*)([\u1100-\u11FF\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF])([\u3000-\u303F\uFF00-\uFFEF])?/.exec(
3736
text
3837
);
39-
firstWord = match?.[0] || '';
40-
remainingText = text.slice(firstWord.length);
38+
39+
if (match) {
40+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
41+
const [fullMatch, spaces = '', char = '', punctuation = ''] = match;
42+
firstWord = spaces + char + punctuation;
43+
remainingText = text.slice(firstWord.length);
44+
} else {
45+
firstWord = '';
46+
remainingText = text;
47+
}
4148
} else {
42-
// Other characters (e.g., English): use space-based word separation
43-
firstWord = /^\s*\S+/.exec(text)?.[0] || '';
44-
remainingText = text.slice(firstWord.length);
49+
// For non-CJK text (including mixed content), match until space or CJK char
50+
const match =
51+
// eslint-disable-next-line regexp/no-unused-capturing-group
52+
/^(\s*\S+?)(?=[\s\u1100-\u11FF\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF]|$)/.exec(
53+
text
54+
);
55+
56+
if (match) {
57+
firstWord = match[0];
58+
remainingText = text.slice(firstWord.length);
59+
} else {
60+
firstWord = text;
61+
remainingText = '';
62+
}
4563
}
4664

4765
return { firstWord, remainingText };

0 commit comments

Comments
 (0)