From c6ba9ca0749cb1dda5120b2b0e2ed6272a5177e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=97=AB=E8=8C=82=E6=BA=90?= Date: Sat, 1 Jun 2024 18:21:50 +0800 Subject: [PATCH] feat: clean code --- .../io/github/jmecn/text/EmojiIterator.java | 2 +- .../java/io/github/jmecn/text/EmojiRun.java | 62 ++++ .../jmecn/font/shaping/GMarkParser.java | 58 ---- .../jmecn/font/shaping/TestCharDetect.java | 264 ------------------ .../github/jmecn/font/shaping/TestEmoji.java | 67 ----- .../jmecn/font/shaping/TestEmojiIter.java | 48 ---- .../github/jmecn/font/shaping/TextSpan.java | 13 - .../io/github/jmecn/text/TestBidiRun.java | 128 +++++++++ .../github/jmecn/text/TestEmojiIterator.java | 163 +++++++++++ .../github/jmecn/text/TestRichTextToSpan.java | 129 +++++++++ 10 files changed, 483 insertions(+), 451 deletions(-) create mode 100644 lib/src/main/java/io/github/jmecn/text/EmojiRun.java delete mode 100644 lib/src/test/java/io/github/jmecn/font/shaping/GMarkParser.java delete mode 100644 lib/src/test/java/io/github/jmecn/font/shaping/TestCharDetect.java delete mode 100644 lib/src/test/java/io/github/jmecn/font/shaping/TestEmoji.java delete mode 100644 lib/src/test/java/io/github/jmecn/font/shaping/TestEmojiIter.java delete mode 100644 lib/src/test/java/io/github/jmecn/font/shaping/TextSpan.java create mode 100644 lib/src/test/java/io/github/jmecn/text/TestBidiRun.java create mode 100644 lib/src/test/java/io/github/jmecn/text/TestEmojiIterator.java create mode 100644 lib/src/test/java/io/github/jmecn/text/TestRichTextToSpan.java diff --git a/lib/src/main/java/io/github/jmecn/text/EmojiIterator.java b/lib/src/main/java/io/github/jmecn/text/EmojiIterator.java index 4d1fe17..50af78e 100644 --- a/lib/src/main/java/io/github/jmecn/text/EmojiIterator.java +++ b/lib/src/main/java/io/github/jmecn/text/EmojiIterator.java @@ -79,7 +79,7 @@ public int getTextEnd() { } public boolean next() { - if (this.end >= this.nChars - 1) { + if (this.end >= this.nChars) { return false; } diff --git a/lib/src/main/java/io/github/jmecn/text/EmojiRun.java b/lib/src/main/java/io/github/jmecn/text/EmojiRun.java new file mode 100644 index 0000000..f4df0ff --- /dev/null +++ b/lib/src/main/java/io/github/jmecn/text/EmojiRun.java @@ -0,0 +1,62 @@ +package io.github.jmecn.text; + +import java.util.Objects; + +/** + * desc: + * + * @author yanmaoyuan + */ +public class EmojiRun { + + private final boolean isEmoji; + private final int unicodeStart; + private final int unicodeEnd; + private final int textStart; + private final int textEnd; + + EmojiRun(boolean isEmoji, int unicodeStart, int unicodeEnd, int textStart, int textEnd) { + this.isEmoji = isEmoji; + this.unicodeStart = unicodeStart; + this.unicodeEnd = unicodeEnd; + this.textStart = textStart; + this.textEnd = textEnd; + } + + public boolean isEmoji() { + return isEmoji; + } + + public int getUnicodeStart() { + return unicodeStart; + } + + public int getUnicodeEnd() { + return unicodeEnd; + } + + public int getTextStart() { + return textStart; + } + + public int getTextEnd() { + return textEnd; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof EmojiRun)) { + return false; + } + EmojiRun that = (EmojiRun) o; + return isEmoji == that.isEmoji && unicodeStart == that.unicodeStart && unicodeEnd == that.unicodeEnd && textStart == that.textStart && textEnd == that.textEnd; + } + + @Override + public int hashCode() { + return Objects.hash(isEmoji, unicodeStart, unicodeEnd, textStart, textEnd); + } +} diff --git a/lib/src/test/java/io/github/jmecn/font/shaping/GMarkParser.java b/lib/src/test/java/io/github/jmecn/font/shaping/GMarkParser.java deleted file mode 100644 index 080ac6b..0000000 --- a/lib/src/test/java/io/github/jmecn/font/shaping/GMarkParser.java +++ /dev/null @@ -1,58 +0,0 @@ -package io.github.jmecn.font.shaping; - -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -class GMarkTag { - private String name; - private String attributes; - private String content; - - public GMarkTag(String name, String attributes, String content) { - this.name = name; - this.attributes = attributes; - this.content = content; - } - - public String getName() { - return name; - } - - public String getAttributes() { - return attributes; - } - - public String getContent() { - return content; - } -} - -public class GMarkParser { - private static final Pattern tagPattern = Pattern.compile("<(\\w+)(.*?)>(.*?)"); - - public static List extractTags(String gmarkText) { - List tags = new ArrayList<>(); - Matcher matcher = tagPattern.matcher(gmarkText); - while (matcher.find()) { - String tagName = matcher.group(1); - String attributes = matcher.group(2); - String content = matcher.group(3); - GMarkTag tag = new GMarkTag(tagName, attributes, content); - tags.add(tag); - } - return tags; - } - - public static void main(String[] args) { - String gmarkText = "这是一个美丽的新世界。Hello world我希望大家永远开心"; - List tags = extractTags(gmarkText); - for (GMarkTag tag : tags) { - System.out.println("Tag: " + tag.getName()); - System.out.println("Attributes: " + tag.getAttributes()); - System.out.println("Content: " + tag.getContent()); - System.out.println(); - } - } -} \ No newline at end of file diff --git a/lib/src/test/java/io/github/jmecn/font/shaping/TestCharDetect.java b/lib/src/test/java/io/github/jmecn/font/shaping/TestCharDetect.java deleted file mode 100644 index 5e5bd73..0000000 --- a/lib/src/test/java/io/github/jmecn/font/shaping/TestCharDetect.java +++ /dev/null @@ -1,264 +0,0 @@ -package io.github.jmecn.font.shaping; - -import com.ibm.icu.lang.UScript; -import com.ibm.icu.lang.UScriptRun; -import org.junit.jupiter.api.Test; - -import java.text.Bidi; -import java.text.BreakIterator; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * desc: - * - * @author yanmaoyuan - * @date 2024/5/21 - */ -public class TestCharDetect { - static final String TEXT = "Love and peace." +// latin - "爱与和平。世界是我们的,也是你们的。" +// Han - "الحب 123والسلام" + // Arabic - "사랑과 평화" + // Hangul - "👋🤔️" // emoji - ; - - @Test - void testPropertyDetect() { - for (int i = 0; i < TEXT.length(); i++) { - int codepoint = Character.codePointAt(TEXT, i); - byte dir = Character.getDirectionality(codepoint); - Character.UnicodeScript script = Character.UnicodeScript.of(codepoint); - - System.out.printf("[U+%04X] %s %s %s, %s\n", codepoint, Character.getName(codepoint), Character.getType(codepoint), dir, script); - } - } - - @Test void testLineBreak() { - BreakIterator iterator = BreakIterator.getLineInstance(); - iterator.setText(TEXT); - // 迭代并分割文本 - int start = iterator.first(); - int end; - while ((end = iterator.next()) != BreakIterator.DONE) { - String line = TEXT.substring(start, end); - System.out.println(line); - start = end; - } - } - @Test void testSimpleBidi() { - List bidiRuns = extractBidiRuns(TEXT); - - // 输出每个 BidiRun 的文本和方向性 - for (BidiRun bidiRun : bidiRuns) { - System.out.printf("Directionality: %d %s\n", bidiRun.getDirectionality(), bidiRun.getText()); - } - } - - @Test void testUScriptRun() { - UScriptRun run = new UScriptRun(TEXT); - while (run.next()) { - int start = run.getScriptStart(); - int limit = run.getScriptLimit(); - int script = run.getScriptCode(); - System.out.printf("Script %s from %d to %d\n", UScript.getName(script), start, limit); - } - } - - @Test void testBidi() { - Bidi bidi = new Bidi(TEXT, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT); - System.out.printf("isMixed:%b, runCount:%d\n", bidi.isMixed(), bidi.getRunCount()); - - for (int i = 0; i < bidi.getRunCount(); i++) { - int start = bidi.getRunStart(i); - int limit = bidi.getRunLimit(i); - System.out.printf("start=%d, limit=%d, level=%d, %s\n", start, limit, bidi.getRunLevel(i), TEXT.substring(start, limit));// 0-left_to_right, 1-right_to_left - } - } - - // 将字符串分解为多个 BidiRun - private static List extractBidiRuns(String text) { - List bidiRuns = new ArrayList<>(); - StringBuilder runText = new StringBuilder(); - byte currentDirectionality = -1; // 初始方向性为 -1,表示未知 - - // 遍历字符串中的每个字符 - for (int i = 0; i < text.length(); i++) { - char c = text.charAt(i); - byte directionality = Character.getDirectionality(c); - - // 如果当前字符的方向性与前一个字符不同,或者当前字符是控制字符,则结束当前 Run,并添加到列表中 - if (directionality != currentDirectionality || Character.isMirrored(c)) { - if (runText.length() > 0) { - bidiRuns.add(new BidiRun(runText.toString(), currentDirectionality)); - runText.setLength(0); - } - currentDirectionality = directionality; - } - - // 将当前字符添加到当前 Run 中 - runText.append(c); - } - - // 添加最后一个 Run - if (runText.length() > 0) { - bidiRuns.add(new BidiRun(runText.toString(), currentDirectionality)); - } - - return bidiRuns; - } - - // 表示一个 Bidi Run 的类 - static class BidiRun { - private final String text; - private final byte directionality; - - public BidiRun(String text, byte directionality) { - this.text = text; - this.directionality = directionality; - } - - public String getText() { - return text; - } - - public byte getDirectionality() { - return directionality; - } - } - - private static final Pattern tagPattern = Pattern.compile("<(b|i|u|color|span|style)(.*?)>(.*?)"); - - public static void parse(String gmarkText) { - Matcher matcher = tagPattern.matcher(gmarkText); - while (matcher.find()) { - String tag = matcher.group(1); - String attributes = matcher.group(2); - String content = matcher.group(3); - System.out.println("Tag: " + tag); - if (!attributes.isEmpty()) { - System.out.println("Attributes: " + attributes); - } - System.out.println("Content: " + content); - } - } - - @Test void testParseMarker() { - String gmarkText = "Hello world !"; - parse(gmarkText); - } - - public static List extractTagContents(TextSpan parentSpan) { - List contents = new ArrayList<>(); - Matcher matcher = tagPattern.matcher(parentSpan.text); - int lastEnd = 0; - while (matcher.find()) { - // 添加标签之前的文本部分 - String beforeTag = parentSpan.text.substring(lastEnd, matcher.start()); - if (!beforeTag.isEmpty()) { - contents.add(new TextSpan(beforeTag, parentSpan.attributes)); - } - String tag = matcher.group(1); - String attributes = matcher.group(2); - // 添加标签内的内容 - String content = matcher.group(3); - - List attrList = new ArrayList<>(); - if (parentSpan.attributes != null) { - // 外层优先级低,放在前面 - attrList.addAll(parentSpan.attributes); - } - // 内层优先级高,放在后面。 - attrList.add(tag + ":" + attributes); - if (!content.isEmpty()) { - TextSpan span = new TextSpan(content, attrList); - if (tagPattern.matcher(content).find()) { - List spans = extractTagContents(span); - contents.addAll(spans); - } else { - contents.add(span); - } - } - // 更新上一个标签结束的位置 - lastEnd = matcher.end(); - } - // 添加剩余的文本部分 - String remainder = parentSpan.text.substring(lastEnd); - if (!remainder.isEmpty()) { - contents.add(new TextSpan(remainder, parentSpan.attributes)); - } - return contents; - } - - public static List extractTagContents(String gmarkText) { - List contents = new ArrayList<>(); - Matcher matcher = tagPattern.matcher(gmarkText); - int lastEnd = 0; - while (matcher.find()) { - // 添加标签之前的文本部分 - String beforeTag = gmarkText.substring(lastEnd, matcher.start()); - if (!beforeTag.isEmpty()) { - contents.add(new TextSpan(beforeTag, null)); - } - String tag = matcher.group(1); - String attributes = matcher.group(2); - // 添加标签内的内容 - String content = matcher.group(3); - - List attrList = new ArrayList<>(); - attrList.add(tag + ":" + attributes); - if (!content.isEmpty()) { - TextSpan span = new TextSpan(content, attrList); - if (tagPattern.matcher(content).find()) { - List spans = extractTagContents(span); - contents.addAll(spans); - } else { - contents.add(span); - } - } - // 更新上一个标签结束的位置 - lastEnd = matcher.end(); - } - // 添加剩余的文本部分 - String remainder = gmarkText.substring(lastEnd); - if (!remainder.isEmpty()) { - contents.add(new TextSpan(remainder, null)); - } - return contents; - } - - @Test void parseTag() { - String gmarkText = "This is a nice place. "; - List extractedContents = extractTagContents(gmarkText); - for (TextSpan content : extractedContents) { - System.out.println(content); - } - } - - static class TextSpan { - String text; - List attributes; - - public TextSpan(String text, List attributes) { - this.text = text; - this.attributes = attributes; - } - - public void add(List attributes) { - if (this.attributes == null) { - this.attributes = new ArrayList<>(); - } - this.attributes.addAll(attributes); - } - @Override - public String toString() { - return "TextSpan{" + - "text='" + text + '\'' + - ", attributes=" + attributes + - '}'; - } - } -} diff --git a/lib/src/test/java/io/github/jmecn/font/shaping/TestEmoji.java b/lib/src/test/java/io/github/jmecn/font/shaping/TestEmoji.java deleted file mode 100644 index b8fb71d..0000000 --- a/lib/src/test/java/io/github/jmecn/font/shaping/TestEmoji.java +++ /dev/null @@ -1,67 +0,0 @@ -package io.github.jmecn.font.shaping; - -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.*; - -/** - * desc: - * - * @author yanmaoyuan - */ -class TestEmoji { - - @Test void charDetect() { - String text = "Hello😊"; - - // the string looks like only have 6 chars, but emoji is a surrogate pair, so the length is 7 - assertFalse(6 == text.length()); - assertTrue(7 == text.length()); - - // the 6th and 7th char is a surrogate pair - assertTrue(Character.isHighSurrogate(text.charAt(5))); - assertTrue(Character.isLowSurrogate(text.charAt(6))); - - // the codepoint is not equal to the char - assertEquals(0xD83D, text.charAt(5)); - assertEquals(0x1F60A, Character.codePointAt(text, 5)); - - // 0xDE0A is a control character, it is not a high surrogate - assertEquals(0xDE0A, text.charAt(6)); - assertEquals(0xDE0A, Character.codePointAt(text, 6)); - - // print all chars - for (int i = 0; i < text.length(); i++) { - char c = text.charAt(i); - int codepoint = Character.codePointAt(text, i); - System.out.printf("char=%c, charAt=0x%X, codepoint=0x%X isHighSurrogate=%b, isLowSurrogate=%b\n", c, (int)c, codepoint, Character.isHighSurrogate(c), Character.isLowSurrogate(c)); - } - } - - @Test void testFitzpatrickModifier() { - String text = "\uD83E\uDDD1\uD83E\uDDD1\uD83C\uDFFB\uD83E\uDDD1\uD83C\uDFFC\uD83E\uDDD1\uD83C\uDFFD\uD83E\uDDD1\uD83C\uDFFE\uD83E\uDDD1\uD83C\uDFFF"; - System.out.println(text); - System.out.println("\uD83C\uDFFB"); - System.out.println("\uD83C\uDFFC"); - System.out.println("\uD83C\uDFFD"); - System.out.println("\uD83C\uDFFE"); - System.out.println("\uD83C\uDFFF"); - - // print all chars - for (int i = 0; i < text.length(); i++) { - char c = text.charAt(i); - int codepoint = Character.codePointAt(text, i); - System.out.printf("char=%c, charAt=0x%X, codepoint=0x%X isHighSurrogate=%b, isLowSurrogate=%b\n", c, (int)c, codepoint, Character.isHighSurrogate(c), Character.isLowSurrogate(c)); - } - } - - @Test void testEmojiZwj() { - String name = "👨‍👩‍👧‍👦"; - assertEquals(11, name.length()); - assertEquals("\uD83D\uDC68\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66", name); - System.out.println("\uD83D\uDC68"); - System.out.println("\uD83D\uDC69"); - System.out.println("\uD83D\uDC67"); - System.out.println("\uD83D\uDC66"); - } -} diff --git a/lib/src/test/java/io/github/jmecn/font/shaping/TestEmojiIter.java b/lib/src/test/java/io/github/jmecn/font/shaping/TestEmojiIter.java deleted file mode 100644 index ecaf291..0000000 --- a/lib/src/test/java/io/github/jmecn/font/shaping/TestEmojiIter.java +++ /dev/null @@ -1,48 +0,0 @@ -package io.github.jmecn.font.shaping; - -import io.github.jmecn.text.EmojiIterator; -import io.github.jmecn.text.Unichar; -import org.junit.jupiter.api.Test; - -/** - * desc: - * - * @author yanmaoyuan - */ -class TestEmojiIter { - - void process(String text) { - char[] chars = text.toCharArray(); - EmojiIterator iter = new EmojiIterator(chars); - - Unichar[] unichars = iter.getUnicodeChars(); - for (Unichar unichar : unichars) { - System.out.println(unichar); - } - System.out.println("Unicode count:" + unichars.length); - System.out.println(text); - System.out.println("Character count:" + chars.length); - while (iter.next()) { - int start = iter.getStart(); - int end = iter.getEnd(); - int ts = iter.getTextStart(); - int te = iter.getTextEnd(); - String substr = text.substring(ts, te); - System.out.printf("isEmoji:%b, unicode:[%d, %d), text:[%d, %s), %s\n", iter.isEmoji(), start, end, ts, te, substr); - } - } - @Test void testSentence() { - String text = "Hello" + "🙋🧑🧑🏻🧑🏼🧑🏽🧑🏾🧑🏿" + "world" + "🍰🐒" + "一家人" + "👨‍👩‍👧‍👦"; - process(text); - } - - @Test void testZwjSequenceWithText() { - String text = "我" + "👨‍👩‍👧‍👦"; - process(text); - } - - @Test void testZwjSequence() { - String text = "👨‍👩‍👧‍👦"; - process(text); - } -} diff --git a/lib/src/test/java/io/github/jmecn/font/shaping/TextSpan.java b/lib/src/test/java/io/github/jmecn/font/shaping/TextSpan.java deleted file mode 100644 index 62aea69..0000000 --- a/lib/src/test/java/io/github/jmecn/font/shaping/TextSpan.java +++ /dev/null @@ -1,13 +0,0 @@ -package io.github.jmecn.font.shaping; - -import java.util.List; - -/** - * desc: - * - * @author yanmaoyuan - */ -public class TextSpan { - private String text; - private List attributes; -} \ No newline at end of file diff --git a/lib/src/test/java/io/github/jmecn/text/TestBidiRun.java b/lib/src/test/java/io/github/jmecn/text/TestBidiRun.java new file mode 100644 index 0000000..0b872a9 --- /dev/null +++ b/lib/src/test/java/io/github/jmecn/text/TestBidiRun.java @@ -0,0 +1,128 @@ +package io.github.jmecn.text; + +import com.ibm.icu.lang.UScript; +import com.ibm.icu.lang.UScriptRun; +import org.junit.jupiter.api.Test; + +import java.text.Bidi; +import java.text.BreakIterator; +import java.util.ArrayList; +import java.util.List; + +/** + * desc: + * + * @author yanmaoyuan + */ +class TestBidiRun { + static final String TEXT = "Love and peace." +// latin + "爱与和平。世界是我们的,也是你们的。" +// Han + "الحب 123والسلام" + // Arabic + "사랑과 평화" + // Hangul + "👋🤔️" // emoji + ; + + @Test + void testPropertyDetect() { + for (int i = 0; i < TEXT.length(); i++) { + int codepoint = Character.codePointAt(TEXT, i); + byte dir = Character.getDirectionality(codepoint); + Character.UnicodeScript script = Character.UnicodeScript.of(codepoint); + + System.out.printf("[U+%04X] %s %s %s, %s\n", codepoint, Character.getName(codepoint), Character.getType(codepoint), dir, script); + } + } + + @Test void testLineBreak() { + BreakIterator iterator = BreakIterator.getLineInstance(); + iterator.setText(TEXT); + // 迭代并分割文本 + int start = iterator.first(); + int end; + while ((end = iterator.next()) != BreakIterator.DONE) { + String line = TEXT.substring(start, end); + System.out.println(line); + start = end; + } + } + @Test void testSimpleBidi() { + List bidiRuns = extractBidiRuns(TEXT); + + // 输出每个 BidiRun 的文本和方向性 + for (BidiRun bidiRun : bidiRuns) { + System.out.printf("Directionality: %d %s\n", bidiRun.getDirectionality(), bidiRun.getText()); + } + } + + @Test void testUScriptRun() { + UScriptRun run = new UScriptRun(TEXT); + while (run.next()) { + int start = run.getScriptStart(); + int limit = run.getScriptLimit(); + int script = run.getScriptCode(); + System.out.printf("Script %s from %d to %d\n", UScript.getName(script), start, limit); + } + } + + @Test void testBidi() { + Bidi bidi = new Bidi(TEXT, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT); + System.out.printf("isMixed:%b, runCount:%d\n", bidi.isMixed(), bidi.getRunCount()); + + for (int i = 0; i < bidi.getRunCount(); i++) { + int start = bidi.getRunStart(i); + int limit = bidi.getRunLimit(i); + System.out.printf("start=%d, limit=%d, level=%d, %s\n", start, limit, bidi.getRunLevel(i), TEXT.substring(start, limit));// 0-left_to_right, 1-right_to_left + } + } + + // 将字符串分解为多个 BidiRun + private static List extractBidiRuns(String text) { + List bidiRuns = new ArrayList<>(); + StringBuilder runText = new StringBuilder(); + byte currentDirectionality = -1; // 初始方向性为 -1,表示未知 + + // 遍历字符串中的每个字符 + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + byte directionality = Character.getDirectionality(c); + + // 如果当前字符的方向性与前一个字符不同,或者当前字符是控制字符,则结束当前 Run,并添加到列表中 + if (directionality != currentDirectionality || Character.isMirrored(c)) { + if (runText.length() > 0) { + bidiRuns.add(new BidiRun(runText.toString(), currentDirectionality)); + runText.setLength(0); + } + currentDirectionality = directionality; + } + + // 将当前字符添加到当前 Run 中 + runText.append(c); + } + + // 添加最后一个 Run + if (runText.length() > 0) { + bidiRuns.add(new BidiRun(runText.toString(), currentDirectionality)); + } + + return bidiRuns; + } + + // 表示一个 Bidi Run 的类 + static class BidiRun { + private final String text; + private final byte directionality; + + public BidiRun(String text, byte directionality) { + this.text = text; + this.directionality = directionality; + } + + public String getText() { + return text; + } + + public byte getDirectionality() { + return directionality; + } + } +} diff --git a/lib/src/test/java/io/github/jmecn/text/TestEmojiIterator.java b/lib/src/test/java/io/github/jmecn/text/TestEmojiIterator.java new file mode 100644 index 0000000..76dea98 --- /dev/null +++ b/lib/src/test/java/io/github/jmecn/text/TestEmojiIterator.java @@ -0,0 +1,163 @@ +package io.github.jmecn.text; + +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * desc: + * + * @author yanmaoyuan + */ +class TestEmojiIterator { + + void test(String text, EmojiRun[] expectedList) { + display(text); + + List actualList = new ArrayList<>(); + EmojiIterator iterator = new EmojiIterator(text.toCharArray()); + while (iterator.next()) { + actualList.add(new EmojiRun(iterator.isEmoji(), iterator.getStart(), iterator.getEnd(), iterator.getTextStart(), iterator.getTextEnd())); + } + + assertEquals(expectedList.length, actualList.size(), "size:" + text); + int size = expectedList.length; + for (int i = 0; i < size; i++) { + EmojiRun expected = expectedList[i]; + EmojiRun actual = actualList.get(i); + assertEquals(expected.isEmoji(), actual.isEmoji(), "isEmoji:" + text); + assertEquals(expected.getUnicodeStart(), actual.getUnicodeStart(), "unicodeStart:" + text); + assertEquals(expected.getUnicodeEnd(), actual.getUnicodeEnd(), "unicodeEnd:" + text); + assertEquals(expected.getTextStart(), actual.getTextStart(), "textStart:" + text); + assertEquals(expected.getTextEnd(), actual.getTextEnd(), "textEnd:" + text); + } + } + + void display(String text) { + char[] chars = text.toCharArray(); + EmojiIterator iter = new EmojiIterator(chars); + Unichar[] unichars = iter.getUnicodeChars(); + System.out.printf(">>>> %s <<<<\nunicode count:%d, character count:%d\n", text, unichars.length, chars.length); + System.out.println("[id]: unicode, string, emoji, text"); + int runs = 0; + + while (iter.next()) { + int start = iter.getStart(); + int end = iter.getEnd(); + int ts = iter.getTextStart(); + int te = iter.getTextEnd(); + String substr = text.substring(ts, te); + System.out.printf("[%2d]: [%2d, %2d), [%2d, %2d), %5b, %s\n", runs++, start, end, ts, te, iter.isEmoji(), substr); + } + System.out.println(); + } + + static class TestData { + String text; + EmojiRun[] expectedList; + + TestData(String text, EmojiRun[] expectedList) { + this.text = text; + this.expectedList = expectedList; + } + } + + static List getTestData() { + String text; + EmojiRun[] expectedList; + + List list = new ArrayList<>(); + + // emoji base: smile + text = "\uD83D\uDE0A";// 😊 + expectedList = new EmojiRun[] { + new EmojiRun(true, 0, 1, 0, 2) + }; + list.add(new TestData(text, expectedList)); + + // zero-width joiner + // family: man and woman and girl and boy + text = "\uD83D\uDC68\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66";// 👨‍👩‍👧‍👦 + expectedList = new EmojiRun[] { + new EmojiRun(true, 0, 7, 0, 11) + }; + list.add(new TestData(text, expectedList)); + + // emoji fitzpatrick modifier + // a hand with light skin tone + text = "\u270B\uD83C\uDFFB"; // ✋🏻 + expectedList = new EmojiRun[] { + new EmojiRun(true, 0, 2, 0, 3) + }; + list.add(new TestData(text, expectedList)); + + // emoji fitzpatrick modifier and zero-width joiner + // a female firefighter with medium-darker skin tone + text = "\uD83D\uDC69\uD83C\uDFFD\u200D\uD83D\uDE92"; // 👩🏽‍🚒 + expectedList = new EmojiRun[] { + new EmojiRun(true, 0, 4, 0, 7) + }; + list.add(new TestData(text, expectedList)); + + // alphanum: cool button + text = "\uD83C\uDD92";// 🆒 + expectedList = new EmojiRun[] { + new EmojiRun(true, 0, 1, 0, 2) + }; + list.add(new TestData(text, expectedList)); + + // flag: China + text = "\uD83C\uDDE8\uD83C\uDDF3"; // 🇨🇳 + expectedList = new EmojiRun[] { + new EmojiRun(true, 0, 2, 0, 4) + }; + list.add(new TestData(text, expectedList)); + + // flag: pirate flag + text = "\uD83C\uDFF4\u200D\u2620\uFE0F";// 🏴‍☠️ + expectedList = new EmojiRun[] { + new EmojiRun(true, 0, 4, 0, 5) + }; + list.add(new TestData(text, expectedList)); + + // keycap: #️⃣*️⃣0️⃣1️⃣2️⃣3️⃣4️⃣5️⃣6️⃣7️⃣8️⃣9️⃣ + text = "#\uFE0F\u20E3*\uFE0F\u20E30\uFE0F\u20E31\uFE0F\u20E32\uFE0F\u20E33\uFE0F\u20E34\uFE0F\u20E35\uFE0F\u20E36\uFE0F\u20E37\uFE0F\u20E38\uFE0F\u20E39\uFE0F\u20E3"; + expectedList = new EmojiRun[]{ + new EmojiRun(true, 0, 36, 0, 36), + }; + list.add(new TestData(text, expectedList)); + + // complex text + text = "Hello, 你好,🌍世界!"; + expectedList = new EmojiRun[] { + new EmojiRun(false, 0, 10, 0, 10), + new EmojiRun(true, 10, 11, 10, 12), + new EmojiRun(false, 11, 14, 12, 15), + }; + list.add(new TestData(text, expectedList)); + + // complex emoji combined with text + text = "Hello" + "🙋🧑🧑🏻🧑🏼🧑🏽🧑🏾🧑🏿" + "world" + "🍰🐒" + "家庭" + "👨‍👩‍👧‍👦"; + expectedList = new EmojiRun[] { + new EmojiRun(false, 0, 5, 0, 5), + new EmojiRun(true, 5, 17, 5, 29), + new EmojiRun(false, 17, 22, 29, 34), + new EmojiRun(true, 22, 24, 34, 38), + new EmojiRun(false, 24, 26, 38, 40), + new EmojiRun(true, 26, 33, 40, 51) + }; + + list.add(new TestData(text, expectedList)); + + return list; + } + + @Test void testAll() { + for (TestData data : getTestData()) { + test(data.text, data.expectedList); + } + } +} diff --git a/lib/src/test/java/io/github/jmecn/text/TestRichTextToSpan.java b/lib/src/test/java/io/github/jmecn/text/TestRichTextToSpan.java new file mode 100644 index 0000000..8612802 --- /dev/null +++ b/lib/src/test/java/io/github/jmecn/text/TestRichTextToSpan.java @@ -0,0 +1,129 @@ +package io.github.jmecn.text; + +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Test case to split rich text into text spans. + * + * @author yanmaoyuan + */ +public class TestRichTextToSpan { + + private static final Pattern tagPattern = Pattern.compile("<(b|i|u|color|span|style)(.*?)>(.*?)"); + + public static List extractTagContents(TextSpan parentSpan) { + List contents = new ArrayList<>(); + Matcher matcher = tagPattern.matcher(parentSpan.text); + int lastEnd = 0; + while (matcher.find()) { + // 添加标签之前的文本部分 + String beforeTag = parentSpan.text.substring(lastEnd, matcher.start()); + if (!beforeTag.isEmpty()) { + contents.add(new TextSpan(beforeTag, parentSpan.attributes)); + } + String tag = matcher.group(1); + String attributes = matcher.group(2); + // 添加标签内的内容 + String content = matcher.group(3); + + List attrList = new ArrayList<>(); + if (parentSpan.attributes != null) { + // 外层优先级低,放在前面 + attrList.addAll(parentSpan.attributes); + } + // 内层优先级高,放在后面。 + attrList.add(tag + ":" + attributes); + if (!content.isEmpty()) { + TextSpan span = new TextSpan(content, attrList); + if (tagPattern.matcher(content).find()) { + List spans = extractTagContents(span); + contents.addAll(spans); + } else { + contents.add(span); + } + } + // 更新上一个标签结束的位置 + lastEnd = matcher.end(); + } + // 添加剩余的文本部分 + String remainder = parentSpan.text.substring(lastEnd); + if (!remainder.isEmpty()) { + contents.add(new TextSpan(remainder, parentSpan.attributes)); + } + return contents; + } + + public static List extractTagContents(String gmarkText) { + List contents = new ArrayList<>(); + Matcher matcher = tagPattern.matcher(gmarkText); + int lastEnd = 0; + while (matcher.find()) { + // 添加标签之前的文本部分 + String beforeTag = gmarkText.substring(lastEnd, matcher.start()); + if (!beforeTag.isEmpty()) { + contents.add(new TextSpan(beforeTag, null)); + } + String tag = matcher.group(1); + String attributes = matcher.group(2); + // 添加标签内的内容 + String content = matcher.group(3); + + List attrList = new ArrayList<>(); + attrList.add(tag + ":" + attributes); + if (!content.isEmpty()) { + TextSpan span = new TextSpan(content, attrList); + if (tagPattern.matcher(content).find()) { + List spans = extractTagContents(span); + contents.addAll(spans); + } else { + contents.add(span); + } + } + // 更新上一个标签结束的位置 + lastEnd = matcher.end(); + } + // 添加剩余的文本部分 + String remainder = gmarkText.substring(lastEnd); + if (!remainder.isEmpty()) { + contents.add(new TextSpan(remainder, null)); + } + return contents; + } + + @Test void parseTag() { + String gmarkText = "This is a nice place. "; + List extractedContents = extractTagContents(gmarkText); + for (TextSpan content : extractedContents) { + System.out.println(content); + } + } + + static class TextSpan { + String text; + List attributes; + + public TextSpan(String text, List attributes) { + this.text = text; + this.attributes = attributes; + } + + public void add(List attributes) { + if (this.attributes == null) { + this.attributes = new ArrayList<>(); + } + this.attributes.addAll(attributes); + } + @Override + public String toString() { + return "TextSpan{" + + "text='" + text + '\'' + + ", attributes=" + attributes + + '}'; + } + } +}