From b8a43b08aea86ddae87573990555c65e48dd29c5 Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Sun, 14 Jul 2024 21:00:24 +0300 Subject: [PATCH] variationselector: fix edge cases --- .../emojis-with-extra-variations.json | 1 + variationselector/emojis-with-variations.json | 1 - variationselector/generate.sh | 12 +++ variationselector/variationselector.go | 42 +++++--- variationselector/variationselector_test.go | 100 ++++++++++++++++++ 5 files changed, 140 insertions(+), 16 deletions(-) create mode 100644 variationselector/emojis-with-extra-variations.json delete mode 100644 variationselector/emojis-with-variations.json diff --git a/variationselector/emojis-with-extra-variations.json b/variationselector/emojis-with-extra-variations.json new file mode 100644 index 0000000..48a4a1e --- /dev/null +++ b/variationselector/emojis-with-extra-variations.json @@ -0,0 +1 @@ +["*","0","1","2","3","4","5","6","7","8","9","⌚","⌛","⏩","⏪","⏫","⏬","⏰","⏳","◽","◾","☔","☕","♈","♉","♊","♋","♌","♍","♎","♏","♐","♑","♒","♓","♿","⚓","⚡","⚪","⚫","⚽","⚾","⛄","⛅","⛎","⛔","⛪","⛲","⛳","⛵","⛺","⛽","✅","✊","✋","✨","❌","❎","❓","❔","❕","❗","➕","➖","➗","➰","➿","⬛","⬜","⭐","⭕","🀄","🈚","🈯","🌍","🌎","🌏","🌕","🌜","🍸","🎓","🎧","🎬","🎭","🎮","🏂","🏄","🏆","🏊","🏠","🏭","🐈","🐕","🐟","🐦","👂","👆","👇","👈","👉","👍","👎","👓","👪","👽","💣","💰","💳","💻","💿","📋","📚","📟","📤","📥","📦","📪","📫","📬","📭","📷","📹","📺","📻","🔈","🔍","🔒","🔓","🕐","🕑","🕒","🕓","🕔","🕕","🕖","🕗","🕘","🕙","🕚","🕛","🕜","🕝","🕞","🕟","🕠","🕡","🕢","🕣","🕤","🕥","🕦","🕧","😐","🚇","🚍","🚑","🚔","🚘","🚭","🚲","🚹","🚺","🚼"] diff --git a/variationselector/emojis-with-variations.json b/variationselector/emojis-with-variations.json deleted file mode 100644 index 488fefa..0000000 --- a/variationselector/emojis-with-variations.json +++ /dev/null @@ -1 +0,0 @@ -["*","0","1","2","3","4","5","6","7","8","9","©","®","‼","⁉","™","ℹ","↔","↕","↖","↗","↘","↙","↩","↪","⌚","⌛","⌨","⏏","⏩","⏪","⏫","⏬","⏭","⏮","⏯","⏰","⏱","⏲","⏳","⏸","⏹","⏺","Ⓜ","▪","▫","▶","◀","◻","◼","◽","◾","☀","☁","☂","☃","☄","☎","☑","☔","☕","☘","☝","☠","☢","☣","☦","☪","☮","☯","☸","☹","☺","♀","♂","♈","♉","♊","♋","♌","♍","♎","♏","♐","♑","♒","♓","♟","♠","♣","♥","♦","♨","♻","♾","♿","⚒","⚓","⚔","⚕","⚖","⚗","⚙","⚛","⚜","⚠","⚡","⚧","⚪","⚫","⚰","⚱","⚽","⚾","⛄","⛅","⛈","⛎","⛏","⛑","⛓","⛔","⛩","⛪","⛰","⛱","⛲","⛳","⛴","⛵","⛷","⛸","⛹","⛺","⛽","✂","✅","✈","✉","✊","✋","✌","✍","✏","✒","✔","✖","✝","✡","✨","✳","✴","❄","❇","❌","❎","❓","❔","❕","❗","❣","❤","➕","➖","➗","➡","➰","➿","⤴","⤵","⬅","⬆","⬇","⬛","⬜","⭐","⭕","〰","〽","㊗","㊙","🀄","🅰","🅱","🅾","🅿","🈂","🈚","🈯","🈷","🌍","🌎","🌏","🌕","🌜","🌡","🌤","🌥","🌦","🌧","🌨","🌩","🌪","🌫","🌬","🌶","🍸","🍽","🎓","🎖","🎗","🎙","🎚","🎛","🎞","🎟","🎧","🎬","🎭","🎮","🏂","🏄","🏆","🏊","🏋","🏌","🏍","🏎","🏔","🏕","🏖","🏗","🏘","🏙","🏚","🏛","🏜","🏝","🏞","🏟","🏠","🏭","🏳","🏵","🏷","🐈","🐕","🐟","🐦","🐿","👁","👂","👆","👇","👈","👉","👍","👎","👓","👪","👽","💣","💰","💳","💻","💿","📋","📚","📟","📤","📥","📦","📪","📫","📬","📭","📷","📹","📺","📻","📽","🔈","🔍","🔒","🔓","🕉","🕊","🕐","🕑","🕒","🕓","🕔","🕕","🕖","🕗","🕘","🕙","🕚","🕛","🕜","🕝","🕞","🕟","🕠","🕡","🕢","🕣","🕤","🕥","🕦","🕧","🕯","🕰","🕳","🕴","🕵","🕶","🕷","🕸","🕹","🖇","🖊","🖋","🖌","🖍","🖐","🖥","🖨","🖱","🖲","🖼","🗂","🗃","🗄","🗑","🗒","🗓","🗜","🗝","🗞","🗡","🗣","🗨","🗯","🗳","🗺","😐","🚇","🚍","🚑","🚔","🚘","🚭","🚲","🚹","🚺","🚼","🛋","🛍","🛎","🛏","🛠","🛡","🛢","🛣","🛤","🛥","🛩","🛰","🛳"] diff --git a/variationselector/generate.sh b/variationselector/generate.sh index 29f6f58..160052c 100755 --- a/variationselector/generate.sh +++ b/variationselector/generate.sh @@ -15,3 +15,15 @@ echo -e "\n$( | awk '{ for (i = 1; i <= NF; i++) {printf("\\U%8s", $i) }; printf("\n") }' \ | sed 's/ /0/g' )" | jq -RcM '[inputs]' > fully-qualified-variations.json + +python < 1 { + panic(fmt.Sprintf("emoji %s is more than one rune long", emoji)) + } + emojisWithExtraVariations[i] = fmt.Sprintf(`\x{%X}`, emojiRunes[0]) } - variationReplacer = strings.NewReplacer(replaceInput...) + variationPattern := fmt.Sprintf(`(^|[^\x{200D}])(%s)([^\x{FE0F}\x{FE0E}\x{200D}\x{1F3FB}\x{1F3FC}\x{1F3FD}\x{1F3FE}\x{1F3FF}]|$)`, strings.Join(emojisWithExtraVariations, "|")) + variationRegex = regexp.MustCompile(variationPattern) var fullyQualifiedVariations []string err = json.Unmarshal(fullyQualifiedVariationsJSON, &fullyQualifiedVariations) if err != nil { panic(err) } - replaceInput = make([]string, 2*len(fullyQualifiedVariations)) + replaceInput := make([]string, 2*len(fullyQualifiedVariations)) for i, emoji := range fullyQualifiedVariations { replaceInput[i*2] = strings.ReplaceAll(emoji, VS16, "") replaceInput[(i*2)+1] = emoji @@ -71,7 +81,8 @@ const VS16 = "\ufe0f" // // This will remove all variation selectors first to make sure it doesn't add duplicates. func Add(val string) string { - return skinToneReplacer.Replace(variationReplacer.Replace(Remove(val))) + initOnce.Do(doInit) + return variationRegex.ReplaceAllString(FullyQualify(val), "$1$2\ufe0f$3") } // Remove removes all emoji variation selectors in the given string. @@ -89,5 +100,6 @@ func Remove(val string) string { // // N.B. This method is not currently used by the Matrix spec, but it is included as bridging to other networks may need it. func FullyQualify(val string) string { - return fullyQualifier.Replace(Remove(val)) + initOnce.Do(doInit) + return skinToneReplacer.Replace(fullyQualifier.Replace(Remove(val))) } diff --git a/variationselector/variationselector_test.go b/variationselector/variationselector_test.go index 8521ef5..250838e 100644 --- a/variationselector/variationselector_test.go +++ b/variationselector/variationselector_test.go @@ -7,15 +7,89 @@ package variationselector_test import ( + "encoding/json" "fmt" + "net/http" "strconv" + "strings" "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.mau.fi/util/exerrors" "go.mau.fi/util/variationselector" ) +func TestAdd_Full(t *testing.T) { + resp := get(t, "https://raw.githubusercontent.com/milesj/emojibase/master/packages/data/en/data.raw.json") + var emojis []emojibaseEmoji + exerrors.PanicIfNotNil(json.NewDecoder(resp.Body).Decode(&emojis)) + for _, e := range emojis { + compareEmoji(t, e.Emoji, variationselector.Add) + for _, s := range e.Skins { + compareEmoji(t, s.Emoji, variationselector.Add) + } + } +} + +func TestFullyQualify_Full(t *testing.T) { + resp := get(t, "https://raw.githubusercontent.com/iamcal/emoji-data/master/emoji.json") + var emojis []iamcalEmoji + exerrors.PanicIfNotNil(json.NewDecoder(resp.Body).Decode(&emojis)) + for _, e := range emojis { + compareEmoji(t, unifiedToUnicode(e.Unified), variationselector.FullyQualify) + for _, s := range e.SkinVariations { + compareEmoji(t, unifiedToUnicode(s.Unified), variationselector.FullyQualify) + } + } +} + +func get(t *testing.T, url string) *http.Response { + req, err := http.NewRequest(http.MethodGet, url, nil) + require.NoError(t, err) + req.Header.Set("User-Agent", "GitHub actions @ https://github.com/mautrix/go-util/blob/main/variationselector/variationselector_test.go") + resp, err := http.DefaultClient.Do(req) + require.NoError(t, err) + return resp +} + +type emojibaseEmoji struct { + Emoji string `json:"emoji"` + Hexcode string `json:"hexcode"` + Skins []emojibaseEmoji `json:"skins"` +} + +type iamcalEmoji struct { + Unified string `json:"unified"` + SkinVariations map[string]iamcalEmoji `json:"skin_variations"` +} + +func unifiedToUnicode(input string) string { + parts := strings.Split(input, "-") + output := make([]rune, len(parts)) + for i, part := range parts { + output[i] = rune(exerrors.Must(strconv.ParseInt(part, 16, 32))) + } + return string(output) +} + +func unicodeToUnified(input string) string { + runes := []rune(input) + output := make([]string, len(runes)) + for i, r := range runes { + output[i] = fmt.Sprintf("%X", r) + } + return strings.Join(output, "-") +} + +func compareEmoji(t *testing.T, orig string, fn func(string) string) { + proc := fn(orig) + if proc != orig { + t.Errorf("emoji: %s\nexpected: %s\ngot: %s", orig, unicodeToUnified(orig), unicodeToUnified(proc)) + } +} + func TestAdd(t *testing.T) { assert.Equal(t, "\U0001f44d\U0001f3fd", variationselector.Add("\U0001f44d\U0001f3fd")) assert.Equal(t, "\U0001f44d\ufe0f", variationselector.Add("\U0001f44d")) @@ -23,6 +97,7 @@ func TestAdd(t *testing.T) { assert.Equal(t, "4\ufe0f\u20e3", variationselector.Add("4\u20e3")) assert.Equal(t, "4\ufe0f\u20e3", variationselector.Add("4\ufe0f\u20e3")) assert.Equal(t, "\U0001f914", variationselector.Add("\U0001f914")) + assert.Equal(t, "\U0001f408\u200d\u2b1b", variationselector.Add("\U0001f408\u200d\u2b1b")) } func TestFullyQualify(t *testing.T) { @@ -35,6 +110,7 @@ func TestFullyQualify(t *testing.T) { assert.Equal(t, "\u263a\ufe0f", variationselector.FullyQualify("\u263a")) assert.Equal(t, "\U0001f3f3\ufe0f\u200D\U0001f308", variationselector.FullyQualify("\U0001f3f3\u200D\U0001f308")) assert.Equal(t, "\U0001f3f3\ufe0f\u200D\U0001f308", variationselector.FullyQualify("\U0001f3f3\ufe0f\u200D\U0001f308")) + assert.Equal(t, "\U0001f408\u200d\u2b1b", variationselector.Add("\U0001f408\u200d\u2b1b")) } func TestRemove(t *testing.T) { @@ -80,3 +156,27 @@ func ExampleRemove() { // "\U0001f44d" // "\U0001f44d" } + +func BenchmarkAdd(b *testing.B) { + for i := 0; i < b.N; i++ { + variationselector.Add("\U0001f44d\U0001f3fd") + variationselector.Add("\U0001f44d") + variationselector.Add("\U0001f44d\ufe0f") + variationselector.Add("4\u20e3") + variationselector.Add("4\ufe0f\u20e3") + variationselector.Add("\U0001f914") + variationselector.Add("\U0001f408\u200d\u2b1b") + } +} + +func BenchmarkFullyQualify(b *testing.B) { + for i := 0; i < b.N; i++ { + variationselector.FullyQualify("\U0001f44d\U0001f3fd") + variationselector.FullyQualify("\U0001f44d") + variationselector.FullyQualify("\U0001f44d\ufe0f") + variationselector.FullyQualify("4\u20e3") + variationselector.FullyQualify("4\ufe0f\u20e3") + variationselector.FullyQualify("\U0001f914") + variationselector.FullyQualify("\U0001f408\u200d\u2b1b") + } +}