Skip to content

Commit

Permalink
variationselector: fix edge cases
Browse files Browse the repository at this point in the history
  • Loading branch information
tulir committed Jul 14, 2024
1 parent 9b4a606 commit b8a43b0
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 16 deletions.
1 change: 1 addition & 0 deletions variationselector/emojis-with-extra-variations.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
["*","0","1","2","3","4","5","6","7","8","9","⌚","⌛","⏩","⏪","⏫","⏬","⏰","⏳","◽","◾","☔","☕","♈","♉","♊","♋","♌","♍","♎","♏","♐","♑","♒","♓","♿","⚓","⚡","⚪","⚫","⚽","⚾","⛄","⛅","⛎","⛔","⛪","⛲","⛳","⛵","⛺","⛽","✅","✊","✋","✨","❌","❎","❓","❔","❕","❗","➕","➖","➗","➰","➿","⬛","⬜","⭐","⭕","🀄","🈚","🈯","🌍","🌎","🌏","🌕","🌜","🍸","🎓","🎧","🎬","🎭","🎮","🏂","🏄","🏆","🏊","🏠","🏭","🐈","🐕","🐟","🐦","👂","👆","👇","👈","👉","👍","👎","👓","👪","👽","💣","💰","💳","💻","💿","📋","📚","📟","📤","📥","📦","📪","📫","📬","📭","📷","📹","📺","📻","🔈","🔍","🔒","🔓","🕐","🕑","🕒","🕓","🕔","🕕","🕖","🕗","🕘","🕙","🕚","🕛","🕜","🕝","🕞","🕟","🕠","🕡","🕢","🕣","🕤","🕥","🕦","🕧","😐","🚇","🚍","🚑","🚔","🚘","🚭","🚲","🚹","🚺","🚼"]
1 change: 0 additions & 1 deletion variationselector/emojis-with-variations.json

This file was deleted.

12 changes: 12 additions & 0 deletions variationselector/generate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,15 @@ echo -e "\n$(
| awk '{ for (i = 1; i <= NF; i++) {printf("\\U%8s", $i) }; printf("\n") }' \
| sed 's/ /0/g'
)" | jq -RcM '[inputs]' > fully-qualified-variations.json

python <<EOF
import json
with open("fully-qualified-variations.json") as f:
fully_qualified = set(json.load(f))
with open("emojis-with-variations.json") as f:
emojis_with_variations = json.load(f)
emojis_with_variations = [x for x in emojis_with_variations if f"{x}\ufe0f" not in fully_qualified]
with open("emojis-with-extra-variations.json", "w") as f:
json.dump(emojis_with_variations, f, ensure_ascii=False, separators=(",",":"))
EOF
rm -f emojis-with-variations.json
42 changes: 27 additions & 15 deletions variationselector/variationselector.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,47 +11,57 @@ package variationselector
import (
_ "embed"
"encoding/json"
"fmt"
"regexp"
"strings"
"sync"
)

//go:generate ./generate.sh

//go:embed emojis-with-variations.json
var emojisWithVariationsJSON []byte
//go:embed emojis-with-extra-variations.json
var emojisWithExtraVariationsJSON []byte

//go:embed fully-qualified-variations.json
var fullyQualifiedVariationsJSON []byte

var variationReplacer, fullyQualifier *strings.Replacer
var fullyQualifier *strings.Replacer

// The variation replacer will add incorrect variation selectors before skin tones, this removes those.
var initOnce sync.Once
var variationRegex *regexp.Regexp

// The fully qualifying replacer will add incorrect variation selectors before skin tones, this removes those.
var skinToneReplacer = strings.NewReplacer(
"\ufe0f\U0001F3FB", "\U0001F3FB",
"\ufe0f\U0001F3FC", "\U0001F3FC",
"\ufe0f\U0001F3FD", "\U0001F3FD",
"\ufe0f\U0001F3FE", "\U0001F3FE",
"\ufe0f\U0001F3FF", "\U0001F3FF",
"\ufe0f\ufe0e", "\ufe0e",
)

func init() {
var emojisWithVariations []string
err := json.Unmarshal(emojisWithVariationsJSON, &emojisWithVariations)
func doInit() {
var emojisWithExtraVariations []string
err := json.Unmarshal(emojisWithExtraVariationsJSON, &emojisWithExtraVariations)
if err != nil {
panic(err)
}
replaceInput := make([]string, 2*len(emojisWithVariations))
for i, emoji := range emojisWithVariations {
replaceInput[i*2] = emoji
replaceInput[(i*2)+1] = emoji + VS16
for i, emoji := range emojisWithExtraVariations {
emojiRunes := []rune(emoji)
if len(emojiRunes) > 1 {
panic(fmt.Sprintf("emoji %s is more than one rune long", emoji))
}
emojisWithExtraVariations[i] = fmt.Sprintf(`\x{%X}`, emojiRunes[0])
}
variationReplacer = strings.NewReplacer(replaceInput...)
variationPattern := fmt.Sprintf(`(^|[^\x{200D}])(%s)([^\x{FE0F}\x{FE0E}\x{200D}\x{1F3FB}\x{1F3FC}\x{1F3FD}\x{1F3FE}\x{1F3FF}]|$)`, strings.Join(emojisWithExtraVariations, "|"))
variationRegex = regexp.MustCompile(variationPattern)

var fullyQualifiedVariations []string
err = json.Unmarshal(fullyQualifiedVariationsJSON, &fullyQualifiedVariations)
if err != nil {
panic(err)
}
replaceInput = make([]string, 2*len(fullyQualifiedVariations))
replaceInput := make([]string, 2*len(fullyQualifiedVariations))
for i, emoji := range fullyQualifiedVariations {
replaceInput[i*2] = strings.ReplaceAll(emoji, VS16, "")
replaceInput[(i*2)+1] = emoji
Expand All @@ -71,7 +81,8 @@ const VS16 = "\ufe0f"
//
// This will remove all variation selectors first to make sure it doesn't add duplicates.
func Add(val string) string {
return skinToneReplacer.Replace(variationReplacer.Replace(Remove(val)))
initOnce.Do(doInit)
return variationRegex.ReplaceAllString(FullyQualify(val), "$1$2\ufe0f$3")
}

// Remove removes all emoji variation selectors in the given string.
Expand All @@ -89,5 +100,6 @@ func Remove(val string) string {
//
// N.B. This method is not currently used by the Matrix spec, but it is included as bridging to other networks may need it.
func FullyQualify(val string) string {
return fullyQualifier.Replace(Remove(val))
initOnce.Do(doInit)
return skinToneReplacer.Replace(fullyQualifier.Replace(Remove(val)))
}
100 changes: 100 additions & 0 deletions variationselector/variationselector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,97 @@
package variationselector_test

import (
"encoding/json"
"fmt"
"net/http"
"strconv"
"strings"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"go.mau.fi/util/exerrors"
"go.mau.fi/util/variationselector"
)

func TestAdd_Full(t *testing.T) {
resp := get(t, "https://raw.githubusercontent.com/milesj/emojibase/master/packages/data/en/data.raw.json")
var emojis []emojibaseEmoji
exerrors.PanicIfNotNil(json.NewDecoder(resp.Body).Decode(&emojis))
for _, e := range emojis {
compareEmoji(t, e.Emoji, variationselector.Add)
for _, s := range e.Skins {
compareEmoji(t, s.Emoji, variationselector.Add)
}
}
}

func TestFullyQualify_Full(t *testing.T) {
resp := get(t, "https://raw.githubusercontent.com/iamcal/emoji-data/master/emoji.json")
var emojis []iamcalEmoji
exerrors.PanicIfNotNil(json.NewDecoder(resp.Body).Decode(&emojis))
for _, e := range emojis {
compareEmoji(t, unifiedToUnicode(e.Unified), variationselector.FullyQualify)
for _, s := range e.SkinVariations {
compareEmoji(t, unifiedToUnicode(s.Unified), variationselector.FullyQualify)
}
}
}

func get(t *testing.T, url string) *http.Response {
req, err := http.NewRequest(http.MethodGet, url, nil)
require.NoError(t, err)
req.Header.Set("User-Agent", "GitHub actions @ https://github.com/mautrix/go-util/blob/main/variationselector/variationselector_test.go")
resp, err := http.DefaultClient.Do(req)
require.NoError(t, err)
return resp
}

type emojibaseEmoji struct {
Emoji string `json:"emoji"`
Hexcode string `json:"hexcode"`
Skins []emojibaseEmoji `json:"skins"`
}

type iamcalEmoji struct {
Unified string `json:"unified"`
SkinVariations map[string]iamcalEmoji `json:"skin_variations"`
}

func unifiedToUnicode(input string) string {
parts := strings.Split(input, "-")
output := make([]rune, len(parts))
for i, part := range parts {
output[i] = rune(exerrors.Must(strconv.ParseInt(part, 16, 32)))
}
return string(output)
}

func unicodeToUnified(input string) string {
runes := []rune(input)
output := make([]string, len(runes))
for i, r := range runes {
output[i] = fmt.Sprintf("%X", r)
}
return strings.Join(output, "-")
}

func compareEmoji(t *testing.T, orig string, fn func(string) string) {
proc := fn(orig)
if proc != orig {
t.Errorf("emoji: %s\nexpected: %s\ngot: %s", orig, unicodeToUnified(orig), unicodeToUnified(proc))
}
}

func TestAdd(t *testing.T) {
assert.Equal(t, "\U0001f44d\U0001f3fd", variationselector.Add("\U0001f44d\U0001f3fd"))
assert.Equal(t, "\U0001f44d\ufe0f", variationselector.Add("\U0001f44d"))
assert.Equal(t, "\U0001f44d\ufe0f", variationselector.Add("\U0001f44d\ufe0f"))
assert.Equal(t, "4\ufe0f\u20e3", variationselector.Add("4\u20e3"))
assert.Equal(t, "4\ufe0f\u20e3", variationselector.Add("4\ufe0f\u20e3"))
assert.Equal(t, "\U0001f914", variationselector.Add("\U0001f914"))
assert.Equal(t, "\U0001f408\u200d\u2b1b", variationselector.Add("\U0001f408\u200d\u2b1b"))
}

func TestFullyQualify(t *testing.T) {
Expand All @@ -35,6 +110,7 @@ func TestFullyQualify(t *testing.T) {
assert.Equal(t, "\u263a\ufe0f", variationselector.FullyQualify("\u263a"))
assert.Equal(t, "\U0001f3f3\ufe0f\u200D\U0001f308", variationselector.FullyQualify("\U0001f3f3\u200D\U0001f308"))
assert.Equal(t, "\U0001f3f3\ufe0f\u200D\U0001f308", variationselector.FullyQualify("\U0001f3f3\ufe0f\u200D\U0001f308"))
assert.Equal(t, "\U0001f408\u200d\u2b1b", variationselector.Add("\U0001f408\u200d\u2b1b"))
}

func TestRemove(t *testing.T) {
Expand Down Expand Up @@ -80,3 +156,27 @@ func ExampleRemove() {
// "\U0001f44d"
// "\U0001f44d"
}

func BenchmarkAdd(b *testing.B) {
for i := 0; i < b.N; i++ {
variationselector.Add("\U0001f44d\U0001f3fd")
variationselector.Add("\U0001f44d")
variationselector.Add("\U0001f44d\ufe0f")
variationselector.Add("4\u20e3")
variationselector.Add("4\ufe0f\u20e3")
variationselector.Add("\U0001f914")
variationselector.Add("\U0001f408\u200d\u2b1b")
}
}

func BenchmarkFullyQualify(b *testing.B) {
for i := 0; i < b.N; i++ {
variationselector.FullyQualify("\U0001f44d\U0001f3fd")
variationselector.FullyQualify("\U0001f44d")
variationselector.FullyQualify("\U0001f44d\ufe0f")
variationselector.FullyQualify("4\u20e3")
variationselector.FullyQualify("4\ufe0f\u20e3")
variationselector.FullyQualify("\U0001f914")
variationselector.FullyQualify("\U0001f408\u200d\u2b1b")
}
}

0 comments on commit b8a43b0

Please sign in to comment.