brain/*: new tokenizing algorithm

Spaces are now included in tokens, so we don't need every brain implementation to add them manually. Punctuation and symbols are also separated from words, which should give more variety. Fixes #45.
zephyrtronium · Aug 10, 2024 · 933e6d0 · 933e6d0
1 parent 74e2908
commit 933e6d0
Show file tree

Hide file tree

Showing 12 changed files with 200 additions and 227 deletions.
diff --git a/brain/braintest/braintest.go b/brain/braintest/braintest.go
@@ -44,63 +44,63 @@ var messages = [...]struct {
 		User:   userhash.Hash{2},
 		Tag:    "kessoku",
 		Time:   time.Unix(0, 0),
-		Tokens: these("member", "bocchi"),
+		Tokens: these("member ", "bocchi "),
 	},
 	{
 		ID:     uuid.UUID{2},
 		User:   userhash.Hash{2},
 		Tag:    "kessoku",
 		Time:   time.Unix(1, 0),
-		Tokens: these("member", "ryou"),
+		Tokens: these("member ", "ryou "),
 	},
 	{
 		ID:     uuid.UUID{3},
 		User:   userhash.Hash{3},
 		Tag:    "kessoku",
 		Time:   time.Unix(2, 0),
-		Tokens: these("member", "nijika"),
+		Tokens: these("member ", "nijika "),
 	},
 	{
 		ID:     uuid.UUID{4},
 		User:   userhash.Hash{3},
 		Tag:    "kessoku",
 		Time:   time.Unix(3, 0),
-		Tokens: these("member", "kita"),
+		Tokens: these("member ", "kita "),
 	},
 	{
 		ID:     uuid.UUID{5},
 		User:   userhash.Hash{2},
 		Tag:    "sickhack",
 		Time:   time.Unix(0, 0),
-		Tokens: these("member", "bocchi"),
+		Tokens: these("member ", "bocchi "),
 	},
 	{
 		ID:     uuid.UUID{6},
 		User:   userhash.Hash{2},
 		Tag:    "sickhack",
 		Time:   time.Unix(1, 0),
-		Tokens: these("member", "ryou"),
+		Tokens: these("member ", "ryou "),
 	},
 	{
 		ID:     uuid.UUID{7},
 		User:   userhash.Hash{3},
 		Tag:    "sickhack",
 		Time:   time.Unix(2, 0),
-		Tokens: these("member", "nijika"),
+		Tokens: these("member ", "nijika "),
 	},
 	{
 		ID:     uuid.UUID{8},
 		User:   userhash.Hash{3},
 		Tag:    "sickhack",
 		Time:   time.Unix(3, 0),
-		Tokens: these("member", "kita"),
+		Tokens: these("member ", "kita "),
 	},
 	{
 		ID:     uuid.UUID{9},
 		User:   userhash.Hash{4},
 		Tag:    "sickhack",
 		Time:   time.Unix(43, 0),
-		Tokens: these("manager", "seika"),
+		Tokens: these("manager ", "seika "),
 	},
 }
 

diff --git a/brain/braintest/braintest_test.go b/brain/braintest/braintest_test.go
@@ -127,7 +127,6 @@ func (m *membrain) Speak(ctx context.Context, tag string, prompt []string, w []b
 		}
 		t := u[rand.IntN(len(u))]
 		w = append(w, t...)
-		w = append(w, ' ')
 		s = brain.ReduceEntropy(t)
 	} else {
 		s = brain.ReduceEntropy(prompt[len(prompt)-1])
@@ -142,7 +141,6 @@ func (m *membrain) Speak(ctx context.Context, tag string, prompt []string, w []b
 			break
 		}
 		w = append(w, t...)
-		w = append(w, ' ')
 		s = brain.ReduceEntropy(t)
 	}
 	return w, nil

diff --git a/brain/kvbrain/speak.go b/brain/kvbrain/speak.go
@@ -39,7 +39,6 @@ func (br *Brain) Speak(ctx context.Context, tag string, prompt []string, w []byt
 			break
 		}
 		w = append(w, b...)
-		w = append(w, ' ')
 		search = search.Drop(search.Len() - l - 1).Prepend(brain.ReduceEntropy(string(b)))
 	}
 	return w, nil

diff --git a/brain/kvbrain/speak_test.go b/brain/kvbrain/speak_test.go
@@ -1,12 +1,10 @@
 package kvbrain
 
 import (
-	"bytes"
 	"context"
 	"errors"
 	"maps"
 	"slices"
-	"strings"
 	"testing"
 
 	"github.com/dgraph-io/badger/v4"
@@ -23,101 +21,101 @@ func TestSpeak(t *testing.T) {
 		name   string
 		kvs    [][2]string
 		prompt []string
-		want   [][]string
+		want   []string
 	}{
 		{
 			name: "empty",
 			kvs:  nil,
-			want: [][]string{
+			want: []string{
 				// Even with no thoughts head empty, we expect to get empty,
 				// non-error results when we speak. Our test currently records
 				// what it gets as a joined string for convenience, so we want
 				// an empty string in here, even though we really should be
 				// getting an empty slice.
-				{""},
+				"",
 			},
 		},
 		{
 			name: "single",
 			kvs: [][2]string{
-				{mkey("kessoku", "\xff", uu), "bocchi"},
-				{mkey("kessoku", "bocchi\xff\xff", uu), ""},
+				{mkey("kessoku", "\xff", uu), "bocchi "},
+				{mkey("kessoku", "bocchi \xff\xff", uu), ""},
 			},
-			want: [][]string{
-				{"bocchi"},
+			want: []string{
+				"bocchi ",
 			},
 		},
 		{
 			name: "longer",
 			kvs: [][2]string{
-				{mkey("kessoku", "\xff", uu), "bocchi"},
-				{mkey("kessoku", "bocchi\xff\xff", uu), "ryou"},
-				{mkey("kessoku", "ryou\xffbocchi\xff\xff", uu), "nijika"},
-				{mkey("kessoku", "nijika\xffryou\xffbocchi\xff\xff", uu), "kita"},
-				{mkey("kessoku", "kita\xffnijika\xffryou\xffbocchi\xff\xff", uu), ""},
+				{mkey("kessoku", "\xff", uu), "bocchi "},
+				{mkey("kessoku", "bocchi \xff\xff", uu), "ryou "},
+				{mkey("kessoku", "ryou \xffbocchi \xff\xff", uu), "nijika "},
+				{mkey("kessoku", "nijika \xffryou \xffbocchi \xff\xff", uu), "kita "},
+				{mkey("kessoku", "kita \xffnijika \xffryou \xffbocchi \xff\xff", uu), ""},
 			},
-			want: [][]string{
-				{"bocchi", "ryou", "nijika", "kita"},
+			want: []string{
+				"bocchi ryou nijika kita ",
 			},
 		},
 		{
 			name: "entropy",
 			kvs: [][2]string{
-				{mkey("kessoku", "\xff", uu), "BOCCHI"},
-				{mkey("kessoku", "bocchi\xff\xff", uu), "RYOU"},
-				{mkey("kessoku", "ryou\xffbocchi\xff\xff", uu), "NIJIKA"},
-				{mkey("kessoku", "nijika\xffryou\xffbocchi\xff\xff", uu), "KITA"},
-				{mkey("kessoku", "kita\xffnijika\xffryou\xffbocchi\xff\xff", uu), ""},
+				{mkey("kessoku", "\xff", uu), "BOCCHI "},
+				{mkey("kessoku", "bocchi \xff\xff", uu), "RYOU "},
+				{mkey("kessoku", "ryou \xffbocchi \xff\xff", uu), "NIJIKA "},
+				{mkey("kessoku", "nijika \xffryou \xffbocchi \xff\xff", uu), "KITA "},
+				{mkey("kessoku", "kita \xffnijika \xffryou \xffbocchi \xff\xff", uu), ""},
 			},
-			want: [][]string{
-				{"BOCCHI", "RYOU", "NIJIKA", "KITA"},
+			want: []string{
+				"BOCCHI RYOU NIJIKA KITA ",
 			},
 		},
 		{
 			name: "prompted",
 			kvs: [][2]string{
-				{mkey("kessoku", "\xff", uu), "bocchi"},
-				{mkey("kessoku", "bocchi\xff\xff", uu), "ryou"},
-				{mkey("kessoku", "ryou\xffbocchi\xff\xff", uu), "nijika"},
-				{mkey("kessoku", "nijika\xffryou\xffbocchi\xff\xff", uu), "kita"},
-				{mkey("kessoku", "kita\xffnijika\xffryou\xffbocchi\xff\xff", uu), ""},
+				{mkey("kessoku", "\xff", uu), "bocchi "},
+				{mkey("kessoku", "bocchi \xff\xff", uu), "ryou "},
+				{mkey("kessoku", "ryou \xffbocchi \xff\xff", uu), "nijika "},
+				{mkey("kessoku", "nijika \xffryou \xffbocchi \xff\xff", uu), "kita "},
+				{mkey("kessoku", "kita \xffnijika \xffryou \xffbocchi \xff\xff", uu), ""},
 			},
-			prompt: []string{"bocchi"},
-			want: [][]string{
-				{"ryou", "nijika", "kita"},
+			prompt: []string{"bocchi "},
+			want: []string{
+				"ryou nijika kita ",
 			},
 		},
 		{
 			name: "prompted-entropy",
 			kvs: [][2]string{
-				{mkey("kessoku", "\xff", uu), "BOCCHI"},
-				{mkey("kessoku", "bocchi\xff\xff", uu), "RYOU"},
-				{mkey("kessoku", "ryou\xffbocchi\xff\xff", uu), "NIJIKA"},
-				{mkey("kessoku", "nijika\xffryou\xffbocchi\xff\xff", uu), "KITA"},
-				{mkey("kessoku", "kita\xffnijika\xffryou\xffbocchi\xff\xff", uu), ""},
+				{mkey("kessoku", "\xff", uu), "BOCCHI "},
+				{mkey("kessoku", "bocchi \xff\xff", uu), "RYOU "},
+				{mkey("kessoku", "ryou \xffbocchi \xff\xff", uu), "NIJIKA "},
+				{mkey("kessoku", "nijika \xffryou \xffbocchi \xff\xff", uu), "KITA "},
+				{mkey("kessoku", "kita \xffnijika \xffryou \xffbocchi \xff\xff", uu), ""},
 			},
-			prompt: []string{"bocchi"},
-			want: [][]string{
-				{"RYOU", "NIJIKA", "KITA"},
+			prompt: []string{"bocchi "},
+			want: []string{
+				"RYOU NIJIKA KITA ",
 			},
 		},
 		{
 			name: "uniform",
 			kvs: [][2]string{
-				{mkey("kessoku", "\xff", uuid.UUID{1}), "bocchi"},
-				{mkey("kessoku", "bocchi\xff\xff", uuid.UUID{1}), "ryou"},
-				{mkey("kessoku", "ryou\xffbocchi\xff\xff", uuid.UUID{1}), ""},
-				{mkey("kessoku", "\xff", uuid.UUID{2}), "bocchi"},
-				{mkey("kessoku", "bocchi\xff\xff", uuid.UUID{2}), "nijika"},
-				{mkey("kessoku", "nijika\xffbocchi\xff\xff", uuid.UUID{2}), ""},
-				{mkey("kessoku", "\xff", uuid.UUID{3}), "bocchi"},
-				{mkey("kessoku", "bocchi\xff\xff", uuid.UUID{3}), "kita"},
-				{mkey("kessoku", "kita\xffbocchi\xff\xff", uuid.UUID{3}), ""},
+				{mkey("kessoku", "\xff", uuid.UUID{1}), "bocchi "},
+				{mkey("kessoku", "bocchi \xff\xff", uuid.UUID{1}), "ryou "},
+				{mkey("kessoku", "ryou \xffbocchi \xff\xff", uuid.UUID{1}), ""},
+				{mkey("kessoku", "\xff", uuid.UUID{2}), "bocchi "},
+				{mkey("kessoku", "bocchi \xff\xff", uuid.UUID{2}), "nijika "},
+				{mkey("kessoku", "nijika \xffbocchi \xff\xff", uuid.UUID{2}), ""},
+				{mkey("kessoku", "\xff", uuid.UUID{3}), "bocchi "},
+				{mkey("kessoku", "bocchi \xff\xff", uuid.UUID{3}), "kita "},
+				{mkey("kessoku", "kita \xffbocchi \xff\xff", uuid.UUID{3}), ""},
 			},
-			want: [][]string{
-				{"bocchi", "ryou"},
-				{"bocchi", "nijika"},
-				{"bocchi", "kita"},
+			want: []string{
+				"bocchi ryou ",
+				"bocchi nijika ",
+				"bocchi kita ",
 			},
 		},
 		// TODO(zeph): test tag isolation
@@ -140,15 +138,15 @@ func TestSpeak(t *testing.T) {
 			br := New(db)
 			want := make(map[string]bool, len(c.want))
 			for _, v := range c.want {
-				want[strings.Join(v, " ")] = true
+				want[v] = true
 			}
 			got := make(map[string]bool, len(c.want))
 			for range 256 {
 				m, err := br.Speak(ctx, "kessoku", slices.Clone(c.prompt), nil)
 				if err != nil {
 					t.Errorf("failed to speak: %v", err)
 				}
-				got[string(bytes.TrimSpace(m))] = true
+				got[string(m)] = true
 			}
 			if !maps.Equal(want, got) {
 				t.Errorf("wrong results: want %v, got %v", want, got)

diff --git a/brain/learn_test.go b/brain/learn_test.go
@@ -12,46 +12,6 @@ import (
 	"github.com/zephyrtronium/robot/userhash"
 )
 
-func TestTokens(t *testing.T) {
-	// Just to make test cases a bit easier to write.
-	s := func(x ...string) []string { return x }
-	cases := []struct {
-		name string
-		in   string
-		want []string
-	}{
-		{"single", "single", s("single")},
-		{"many", "many words in this message", s("many", "words", "in", "this", "message")},
-		{"a", "a word", s("a word")},
-		{"an", "an word", s("an word")},
-		{"the", "the word", s("the word")},
-		{"aend", "word a", s("word", "a")},
-		{"anend", "word an", s("word", "an")},
-		{"theend", "word the", s("word", "the")},
-		{"aaa", "a a a", s("a", "a", "a")},
-		{"ananan", "an an an", s("an an", "an")},
-		{"thethethe", "the the the", s("the the", "the")},
-		{"meme", "a x y", s("a", "x", "y")},
-		{"spaces", "x    y", s("x", "y")},
-		{"tabs", "x\ty", s("x", "y")},
-		{"unicode", "x\u2002y", s("x", "y")},
-		{"spaceend", "x y ", s("x", "y")},
-	}
-	for _, c := range cases {
-		t.Run(c.name, func(t *testing.T) {
-			dst := make([]string, len(c.want))
-			p := &dst[0]
-			got := brain.Tokens(dst[:0], c.in)
-			if diff := cmp.Diff(c.want, got); diff != "" {
-				t.Errorf("wrong tokens from %q:\n%s", c.in, diff)
-			}
-			if p != &dst[0] {
-				t.Error("first element pointer changed")
-			}
-		})
-	}
-}
-
 type testLearner struct {
 	learned []brain.Tuple
 	forgot  []brain.Tuple

diff --git a/brain/speak.go b/brain/speak.go
@@ -33,10 +33,9 @@ func Speak(ctx context.Context, s Speaker, tag, prompt string) (string, error) {
 		builderPool.Put(w[:0])
 		tokensPool.Put(toks[:0])
 	}()
-	w = slices.Grow(w, len(prompt))
+	w = slices.Grow(w, len(prompt)+1)
 	for i, t := range toks {
 		w = append(w, t...)
-		w = append(w, ' ')
 		toks[i] = ReduceEntropy(t)
 	}
 	slices.Reverse(toks)

diff --git a/brain/speak_test.go b/brain/speak_test.go
@@ -46,21 +46,21 @@ func TestSpeak(t *testing.T) {
 			name:   "prompted",
 			prompt: "bocchi ryo nijika",
 			append: nil,
-			want:   []string{"nijika", "ryo", "bocchi"},
+			want:   []string{"nijika ", "ryo ", "bocchi "},
 			say:    "bocchi ryo nijika",
 		},
 		{
 			name:   "prompted-add",
 			prompt: "bocchi ryo nijika",
 			append: []byte("kita"),
-			want:   []string{"nijika", "ryo", "bocchi"},
+			want:   []string{"nijika ", "ryo ", "bocchi "},
 			say:    "bocchi ryo nijika kita",
 		},
 		{
 			name:   "entropy",
 			prompt: "BOCCHI RYO NIJIKA",
 			append: []byte("KITA"),
-			want:   []string{"nijika", "ryo", "bocchi"},
+			want:   []string{"nijika ", "ryo ", "bocchi "},
 			say:    "BOCCHI RYO NIJIKA KITA",
 		},
 	}

diff --git a/brain/sqlbrain/speak.go b/brain/sqlbrain/speak.go
@@ -38,7 +38,6 @@ func (br *Brain) Speak(ctx context.Context, tag string, prompt []string, w []byt
 			break
 		}
 		w = append(w, b...)
-		w = append(w, ' ')
 		search = search.Drop(search.Len() - l - 1).Prepend(brain.ReduceEntropy(string(b)))
 	}
 	return w, nil