Skip to content

Commit

Permalink
brain/*: new tokenizing algorithm
Browse files Browse the repository at this point in the history
Spaces are now included in tokens, so we don't need every brain
implementation to add them manually. Punctuation and symbols are also
separated from words, which should give more variety.

Fixes #45.
  • Loading branch information
zephyrtronium committed Aug 10, 2024
1 parent 74e2908 commit 933e6d0
Show file tree
Hide file tree
Showing 12 changed files with 200 additions and 227 deletions.
18 changes: 9 additions & 9 deletions brain/braintest/braintest.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,63 +44,63 @@ var messages = [...]struct {
User: userhash.Hash{2},
Tag: "kessoku",
Time: time.Unix(0, 0),
Tokens: these("member", "bocchi"),
Tokens: these("member ", "bocchi "),
},
{
ID: uuid.UUID{2},
User: userhash.Hash{2},
Tag: "kessoku",
Time: time.Unix(1, 0),
Tokens: these("member", "ryou"),
Tokens: these("member ", "ryou "),
},
{
ID: uuid.UUID{3},
User: userhash.Hash{3},
Tag: "kessoku",
Time: time.Unix(2, 0),
Tokens: these("member", "nijika"),
Tokens: these("member ", "nijika "),
},
{
ID: uuid.UUID{4},
User: userhash.Hash{3},
Tag: "kessoku",
Time: time.Unix(3, 0),
Tokens: these("member", "kita"),
Tokens: these("member ", "kita "),
},
{
ID: uuid.UUID{5},
User: userhash.Hash{2},
Tag: "sickhack",
Time: time.Unix(0, 0),
Tokens: these("member", "bocchi"),
Tokens: these("member ", "bocchi "),
},
{
ID: uuid.UUID{6},
User: userhash.Hash{2},
Tag: "sickhack",
Time: time.Unix(1, 0),
Tokens: these("member", "ryou"),
Tokens: these("member ", "ryou "),
},
{
ID: uuid.UUID{7},
User: userhash.Hash{3},
Tag: "sickhack",
Time: time.Unix(2, 0),
Tokens: these("member", "nijika"),
Tokens: these("member ", "nijika "),
},
{
ID: uuid.UUID{8},
User: userhash.Hash{3},
Tag: "sickhack",
Time: time.Unix(3, 0),
Tokens: these("member", "kita"),
Tokens: these("member ", "kita "),
},
{
ID: uuid.UUID{9},
User: userhash.Hash{4},
Tag: "sickhack",
Time: time.Unix(43, 0),
Tokens: these("manager", "seika"),
Tokens: these("manager ", "seika "),
},
}

Expand Down
2 changes: 0 additions & 2 deletions brain/braintest/braintest_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ func (m *membrain) Speak(ctx context.Context, tag string, prompt []string, w []b
}
t := u[rand.IntN(len(u))]
w = append(w, t...)
w = append(w, ' ')
s = brain.ReduceEntropy(t)
} else {
s = brain.ReduceEntropy(prompt[len(prompt)-1])
Expand All @@ -142,7 +141,6 @@ func (m *membrain) Speak(ctx context.Context, tag string, prompt []string, w []b
break
}
w = append(w, t...)
w = append(w, ' ')
s = brain.ReduceEntropy(t)
}
return w, nil
Expand Down
1 change: 0 additions & 1 deletion brain/kvbrain/speak.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ func (br *Brain) Speak(ctx context.Context, tag string, prompt []string, w []byt
break
}
w = append(w, b...)
w = append(w, ' ')
search = search.Drop(search.Len() - l - 1).Prepend(brain.ReduceEntropy(string(b)))
}
return w, nil
Expand Down
106 changes: 52 additions & 54 deletions brain/kvbrain/speak_test.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
package kvbrain

import (
"bytes"
"context"
"errors"
"maps"
"slices"
"strings"
"testing"

"github.com/dgraph-io/badger/v4"
Expand All @@ -23,101 +21,101 @@ func TestSpeak(t *testing.T) {
name string
kvs [][2]string
prompt []string
want [][]string
want []string
}{
{
name: "empty",
kvs: nil,
want: [][]string{
want: []string{
// Even with no thoughts head empty, we expect to get empty,
// non-error results when we speak. Our test currently records
// what it gets as a joined string for convenience, so we want
// an empty string in here, even though we really should be
// getting an empty slice.
{""},
"",
},
},
{
name: "single",
kvs: [][2]string{
{mkey("kessoku", "\xff", uu), "bocchi"},
{mkey("kessoku", "bocchi\xff\xff", uu), ""},
{mkey("kessoku", "\xff", uu), "bocchi "},
{mkey("kessoku", "bocchi \xff\xff", uu), ""},
},
want: [][]string{
{"bocchi"},
want: []string{
"bocchi ",
},
},
{
name: "longer",
kvs: [][2]string{
{mkey("kessoku", "\xff", uu), "bocchi"},
{mkey("kessoku", "bocchi\xff\xff", uu), "ryou"},
{mkey("kessoku", "ryou\xffbocchi\xff\xff", uu), "nijika"},
{mkey("kessoku", "nijika\xffryou\xffbocchi\xff\xff", uu), "kita"},
{mkey("kessoku", "kita\xffnijika\xffryou\xffbocchi\xff\xff", uu), ""},
{mkey("kessoku", "\xff", uu), "bocchi "},
{mkey("kessoku", "bocchi \xff\xff", uu), "ryou "},
{mkey("kessoku", "ryou \xffbocchi \xff\xff", uu), "nijika "},
{mkey("kessoku", "nijika \xffryou \xffbocchi \xff\xff", uu), "kita "},
{mkey("kessoku", "kita \xffnijika \xffryou \xffbocchi \xff\xff", uu), ""},
},
want: [][]string{
{"bocchi", "ryou", "nijika", "kita"},
want: []string{
"bocchi ryou nijika kita ",
},
},
{
name: "entropy",
kvs: [][2]string{
{mkey("kessoku", "\xff", uu), "BOCCHI"},
{mkey("kessoku", "bocchi\xff\xff", uu), "RYOU"},
{mkey("kessoku", "ryou\xffbocchi\xff\xff", uu), "NIJIKA"},
{mkey("kessoku", "nijika\xffryou\xffbocchi\xff\xff", uu), "KITA"},
{mkey("kessoku", "kita\xffnijika\xffryou\xffbocchi\xff\xff", uu), ""},
{mkey("kessoku", "\xff", uu), "BOCCHI "},
{mkey("kessoku", "bocchi \xff\xff", uu), "RYOU "},
{mkey("kessoku", "ryou \xffbocchi \xff\xff", uu), "NIJIKA "},
{mkey("kessoku", "nijika \xffryou \xffbocchi \xff\xff", uu), "KITA "},
{mkey("kessoku", "kita \xffnijika \xffryou \xffbocchi \xff\xff", uu), ""},
},
want: [][]string{
{"BOCCHI", "RYOU", "NIJIKA", "KITA"},
want: []string{
"BOCCHI RYOU NIJIKA KITA ",
},
},
{
name: "prompted",
kvs: [][2]string{
{mkey("kessoku", "\xff", uu), "bocchi"},
{mkey("kessoku", "bocchi\xff\xff", uu), "ryou"},
{mkey("kessoku", "ryou\xffbocchi\xff\xff", uu), "nijika"},
{mkey("kessoku", "nijika\xffryou\xffbocchi\xff\xff", uu), "kita"},
{mkey("kessoku", "kita\xffnijika\xffryou\xffbocchi\xff\xff", uu), ""},
{mkey("kessoku", "\xff", uu), "bocchi "},
{mkey("kessoku", "bocchi \xff\xff", uu), "ryou "},
{mkey("kessoku", "ryou \xffbocchi \xff\xff", uu), "nijika "},
{mkey("kessoku", "nijika \xffryou \xffbocchi \xff\xff", uu), "kita "},
{mkey("kessoku", "kita \xffnijika \xffryou \xffbocchi \xff\xff", uu), ""},
},
prompt: []string{"bocchi"},
want: [][]string{
{"ryou", "nijika", "kita"},
prompt: []string{"bocchi "},
want: []string{
"ryou nijika kita ",
},
},
{
name: "prompted-entropy",
kvs: [][2]string{
{mkey("kessoku", "\xff", uu), "BOCCHI"},
{mkey("kessoku", "bocchi\xff\xff", uu), "RYOU"},
{mkey("kessoku", "ryou\xffbocchi\xff\xff", uu), "NIJIKA"},
{mkey("kessoku", "nijika\xffryou\xffbocchi\xff\xff", uu), "KITA"},
{mkey("kessoku", "kita\xffnijika\xffryou\xffbocchi\xff\xff", uu), ""},
{mkey("kessoku", "\xff", uu), "BOCCHI "},
{mkey("kessoku", "bocchi \xff\xff", uu), "RYOU "},
{mkey("kessoku", "ryou \xffbocchi \xff\xff", uu), "NIJIKA "},
{mkey("kessoku", "nijika \xffryou \xffbocchi \xff\xff", uu), "KITA "},
{mkey("kessoku", "kita \xffnijika \xffryou \xffbocchi \xff\xff", uu), ""},
},
prompt: []string{"bocchi"},
want: [][]string{
{"RYOU", "NIJIKA", "KITA"},
prompt: []string{"bocchi "},
want: []string{
"RYOU NIJIKA KITA ",
},
},
{
name: "uniform",
kvs: [][2]string{
{mkey("kessoku", "\xff", uuid.UUID{1}), "bocchi"},
{mkey("kessoku", "bocchi\xff\xff", uuid.UUID{1}), "ryou"},
{mkey("kessoku", "ryou\xffbocchi\xff\xff", uuid.UUID{1}), ""},
{mkey("kessoku", "\xff", uuid.UUID{2}), "bocchi"},
{mkey("kessoku", "bocchi\xff\xff", uuid.UUID{2}), "nijika"},
{mkey("kessoku", "nijika\xffbocchi\xff\xff", uuid.UUID{2}), ""},
{mkey("kessoku", "\xff", uuid.UUID{3}), "bocchi"},
{mkey("kessoku", "bocchi\xff\xff", uuid.UUID{3}), "kita"},
{mkey("kessoku", "kita\xffbocchi\xff\xff", uuid.UUID{3}), ""},
{mkey("kessoku", "\xff", uuid.UUID{1}), "bocchi "},
{mkey("kessoku", "bocchi \xff\xff", uuid.UUID{1}), "ryou "},
{mkey("kessoku", "ryou \xffbocchi \xff\xff", uuid.UUID{1}), ""},
{mkey("kessoku", "\xff", uuid.UUID{2}), "bocchi "},
{mkey("kessoku", "bocchi \xff\xff", uuid.UUID{2}), "nijika "},
{mkey("kessoku", "nijika \xffbocchi \xff\xff", uuid.UUID{2}), ""},
{mkey("kessoku", "\xff", uuid.UUID{3}), "bocchi "},
{mkey("kessoku", "bocchi \xff\xff", uuid.UUID{3}), "kita "},
{mkey("kessoku", "kita \xffbocchi \xff\xff", uuid.UUID{3}), ""},
},
want: [][]string{
{"bocchi", "ryou"},
{"bocchi", "nijika"},
{"bocchi", "kita"},
want: []string{
"bocchi ryou ",
"bocchi nijika ",
"bocchi kita ",
},
},
// TODO(zeph): test tag isolation
Expand All @@ -140,15 +138,15 @@ func TestSpeak(t *testing.T) {
br := New(db)
want := make(map[string]bool, len(c.want))
for _, v := range c.want {
want[strings.Join(v, " ")] = true
want[v] = true
}
got := make(map[string]bool, len(c.want))
for range 256 {
m, err := br.Speak(ctx, "kessoku", slices.Clone(c.prompt), nil)
if err != nil {
t.Errorf("failed to speak: %v", err)
}
got[string(bytes.TrimSpace(m))] = true
got[string(m)] = true
}
if !maps.Equal(want, got) {
t.Errorf("wrong results: want %v, got %v", want, got)
Expand Down
40 changes: 0 additions & 40 deletions brain/learn_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,46 +12,6 @@ import (
"github.com/zephyrtronium/robot/userhash"
)

func TestTokens(t *testing.T) {
// Just to make test cases a bit easier to write.
s := func(x ...string) []string { return x }
cases := []struct {
name string
in string
want []string
}{
{"single", "single", s("single")},
{"many", "many words in this message", s("many", "words", "in", "this", "message")},
{"a", "a word", s("a word")},
{"an", "an word", s("an word")},
{"the", "the word", s("the word")},
{"aend", "word a", s("word", "a")},
{"anend", "word an", s("word", "an")},
{"theend", "word the", s("word", "the")},
{"aaa", "a a a", s("a", "a", "a")},
{"ananan", "an an an", s("an an", "an")},
{"thethethe", "the the the", s("the the", "the")},
{"meme", "a x y", s("a", "x", "y")},
{"spaces", "x y", s("x", "y")},
{"tabs", "x\ty", s("x", "y")},
{"unicode", "x\u2002y", s("x", "y")},
{"spaceend", "x y ", s("x", "y")},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
dst := make([]string, len(c.want))
p := &dst[0]
got := brain.Tokens(dst[:0], c.in)
if diff := cmp.Diff(c.want, got); diff != "" {
t.Errorf("wrong tokens from %q:\n%s", c.in, diff)
}
if p != &dst[0] {
t.Error("first element pointer changed")
}
})
}
}

type testLearner struct {
learned []brain.Tuple
forgot []brain.Tuple
Expand Down
3 changes: 1 addition & 2 deletions brain/speak.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,9 @@ func Speak(ctx context.Context, s Speaker, tag, prompt string) (string, error) {
builderPool.Put(w[:0])
tokensPool.Put(toks[:0])
}()
w = slices.Grow(w, len(prompt))
w = slices.Grow(w, len(prompt)+1)
for i, t := range toks {
w = append(w, t...)
w = append(w, ' ')
toks[i] = ReduceEntropy(t)
}
slices.Reverse(toks)
Expand Down
6 changes: 3 additions & 3 deletions brain/speak_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,21 @@ func TestSpeak(t *testing.T) {
name: "prompted",
prompt: "bocchi ryo nijika",
append: nil,
want: []string{"nijika", "ryo", "bocchi"},
want: []string{"nijika ", "ryo ", "bocchi "},
say: "bocchi ryo nijika",
},
{
name: "prompted-add",
prompt: "bocchi ryo nijika",
append: []byte("kita"),
want: []string{"nijika", "ryo", "bocchi"},
want: []string{"nijika ", "ryo ", "bocchi "},
say: "bocchi ryo nijika kita",
},
{
name: "entropy",
prompt: "BOCCHI RYO NIJIKA",
append: []byte("KITA"),
want: []string{"nijika", "ryo", "bocchi"},
want: []string{"nijika ", "ryo ", "bocchi "},
say: "BOCCHI RYO NIJIKA KITA",
},
}
Expand Down
1 change: 0 additions & 1 deletion brain/sqlbrain/speak.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ func (br *Brain) Speak(ctx context.Context, tag string, prompt []string, w []byt
break
}
w = append(w, b...)
w = append(w, ' ')
search = search.Drop(search.Len() - l - 1).Prepend(brain.ReduceEntropy(string(b)))
}
return w, nil
Expand Down
Loading

0 comments on commit 933e6d0

Please sign in to comment.