-
Notifications
You must be signed in to change notification settings - Fork 38
/
gomarkov.go
153 lines (140 loc) · 4.1 KB
/
gomarkov.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
package gomarkov
import (
"encoding/json"
"errors"
"fmt"
"math/rand"
"sync"
"time"
)
// Tokens are wrapped around a sequence of words to maintain the
// start and end transition counts
const (
StartToken = "^"
EndToken = "$"
)
// Chain is a markov chain instance
type Chain struct {
Order int
statePool *spool
frequencyMat map[int]sparseArray
lock *sync.RWMutex
}
// PRNG is a pseudo-random number generator compatible with math/rand interfaces.
type PRNG interface {
// Intn returns a number number in the half-open interval [0,n)
Intn(int) int
}
type chainJSON struct {
Order int `json:"int"`
SpoolMap map[string]int `json:"spool_map"`
FreqMat map[int]sparseArray `json:"freq_mat"`
}
var defaultPrng = rand.New(rand.NewSource(time.Now().UnixNano()))
// MarshalJSON ...
func (chain Chain) MarshalJSON() ([]byte, error) {
obj := chainJSON{
chain.Order,
chain.statePool.stringMap,
chain.frequencyMat,
}
return json.Marshal(obj)
}
// UnmarshalJSON ...
func (chain *Chain) UnmarshalJSON(b []byte) error {
var obj chainJSON
err := json.Unmarshal(b, &obj)
if err != nil {
return err
}
chain.Order = obj.Order
intMap := make(map[int]string)
for k, v := range obj.SpoolMap {
intMap[v] = k
}
chain.statePool = &spool{
stringMap: obj.SpoolMap,
intMap: intMap,
}
chain.frequencyMat = obj.FreqMat
chain.lock = new(sync.RWMutex)
return nil
}
// NewChain creates an instance of Chain
func NewChain(order int) *Chain {
chain := Chain{Order: order}
chain.statePool = &spool{
stringMap: make(map[string]int),
intMap: make(map[int]string),
}
chain.frequencyMat = make(map[int]sparseArray, 0)
chain.lock = new(sync.RWMutex)
return &chain
}
// Add adds the transition counts to the chain for a given sequence of words
func (chain *Chain) Add(input []string) {
startTokens := array(StartToken, chain.Order)
endTokens := array(EndToken, chain.Order)
tokens := make([]string, 0)
tokens = append(tokens, startTokens...)
tokens = append(tokens, input...)
tokens = append(tokens, endTokens...)
pairs := MakePairs(tokens, chain.Order)
for i := 0; i < len(pairs); i++ {
pair := pairs[i]
currentIndex := chain.statePool.add(pair.CurrentState.key())
nextIndex := chain.statePool.add(pair.NextState)
chain.lock.Lock()
if chain.frequencyMat[currentIndex] == nil {
chain.frequencyMat[currentIndex] = make(sparseArray, 0)
}
chain.frequencyMat[currentIndex][nextIndex]++
chain.lock.Unlock()
}
}
// TransitionProbability returns the transition probability between two states
func (chain *Chain) TransitionProbability(next string, current NGram) (float64, error) {
if len(current) != chain.Order {
return 0, errors.New("N-gram length does not match chain order")
}
currentIndex, currentExists := chain.statePool.get(current.key())
nextIndex, nextExists := chain.statePool.get(next)
if !currentExists || !nextExists {
return 0, nil
}
arr := chain.frequencyMat[currentIndex]
sum := float64(arr.sum())
freq := float64(arr[nextIndex])
return freq / sum, nil
}
// Generate generates new text based on an initial seed of words
func (chain *Chain) Generate(current NGram) (string, error) {
return chain.GenerateDeterministic(current, defaultPrng)
}
// GenerateDeterministic generates new text deterministically, based on an initial seed of words and using a specified PRNG.
// Use it for reproducibly pseudo-random results (i.e. pass the same PRNG and same state every time).
func (chain *Chain) GenerateDeterministic(current NGram, prng PRNG) (string, error) {
if len(current) != chain.Order {
return "", errors.New("N-gram length does not match chain order")
}
if current[len(current)-1] == EndToken {
// Dont generate anything after the end token
return "", nil
}
currentIndex, currentExists := chain.statePool.get(current.key())
if !currentExists {
return "", fmt.Errorf("Unknown ngram %v", current)
}
arr := chain.frequencyMat[currentIndex]
sum := arr.sum()
randN := prng.Intn(sum)
keys := arr.orderedKeys()
for _, key := range keys {
freq := arr[key]
randN -= freq
if randN <= 0 {
return chain.statePool.intMap[key], nil
}
}
return "", nil
}