-
Notifications
You must be signed in to change notification settings - Fork 39
/
Copy pathstorage.go
81 lines (64 loc) · 1.94 KB
/
storage.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
package sentences
import "encoding/json"
// SetString is an implementation of a set of strings
// probably not the best way to do this but oh well.
type SetString map[string]int
// Add adds a string key to the set
func (ss SetString) Add(str string) {
ss[str] = 1
}
// Remove deletes a string key from the set
func (ss SetString) Remove(str string) {
delete(ss, str)
}
// Has checks whether a key exists in the set
func (ss SetString) Has(str string) bool {
if ss[str] == 0 {
return false
}
return true
}
// Array returns and array of keys from the set
func (ss SetString) Array() []string {
arr := make([]string, 0, len(ss))
for key := range ss {
arr = append(arr, key)
}
return arr
}
// Storage stores data used to perform sentence boundary detection with punkt
// This is where all the training data gets stored for future use
type Storage struct {
AbbrevTypes SetString `json:"AbbrevTypes"`
Collocations SetString `json:"Collocations"`
SentStarters SetString `json:"SentStarters"`
OrthoContext SetString `json:"OrthoContext"`
}
// LoadTraining is the primary function to load JSON training data. By default, the sentence tokenizer
// loads in english automatically, but other languages could be loaded into a
// binary file using the `make <lang>` command.
func LoadTraining(data []byte) (*Storage, error) {
var storage Storage
err := json.Unmarshal(data, &storage)
if err != nil {
return nil, err
}
return &storage, nil
}
// NewStorage creates the default storage container
func NewStorage() *Storage {
return &Storage{SetString{}, SetString{}, SetString{}, SetString{}}
}
// Used in the training to add a type to the ortho context
func (p *Storage) addOrthoContext(typ string, flag int) {
p.OrthoContext[typ] |= flag
}
// IsAbbr detemines if any of the tokens are an abbreviation
func (p *Storage) IsAbbr(tokens ...string) bool {
for _, token := range tokens {
if p.AbbrevTypes.Has(token) {
return true
}
}
return false
}