Skip to content

Commit

Permalink
Improve tests, README
Browse files Browse the repository at this point in the history
  • Loading branch information
schollz committed Apr 24, 2017
1 parent 842466b commit 56fb6b1
Show file tree
Hide file tree
Showing 5 changed files with 303 additions and 56 deletions.
29 changes: 25 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,13 @@ fmt.Println(cm.ClosestN("kind gizard",3))

```golang
// Calculate accuracy
fmt.Println(cm.Accuracy())
fmt.Println(cm.AccuracyMutatingWords())
// ~ 66 % (still way better than Levenshtein which hits 0% with this particular set)

// Improve accuracy by adding more bags
bagSizes = []int{2, 3, 4}
cm = closestmatch.New(wordsToTest, bagSizes)
fmt.Println(cm.Accuracy())
fmt.Println(cm.AccuracyMutatingWords())
// accuracy improves to ~ 76 %
```

Expand All @@ -70,9 +70,9 @@ fmt.Println(cm2.Closest("lizard wizard"))
// prints "The Lizard Wizard"
```

### Accuracy and Speed
### Advantages

*closestmatch* is more accurate than Levenshtein for long strings (like in the test corpus). If you run `go test` the tests will pass which validate that Levenshtein performs < 50% accuracy and *closestmatch* performs with > 90% accuracy (usually it is 95-98%).
*closestmatch* is more accurate than Levenshtein for long strings (like in the test corpus).

*closestmatch* is ~20x faster than [a fast implementation of Levenshtein](https://groups.google.com/forum/#!topic/golang-nuts/YyH1f_qCZVc). Try it yourself with the benchmarks:

Expand All @@ -92,6 +92,27 @@ BenchmarkClosestOne-8 104603530 4855916 -95.36%

The `New()` function in *closestmatch* is so slower than *levenshtein* because there is precomputation needed.

### Disadvantages

*closestmatch* does worse for matching lists of single words, like a dictionary. For comparison:


```
$ cd $GOPATH/src/github.com/schollz/closestmatch && go test
Accuracy with mutating words in book list: 90.0%
Accuracy with mutating letters in book list: 100.0%
Accuracy with mutating letters in dictionary: 38.9%
```

while levenshtein performs slightly better for a single-word dictionary (but worse for longer names, like book titles):

```
$ cd $GOPATH/src/github.com/schollz/closestmatch/levenshtein && go test
Accuracy with mutating words in book list: 40.0%
Accuracy with mutating letters in book list: 100.0%
Accuracy with mutating letters in dictionary: 64.8%
```

## License

MIT
102 changes: 70 additions & 32 deletions closestmatch.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ type ClosestMatch struct {
ID map[uint32]IDInfo
}

// IDInfo carries the information about the keys
type IDInfo struct {
Key string
NumSubstrings int
Expand Down Expand Up @@ -70,12 +71,12 @@ func (cm *ClosestMatch) worker(id int, jobs <-chan job, results chan<- result) {
for j := range jobs {
m := make(map[string]int)
if ids, ok := cm.SubstringToID[j.substring]; ok {
weight := 100000 / len(ids)
weight := 200000 / len(ids)
for id := range ids {
if _, ok2 := m[cm.ID[id].Key]; !ok2 {
m[cm.ID[id].Key] = 0
}
m[cm.ID[id].Key] += weight
m[cm.ID[id].Key] += 1 + 0*weight
}
}
results <- result{m: m}
Expand Down Expand Up @@ -179,15 +180,15 @@ func (cm *ClosestMatch) splitWord(word string) map[string]struct{} {
return wordHash
}

// Accuracy runs some basic tests against the wordlist to
// AccuracyMutatingWords runs some basic tests against the wordlist to
// see how accurate this bag-of-characters method is against
// the target dataset
func (cm *ClosestMatch) Accuracy() float64 {
func (cm *ClosestMatch) AccuracyMutatingWords() float64 {
rand.Seed(1)
percentCorrect := 0.0
numTrials := 0.0

for wordTrials := 0; wordTrials < 100; wordTrials++ {
for wordTrials := 0; wordTrials < 200; wordTrials++ {

var testString, originalTestString string
testStringNum := rand.Intn(len(cm.ID))
Expand All @@ -201,24 +202,20 @@ func (cm *ClosestMatch) Accuracy() float64 {
break
}

// remove a random word
for trial := 0; trial < 4; trial++ {
words := strings.Split(originalTestString, " ")
var words []string
choice := rand.Intn(3)
if choice == 0 {
// remove a random word
words = strings.Split(originalTestString, " ")
if len(words) < 3 {
continue
}
deleteWordI := rand.Intn(len(words))
words = append(words[:deleteWordI], words[deleteWordI+1:]...)
testString = strings.Join(words, " ")
if cm.Closest(testString) == originalTestString {
percentCorrect += 1.0
}
numTrials += 1.0
}

// remove a random word and reverse
for trial := 0; trial < 4; trial++ {
words := strings.Split(originalTestString, " ")
} else if choice == 1 {
// remove a random word and reverse
words = strings.Split(originalTestString, " ")
if len(words) > 1 {
deleteWordI := rand.Intn(len(words))
words = append(words[:deleteWordI], words[deleteWordI+1:]...)
Expand All @@ -229,15 +226,9 @@ func (cm *ClosestMatch) Accuracy() float64 {
continue
}
testString = strings.Join(words, " ")
if cm.Closest(testString) == originalTestString {
percentCorrect += 1.0
}
numTrials += 1.0
}

// remove a random word and shuffle and replace random letter
for trial := 0; trial < 4; trial++ {
words := strings.Split(originalTestString, " ")
} else {
// remove a random word and shuffle and replace 2 random letters
words = strings.Split(originalTestString, " ")
if len(words) > 1 {
deleteWordI := rand.Intn(len(words))
words = append(words[:deleteWordI], words[deleteWordI+1:]...)
Expand All @@ -255,18 +246,65 @@ func (cm *ClosestMatch) Accuracy() float64 {
testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:]
ii = rand.Intn(len(testString))
testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:]
if cm.Closest(testString) == originalTestString {
percentCorrect += 1.0
}
closest := cm.Closest(testString)
if closest == originalTestString {
percentCorrect += 1.0
} else {
//fmt.Printf("Original: %s, Mutilated: %s, Match: %s\n", originalTestString, testString, closest)
}
numTrials += 1.0
}
return 100.0 * percentCorrect / numTrials
}

// AccuracyMutatingLetters runs some basic tests against the wordlist to
// see how accurate this bag-of-characters method is against
// the target dataset when mutating individual letters (adding, removing, changing)
func (cm *ClosestMatch) AccuracyMutatingLetters() float64 {
rand.Seed(1)
percentCorrect := 0.0
numTrials := 0.0

for wordTrials := 0; wordTrials < 200; wordTrials++ {

var testString, originalTestString string
testStringNum := rand.Intn(len(cm.ID))
i := 0
for id := range cm.ID {
i++
if i != testStringNum {
continue
}
numTrials += 1.0
originalTestString = cm.ID[id].Key
break
}
testString = originalTestString

// letters to replace with
letters := "abcdefghijklmnopqrstuvwxyz"

// test the original string
if cm.Closest(testString) == originalTestString {
choice := rand.Intn(3)
if choice == 0 {
// replace random letter
ii := rand.Intn(len(testString))
testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:]
} else if choice == 1 {
// delete random letter
ii := rand.Intn(len(testString))
testString = testString[:ii] + testString[ii+1:]
} else {
// add random letter
ii := rand.Intn(len(testString))
testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii:]
}
closest := cm.Closest(testString)
if closest == originalTestString {
percentCorrect += 1.0
} else {
//fmt.Printf("Original: %s, Mutilated: %s, Match: %s\n", originalTestString, testString, closest)
}
numTrials += 1.0

}

return 100.0 * percentCorrect / numTrials
Expand Down
36 changes: 26 additions & 10 deletions closestmatch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,10 @@ func ExampleMatchingSimple() {
}

func ExampleMatchingN() {
cm := New(test.WordsToTest, []int{1, 2, 3})
fmt.Println(cm.ClosestN("war by hg wells", 3))
cm := New(test.WordsToTest, []int{4})
fmt.Println(cm.ClosestN("war h.g. wells", 3))
// Output:
// [the war of the worlds by h. g. wells the time machine by h. g. wells tractatus logico-philosophicus by ludwig wittgenstein]
// [the war of the worlds by h. g. wells the time machine by h. g. wells war and peace by graf leo tolstoy]
}

func ExampleMatchingBigList() {
Expand All @@ -109,16 +109,32 @@ func ExampleMatchingBigList() {
// island of a thousand mirrors by nayomi munaweera
}

func TestAccuracy(t *testing.T) {
cm := New(test.WordsToTest, []int{1, 2})
accuracy := cm.Accuracy()
if accuracy < 90 {
t.Errorf("Accuracy should be higher than %2.1f", accuracy)
}
func TestAccuracyBookWords(t *testing.T) {
bText, _ := ioutil.ReadFile("test/books.list")
wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
cm := New(wordsToTest, []int{4, 5})
accuracy := cm.AccuracyMutatingWords()
fmt.Printf("Accuracy with mutating words in book list:\t%2.1f%%\n", accuracy)
}

func TestAccuracyBookletters(t *testing.T) {
bText, _ := ioutil.ReadFile("test/books.list")
wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
cm := New(wordsToTest, []int{5})
accuracy := cm.AccuracyMutatingLetters()
fmt.Printf("Accuracy with mutating letters in book list:\t%2.1f%%\n", accuracy)
}

func TestAccuracyDictionaryletters(t *testing.T) {
bText, _ := ioutil.ReadFile("test/popular.txt")
wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
cm := New(wordsToTest, []int{2, 3, 4})
accuracy := cm.AccuracyMutatingWords()
fmt.Printf("Accuracy with mutating letters in dictionary:\t%2.1f%%\n", accuracy)
}

func TestSaveLoad(t *testing.T) {
cm := New(test.WordsToTest, []int{2, 3})
cm := New(test.WordsToTest, []int{2, 3, 4})
err := cm.Save("test.txt")
if err != nil {
t.Error(err)
Expand Down
Loading

0 comments on commit 56fb6b1

Please sign in to comment.