Improve tests, README

schollz · Apr 24, 2017 · 56fb6b1 · 56fb6b1
1 parent 842466b
commit 56fb6b1
Show file tree

Hide file tree

Showing 5 changed files with 303 additions and 56 deletions.
diff --git a/README.md b/README.md
@@ -48,13 +48,13 @@ fmt.Println(cm.ClosestN("kind gizard",3))
 
 ```golang
 // Calculate accuracy
-fmt.Println(cm.Accuracy())
+fmt.Println(cm.AccuracyMutatingWords())
 // ~ 66 % (still way better than Levenshtein which hits 0% with this particular set)
 
 // Improve accuracy by adding more bags
 bagSizes = []int{2, 3, 4}
 cm = closestmatch.New(wordsToTest, bagSizes)
-fmt.Println(cm.Accuracy())
+fmt.Println(cm.AccuracyMutatingWords())
 // accuracy improves to ~ 76 %
 ```
 
@@ -70,9 +70,9 @@ fmt.Println(cm2.Closest("lizard wizard"))
 // prints "The Lizard Wizard"
 ```
 
-### Accuracy and Speed
+### Advantages
 
-*closestmatch* is more accurate than Levenshtein for long strings (like in the test corpus). If you run `go test` the tests will pass which validate that Levenshtein performs < 50% accuracy and *closestmatch* performs with > 90% accuracy (usually it is 95-98%). 
+*closestmatch* is more accurate than Levenshtein for long strings (like in the test corpus). 
 
 *closestmatch* is ~20x faster than [a fast implementation of Levenshtein](https://groups.google.com/forum/#!topic/golang-nuts/YyH1f_qCZVc). Try it yourself with the benchmarks:
 
@@ -92,6 +92,27 @@ BenchmarkClosestOne-8     104603530     4855916       -95.36%
 
 The `New()` function in *closestmatch* is so slower than *levenshtein* because there is precomputation needed.
 
+### Disadvantages
+
+*closestmatch* does worse for matching lists of single words, like a dictionary. For comparison:
+
+
+```
+$ cd $GOPATH/src/github.com/schollz/closestmatch && go test
+Accuracy with mutating words in book list:      90.0%
+Accuracy with mutating letters in book list:    100.0%
+Accuracy with mutating letters in dictionary:   38.9%
+```
+
+while levenshtein performs slightly better for a single-word dictionary (but worse for longer names, like book titles):
+
+```
+$ cd $GOPATH/src/github.com/schollz/closestmatch/levenshtein && go test
+Accuracy with mutating words in book list:      40.0%
+Accuracy with mutating letters in book list:    100.0%
+Accuracy with mutating letters in dictionary:   64.8%
+```
+
 ## License
 
 MIT
diff --git a/closestmatch.go b/closestmatch.go
@@ -17,6 +17,7 @@ type ClosestMatch struct {
 	ID             map[uint32]IDInfo
 }
 
+// IDInfo carries the information about the keys
 type IDInfo struct {
 	Key           string
 	NumSubstrings int
@@ -70,12 +71,12 @@ func (cm *ClosestMatch) worker(id int, jobs <-chan job, results chan<- result) {
 	for j := range jobs {
 		m := make(map[string]int)
 		if ids, ok := cm.SubstringToID[j.substring]; ok {
-			weight := 100000 / len(ids)
+			weight := 200000 / len(ids)
 			for id := range ids {
 				if _, ok2 := m[cm.ID[id].Key]; !ok2 {
 					m[cm.ID[id].Key] = 0
 				}
-				m[cm.ID[id].Key] += weight
+				m[cm.ID[id].Key] += 1 + 0*weight
 			}
 		}
 		results <- result{m: m}
@@ -179,15 +180,15 @@ func (cm *ClosestMatch) splitWord(word string) map[string]struct{} {
 	return wordHash
 }
 
-// Accuracy runs some basic tests against the wordlist to
+// AccuracyMutatingWords runs some basic tests against the wordlist to
 // see how accurate this bag-of-characters method is against
 // the target dataset
-func (cm *ClosestMatch) Accuracy() float64 {
+func (cm *ClosestMatch) AccuracyMutatingWords() float64 {
 	rand.Seed(1)
 	percentCorrect := 0.0
 	numTrials := 0.0
 
-	for wordTrials := 0; wordTrials < 100; wordTrials++ {
+	for wordTrials := 0; wordTrials < 200; wordTrials++ {
 
 		var testString, originalTestString string
 		testStringNum := rand.Intn(len(cm.ID))
@@ -201,24 +202,20 @@ func (cm *ClosestMatch) Accuracy() float64 {
 			break
 		}
 
-		// remove a random word
-		for trial := 0; trial < 4; trial++ {
-			words := strings.Split(originalTestString, " ")
+		var words []string
+		choice := rand.Intn(3)
+		if choice == 0 {
+			// remove a random word
+			words = strings.Split(originalTestString, " ")
 			if len(words) < 3 {
 				continue
 			}
 			deleteWordI := rand.Intn(len(words))
 			words = append(words[:deleteWordI], words[deleteWordI+1:]...)
 			testString = strings.Join(words, " ")
-			if cm.Closest(testString) == originalTestString {
-				percentCorrect += 1.0
-			}
-			numTrials += 1.0
-		}
-
-		// remove a random word and reverse
-		for trial := 0; trial < 4; trial++ {
-			words := strings.Split(originalTestString, " ")
+		} else if choice == 1 {
+			// remove a random word and reverse
+			words = strings.Split(originalTestString, " ")
 			if len(words) > 1 {
 				deleteWordI := rand.Intn(len(words))
 				words = append(words[:deleteWordI], words[deleteWordI+1:]...)
@@ -229,15 +226,9 @@ func (cm *ClosestMatch) Accuracy() float64 {
 				continue
 			}
 			testString = strings.Join(words, " ")
-			if cm.Closest(testString) == originalTestString {
-				percentCorrect += 1.0
-			}
-			numTrials += 1.0
-		}
-
-		// remove a random word and shuffle and replace random letter
-		for trial := 0; trial < 4; trial++ {
-			words := strings.Split(originalTestString, " ")
+		} else {
+			// remove a random word and shuffle and replace 2 random letters
+			words = strings.Split(originalTestString, " ")
 			if len(words) > 1 {
 				deleteWordI := rand.Intn(len(words))
 				words = append(words[:deleteWordI], words[deleteWordI+1:]...)
@@ -255,18 +246,65 @@ func (cm *ClosestMatch) Accuracy() float64 {
 			testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:]
 			ii = rand.Intn(len(testString))
 			testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:]
-			if cm.Closest(testString) == originalTestString {
-				percentCorrect += 1.0
+		}
+		closest := cm.Closest(testString)
+		if closest == originalTestString {
+			percentCorrect += 1.0
+		} else {
+			//fmt.Printf("Original: %s, Mutilated: %s, Match: %s\n", originalTestString, testString, closest)
+		}
+		numTrials += 1.0
+	}
+	return 100.0 * percentCorrect / numTrials
+}
+
+// AccuracyMutatingLetters runs some basic tests against the wordlist to
+// see how accurate this bag-of-characters method is against
+// the target dataset when mutating individual letters (adding, removing, changing)
+func (cm *ClosestMatch) AccuracyMutatingLetters() float64 {
+	rand.Seed(1)
+	percentCorrect := 0.0
+	numTrials := 0.0
+
+	for wordTrials := 0; wordTrials < 200; wordTrials++ {
+
+		var testString, originalTestString string
+		testStringNum := rand.Intn(len(cm.ID))
+		i := 0
+		for id := range cm.ID {
+			i++
+			if i != testStringNum {
+				continue
 			}
-			numTrials += 1.0
+			originalTestString = cm.ID[id].Key
+			break
 		}
+		testString = originalTestString
+
+		// letters to replace with
+		letters := "abcdefghijklmnopqrstuvwxyz"
 
-		// test the original string
-		if cm.Closest(testString) == originalTestString {
+		choice := rand.Intn(3)
+		if choice == 0 {
+			// replace random letter
+			ii := rand.Intn(len(testString))
+			testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:]
+		} else if choice == 1 {
+			// delete random letter
+			ii := rand.Intn(len(testString))
+			testString = testString[:ii] + testString[ii+1:]
+		} else {
+			// add random letter
+			ii := rand.Intn(len(testString))
+			testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii:]
+		}
+		closest := cm.Closest(testString)
+		if closest == originalTestString {
 			percentCorrect += 1.0
+		} else {
+			//fmt.Printf("Original: %s, Mutilated: %s, Match: %s\n", originalTestString, testString, closest)
 		}
 		numTrials += 1.0
-
 	}
 
 	return 100.0 * percentCorrect / numTrials

diff --git a/closestmatch_test.go b/closestmatch_test.go
@@ -93,10 +93,10 @@ func ExampleMatchingSimple() {
 }
 
 func ExampleMatchingN() {
-	cm := New(test.WordsToTest, []int{1, 2, 3})
-	fmt.Println(cm.ClosestN("war by hg wells", 3))
+	cm := New(test.WordsToTest, []int{4})
+	fmt.Println(cm.ClosestN("war h.g. wells", 3))
 	// Output:
-	// [the war of the worlds by h. g. wells the time machine by h. g. wells tractatus logico-philosophicus by ludwig wittgenstein]
+	// [the war of the worlds by h. g. wells the time machine by h. g. wells war and peace by graf leo tolstoy]
 }
 
 func ExampleMatchingBigList() {
@@ -109,16 +109,32 @@ func ExampleMatchingBigList() {
 	// island of a thousand mirrors by nayomi munaweera
 }
 
-func TestAccuracy(t *testing.T) {
-	cm := New(test.WordsToTest, []int{1, 2})
-	accuracy := cm.Accuracy()
-	if accuracy < 90 {
-		t.Errorf("Accuracy should be higher than %2.1f", accuracy)
-	}
+func TestAccuracyBookWords(t *testing.T) {
+	bText, _ := ioutil.ReadFile("test/books.list")
+	wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
+	cm := New(wordsToTest, []int{4, 5})
+	accuracy := cm.AccuracyMutatingWords()
+	fmt.Printf("Accuracy with mutating words in book list:\t%2.1f%%\n", accuracy)
+}
+
+func TestAccuracyBookletters(t *testing.T) {
+	bText, _ := ioutil.ReadFile("test/books.list")
+	wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
+	cm := New(wordsToTest, []int{5})
+	accuracy := cm.AccuracyMutatingLetters()
+	fmt.Printf("Accuracy with mutating letters in book list:\t%2.1f%%\n", accuracy)
+}
+
+func TestAccuracyDictionaryletters(t *testing.T) {
+	bText, _ := ioutil.ReadFile("test/popular.txt")
+	wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
+	cm := New(wordsToTest, []int{2, 3, 4})
+	accuracy := cm.AccuracyMutatingWords()
+	fmt.Printf("Accuracy with mutating letters in dictionary:\t%2.1f%%\n", accuracy)
 }
 
 func TestSaveLoad(t *testing.T) {
-	cm := New(test.WordsToTest, []int{2, 3})
+	cm := New(test.WordsToTest, []int{2, 3, 4})
 	err := cm.Save("test.txt")
 	if err != nil {
 		t.Error(err)