Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the ability to associate an object to the match. #14

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@

# THIS IS A MODIFIED VERSION OF CLOSESTMATCH! DOCUMENTATION WILL BE INACCURATE UNTIL I UPDATE. This fork allows an interface to be attached to each searchable item so we can return anything we want.

# closestmatch :page_with_curl:

<a href="#"><img src="https://img.shields.io/badge/version-2.1.0-brightgreen.svg?style=flat-square" alt="Version"></a>
Expand Down
45 changes: 29 additions & 16 deletions closestmatch.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,23 +22,26 @@ type ClosestMatch struct {
type IDInfo struct {
Key string
NumSubstrings int
Data interface{}
}

// New returns a new structure for performing closest matches
func New(possible []string, subsetSize []int) *ClosestMatch {
func New(possible map[string]interface{}, subsetSize []int) *ClosestMatch {
cm := new(ClosestMatch)
cm.SubstringSizes = subsetSize
cm.SubstringToID = make(map[string]map[uint32]struct{})
cm.ID = make(map[uint32]IDInfo)
for i, s := range possible {
substrings := cm.splitWord(strings.ToLower(s))
cm.ID[uint32(i)] = IDInfo{Key: s, NumSubstrings: len(substrings)}
i := 0
for k, m := range possible {
substrings := cm.splitWord(strings.ToLower(k))
cm.ID[uint32(i)] = IDInfo{Key: k, NumSubstrings: len(substrings), Data: m}
for substring := range substrings {
if _, ok := cm.SubstringToID[substring]; !ok {
cm.SubstringToID[substring] = make(map[uint32]struct{})
}
cm.SubstringToID[substring][uint32(i)] = struct{}{}
}
i++
}

return cm
Expand Down Expand Up @@ -77,16 +80,23 @@ func (cm *ClosestMatch) Save(filename string) error {
return enc.Encode(cm)
}

type workerResult struct {
Value int
Data interface{}
}

func (cm *ClosestMatch) worker(id int, jobs <-chan job, results chan<- result) {
for j := range jobs {
m := make(map[string]int)
m := make(map[string]workerResult)
if ids, ok := cm.SubstringToID[j.substring]; ok {
weight := 1000 / len(ids)
for id := range ids {
if _, ok2 := m[cm.ID[id].Key]; !ok2 {
m[cm.ID[id].Key] = 0
m[cm.ID[id].Key] = workerResult{Value: 0, Data: cm.ID[id].Data}
}
m[cm.ID[id].Key] += 1 + 1000/len(cm.ID[id].Key) + weight
item := m[cm.ID[id].Key]
item.Value += 1 + 1000/len(cm.ID[id].Key) + weight
m[cm.ID[id].Key] = item
}
}
results <- result{m: m}
Expand All @@ -98,10 +108,10 @@ type job struct {
}

type result struct {
m map[string]int
m map[string]workerResult
}

func (cm *ClosestMatch) match(searchWord string) map[string]int {
func (cm *ClosestMatch) match(searchWord string) map[string]workerResult {
searchSubstrings := cm.splitWord(searchWord)
searchSubstringsLen := len(searchSubstrings)

Expand All @@ -118,12 +128,14 @@ func (cm *ClosestMatch) match(searchWord string) map[string]int {
}
close(jobs)

m := make(map[string]int)
m := make(map[string]workerResult)
for a := 1; a <= searchSubstringsLen; a++ {
r := <-results
for key := range r.m {
if _, ok := m[key]; ok {
m[key] += r.m[key]
x := m[key]
x.Value += r.m[key].Value
m[key] = x
} else {
m[key] = r.m[key]
}
Expand All @@ -142,22 +154,22 @@ func (cm *ClosestMatch) Closest(searchWord string) string {
}

// ClosestN searches for the `searchWord` and returns the n closests matches
func (cm *ClosestMatch) ClosestN(searchWord string, max int) []string {
matches := make([]string, 0, max)
func (cm *ClosestMatch) ClosestN(searchWord string, max int) []interface{} {
matches := make([]interface{}, 0, max)
for i, pair := range rankByWordCount(cm.match(searchWord)) {
if i >= max {
break
}
matches = append(matches, pair.Key)
matches = append(matches, pair.Data)
}
return matches
}

func rankByWordCount(wordFrequencies map[string]int) PairList {
func rankByWordCount(wordFrequencies map[string]workerResult) PairList {
pl := make(PairList, len(wordFrequencies))
i := 0
for k, v := range wordFrequencies {
pl[i] = Pair{k, v}
pl[i] = Pair{k, v.Value, v.Data}
i++
}
sort.Sort(sort.Reverse(pl))
Expand All @@ -167,6 +179,7 @@ func rankByWordCount(wordFrequencies map[string]int) PairList {
type Pair struct {
Key string
Value int
Data interface{}
}

type PairList []Pair
Expand Down
91 changes: 56 additions & 35 deletions closestmatch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,17 @@ import (
"strings"
"testing"

"github.com/schollz/closestmatch/test"
"github.com/Yugloocamai/closestmatch/test"
)

func BenchmarkNew(b *testing.B) {
for i := 0; i < b.N; i++ {
New(test.WordsToTest, []int{3})
New(test.BooksToTest, []int{3})
}
}

func BenchmarkSplitOne(b *testing.B) {
cm := New(test.WordsToTest, []int{3})
cm := New(test.BooksToTest, []int{3})
searchWord := test.SearchWords[0]
b.ResetTimer()
for i := 0; i < b.N; i++ {
Expand All @@ -26,8 +26,8 @@ func BenchmarkSplitOne(b *testing.B) {

func BenchmarkClosestOne(b *testing.B) {
bText, _ := ioutil.ReadFile("test/books.list")
wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
cm := New(wordsToTest, []int{3})
books := test.GetBooks(string(bText))
cm := New(books, []int{3})
searchWord := test.SearchWords[0]
b.ResetTimer()
for i := 0; i < b.N; i++ {
Expand All @@ -37,8 +37,8 @@ func BenchmarkClosestOne(b *testing.B) {

func BenchmarkClosest3(b *testing.B) {
bText, _ := ioutil.ReadFile("test/books.list")
wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
cm := New(wordsToTest, []int{3})
books := test.GetBooks(string(bText))
cm := New(books, []int{3})
searchWord := test.SearchWords[0]
b.ResetTimer()
for i := 0; i < b.N; i++ {
Expand All @@ -48,8 +48,8 @@ func BenchmarkClosest3(b *testing.B) {

func BenchmarkClosest30(b *testing.B) {
bText, _ := ioutil.ReadFile("test/books.list")
wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
cm := New(wordsToTest, []int{3})
books := test.GetBooks(string(bText))
cm := New(books, []int{3})
searchWord := test.SearchWords[0]
b.ResetTimer()
for i := 0; i < b.N; i++ {
Expand All @@ -59,8 +59,8 @@ func BenchmarkClosest30(b *testing.B) {

func BenchmarkFileLoad(b *testing.B) {
bText, _ := ioutil.ReadFile("test/books.list")
wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
cm := New(wordsToTest, []int{3, 4})
books := test.GetBooks(string(bText))
cm := New(books, []int{3, 4})
cm.Save("test/books.list.cm.gz")
b.ResetTimer()
for i := 0; i < b.N; i++ {
Expand All @@ -70,16 +70,22 @@ func BenchmarkFileLoad(b *testing.B) {

func BenchmarkFileSave(b *testing.B) {
bText, _ := ioutil.ReadFile("test/books.list")
wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
cm := New(wordsToTest, []int{3, 4})
books := test.GetBooks(string(bText))
cm := New(books, []int{3, 4})
b.ResetTimer()
for i := 0; i < b.N; i++ {
cm.Save("test/books.list.cm.gz")
}
}

func ExampleMatchingSmall() {
cm := New([]string{"love", "loving", "cat", "kit", "cats"}, []int{4})
loveCats := make(map[string]interface{})
loveCats["love"] = map[string]string{"name": "love"}
loveCats["loving"] = map[string]string{"name": "loving"}
loveCats["cat"] = map[string]string{"name": "cat"}
loveCats["kit"] = map[string]string{"name": "kit"}
loveCats["cats"] = map[string]string{"name": "cats"}
cm := New(loveCats, []int{4})
fmt.Println(cm.splitWord("love"))
fmt.Println(cm.splitWord("kit"))
fmt.Println(cm.Closest("kit"))
Expand All @@ -91,7 +97,13 @@ func ExampleMatchingSmall() {
}

func ExampleMatchingSimple() {
cm := New(test.WordsToTest, []int{3})

booksLines := strings.Split(strings.ToLower(test.Books), "\n")
wordsToTest := make(map[string]interface{})
for _, v := range booksLines {
wordsToTest[v] = map[string]string{"words": v}
}
cm := New(wordsToTest, []int{3})
for _, searchWord := range test.SearchWords {
fmt.Printf("'%s' matched '%s'\n", searchWord, cm.Closest(searchWord))
}
Expand All @@ -100,21 +112,26 @@ func ExampleMatchingSimple() {
// 'mysterious afur at styles by christie' matched 'the mysterious affair at styles by agatha christie'
// 'hard times by charles dickens' matched 'hard times by charles dickens'
// 'complete william shakespeare' matched 'the complete works of william shakespeare by william shakespeare'
// 'war by hg wells' matched 'the war of the worlds by h. g. wells'
// 'War by HG Wells' matched 'the war of the worlds by h. g. wells'

}

func ExampleMatchingN() {
cm := New(test.WordsToTest, []int{4})
fmt.Println(cm.ClosestN("war h.g. wells", 3))
cm := New(test.BooksToTest, []int{4})
results := cm.ClosestN("war h.g. wells", 3)
var slice []string
for _, v := range results {
slice = append(slice, v.(map[string]string)["name"])
}
fmt.Println(slice)
// Output:
// [the war of the worlds by h. g. wells the time machine by h. g. wells war and peace by graf leo tolstoy]
}

func ExampleMatchingBigList() {
bText, _ := ioutil.ReadFile("test/books.list")
wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
cm := New(wordsToTest, []int{3})
books := test.GetBooks(string(bText))
cm := New(books, []int{3})
searchWord := "island of a thod mirrors"
fmt.Println(cm.Closest(searchWord))
// Output:
Expand All @@ -123,12 +140,12 @@ func ExampleMatchingBigList() {

func ExampleMatchingCatcher() {
bText, _ := ioutil.ReadFile("test/catcher.txt")
wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
cm := New(wordsToTest, []int{5})
books := test.GetBooks(string(bText))
cm := New(books, []int{5})
searchWord := "catcher in the rye by jd salinger"
for i, match := range cm.ClosestN(searchWord, 3) {
if i == 2 {
fmt.Println(match)
fmt.Println(match.(map[string]string)["name"])
}
}
// Output:
Expand All @@ -137,12 +154,12 @@ func ExampleMatchingCatcher() {

func ExampleMatchingPotter() {
bText, _ := ioutil.ReadFile("test/potter.txt")
wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
cm := New(wordsToTest, []int{5})
books := test.GetBooks(string(bText))
cm := New(books, []int{5})
searchWord := "harry potter and the half blood prince by j.k. rowling"
for i, match := range cm.ClosestN(searchWord, 3) {
if i == 1 {
fmt.Println(match)
fmt.Println(match.(map[string]string)["name"])
}
}
// Output:
Expand All @@ -151,36 +168,40 @@ func ExampleMatchingPotter() {

func TestAccuracyBookWords(t *testing.T) {
bText, _ := ioutil.ReadFile("test/books.list")
wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
cm := New(wordsToTest, []int{4, 5})
books := test.GetBooks(string(bText))
cm := New(books, []int{4, 5})
accuracy := cm.AccuracyMutatingWords()
fmt.Printf("Accuracy with mutating words in book list:\t%2.1f%%\n", accuracy)
}

func TestAccuracyBookLetters(t *testing.T) {
bText, _ := ioutil.ReadFile("test/books.list")
wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
cm := New(wordsToTest, []int{5})
books := test.GetBooks(string(bText))
cm := New(books, []int{5})
accuracy := cm.AccuracyMutatingLetters()
fmt.Printf("Accuracy with mutating letters in book list:\t%2.1f%%\n", accuracy)
}

func TestAccuracyDictionaryLetters(t *testing.T) {
bText, _ := ioutil.ReadFile("test/popular.txt")
wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
words := strings.Split(strings.ToLower(string(bText)), "\n")
wordsToTest := make(map[string]interface{})
for _, v := range words {
wordsToTest[v] = map[string]string{"word": v}
}
cm := New(wordsToTest, []int{2, 3, 4})
accuracy := cm.AccuracyMutatingWords()
fmt.Printf("Accuracy with mutating letters in dictionary:\t%2.1f%%\n", accuracy)
}

func TestSaveLoad(t *testing.T) {
bText, _ := ioutil.ReadFile("test/books.list")
wordsToTest := strings.Split(strings.ToLower(string(bText)), "\n")
books := test.GetBooks(string(bText))
type TestStruct struct {
cm *ClosestMatch
}
tst := new(TestStruct)
tst.cm = New(wordsToTest, []int{5})
tst.cm = New(books, []int{5})
err := tst.cm.Save("test.gob")
if err != nil {
t.Error(err)
Expand All @@ -191,8 +212,8 @@ func TestSaveLoad(t *testing.T) {
if err != nil {
t.Error(err)
}
answer2 := tst2.cm.Closest("war of the worlds by hg wells")
answer1 := tst.cm.Closest("war of the worlds by hg wells")
answer2 := tst2.cm.Closest("war of the worlds")
answer1 := tst.cm.Closest("war of the worlds")
if answer1 != answer2 {
t.Errorf("Differing answers: '%s' '%s'", answer1, answer2)
}
Expand Down
Loading