-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathcod_benchmark_test.go
146 lines (133 loc) · 3.55 KB
/
cod_benchmark_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
package lshensemble
import (
"bufio"
"fmt"
"io/ioutil"
"log"
"math/rand"
"os"
"path/filepath"
"strings"
"testing"
"time"
)
const (
benchmarkSeed = 42
fracQuery = 0.01
minDomainSize = 10
)
var (
thresholds = []float64{0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}
)
// Running this function requires a `_cod_domains` directory
// in the current directory.
// The `_code_domains` directory should contain domains files,
// which are line-separated files.
func Benchmark_CanadianOpenData(b *testing.B) {
// Read raw domains
start := time.Now()
rawDomains := make([]rawDomain, 0)
var count int
fmt.Println()
for domain := range readDomains("_cod_domains") {
// Ignore domaisn with less than 10 values
if len(domain.values) < minDomainSize {
continue
}
rawDomains = append(rawDomains, domain)
count++
fmt.Printf("\rRead %d domains", count)
}
fmt.Println()
log.Printf("Read %d domains in %s", len(rawDomains),
time.Now().Sub(start).String())
// Select queries
numQuery := int(fracQuery * float64(len(rawDomains)))
queries := make([]rawDomain, 0, numQuery)
rand.Seed(int64(benchmarkSeed))
for _, i := range rand.Perm(len(rawDomains))[:numQuery] {
queries = append(queries, rawDomains[i])
}
// Run benchmark
for _, threshold := range thresholds {
log.Printf("Canadian Open Data benchmark threshold = %.2f", threshold)
benchmarkCOD(rawDomains, queries, threshold)
}
}
func benchmarkCOD(rawDomains, queries []rawDomain, threshold float64) {
linearscanOutput := fmt.Sprintf("_cod_linearscan_threshold=%.2f", threshold)
lshensembleOutput := fmt.Sprintf("_cod_lshensemble_threshold=%.2f", threshold)
accuracyOutput := fmt.Sprintf("_cod_accuracy_threshold=%.2f", threshold)
benchmarkLinearscan(rawDomains, queries, threshold, linearscanOutput)
benchmarkLshEnsemble(rawDomains, queries, threshold, lshensembleOutput)
benchmarkAccuracy(linearscanOutput, lshensembleOutput, accuracyOutput)
}
type rawDomain struct {
values map[string]bool
key string
}
type byKey []*rawDomain
func (ds byKey) Len() int { return len(ds) }
func (ds byKey) Swap(i, j int) { ds[i], ds[j] = ds[j], ds[i] }
func (ds byKey) Less(i, j int) bool { return ds[i].key < ds[j].key }
func readDomains(dir string) chan rawDomain {
out := make(chan rawDomain)
files, err := ioutil.ReadDir(dir)
if err != nil {
msg := fmt.Sprintf("Error reading domain directory %s, does it exist?", dir)
panic(msg)
}
go func() {
for _, file := range files {
key := file.Name()
values := make(map[string]bool)
domainFile, err := os.Open(filepath.Join(dir, key))
if err != nil {
panic(err)
}
scanner := bufio.NewScanner(domainFile)
for scanner.Scan() {
v := strings.ToLower(scanner.Text())
values[v] = true
err = scanner.Err()
if err != nil {
panic(err)
}
}
domainFile.Close()
out <- rawDomain{
values: values,
key: key,
}
}
close(out)
}()
return out
}
type queryResult struct {
candidates []interface{}
queryKey interface{}
duration time.Duration
}
func outputQueryResults(results chan queryResult, outputFilename string) {
f, err := os.Create(outputFilename)
if err != nil {
panic(err)
}
out := bufio.NewWriter(f)
for result := range results {
out.WriteString(result.queryKey.(string))
out.WriteString("\t")
out.WriteString(result.duration.String())
out.WriteString("\t")
for i, candidate := range result.candidates {
out.WriteString(candidate.(string))
if i < len(result.candidates)-1 {
out.WriteString("\t")
}
}
out.WriteString("\n")
}
out.Flush()
f.Close()
}