-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathocr-evaluation-strict.py
85 lines (68 loc) · 2.4 KB
/
ocr-evaluation-strict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import fileinput
import codecs
import json
import csv
import os
import sys
import json
from collections import defaultdict
dd_output_tsv = ''
eval_path_base = ''
output_stat_path = ''
docid_filter = None
if len(sys.argv) >= 4:
dd_output_tsv = sys.argv[1]
eval_path_base = sys.argv[2]
output_stat_path = sys.argv[3]
if len(sys.argv) >= 5:
doclist = sys.argv[4]
docid_filter = [l.strip() for l in open(doclist).readlines()]
else:
print 'Usage:',sys.argv[0], 'dd_output_tsv eval_path_base output_stat_path [doclist]'
print 'e.g. pypy ',sys.argv[0], '/tmp/ocr-output-words-cuneiform-all.tsv data/test-evaluation/ eval-results-cuni.txt'
sys.exit(1)
# GENERATE DATA with one pass
lines = [l.strip().split('\t') for l in open(dd_output_tsv).readlines()]
doc_candid_word_index = {}
doc_candidate_ids = {}
for line in lines:
docid, candidate_id, word = line
if docid not in doc_candid_word_index:
doc_candid_word_index[docid] = {}
doc_candidate_ids[docid] = []
data = doc_candid_word_index[docid]
if candidate_id not in data:
data[candidate_id] = []
doc_candidate_ids[docid].append(candidate_id)
data[candidate_id].append(word)
# doc_candid_word_index: docid: { candid : [w1,w2,..] }
# doc_candidate_ids: docid: [ candid1, candid2.. ]
eval_data = {}
for docid in doc_candid_word_index:
eval_data[docid] = []
index = doc_candid_word_index[docid]
cands = doc_candidate_ids[docid]
for candid in cands:
data = index[candid] # [w1,w2,w3]
cand = (candid, data) # candidate_id, [w1,w2,w3]
var = [cand]
eval_data[docid].append(var)
print 'Finished processing',len(eval_data),'documents'
fout = open(output_stat_path, 'w')
sys.path.append('util/')
import candmatch # Use our script here
for docid in eval_data:
data = eval_data[docid]
evalpath = eval_path_base + '/' + docid + '.seq'
if not os.path.exists(evalpath):
print 'Error: cannot find path:',evalpath
continue
eval_sequence = [l.rstrip('\n') for l in open(evalpath).readlines()]
print 'Matching',docid,'...'
matches, matched_candidate_ids, f, path, records = candmatch.Match(data, eval_sequence)
print >>sys.stderr, 'DOCID:',docid, ' MATCHES:',matches,'/',len(eval_sequence),'(%.4f)' % (matches / float(len(eval_sequence)))
print >>fout, '\t'.join ([str(x) for x in [docid,
matches,
len(eval_sequence),
matches / float(len(eval_sequence))]])
fout.close()