-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathconvert_msmarco_to_tfrecord.py
241 lines (186 loc) · 7.52 KB
/
convert_msmarco_to_tfrecord.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
"""Converts MS MARCO queries and corpus into TFRecord that will be consumed by BERT.
The main necessary inputs are:
- Passage Collection (tsv file)
- Pairs of Query-Relevant Passage (called qrels in TREC's nomenclature)
- Pairs of Query-Candidate Passage (called run in TREC's nomenclature)
The outputs is a TFRecord file and a text file with query id and doc id mappings.
"""
import collections
import os
import tensorflow as tf
import time
import tokenization
flags = tf.flags
FLAGS = flags.FLAGS
flags.DEFINE_string(
"output_folder", None, "Folder where the TFRecord files will be writen.")
flags.DEFINE_string(
"vocab", None, "The vocabulary file that the BERT model was trained on.")
flags.DEFINE_string(
"collection_path", None,
"Path to the tsv file containing the MS MARCO documents.")
flags.DEFINE_string(
"queries", None, "Path to the <query id; query text> pairs.")
flags.DEFINE_string(
"run", None, "Path to the query id / candidate doc ids pairs.")
flags.DEFINE_string(
"qrels", None, "Path to the query id / relevant doc ids pairs.")
flags.DEFINE_integer(
"max_seq_length", 512,
"The maximum total input sequence length after WordPiece tokenization. "
"Sequences longer than this will be truncated, and sequences shorter "
"than this will be padded.")
flags.DEFINE_integer(
"max_query_length", 64,
"The maximum query sequence length after WordPiece tokenization. "
"Sequences longer than this will be truncated.")
flags.DEFINE_integer(
"num_docs", 1000, "The number of docs per query.")
def convert_dataset(data, collection, tokenizer):
ids_file = open(FLAGS.output_folder + '/query_doc_ids.txt', 'w')
output_path = FLAGS.output_folder + '/dataset.tf'
start_time = time.time()
random_title = list(collection.keys())[0]
with tf.python_io.TFRecordWriter(output_path) as writer:
for i, query_id in enumerate(data):
query, qrels, doc_titles = data[query_id]
query = tokenization.convert_to_unicode(query)
query_ids = tokenization.convert_to_bert_input(
text=query,
max_seq_length=FLAGS.max_query_length,
tokenizer=tokenizer,
add_cls=True)
query_ids_tf = tf.train.Feature(
int64_list=tf.train.Int64List(value=query_ids))
doc_titles = doc_titles[:FLAGS.num_docs]
# Add fake docs so we always have max_docs per query.
doc_titles += max(0, FLAGS.num_docs - len(doc_titles)) * [random_title]
labels = [
1 if doc_title in qrels else 0
for doc_title in doc_titles
]
if i % 1000 == 0:
print('query: {}; len qrels: {}'.format(query, len(qrels)))
print('sum labels: {}'.format(sum(labels)))
for j, (label, doc_title) in enumerate(zip(labels, doc_titles)):
print('doc {}, label {}, title: {}\n{}\n'.format(
j, label, doc_title, collection[doc_title]))
print()
doc_token_ids = [
tokenization.convert_to_bert_input(
text=tokenization.convert_to_unicode(collection[doc_title]),
max_seq_length=FLAGS.max_seq_length - len(query_ids),
tokenizer=tokenizer,
add_cls=False)
for doc_title in doc_titles
]
for doc_token_id, label, doc_title in zip(
doc_token_ids, labels, doc_titles):
ids_file.write('{}\t{}\n'.format(query_id, doc_title))
doc_ids_tf = tf.train.Feature(
int64_list=tf.train.Int64List(value=doc_token_id))
labels_tf = tf.train.Feature(
int64_list=tf.train.Int64List(value=[label]))
len_gt_titles_tf = tf.train.Feature(
int64_list=tf.train.Int64List(value=[len(qrels)]))
features = tf.train.Features(feature={
'query_ids': query_ids_tf,
'doc_ids': doc_ids_tf,
'label': labels_tf,
'len_gt_titles': len_gt_titles_tf,
})
example = tf.train.Example(features=features)
writer.write(example.SerializeToString())
if i % 1000 == 0:
print('wrote {} of {} queries'.format(i, len(data)))
time_passed = time.time() - start_time
est_hours = (len(data) - i) * time_passed / (max(1.0, i) * 3600)
print('estimated total hours to save: {}'.format(est_hours))
ids_file.close()
def load_qrels(path):
"""Loads qrels into a dict of key: query_id, value: list of relevant doc ids."""
qrels = collections.defaultdict(set)
with open(path) as f:
for i, line in enumerate(f):
query_id, _, doc_id, relevance = line.rstrip().split('\t')
if int(relevance) >= 1:
qrels[query_id].add(doc_id)
if i % 1000 == 0:
print('Loading qrels {}'.format(i))
return qrels
def load_queries(path):
"""Loads queries into a dict of key: query_id, value: query text."""
queries = {}
with open(path) as f:
for i, line in enumerate(f):
query_id, query = line.rstrip().split('\t')
queries[query_id] = query
if i % 1000 == 0:
print('Loading queries {}'.format(i))
return queries
def load_run(path):
"""Loads run into a dict of key: query_id, value: list of candidate doc ids."""
# We want to preserve the order of runs so we can pair the run file with the
# TFRecord file.
run = collections.OrderedDict()
with open(path) as f:
for i, line in enumerate(f):
query_id, doc_title, rank = line.split('\t')
if query_id not in run:
run[query_id] = []
run[query_id].append((doc_title, int(rank)))
if i % 1000000 == 0:
print('Loading run {}'.format(i))
# Sort candidate docs by rank.
sorted_run = collections.OrderedDict()
for query_id, doc_titles_ranks in run.items():
sorted(doc_titles_ranks, key=lambda x: x[1])
doc_titles = [doc_titles for doc_titles, _ in doc_titles_ranks]
sorted_run[query_id] = doc_titles
return sorted_run
def merge(qrels, run, queries):
"""Merge qrels and runs into a single dict of key: query,
value: tuple(relevant_doc_ids, candidate_doc_ids)"""
data = collections.OrderedDict()
for query_id, candidate_doc_ids in run.items():
query = queries[query_id]
relevant_doc_ids = set()
if qrels:
relevant_doc_ids = qrels[query_id]
data[query_id] = (query, relevant_doc_ids, candidate_doc_ids)
return data
def load_collection(path):
"""Loads tsv collection into a dict of key: doc id, value: doc text."""
collection = {}
with open(path) as f:
for i, line in enumerate(f):
doc_id, doc_text = line.rstrip().split('\t')
collection[doc_id] = doc_text.replace('\n', ' ')
if i % 1000000 == 0:
print('Loading collection, doc {}'.format(i))
return collection
def main(_):
print('Loading Tokenizer...')
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab, do_lower_case=True)
if not os.path.exists(FLAGS.output_folder):
os.mkdir(FLAGS.output_folder)
qrels = None
if FLAGS.qrels:
qrels = load_qrels(path=FLAGS.qrels)
queries = load_queries(path=FLAGS.queries)
run = load_run(path=FLAGS.run)
data = merge(qrels=qrels, run=run, queries=queries)
print('Loading Collection...')
collection = load_collection(FLAGS.collection_path)
print('Converting to TFRecord...')
convert_dataset(data=data, collection=collection, tokenizer=tokenizer)
print('Done!')
if __name__ == '__main__':
flags.mark_flag_as_required('output_folder')
flags.mark_flag_as_required('collection_path')
flags.mark_flag_as_required('vocab')
flags.mark_flag_as_required('queries')
flags.mark_flag_as_required('run')
flags.mark_flag_as_required('output_folder')
tf.app.run(main)