-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex_baseline.sh
41 lines (35 loc) · 1.09 KB
/
index_baseline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
mkdir -p data
mkdir -p indexing
# title
python3 text2text/filter_corpus.py \
--input_jsonl data/corpus.jsonl \
--output_dir data/title_corpus \
--setting title
python -m pyserini.index.lucene \
--collection JsonCollection \
--input data/title_corpus \
--index indexing/trec-pds-title/ \
--generator DefaultLuceneDocumentGenerator \
--threads 4
# simplified
python3 text2text/filter_corpus.py \
--input_jsonl data/corpus.jsonl \
--output_dir data/simplified_corpus \
--setting simplified
python -m pyserini.index.lucene \
--collection JsonCollection \
--input data/simplified_corpus \
--index indexing/trec-pds-simplified/ \
--generator DefaultLuceneDocumentGenerator \
--threads 4
# full
python3 text2text/filter_corpus.py \
--input_jsonl data/corpus.jsonl \
--output_dir data/full_corpus \
--setting full
python -m pyserini.index.lucene \
--collection JsonCollection \
--input data/full_corpus \
--index indexing/trec-pds-full/ \
--generator DefaultLuceneDocumentGenerator \
--threads 4