Skip to content

Commit

Permalink
Readme testcase (#1077)
Browse files Browse the repository at this point in the history
- move integration/resources/sample* into tests/resources
- unit test for the added command line (encode and index) in readme
  • Loading branch information
crystina-z authored Mar 21, 2022
1 parent eb013ea commit 1f6ba77
Show file tree
Hide file tree
Showing 14 changed files with 206 additions and 6 deletions.
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -538,16 +538,18 @@ python -m pyserini.index.faiss \
--pq
```
4. [Flat](https://faiss.ai/cpp_api/struct/structfaiss_1_1IndexFlat.html)

This command is for converting the `.jsonl` format into Faiss flat format,
and generates the same files with `pyserini.encode` with `--to-faiss` specified.
```bash
python -m pyserini.index.faiss \
--input path/to/encoded/corpus \ # either in the Faiss or the jsonl format
--input path/to/encoded/corpus \ # in jsonl format
--output path/to/output/index \
```
Note that this would generate the same files with `pyserini.encode` with `--to-faiss` specified.

Once the index is built, you can use `FaissSearcher` to search in the collection:
```python
from pyserini.dsearch import FaissSearcher
from pyserini.search import FaissSearcher

searcher = FaissSearcher(
'indexes/dindex-sample-dpr-multi',
Expand Down
3 changes: 2 additions & 1 deletion pyserini/index/faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
parser.add_argument('--efC', type=int, default=256, required=False)
parser.add_argument('--pq', action="store_true", required=False)
parser.add_argument('--pq-m', type=int, default=192, required=False)
parser.add_argument('--pq-nbits', type=int, default=8, required=False)
parser.add_argument('--threads', type=int, default=12, required=False)
args = parser.parse_args()

Expand Down Expand Up @@ -68,7 +69,7 @@
index = faiss.IndexHNSWFlat(args.dim, args.M, faiss.METRIC_INNER_PRODUCT)
index.hnsw.efConstruction = args.efC
elif args.pq:
index = faiss.IndexPQ(args.dim, args.pq_m, 8, faiss.METRIC_INNER_PRODUCT)
index = faiss.IndexPQ(args.dim, args.pq_m, args.pq_nbits, faiss.METRIC_INNER_PRODUCT)
else:
index = faiss.IndexFlatIP(args.dim)
index.verbose = True
Expand Down
File renamed without changes.
File renamed without changes.
87 changes: 85 additions & 2 deletions tests/test_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,33 @@
# limitations under the License.
#

import os
import json
import faiss
import unittest
import pathlib as pl


from pyserini.encode import JsonlCollectionIterator, TctColBertDocumentEncoder, DprDocumentEncoder, \
UniCoilDocumentEncoder
from pyserini.util import get_cache_home


class TestSearch(unittest.TestCase):
def setUp(self):
self.docids = []
self.texts = []
with open('tests/resources/simple_cacm_corpus.json') as f:
self.test_file = 'tests/resources/simple_cacm_corpus.json'

with open(self.test_file) as f:
for line in f:
self.texts.append(json.loads(line)['contents'])
line = json.loads(line)
self.docids.append(line['id'])
self.texts.append(line['contents'])

def assertIsFile(self, path):
if not pl.Path(path).resolve().is_file():
raise AssertionError("File does not exist: %s" % str(path))

def test_dpr_encoder(self):
encoder = DprDocumentEncoder('facebook/dpr-ctx_encoder-multiset-base', device='cpu')
Expand All @@ -52,6 +66,75 @@ def test_unicoil_encoder(self):
self.assertAlmostEqual(vectors[2]['rounding'], 3.9474332332611084, places=4)
self.assertAlmostEqual(vectors[2]['commercial'], 3.288801670074463, places=4)

def test_tct_colbert_v2_encoder_cmd(self):
cache_dir = get_cache_home()
index_dir = f'{cache_dir}/temp_index'
cmd = f'python -m pyserini.encode \
input --corpus {self.test_file} \
--fields text \
output --embeddings {index_dir} \
encoder --encoder castorini/tct_colbert-v2-hnp-msmarco \
--fields text \
--batch 1 \
--device cpu'
status = os.system(cmd)
self.assertEqual(status, 0)

embedding_json_fn = os.path.join(index_dir, 'embeddings.jsonl')
self.assertIsFile(embedding_json_fn)

embeddings = [json.loads(line) for line in open(embedding_json_fn)]
self.assertListEqual([entry["id"] for entry in embeddings], self.docids)
self.assertListEqual(
[entry["contents"] for entry in embeddings],
[entry.strip() for entry in self.texts],
)

self.assertAlmostEqual(embeddings[0]['vector'][0], 0.12679848074913025, places=4)
self.assertAlmostEqual(embeddings[0]['vector'][-1], -0.0037349488120526075, places=4)
self.assertAlmostEqual(embeddings[2]['vector'][0], 0.03678430616855621, places=4)
self.assertAlmostEqual(embeddings[2]['vector'][-1], 0.13209162652492523, places=4)

def test_tct_colbert_v2_encoder_cmd_shard(self):
cache_dir = get_cache_home()

for shard_i in range(2):
index_dir = f'{cache_dir}/temp_index-{shard_i}'
cmd = f'python -m pyserini.encode \
input --corpus {self.test_file} \
--fields text \
--shard-id {shard_i} \
--shard-num 2 \
output --embeddings {index_dir} \
--to-faiss \
encoder --encoder castorini/tct_colbert-v2-hnp-msmarco \
--fields text \
--batch 1 \
--device cpu'
status = os.system(cmd)
self.assertEqual(status, 0)
self.assertIsFile(os.path.join(index_dir, 'docid'))
self.assertIsFile(os.path.join(index_dir, 'index'))

cmd = f'python -m pyserini.index.merge_faiss_indexes --prefix {cache_dir}/temp_index- --shard-num 2'
index_dir = f'{cache_dir}/temp_index-full'
docid_fn = os.path.join(index_dir, 'docid')
index_fn = os.path.join(index_dir, 'index')

status = os.system(cmd)
self.assertEqual(status, 0)
self.assertIsFile(docid_fn)
self.assertIsFile(index_fn)

index = faiss.read_index(index_fn)
vectors = index.reconstruct_n(0, index.ntotal)

self.assertListEqual([docid.strip() for docid in open(docid_fn, "r")], self.docids)
self.assertAlmostEqual(vectors[0][0], 0.12679848074913025, places=4)
self.assertAlmostEqual(vectors[0][-1], -0.0037349488120526075, places=4)
self.assertAlmostEqual(vectors[2][0], 0.03678430616855621, places=4)
self.assertAlmostEqual(vectors[2][-1], 0.13209162652492523, places=4)


class TestJsonlCollectionIterator(unittest.TestCase):
def setUp(self):
Expand Down
114 changes: 114 additions & 0 deletions tests/test_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#
# Pyserini: Reproducible IR research with sparse and dense representations
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import json
import os
import faiss
import unittest
import pathlib as pl


from pyserini.util import get_cache_home


class TestSearch(unittest.TestCase):
def setUp(self):
self.docids = []
self.texts = []
self.test_file = 'tests/resources/simple_cacm_corpus.json'
with open(self.test_file) as f:
for line in f:
line = json.loads(line)
self.docids.append(line['id'])
self.texts.append(line['contents'])

def assertIsFile(self, path):
if not pl.Path(path).resolve().is_file():
raise AssertionError("File does not exist: %s" % str(path))

def prepare_encoded_collection(self):
cache_dir = get_cache_home()
encoded_corpus_dir = f'{cache_dir}/temp_index'
cmd = f'python -m pyserini.encode \
input --corpus {self.test_file} \
--fields text \
output --embeddings {encoded_corpus_dir} \
--to-faiss \
encoder --encoder castorini/tct_colbert-v2-hnp-msmarco \
--fields text \
--batch 1 \
--device cpu'
status = os.system(cmd)
self.assertEqual(status, 0)
self.assertIsFile(os.path.join(encoded_corpus_dir, 'docid'))
self.assertIsFile(os.path.join(encoded_corpus_dir, 'index'))
return encoded_corpus_dir

def test_faiss_hnsw(self):
cache_dir = get_cache_home()
index_dir = f'{cache_dir}/temp_hnsw'
encoded_corpus_dir = self.prepare_encoded_collection()
cmd = f'python -m pyserini.index.faiss \
--input {encoded_corpus_dir} \
--output {index_dir} \
--M 3 \
--hnsw'

status = os.system(cmd)
self.assertEqual(status, 0)

docid_fn = os.path.join(index_dir, 'docid')
index_fn = os.path.join(index_dir, 'index')
self.assertIsFile(docid_fn)
self.assertIsFile(index_fn)

index = faiss.read_index(index_fn)
vectors = index.reconstruct_n(0, index.ntotal)

self.assertListEqual([docid.strip() for docid in open(docid_fn, "r")], self.docids)
self.assertAlmostEqual(vectors[0][0], 0.12679848074913025, places=4)
self.assertAlmostEqual(vectors[0][-1], -0.0037349488120526075, places=4)
self.assertAlmostEqual(vectors[2][0], 0.03678430616855621, places=4)
self.assertAlmostEqual(vectors[2][-1], 0.13209162652492523, places=4)

def test_faiss_pq(self):
cache_dir = get_cache_home()
index_dir = f'{cache_dir}/temp_pq'
encoded_corpus_dir = self.prepare_encoded_collection()
cmd = f'python -m pyserini.index.faiss \
--input {encoded_corpus_dir} \
--output {index_dir} \
--pq-m 3 \
--efC 1 \
--pq-nbits 128 \
--pq'

status = os.system(cmd)
self.assertEqual(status, 0)

docid_fn = os.path.join(index_dir, 'docid')
index_fn = os.path.join(index_dir, 'index')
self.assertIsFile(docid_fn)
self.assertIsFile(index_fn)

index = faiss.read_index(index_fn)
vectors = index.reconstruct_n(0, index.ntotal)

self.assertListEqual([docid.strip() for docid in open(docid_fn, "r")], self.docids)
self.assertAlmostEqual(vectors[0][0], 0.04343192, places=4)
self.assertAlmostEqual(vectors[0][-1], 0.075478144, places=4)
self.assertAlmostEqual(vectors[2][0], 0.04343192, places=4)
self.assertAlmostEqual(vectors[2][-1], 0.075478144, places=4)

0 comments on commit 1f6ba77

Please sign in to comment.