You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
First I wanted to say great work on bm25-pt, it is a very interesting implementation and the idea of pre-computing scores and sparse multiplication inspired a personal project.
Is this a normal behavior, or might there have been some issues with the implementation/benchmarking method?
Here's a code snippet (full code below):
# ...timer=Timer("[bm25-pt]")
tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased")
t=timer.start("Tokenize Corpus")
tokenized_corpus=utils.huggingface.batch_tokenize(tokenizer, corpus_lst)
timer.stop(t, show=True, n_total=len(corpus_lst))
t=timer.start("Tokenize Queries")
queries_tokenized=utils.huggingface.batch_tokenize(tokenizer, queries_lst)
timer.stop(t, show=True, n_total=len(queries_lst))
num_tokens=sum(len(doc) fordocintokenized_corpus)
print(f"Number of Tokens: {num_tokens:,}")
print(f"Number of Tokens / Doc: {num_tokens/len(corpus_lst):.2f}")
print("-"*50)
t=timer.start("Index")
model=bm25_pt.BM25(tokenizer=tokenizer, device='cpu')
model.index(corpus_lst)
timer.stop(t)
# we deduct the time taken to tokenize the corpus from the indexing timetimer.results['Index']['elapsed'] -=timer.elapsed("Tokenize Corpus")
# We can now show the time taken to index the corpustimer.show("Index", n_total=len(corpus_lst))
results= []
scores= []
t_score=timer.start("Score")
timer.pause("Score")
t_query=timer.start("Query")
batches=get_batches(queries_lst, batch_size=batch_size)
num_batches=len(queries_lst) //batch_size+1forbatchintqdm(batches, total=num_batches, desc="bm25-pt Scoring", leave=False, disable=notverbose):
timer.resume(t_score)
raw_scores_batch=model.score_batch(batch)
timer.pause(t_score)
raw_scores_batch=raw_scores_batch.cpu().numpy()
forraw_scoresinraw_scores_batch:
result, score=compute_top_k_from_scores(
raw_scores, corpus=corpus_ids, k=top_k, with_scores=True
)
results.append(result)
scores.append(score)
# ...
See full code
importjsonimportosfrompathlibimportPathimporttimeimportbeir.utilfrombeir.datasets.data_loaderimportGenericDataLoaderfrombeir.retrieval.evaluationimportEvaluateRetrievalimportnumpyasnpfromtqdm.autoimporttqdmimportStemmerimportbm25_ptfromtransformersimportAutoTokenizerfromutils.benchmarkimportget_max_memory_usage, Timerimportutils.huggingfacefromutils.beirimport (
BASE_URL,
clean_results_keys,
merge_cqa_dupstack,
postprocess_results_for_eval,
)
defget_batches(lst, batch_size=32):
foriinrange(0, len(lst), batch_size):
yieldlst[i:i+batch_size]
defcompute_top_k_from_scores(scores, corpus=None, k=10, sorting=False, with_scores=False):
ifnotisinstance(scores, np.ndarray):
scores=np.array(scores)
# top_n = np.argsort(scores)[::-1][:n]top_n=np.argpartition(scores, -k)
# use np.take to select the last k elementstop_n=np.take(top_n, np.argsort(scores[top_n])[-k:])
ifsorting:
# sort in descending ordertop_n=top_n[np.argsort(scores[top_n])][::-1]
ifcorpusisNone:
results=top_nelse:
results= [corpus[i] foriintop_n]
ifwith_scores:
top_scores=scores[top_n]
returnresults, top_scoreselse:
returnresultsdefmain(dataset, n_threads=1, top_k=1000, batch_size=32, save_dir="datasets", result_dir="results", verbose=False):
#### Download dataset and unzip the datasetdata_path=beir.util.download_and_unzip(BASE_URL.format(dataset), save_dir)
ifdataset=="cqadupstack":
merge_cqa_dupstack(data_path)
ifdataset=="msmarco":
split="dev"else:
split="test"corpus, queries, qrels=GenericDataLoader(data_folder=data_path).load(split=split)
corpus_ids, corpus_lst= [], []
forkey, valincorpus.items():
corpus_ids.append(key)
corpus_lst.append(val["title"] +" "+val["text"])
qids, queries_lst= [], []
forkey, valinqueries.items():
qids.append(key)
queries_lst.append(val)
print("="*50)
print("Dataset: ", dataset)
print(f"Corpus Size: {len(corpus_lst):,}")
print(f"Queries Size: {len(queries_lst):,}")
timer=Timer("[bm25-pt]")
tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased")
t=timer.start("Tokenize Corpus")
tokenized_corpus=utils.huggingface.batch_tokenize(tokenizer, corpus_lst)
timer.stop(t, show=True, n_total=len(corpus_lst))
t=timer.start("Tokenize Queries")
queries_tokenized=utils.huggingface.batch_tokenize(tokenizer, queries_lst)
timer.stop(t, show=True, n_total=len(queries_lst))
num_tokens=sum(len(doc) fordocintokenized_corpus)
print(f"Number of Tokens: {num_tokens:,}")
print(f"Number of Tokens / Doc: {num_tokens/len(corpus_lst):.2f}")
print("-"*50)
t=timer.start("Index")
model=bm25_pt.BM25(tokenizer=tokenizer, device='cpu')
model.index(corpus_lst)
timer.stop(t)
# we deduct the time taken to tokenize the corpus from the indexing timetimer.results['Index']['elapsed'] -=timer.elapsed("Tokenize Corpus")
# We can now show the time taken to index the corpustimer.show("Index", n_total=len(corpus_lst))
results= []
scores= []
t_score=timer.start("Score")
timer.pause("Score")
t_query=timer.start("Query")
batches=get_batches(queries_lst, batch_size=batch_size)
num_batches=len(queries_lst) //batch_size+1forbatchintqdm(batches, total=num_batches, desc="bm25-pt Scoring", leave=False, disable=notverbose):
timer.resume(t_score)
raw_scores_batch=model.score_batch(batch)
timer.pause(t_score)
raw_scores_batch=raw_scores_batch.cpu().numpy()
forraw_scoresinraw_scores_batch:
result, score=compute_top_k_from_scores(
raw_scores, corpus=corpus_ids, k=top_k, with_scores=True
)
results.append(result)
scores.append(score)
queried_results=np.array(results)
queried_scores=np.array(scores)
timer.stop(t_score)
timer.stop(t_query)
# we deduct the time taken to tokenize the queries from the scoring timetimer.results['Score']['elapsed'] -=timer.elapsed("Tokenize Queries")
timer.results['Query']['elapsed'] -=timer.elapsed("Tokenize Queries")
# We can now show the time taken to score the queriestimer.show("Score", n_total=len(queries_lst))
timer.show("Query", n_total=len(queries_lst))
results_dict=postprocess_results_for_eval(queried_results, queried_scores, qids)
ndcg, _map, recall, precision=EvaluateRetrieval.evaluate(
qrels, results_dict, [1, 10, 100, 1000]
)
max_mem_gb=get_max_memory_usage("GB")
print("-"*50)
print(f"Max Memory Usage: {max_mem_gb:.4f} GB")
print(ndcg)
print(recall)
print("="*50)
# Save everything to jsonsave_dict= {
"model": "bm25-pt",
"dataset": dataset,
"stemmer": "snowball",
"tokenizer": "skl",
"date": time.strftime("%Y-%m-%d %H:%M:%S"),
"n_threads": n_threads,
"top_k": top_k,
"max_mem_gb": max_mem_gb,
"stats": {
"num_docs": len(corpus_lst),
"num_queries": len(queries_lst),
"num_tokens": num_tokens,
},
"timing": timer.to_dict(underscore=True, lowercase=True),
"scores": {
"ndcg": clean_results_keys(ndcg),
"map": clean_results_keys(_map),
"recall": clean_results_keys(recall),
"precision": clean_results_keys(precision),
},
}
result_dir=Path(result_dir) /save_dict["model"]
result_dir.mkdir(parents=True, exist_ok=True)
save_path=Path(result_dir) /f"{dataset}-{os.urandom(8).hex()}.json"withopen(save_path, "w") asf:
json.dump(save_dict, f, indent=2)
if__name__=="__main__":
importargparseparser=argparse.ArgumentParser(
description="Benchmark bm25-pt on a dataset.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"-d",
"--dataset",
type=str,
default="fiqa",
help="Dataset to benchmark on.",
)
parser.add_argument(
"-t",
"--n_threads",
type=int,
default=1,
help="Number of threads to run in parallel.",
)
parser.add_argument(
"--num_runs",
type=int,
default=1,
help="Number of runs to repeat main."
)
parser.add_argument(
"--top_k",
type=int,
default=1000,
help="Number of top-k documents to retrieve.",
)
parser.add_argument(
"--profile",
action="store_true",
help="Enable profiling",
)
parser.add_argument(
"--result_dir",
type=str,
default="results",
help="Directory to save results.",
)
parser.add_argument(
"--save_dir",
type=str,
default="datasets",
help="Directory to save datasets.",
)
parser.add_argument(
"--batch_size",
type=int,
default=32,
help="Batch size for scoring.",
)
kwargs=vars(parser.parse_args())
profile=kwargs.pop("profile")
num_runs=kwargs.pop("num_runs")
ifprofile:
importcProfileimportpstatsifnum_runs>1:
raiseValueError("Cannot profile with multiple runs.")
cProfile.run("main(**kwargs)", filename="bm25pt.prof")
p=pstats.Stats("bm25pt.prof")
p.sort_stats("time").print_stats(50)
else:
for_inrange(num_runs):
main(**kwargs)
The text was updated successfully, but these errors were encountered:
First I wanted to say great work on bm25-pt, it is a very interesting implementation and the idea of pre-computing scores and sparse multiplication inspired a personal project.
I've attempted to benchmark a few algorithms, including BM25-PT, on the public dataset of BEIR. For BM25-PT, I found that it runs out of memory for bigger datasets, including NQ, MSMARCO, HotpotQA, etc.
Here's one of the notebooks, specifically for MSMARCO. As shown in the logs, it runs out of memory (30GB) prior to completion.
Is this a normal behavior, or might there have been some issues with the implementation/benchmarking method?
Here's a code snippet (full code below):
See full code
The text was updated successfully, but these errors were encountered: