From 6fa4796e94aa83c6db5061918e30d48c259f527b Mon Sep 17 00:00:00 2001 From: Mirko Lenz Date: Thu, 7 Dec 2023 16:00:41 +0100 Subject: [PATCH] test: update retrieval test case --- tests/test_retrieve.py | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/tests/test_retrieve.py b/tests/test_retrieve.py index cf22547..b405e64 100644 --- a/tests/test_retrieve.py +++ b/tests/test_retrieve.py @@ -1,17 +1,48 @@ +import pandas as pd + import cbrkit +query_name = 42 +casebase_file = "data/cars-1k.csv" + -def test_retrieve(): - casebase: cbrkit.model.Casebase[dict[str, str]] = cbrkit.load_path( - "data/cars-1k.csv" +def test_retrieve_default(): + casebase: cbrkit.model.Casebase[dict[str, str]] = cbrkit.load_path(casebase_file) + query = casebase[query_name] + + result = cbrkit.retrieve( + casebase, + query=query, + similarity_func="datatypes", + casebase_limit=5, ) - query_name = "42" + + assert len(casebase) == 999 # csv contains header + assert len(result.ranking) == len(casebase) + assert len(result.casebase) == 5 + assert result.ranking[0] == query_name + + +# TODO: Create some taxonomy similarity measure +custom_sim_func = cbrkit.case_sim.factories.by_attributes( + { + "manufacturer": cbrkit.data_sim.strings.levenshtein(), + "miles": cbrkit.data_sim.numeric.linear(max=1000000), + }, + aggregate=cbrkit.case_sim.aggregate(), +) + + +# TODO: Pandas dataframe is indexed by int, but should use strings instead! +def test_retrieve_custom(): + df = pd.read_csv(casebase_file) + casebase = cbrkit.load_dataframe(df) query = casebase[query_name] result = cbrkit.retrieve( casebase, query=query, - similarity_func="equality", + similarity_func=custom_sim_func, casebase_limit=5, )