Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bug: run parameter "ranking_mode" does not override init param in meta field ranker #7375

Merged
merged 4 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions haystack/components/rankers/meta_field.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def run(
# Add the docs missing the meta_field back on the end
sorted_by_meta = [doc for meta, doc in tuple_sorted_by_meta]
sorted_documents = sorted_by_meta + docs_missing_meta_field
sorted_documents = self._merge_rankings(documents, sorted_documents, weight)
sorted_documents = self._merge_rankings(documents, sorted_documents, weight, ranking_mode)
return {"documents": sorted_documents[:top_k]}

def _parse_meta(
Expand Down Expand Up @@ -296,18 +296,22 @@ def _parse_meta(
return meta_values

def _merge_rankings(
self, documents: List[Document], sorted_documents: List[Document], weight: float
self,
documents: List[Document],
sorted_documents: List[Document],
weight: float,
ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"],
) -> List[Document]:
"""
Merge the two different rankings for Documents sorted both by their content and by their meta field.
"""
scores_map: Dict = defaultdict(int)

if self.ranking_mode == "reciprocal_rank_fusion":
if ranking_mode == "reciprocal_rank_fusion":
for i, (document, sorted_doc) in enumerate(zip(documents, sorted_documents)):
scores_map[document.id] += self._calculate_rrf(rank=i) * (1 - weight)
scores_map[sorted_doc.id] += self._calculate_rrf(rank=i) * weight
elif self.ranking_mode == "linear_score":
elif ranking_mode == "linear_score":
for i, (document, sorted_doc) in enumerate(zip(documents, sorted_documents)):
score = float(0)
if document.score is None:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
fixes:
- |
Fixed a bug in the `MetaFieldRanker`: when the `ranking_mode` parameter was overridden in the `run` method,
the component was incorrectly using the `ranking_mode` parameter set in the `__init__` method.
15 changes: 15 additions & 0 deletions test/components/rankers/test_metafield.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,3 +221,18 @@ def test_linear_score_raises_raises_warning_if_doc_without_score(self, caplog):
with caplog.at_level(logging.WARNING):
ranker.run(documents=docs_before)
assert "The score wasn't provided; defaulting to 0." in caplog.text

def test_different_ranking_mode_for_init_vs_run(self):
ranker = MetaFieldRanker(meta_field="rating", ranking_mode="linear_score", weight=0.5)
docs_before = [
Document(content="abc", meta={"rating": 1.3}, score=0.3),
Document(content="abc", meta={"rating": 0.7}, score=0.4),
Document(content="abc", meta={"rating": 2.1}, score=0.6),
]
output = ranker.run(documents=docs_before)
docs_after = output["documents"]
assert docs_after[0].score == 0.8

output = ranker.run(documents=docs_before, ranking_mode="reciprocal_rank_fusion")
docs_after = output["documents"]
assert docs_after[0].score == 0.01626123744050767
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's update to use pytest.approx so something like

Suggested change
assert docs_after[0].score == 0.01626123744050767
assert docs_after[0].score == pytest.approx(0.016261, abs=1e-5)

since doing float comparisons to that many decimals can be fragile.