Skip to content

Commit

Permalink
fix: route not identified files to dead end edge (#7589)
Browse files Browse the repository at this point in the history
* fix: route not identified files to dead end edge

* tests: fix unit tests

* tests: add file classifier pipeline run

* fix: typing
  • Loading branch information
ArzelaAscoIi authored Apr 24, 2024
1 parent b45ecb3 commit 7479db0
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 2 deletions.
2 changes: 1 addition & 1 deletion haystack/nodes/file_classifier/file_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def run(self, file_paths: Union[Path, List[Path], str, List[str], List[Union[Pat
paths[0],
self.supported_types,
)
return None, None
return {"file_paths": paths}, "output_dead_end"
raise ValueError(
f"Files of type '{extension}' ({paths[0]}) are not supported. "
f"The supported types are: {self.supported_types}. "
Expand Down
3 changes: 2 additions & 1 deletion test/nodes/test_filetype_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,8 @@ def test_filetype_classifier_raise_on_error_disabled_unsupported_file_types(tmp_
caplog.clear()
with caplog.at_level(logging.WARNING):
output, edge = node.run(test_file)
assert edge == output == None
assert edge == "output_dead_end"
assert output == {"file_paths": [test_file]}
assert (
f"Unsupported files of type '{file_type}' ({test_file!s}) found. Unsupported file types will be ignored"
in caplog.text
Expand Down
21 changes: 21 additions & 0 deletions test/pipelines/test_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pathlib import Path
import ssl
import json
import platform
Expand All @@ -16,6 +17,7 @@
from haystack.document_stores.deepsetcloud import DeepsetCloudDocumentStore
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack.document_stores.memory import InMemoryDocumentStore
from haystack.nodes.file_classifier.file_type import FileTypeClassifier
from haystack.nodes.other.join_docs import JoinDocuments
from haystack.nodes.base import BaseComponent
from haystack.nodes.retriever.sparse import BM25Retriever
Expand Down Expand Up @@ -2191,6 +2193,25 @@ def test_pipeline_execution_using_join_preserves_changed_query():
assert res["query"] == "This is a test."


@pytest.mark.unit
@pytest.mark.parametrize("file_type", ["csv", "xml"])
def test_pipeline_execution_can_handle_unknown_edge_for_classifier(file_type: str) -> None:
"""Tests running a classifier against an unexpected file type.
We need to route not expected file types against a dead end node.
Simply returning "None" for a classification does not work, since
the pipeline will raise an error, trying to route the result to the next node
here: https://github.com/deepset-ai/haystack/blob/b45ecb355636c3227185e97ee595006c06d17470/haystack/pipelines/base.py#L573
See fix pr: https://github.com/deepset-ai/haystack/pull/7589
"""
classifier = FileTypeClassifier(raise_on_error=False)
pipeline = Pipeline()
pipeline.add_node(component=classifier, name="FileTypeClassifier", inputs=["File"])
res = pipeline.run_batch(file_paths=[f"./test.{file_type}"])
assert res["file_paths"] == [Path(f"./test.{file_type}")]


@pytest.mark.unit
def test_update_config_hash():
fake_configs = {
Expand Down

0 comments on commit 7479db0

Please sign in to comment.