From 7479db028a5e0b450394cba851bc61e8c4db4e3b Mon Sep 17 00:00:00 2001 From: ArzelaAscoIi <37148029+ArzelaAscoIi@users.noreply.github.com> Date: Wed, 24 Apr 2024 14:49:38 +0200 Subject: [PATCH] fix: route not identified files to dead end edge (#7589) * fix: route not identified files to dead end edge * tests: fix unit tests * tests: add file classifier pipeline run * fix: typing --- haystack/nodes/file_classifier/file_type.py | 2 +- test/nodes/test_filetype_classifier.py | 3 ++- test/pipelines/test_pipeline.py | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/haystack/nodes/file_classifier/file_type.py b/haystack/nodes/file_classifier/file_type.py index 24b8caf460..67a3d8802b 100644 --- a/haystack/nodes/file_classifier/file_type.py +++ b/haystack/nodes/file_classifier/file_type.py @@ -134,7 +134,7 @@ def run(self, file_paths: Union[Path, List[Path], str, List[str], List[Union[Pat paths[0], self.supported_types, ) - return None, None + return {"file_paths": paths}, "output_dead_end" raise ValueError( f"Files of type '{extension}' ({paths[0]}) are not supported. " f"The supported types are: {self.supported_types}. " diff --git a/test/nodes/test_filetype_classifier.py b/test/nodes/test_filetype_classifier.py index e35c3c0cea..5b0f12c8da 100644 --- a/test/nodes/test_filetype_classifier.py +++ b/test/nodes/test_filetype_classifier.py @@ -176,7 +176,8 @@ def test_filetype_classifier_raise_on_error_disabled_unsupported_file_types(tmp_ caplog.clear() with caplog.at_level(logging.WARNING): output, edge = node.run(test_file) - assert edge == output == None + assert edge == "output_dead_end" + assert output == {"file_paths": [test_file]} assert ( f"Unsupported files of type '{file_type}' ({test_file!s}) found. Unsupported file types will be ignored" in caplog.text diff --git a/test/pipelines/test_pipeline.py b/test/pipelines/test_pipeline.py index 4713822b50..f9737f110e 100644 --- a/test/pipelines/test_pipeline.py +++ b/test/pipelines/test_pipeline.py @@ -1,3 +1,4 @@ +from pathlib import Path import ssl import json import platform @@ -16,6 +17,7 @@ from haystack.document_stores.deepsetcloud import DeepsetCloudDocumentStore from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore from haystack.document_stores.memory import InMemoryDocumentStore +from haystack.nodes.file_classifier.file_type import FileTypeClassifier from haystack.nodes.other.join_docs import JoinDocuments from haystack.nodes.base import BaseComponent from haystack.nodes.retriever.sparse import BM25Retriever @@ -2191,6 +2193,25 @@ def test_pipeline_execution_using_join_preserves_changed_query(): assert res["query"] == "This is a test." +@pytest.mark.unit +@pytest.mark.parametrize("file_type", ["csv", "xml"]) +def test_pipeline_execution_can_handle_unknown_edge_for_classifier(file_type: str) -> None: + """Tests running a classifier against an unexpected file type. + + We need to route not expected file types against a dead end node. + Simply returning "None" for a classification does not work, since + the pipeline will raise an error, trying to route the result to the next node + here: https://github.com/deepset-ai/haystack/blob/b45ecb355636c3227185e97ee595006c06d17470/haystack/pipelines/base.py#L573 + + See fix pr: https://github.com/deepset-ai/haystack/pull/7589 + """ + classifier = FileTypeClassifier(raise_on_error=False) + pipeline = Pipeline() + pipeline.add_node(component=classifier, name="FileTypeClassifier", inputs=["File"]) + res = pipeline.run_batch(file_paths=[f"./test.{file_type}"]) + assert res["file_paths"] == [Path(f"./test.{file_type}")] + + @pytest.mark.unit def test_update_config_hash(): fake_configs = {