From dab13622f94dfb62298b1633cbaa2efa90f50444 Mon Sep 17 00:00:00 2001 From: Um Changyong Date: Fri, 20 Dec 2024 12:26:36 +0900 Subject: [PATCH 1/5] fix: parsed file with specific file_type could be concatenated --- autorag/data/parse/run.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/autorag/data/parse/run.py b/autorag/data/parse/run.py index 502f052d7..ec7730075 100644 --- a/autorag/data/parse/run.py +++ b/autorag/data/parse/run.py @@ -107,7 +107,14 @@ def run_parser( module_params, ) ) - list(map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))) + + _files = {} + for result, filepath in zip(results, filepaths): + _files[filepath].append(result) if filepath in _files.keys() else _files.update({filepath: [result]}) + # Save files with a specific file type as Parquet files. + for filepath, value in _files.items(): + pd.concat(value).to_parquet(filepath, index=False) + filenames = list(map(lambda x: os.path.basename(x), filepaths)) summary_df = pd.DataFrame( @@ -122,7 +129,7 @@ def run_parser( # concat all parquet files here if not all_files. if not all_files: - dataframes = [pd.read_parquet(file) for file in filepaths] + dataframes = [pd.read_parquet(file) for file in _files.keys()] combined_df = pd.concat(dataframes, ignore_index=True) combined_df.to_parquet( os.path.join(project_dir, "parsed_result.parquet"), index=False From 99c9f936b793c180b621905167ed063e7bf771e1 Mon Sep 17 00:00:00 2001 From: Um Changyong Date: Fri, 20 Dec 2024 15:33:23 +0900 Subject: [PATCH 2/5] feat: convert name of var --- autorag/data/parse/run.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autorag/data/parse/run.py b/autorag/data/parse/run.py index ec7730075..ce3544667 100644 --- a/autorag/data/parse/run.py +++ b/autorag/data/parse/run.py @@ -128,8 +128,9 @@ def run_parser( summary_df.to_csv(os.path.join(project_dir, "summary.csv"), index=False) # concat all parquet files here if not all_files. + _filepaths = list(_files.keys()) if not all_files: - dataframes = [pd.read_parquet(file) for file in _files.keys()] + dataframes = [pd.read_parquet(file) for file in _filepaths] combined_df = pd.concat(dataframes, ignore_index=True) combined_df.to_parquet( os.path.join(project_dir, "parsed_result.parquet"), index=False From b24d95f99983af7d67cc4404d0b5506b3e2909cd Mon Sep 17 00:00:00 2001 From: Um Changyong Date: Fri, 20 Dec 2024 15:35:31 +0900 Subject: [PATCH 3/5] fix: apply ruff format --- autorag/data/parse/run.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/autorag/data/parse/run.py b/autorag/data/parse/run.py index ce3544667..e1951e133 100644 --- a/autorag/data/parse/run.py +++ b/autorag/data/parse/run.py @@ -107,10 +107,12 @@ def run_parser( module_params, ) ) - - _files = {} + + _files = {} for result, filepath in zip(results, filepaths): - _files[filepath].append(result) if filepath in _files.keys() else _files.update({filepath: [result]}) + _files[filepath].append(result) if filepath in _files.keys() else _files.update( + {filepath: [result]} + ) # Save files with a specific file type as Parquet files. for filepath, value in _files.items(): pd.concat(value).to_parquet(filepath, index=False) From bd7a60bb234ee1766b9362004e40eb365b17ddc3 Mon Sep 17 00:00:00 2001 From: Um Changyong Date: Thu, 26 Dec 2024 14:07:44 +0900 Subject: [PATCH 4/5] feat: add tests --- tests/autorag/embedding/test_base.py | 39 ++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 tests/autorag/embedding/test_base.py diff --git a/tests/autorag/embedding/test_base.py b/tests/autorag/embedding/test_base.py new file mode 100644 index 000000000..1feb9a748 --- /dev/null +++ b/tests/autorag/embedding/test_base.py @@ -0,0 +1,39 @@ +import pytest +from autorag.embedding.base import EmbeddingModel + + +def test_load_embedding_model(): + # Test loading a supported embedding model + embedding = EmbeddingModel.load("mock") + assert embedding is not None + + # Test loading an unsupported embedding model + with pytest.raises( + ValueError, match="Embedding model 'unsupported_model' is not supported" + ): + EmbeddingModel.load("unsupported_model") + + +def test_load_embedding_model_from_dict(): + # Test loading with missing keys + with pytest.raises( + ValueError, match="Both 'type' and 'model_name' must be provided" + ): + EmbeddingModel.load_from_dict([{"type": "openai"}]) + + # Test loading with unsupported type + with pytest.raises( + ValueError, match="Embedding model type 'unsupported_type' is not supported" + ): + EmbeddingModel.load_from_dict( + [{"type": "unsupported_type", "model_name": "some-model"}] + ) + + # Test loading with multiple items + with pytest.raises(ValueError, match="Only one embedding model is supported"): + EmbeddingModel.load_from_dict( + [ + {"type": "openai", "model_name": "text-embedding-ada-002"}, + {"type": "huggingface", "model_name": "BAAI/bge-small-en-v1.5"}, + ] + ) From 69dfb3a3b58aa5f79acb1975999faf9ef8472880 Mon Sep 17 00:00:00 2001 From: Um Changyong Date: Thu, 26 Dec 2024 14:14:52 +0900 Subject: [PATCH 5/5] Revert "feat: add tests" This reverts commit bd7a60bb234ee1766b9362004e40eb365b17ddc3. --- tests/autorag/embedding/test_base.py | 39 ---------------------------- 1 file changed, 39 deletions(-) delete mode 100644 tests/autorag/embedding/test_base.py diff --git a/tests/autorag/embedding/test_base.py b/tests/autorag/embedding/test_base.py deleted file mode 100644 index 1feb9a748..000000000 --- a/tests/autorag/embedding/test_base.py +++ /dev/null @@ -1,39 +0,0 @@ -import pytest -from autorag.embedding.base import EmbeddingModel - - -def test_load_embedding_model(): - # Test loading a supported embedding model - embedding = EmbeddingModel.load("mock") - assert embedding is not None - - # Test loading an unsupported embedding model - with pytest.raises( - ValueError, match="Embedding model 'unsupported_model' is not supported" - ): - EmbeddingModel.load("unsupported_model") - - -def test_load_embedding_model_from_dict(): - # Test loading with missing keys - with pytest.raises( - ValueError, match="Both 'type' and 'model_name' must be provided" - ): - EmbeddingModel.load_from_dict([{"type": "openai"}]) - - # Test loading with unsupported type - with pytest.raises( - ValueError, match="Embedding model type 'unsupported_type' is not supported" - ): - EmbeddingModel.load_from_dict( - [{"type": "unsupported_type", "model_name": "some-model"}] - ) - - # Test loading with multiple items - with pytest.raises(ValueError, match="Only one embedding model is supported"): - EmbeddingModel.load_from_dict( - [ - {"type": "openai", "model_name": "text-embedding-ada-002"}, - {"type": "huggingface", "model_name": "BAAI/bge-small-en-v1.5"}, - ] - )