From dab13622f94dfb62298b1633cbaa2efa90f50444 Mon Sep 17 00:00:00 2001
From: Um Changyong <changyong.um@sfa.co.kr>
Date: Fri, 20 Dec 2024 12:26:36 +0900
Subject: [PATCH 1/5] fix: parsed file with specific file_type could be
 concatenated

---
 autorag/data/parse/run.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/autorag/data/parse/run.py b/autorag/data/parse/run.py
index 502f052d7..ec7730075 100644
--- a/autorag/data/parse/run.py
+++ b/autorag/data/parse/run.py
@@ -107,7 +107,14 @@ def run_parser(
 				module_params,
 			)
 		)
-	list(map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths)))
+  
+	_files = {} 
+	for result, filepath in zip(results, filepaths):
+			_files[filepath].append(result) if filepath in _files.keys() else _files.update({filepath: [result]})
+	# Save files with a specific file type as Parquet files.
+	for filepath, value in _files.items():
+		pd.concat(value).to_parquet(filepath, index=False)
+
 	filenames = list(map(lambda x: os.path.basename(x), filepaths))
 
 	summary_df = pd.DataFrame(
@@ -122,7 +129,7 @@ def run_parser(
 
 	# concat all parquet files here if not all_files.
 	if not all_files:
-		dataframes = [pd.read_parquet(file) for file in filepaths]
+		dataframes = [pd.read_parquet(file) for file in _files.keys()]
 		combined_df = pd.concat(dataframes, ignore_index=True)
 		combined_df.to_parquet(
 			os.path.join(project_dir, "parsed_result.parquet"), index=False

From 99c9f936b793c180b621905167ed063e7bf771e1 Mon Sep 17 00:00:00 2001
From: Um Changyong <changyong.um@sfa.co.kr>
Date: Fri, 20 Dec 2024 15:33:23 +0900
Subject: [PATCH 2/5] feat: convert name of var

---
 autorag/data/parse/run.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autorag/data/parse/run.py b/autorag/data/parse/run.py
index ec7730075..ce3544667 100644
--- a/autorag/data/parse/run.py
+++ b/autorag/data/parse/run.py
@@ -128,8 +128,9 @@ def run_parser(
 	summary_df.to_csv(os.path.join(project_dir, "summary.csv"), index=False)
 
 	# concat all parquet files here if not all_files.
+	_filepaths = list(_files.keys())
 	if not all_files:
-		dataframes = [pd.read_parquet(file) for file in _files.keys()]
+		dataframes = [pd.read_parquet(file) for file in _filepaths]
 		combined_df = pd.concat(dataframes, ignore_index=True)
 		combined_df.to_parquet(
 			os.path.join(project_dir, "parsed_result.parquet"), index=False

From b24d95f99983af7d67cc4404d0b5506b3e2909cd Mon Sep 17 00:00:00 2001
From: Um Changyong <changyong.um@sfa.co.kr>
Date: Fri, 20 Dec 2024 15:35:31 +0900
Subject: [PATCH 3/5] fix: apply ruff format

---
 autorag/data/parse/run.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/autorag/data/parse/run.py b/autorag/data/parse/run.py
index ce3544667..e1951e133 100644
--- a/autorag/data/parse/run.py
+++ b/autorag/data/parse/run.py
@@ -107,10 +107,12 @@ def run_parser(
 				module_params,
 			)
 		)
-  
-	_files = {} 
+
+	_files = {}
 	for result, filepath in zip(results, filepaths):
-			_files[filepath].append(result) if filepath in _files.keys() else _files.update({filepath: [result]})
+		_files[filepath].append(result) if filepath in _files.keys() else _files.update(
+			{filepath: [result]}
+		)
 	# Save files with a specific file type as Parquet files.
 	for filepath, value in _files.items():
 		pd.concat(value).to_parquet(filepath, index=False)

From bd7a60bb234ee1766b9362004e40eb365b17ddc3 Mon Sep 17 00:00:00 2001
From: Um Changyong <changyong.um@sfa.co.kr>
Date: Thu, 26 Dec 2024 14:07:44 +0900
Subject: [PATCH 4/5] feat: add tests

---
 tests/autorag/embedding/test_base.py | 39 ++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 tests/autorag/embedding/test_base.py

diff --git a/tests/autorag/embedding/test_base.py b/tests/autorag/embedding/test_base.py
new file mode 100644
index 000000000..1feb9a748
--- /dev/null
+++ b/tests/autorag/embedding/test_base.py
@@ -0,0 +1,39 @@
+import pytest
+from autorag.embedding.base import EmbeddingModel
+
+
+def test_load_embedding_model():
+	# Test loading a supported embedding model
+	embedding = EmbeddingModel.load("mock")
+	assert embedding is not None
+
+	# Test loading an unsupported embedding model
+	with pytest.raises(
+		ValueError, match="Embedding model 'unsupported_model' is not supported"
+	):
+		EmbeddingModel.load("unsupported_model")
+
+
+def test_load_embedding_model_from_dict():
+	# Test loading with missing keys
+	with pytest.raises(
+		ValueError, match="Both 'type' and 'model_name' must be provided"
+	):
+		EmbeddingModel.load_from_dict([{"type": "openai"}])
+
+	# Test loading with unsupported type
+	with pytest.raises(
+		ValueError, match="Embedding model type 'unsupported_type' is not supported"
+	):
+		EmbeddingModel.load_from_dict(
+			[{"type": "unsupported_type", "model_name": "some-model"}]
+		)
+
+	# Test loading with multiple items
+	with pytest.raises(ValueError, match="Only one embedding model is supported"):
+		EmbeddingModel.load_from_dict(
+			[
+				{"type": "openai", "model_name": "text-embedding-ada-002"},
+				{"type": "huggingface", "model_name": "BAAI/bge-small-en-v1.5"},
+			]
+		)

From 69dfb3a3b58aa5f79acb1975999faf9ef8472880 Mon Sep 17 00:00:00 2001
From: Um Changyong <changyong.um@sfa.co.kr>
Date: Thu, 26 Dec 2024 14:14:52 +0900
Subject: [PATCH 5/5] Revert "feat: add tests"

This reverts commit bd7a60bb234ee1766b9362004e40eb365b17ddc3.
---
 tests/autorag/embedding/test_base.py | 39 ----------------------------
 1 file changed, 39 deletions(-)
 delete mode 100644 tests/autorag/embedding/test_base.py

diff --git a/tests/autorag/embedding/test_base.py b/tests/autorag/embedding/test_base.py
deleted file mode 100644
index 1feb9a748..000000000
--- a/tests/autorag/embedding/test_base.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import pytest
-from autorag.embedding.base import EmbeddingModel
-
-
-def test_load_embedding_model():
-	# Test loading a supported embedding model
-	embedding = EmbeddingModel.load("mock")
-	assert embedding is not None
-
-	# Test loading an unsupported embedding model
-	with pytest.raises(
-		ValueError, match="Embedding model 'unsupported_model' is not supported"
-	):
-		EmbeddingModel.load("unsupported_model")
-
-
-def test_load_embedding_model_from_dict():
-	# Test loading with missing keys
-	with pytest.raises(
-		ValueError, match="Both 'type' and 'model_name' must be provided"
-	):
-		EmbeddingModel.load_from_dict([{"type": "openai"}])
-
-	# Test loading with unsupported type
-	with pytest.raises(
-		ValueError, match="Embedding model type 'unsupported_type' is not supported"
-	):
-		EmbeddingModel.load_from_dict(
-			[{"type": "unsupported_type", "model_name": "some-model"}]
-		)
-
-	# Test loading with multiple items
-	with pytest.raises(ValueError, match="Only one embedding model is supported"):
-		EmbeddingModel.load_from_dict(
-			[
-				{"type": "openai", "model_name": "text-embedding-ada-002"},
-				{"type": "huggingface", "model_name": "BAAI/bge-small-en-v1.5"},
-			]
-		)