Merge pull request #141 from chonkie-ai/development

[FIX] Minor fixes + Stylistic enhancements for TQDM and Multiprocessing
chonkie-ai · Jan 7, 2025 · 3042b0d · 3042b0d
2 parents 993f40e + 9d6d31f
commit 3042b0d
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 19 deletions.
diff --git a/src/chonkie/chunker/base.py b/src/chonkie/chunker/base.py
@@ -7,9 +7,11 @@
 from multiprocessing import Pool, cpu_count
 from typing import Any, Callable, List, Union
 
+from tqdm import tqdm
+
 from chonkie.types import Chunk
 
-from tqdm import tqdm
+
 class BaseChunker(ABC):
     """Abstract base class for all chunker implementations.
 
@@ -246,11 +248,11 @@ def _process_batch_sequential(self,
         return [
                 self.chunk(t) for t in tqdm(
                     texts,
-                    desc="🦛 CHONKING",
+                    desc="🦛",
                     disable=not show_progress_bar,
-                    unit="texts",
-                    bar_format="{desc}: [{bar:20}] {percentage:3.0f}% • {n_fmt}/{total_fmt} texts chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱", 
-                    ascii=' >=')
+                        unit="doc",
+                    bar_format="{desc} ch{bar:20}nk {percentage:3.0f}% • {n_fmt}/{total_fmt} docs chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱", 
+                    ascii=' o')
         ]
 
     def _process_batch_multiprocessing(self,
@@ -264,12 +266,12 @@ def _process_batch_multiprocessing(self,
         with Pool(processes=num_workers) as pool:
             results = []
             with tqdm(total=total,
-                     desc="🦛 CHONKING",
+                     desc="🦛",
                      disable=not show_progress_bar,
-                     unit="texts",
-                     bar_format="{desc}: [{bar:20}] {percentage:3.0f}% • {n_fmt}/{total_fmt} texts chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
-                     ascii=' >=') as pbar:
-                for result in pool.imap_unordered(self.chunk, texts, chunksize=chunksize):
+                     unit="doc",
+                     bar_format="{desc} ch{bar:20}nk {percentage:3.0f}% • {n_fmt}/{total_fmt} docs chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
+                     ascii=' o') as pbar:
+                for result in pool.imap(self.chunk, texts, chunksize=chunksize):
                     results.append(result)
                     pbar.update()
             return results

diff --git a/src/chonkie/chunker/semantic.py b/src/chonkie/chunker/semantic.py
@@ -115,13 +115,13 @@ def __init__(
             self.embedding_model = AutoEmbeddings.get_embeddings(embedding_model, **kwargs)
         else:
             raise ValueError(
-                "embedding_model must be a string or BaseEmbeddings instance"
+                f"{embedding_model} is not a valid embedding model"
             )
 
         # Probably the dependency is not installed
         if self.embedding_model is None:
             raise ImportError(
-                "embedding_model is not a valid embedding model",
+                f"{embedding_model} is not a valid embedding model",
                 "Please install the `semantic` extra to use this feature",
             )
 

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
@@ -2,11 +2,13 @@
 
 from typing import Any, Generator, List, Tuple, Union
 
+from tqdm import trange
+
 from chonkie.types import Chunk
 
 from .base import BaseChunker
 
-from tqdm import trange
+
 class TokenChunker(BaseChunker):
     """Chunker that splits text into chunks of a specified token size.
 
@@ -191,11 +193,11 @@ def chunk_batch(
         for i in trange(0,
                         len(texts),
                         batch_size,
-                        desc="🦛 CHONKING",
+                        desc="🦛",
                         disable=not show_progress_bar, 
-                        unit="batches",
-                        bar_format="{desc}: [{bar:20}] {percentage:3.0f}% • {n_fmt}/{total_fmt} batches chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
-                        ascii=' >='):
+                        unit="batch",
+                        bar_format="{desc} ch{bar:20}nk {percentage:3.0f}% • {n_fmt}/{total_fmt} batches chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
+                        ascii=' o'):
             batch_texts = texts[i : min(i + batch_size, len(texts))]
             chunks.extend(self._process_text_batch(batch_texts))
         return chunks

diff --git a/src/chonkie/embeddings/auto.py b/src/chonkie/embeddings/auto.py
@@ -1,4 +1,5 @@
-import warnings
+"""AutoEmbeddings is a factory class for automatically loading embeddings."""
+
 from typing import Any, Union
 
 from .base import BaseEmbeddings
@@ -63,7 +64,7 @@ def get_embeddings(
                     try:
                         return embeddings_cls(model, **kwargs)
                     except Exception as e:
-                        warnings.warn(f"Failed to load {embeddings_cls.__name__}: {e}")
+                        raise ValueError(f"Failed to load {embeddings_cls.__name__}: {e}")
             except Exception:
                 # Fall back to SentenceTransformerEmbeddings if no matching implementation is found
                 from .sentence_transformer import SentenceTransformerEmbeddings

diff --git a/src/chonkie/embeddings/base.py b/src/chonkie/embeddings/base.py
@@ -1,6 +1,9 @@
+"""Base class for all embeddings implementations."""
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any, Callable, List, Union
 
+import numpy as np
+
 # for type checking
 if TYPE_CHECKING:
     import numpy as np