Skip to content

Commit

Permalink
Merge pull request #141 from chonkie-ai/development
Browse files Browse the repository at this point in the history
[FIX] Minor fixes + Stylistic enhancements for TQDM and Multiprocessing
  • Loading branch information
bhavnicksm authored Jan 7, 2025
2 parents 993f40e + 9d6d31f commit 3042b0d
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 19 deletions.
22 changes: 12 additions & 10 deletions src/chonkie/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
from multiprocessing import Pool, cpu_count
from typing import Any, Callable, List, Union

from tqdm import tqdm

from chonkie.types import Chunk

from tqdm import tqdm

class BaseChunker(ABC):
"""Abstract base class for all chunker implementations.
Expand Down Expand Up @@ -246,11 +248,11 @@ def _process_batch_sequential(self,
return [
self.chunk(t) for t in tqdm(
texts,
desc="🦛 CHONKING",
desc="🦛",
disable=not show_progress_bar,
unit="texts",
bar_format="{desc}: [{bar:20}] {percentage:3.0f}% • {n_fmt}/{total_fmt} texts chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
ascii=' >=')
unit="doc",
bar_format="{desc} ch{bar:20}nk {percentage:3.0f}% • {n_fmt}/{total_fmt} docs chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
ascii=' o')
]

def _process_batch_multiprocessing(self,
Expand All @@ -264,12 +266,12 @@ def _process_batch_multiprocessing(self,
with Pool(processes=num_workers) as pool:
results = []
with tqdm(total=total,
desc="🦛 CHONKING",
desc="🦛",
disable=not show_progress_bar,
unit="texts",
bar_format="{desc}: [{bar:20}] {percentage:3.0f}% • {n_fmt}/{total_fmt} texts chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
ascii=' >=') as pbar:
for result in pool.imap_unordered(self.chunk, texts, chunksize=chunksize):
unit="doc",
bar_format="{desc} ch{bar:20}nk {percentage:3.0f}% • {n_fmt}/{total_fmt} docs chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
ascii=' o') as pbar:
for result in pool.imap(self.chunk, texts, chunksize=chunksize):
results.append(result)
pbar.update()
return results
Expand Down
4 changes: 2 additions & 2 deletions src/chonkie/chunker/semantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,13 @@ def __init__(
self.embedding_model = AutoEmbeddings.get_embeddings(embedding_model, **kwargs)
else:
raise ValueError(
"embedding_model must be a string or BaseEmbeddings instance"
f"{embedding_model} is not a valid embedding model"
)

# Probably the dependency is not installed
if self.embedding_model is None:
raise ImportError(
"embedding_model is not a valid embedding model",
f"{embedding_model} is not a valid embedding model",
"Please install the `semantic` extra to use this feature",
)

Expand Down
12 changes: 7 additions & 5 deletions src/chonkie/chunker/token.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

from typing import Any, Generator, List, Tuple, Union

from tqdm import trange

from chonkie.types import Chunk

from .base import BaseChunker

from tqdm import trange

class TokenChunker(BaseChunker):
"""Chunker that splits text into chunks of a specified token size.
Expand Down Expand Up @@ -191,11 +193,11 @@ def chunk_batch(
for i in trange(0,
len(texts),
batch_size,
desc="🦛 CHONKING",
desc="🦛",
disable=not show_progress_bar,
unit="batches",
bar_format="{desc}: [{bar:20}] {percentage:3.0f}% • {n_fmt}/{total_fmt} batches chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
ascii=' >='):
unit="batch",
bar_format="{desc} ch{bar:20}nk {percentage:3.0f}% • {n_fmt}/{total_fmt} batches chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
ascii=' o'):
batch_texts = texts[i : min(i + batch_size, len(texts))]
chunks.extend(self._process_text_batch(batch_texts))
return chunks
Expand Down
5 changes: 3 additions & 2 deletions src/chonkie/embeddings/auto.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import warnings
"""AutoEmbeddings is a factory class for automatically loading embeddings."""

from typing import Any, Union

from .base import BaseEmbeddings
Expand Down Expand Up @@ -63,7 +64,7 @@ def get_embeddings(
try:
return embeddings_cls(model, **kwargs)
except Exception as e:
warnings.warn(f"Failed to load {embeddings_cls.__name__}: {e}")
raise ValueError(f"Failed to load {embeddings_cls.__name__}: {e}")
except Exception:
# Fall back to SentenceTransformerEmbeddings if no matching implementation is found
from .sentence_transformer import SentenceTransformerEmbeddings
Expand Down
3 changes: 3 additions & 0 deletions src/chonkie/embeddings/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
"""Base class for all embeddings implementations."""
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, Callable, List, Union

import numpy as np

# for type checking
if TYPE_CHECKING:
import numpy as np
Expand Down

0 comments on commit 3042b0d

Please sign in to comment.