Skip to content

Commit

Permalink
Refactor OverlapRefinery context handling and update tests
Browse files Browse the repository at this point in the history
- Renamed method for obtaining overlap context from `_get_overlap_context` to `_get_prefix_overlap_context` to clarify its purpose.
- Updated test assertions to reflect changes in context handling, ensuring the last chunk has no context and verifying context for all other chunks.
- Added new tests for prefix mode functionality in OverlapRefinery, ensuring correct context management and merging behavior.
- Adjusted existing tests to improve clarity and accuracy in context validation.
  • Loading branch information
bhavnicksm committed Dec 5, 2024
1 parent 621061f commit e17e6d0
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 17 deletions.
2 changes: 1 addition & 1 deletion src/chonkie/refinery/overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ def _refine_prefix(self, chunks: List[Chunk]) -> List[Chunk]:
# Process remaining chunks
for i in range(1, len(refined_chunks)):
# Get context from previous chunk
context = self._get_overlap_context(chunks[i - 1])
context = self._get_prefix_overlap_context(chunks[i - 1])
setattr(refined_chunks[i], "context", context)

# Optionally update chunk text to include context
Expand Down
70 changes: 54 additions & 16 deletions tests/refinery/test_overlap_refinery.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,11 @@ def test_overlap_refinery_basic_chunks_approximate(basic_chunks):
refinery = OverlapRefinery(context_size=4) # Small context for testing
refined = refinery.refine(basic_chunks)

# First chunk should have no context
assert refined[0].context is None
# Last chunk should have no context
assert refined[-1].context is None

# Subsequent chunks should have context from previous chunks
for i in range(1, len(refined)):
for i in range(len(refined) - 1):
assert refined[i].context is not None
assert isinstance(refined[i].context, Context)
assert refined[i].context.token_count <= 4
Expand All @@ -125,24 +125,21 @@ def test_overlap_refinery_basic_chunks_exact(basic_chunks, tokenizer):
refined = refinery.refine(basic_chunks)

# Check context for subsequent chunks
for i in range(1, len(refined)):
for i in range(len(refined) - 1):
assert refined[i].context is not None
assert isinstance(refined[i].context, Context)
# Verify exact token count using tokenizer
actual_tokens = len(tokenizer.encode(refined[i].context.text))
assert actual_tokens <= 4


def test_overlap_refinery_sentence_chunks(sentence_chunks):
"""Test overlap calculation with SentenceChunks."""
refinery = OverlapRefinery(context_size=4)
refined = refinery.refine(sentence_chunks)

# Check context for second chunk
assert refined[1].context is not None
assert isinstance(refined[1].context, Context)
assert refined[1].context.token_count <= 4

assert refined[1].context is not None, f"Second chunk should have context, got {refined[1].context}"
assert isinstance(refined[1].context, Context), f"Context should be a Context, got {type(refined[1].context)}"

def test_overlap_refinery_no_merge_context(basic_chunks):
"""Test behavior when merge_context is False."""
Expand All @@ -161,7 +158,7 @@ def test_overlap_refinery_context_size_limits(basic_chunks):
refined = refinery.refine(basic_chunks)

# Check that no context exceeds size limit
for chunk in refined[1:]: # Skip first chunk
for chunk in refined[:-1]: # Skip last chunk
assert chunk.context.token_count <= 2


Expand All @@ -184,14 +181,10 @@ def test_overlap_refinery_merge_context(basic_chunks, tokenizer):

refined = refinery.refine(chunks_copy)

# First chunk should be unchanged
assert refined[0].text == basic_chunks[0].text
assert refined[0].token_count == basic_chunks[0].token_count

# Subsequent chunks should have context prepended
for i in range(1, len(refined)):
for i in range(len(refined) - 1):
assert refined[i].context is not None
assert refined[i].text.startswith(refined[i].context.text)
assert refined[i].text.endswith(refined[i].context.text)
# Verify token count increase
original_tokens = len(tokenizer.encode(basic_chunks[i].text))
new_tokens = len(tokenizer.encode(refined[i].text))
Expand All @@ -217,5 +210,50 @@ def test_overlap_refinery_mixed_chunk_types():
refinery.refine(chunks)


def test_overlap_refinery_prefix_mode(basic_chunks):
"""Test that OverlapRefinery works correctly in prefix mode."""
refinery = OverlapRefinery(context_size=4, mode="prefix")
refined = refinery.refine(basic_chunks)

# First chunk should have no context
assert refined[0].context is None

# Subsequent chunks should have context from previous chunks
for i in range(1, len(refined)):
assert refined[i].context is not None
assert isinstance(refined[i].context, Context)
assert refined[i].context.token_count <= 4
# Verify context comes from previous chunk
assert refined[i].context.text in basic_chunks[i-1].text
# Verify context ends at the end of previous chunk
assert refined[i].context.end_index == basic_chunks[i-1].end_index

def test_overlap_refinery_prefix_mode_with_merge(basic_chunks, tokenizer):
"""Test that OverlapRefinery merges context correctly in prefix mode."""
refinery = OverlapRefinery(
context_size=4,
tokenizer=tokenizer,
mode="prefix",
merge_context=True,
approximate=False
)
refined = refinery.refine(basic_chunks)

# First chunk should be unchanged
assert refined[0].text == basic_chunks[0].text
assert refined[0].token_count == basic_chunks[0].token_count

# Subsequent chunks should have context prepended
for i in range(1, len(refined)):
assert refined[i].context is not None
# Verify text starts with context
assert refined[i].text.startswith(refined[i].context.text)
# Verify token count increase
original_tokens = len(tokenizer.encode(basic_chunks[i].text))
new_tokens = len(tokenizer.encode(refined[i].text))
assert new_tokens > original_tokens
# Verify start index is from context
assert refined[i].start_index == refined[i].context.start_index

if __name__ == "__main__":
pytest.main()

0 comments on commit e17e6d0

Please sign in to comment.