Skip to content

Eval modalities2 #306

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
8e2fd79
chore(tokenizer wrapper): Adapted a wrapper for sp tokenizer.
ajude2s Dec 16, 2024
e5c3b28
fix: checkpoint conversion to HF
flxst Dec 16, 2024
17585fd
chore(tokenizer wrapper): SP Tokenizer wrapper for Modalities tokenizer.
ajude2s Dec 18, 2024
b115bce
chore(tokenizer wrapper): SP Tokenizer wrapper for Modalities tokenizer.
ajude2s Dec 19, 2024
d26858b
Merge branch 'fix/checkpoint_conversion_to_hf' into eval_modalities2
ajude2s Dec 19, 2024
f7fb4be
chore(tokenizer wrapper): Testing
ajude2s Jan 13, 2025
78ee8df
Merge remote-tracking branch 'origin/main' into eval_modalities2
BlueCrescent Feb 6, 2025
4c046b1
Merge remote-tracking branch 'origin/main' into eval_modalities2
BlueCrescent Feb 12, 2025
123d3d2
chore: Merge remote-tracking branch 'origin/main' into eval_modalities2
BlueCrescent Feb 21, 2025
02bac94
fix(huggingface): Fixed bug in hf adapter config (probably caused by …
BlueCrescent Feb 21, 2025
f63ac51
test(huggingface): Required update of test config for tests to pass.
BlueCrescent Feb 25, 2025
99c5788
test(huggingface): Added additional tests for checkpoint conversion.
BlueCrescent Feb 26, 2025
85139fd
Added type hints for the tests.
ajude2s Feb 26, 2025
dcb1de8
Merge remote-tracking branch 'origin/eval_modalities2' into eval_moda…
ajude2s Feb 26, 2025
c215304
Removed mismatch cases from the test.
ajude2s Feb 26, 2025
e971024
Added the copyright notice of the llama2 implementation of the tokeni…
ajude2s Feb 27, 2025
07285d2
Merge branch 'main' into eval_modalities2
BlueCrescent Mar 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
295 changes: 279 additions & 16 deletions src/modalities/models/huggingface_adapters/hf_adapter.py

Large diffs are not rendered by default.

15 changes: 14 additions & 1 deletion src/modalities/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pydantic import BaseModel

from modalities.config.component_factory import ComponentFactory
from modalities.config.pydanctic_if_types import PydanticPytorchModuleType
from modalities.config.pydanctic_if_types import PydanticPytorchModuleType, PydanticTokenizerIFType
from modalities.registry.components import COMPONENTS
from modalities.registry.registry import Registry

Expand Down Expand Up @@ -54,3 +54,16 @@ class PydanticConfig(BaseModel):

components = component_factory.build_components(config_dict=config, components_model_type=PydanticConfig)
return getattr(components, model_type.value)


def get_tokenizer_from_config(config: dict, tokenizer_type: str):
registry = Registry(COMPONENTS)
component_factory = ComponentFactory(registry=registry)

class PydanticConfig(BaseModel):
tokenizer: PydanticTokenizerIFType

components = component_factory.build_components(
config_dict=config, components_model_type=PydanticConfig
)
return getattr(components, tokenizer_type)
27 changes: 26 additions & 1 deletion tests/checkpointing/test_checkpoint_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,11 @@ def prediction_key() -> str:

@pytest.fixture()
def hf_model_from_checkpoint(
checkpoint_conversion: CheckpointConversion, pytorch_model: NNModel, device: str, prediction_key: str
checkpoint_conversion: CheckpointConversion,
pytorch_model: NNModel,
device: str,
prediction_key: str,
hf_model: NNModel,
) -> NNModel:
AutoConfig.register(model_type="modalities", config=HFModelAdapterConfig)
AutoModelForCausalLM.register(config_class=HFModelAdapterConfig, model_class=HFModelAdapter)
Expand Down Expand Up @@ -147,3 +151,24 @@ def test_models_before_and_after_conversion_are_equal(
for p1, p2, p3 in zip(hf_model.parameters(), pytorch_model.parameters(), hf_model_from_checkpoint.parameters()):
assert torch.equal(p1, p2)
assert torch.equal(p1, p3)


@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="This test requires a GPU.")
def test_hf_model_can_generate(hf_model: AutoModelForCausalLM):
assert hf_model.can_generate()


@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="This test requires a GPU.")
def test_hf_model_from_checkpoint_can_generate(hf_model_from_checkpoint: AutoModelForCausalLM):
assert hf_model_from_checkpoint.can_generate()


@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="This test requires a GPU.")
def test_hf_model_and_hf_model_from_checkpoint_generate_same(
hf_model: AutoModelForCausalLM,
hf_model_from_checkpoint: AutoModelForCausalLM,
test_tensor: torch.Tensor,
):
res = hf_model.generate(test_tensor, max_length=20)
res_from_checkpoint = hf_model_from_checkpoint.generate(test_tensor, max_length=20)
assert (res == res_from_checkpoint).all()
111 changes: 111 additions & 0 deletions tests/tokenization/test_tokenizer_parity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from pathlib import Path

import pytest
import sentencepiece as spm
from transformers import PreTrainedTokenizerFast

from modalities.config.config import load_app_config_dict
from modalities.models.huggingface_adapters.hf_adapter import HFModelAdapterConfig, HFTokenizerAdapter


# Tokenize using SentencePiece
def tokenize_with_sp(sp_tokenizer, text: str):
tokens = sp_tokenizer.encode(text, out_type=str)
token_ids = sp_tokenizer.encode(text, out_type=int)
decoded_text = sp_tokenizer.decode(token_ids)
return tokens, token_ids, decoded_text


# Tokenize using Hugging Face
def tokenize_with_hf(hf_tokenizer, text):
tokens = hf_tokenizer.tokenize(text)
token_ids = hf_tokenizer.encode(text, add_special_tokens=False)
decoded_text = hf_tokenizer.decode(token_ids)
return tokens, token_ids, decoded_text


# Tokenize using the wrapper tokenizer
def tokenize_with_wrapper(wrapper_tokenizer, text):
tokens = wrapper_tokenizer.tokenize(text)
token_ids = wrapper_tokenizer.encode(text)
decoded_text = wrapper_tokenizer.decode(token_ids)
return tokens, token_ids, decoded_text


# Load SentencePiece tokenizer
def load_sp_tokenizer(sp_model_path):
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)
return sp


@pytest.fixture
def sp_tokenizer_path():
return "tests/tokenization/tokenizer_files/sp_tokenizer/en_32k_tokenizer.model"


# Fixtures for tokenizers
@pytest.fixture
def sp_tokenizer(sp_tokenizer_path: str):
tokenizer = load_sp_tokenizer(sp_tokenizer_path)
return tokenizer


@pytest.fixture
def hf_tokenizer_path():
return "tests/tokenization/tokenizer_files/converted_to_hf_tokenizer"


@pytest.fixture
def hf_tokenizer(hf_tokenizer_path: str):
tokenizer = PreTrainedTokenizerFast.from_pretrained(hf_tokenizer_path)
return tokenizer


@pytest.fixture()
def config_file_path() -> Path:
return Path("tests/tokenization/tokenizer_files/modalities_config/dclm_2_7B_50B_continue.yaml")


@pytest.fixture()
def config_dict(config_file_path: Path) -> dict:
return load_app_config_dict(config_file_path=config_file_path)


@pytest.fixture
def wrapper_tokenizer(config_dict: dict):
config_adapter = HFModelAdapterConfig(config=config_dict)
tokenizer = HFTokenizerAdapter(config=config_adapter)
return tokenizer


# Parametrized test function
@pytest.mark.parametrize("text", [
"This is a simple sentence with punctuation! How does it handle commas, semicolons, and exclamation marks?",
"URLs like https://www.example.com or ftp://server.org/test are quite common.",
"Programming code: def tokenize(text): return text.split() # Python code as input.",
"Special characters: ~!@#$%^&*()_+-={}|[]\\:\";'<>?,./` and spaces.",
"Long sentence: In a land far, far away, there lived a programmer who loved tokenizers so much that they created thousands of tests, each weirder than the last, to ensure that every edge case imaginable was covered.",
"Mathematical equations: E = mc^2 or f(x) = ax^2 + bx + c are common in technical text.",
"Random string: ajsdkfhwjeio2340298hfsdjkf@@@!!!***.",
"Numbers: 1234567890, 1,000,000, and 3.14159 are common in text as well.",
])
def test_tokenizations(sp_tokenizer: spm.SentencePieceProcessor, hf_tokenizer: PreTrainedTokenizerFast,
wrapper_tokenizer: HFTokenizerAdapter, text: str):
# Tokenize using all tokenizers
sp_data = tokenize_with_sp(sp_tokenizer, text)
hf_data = tokenize_with_hf(hf_tokenizer, text)
wrapper_data = tokenize_with_wrapper(wrapper_tokenizer, text)

sp_tokens, sp_token_ids, sp_decoded = sp_data
hf_tokens, hf_token_ids, hf_decoded = hf_data
wrapper_tokens, wrapper_token_ids, wrapper_decoded = wrapper_data

# Token Equivalence
assert sp_tokens == hf_tokens == wrapper_tokens, f"Token mismatch for text: {text}"

# Token ID Equivalence
assert sp_token_ids == hf_token_ids == wrapper_token_ids, f"Token ID mismatch for text: {text}"

# Round-Trip Text Parity
assert sp_decoded == hf_decoded == wrapper_decoded, f"Round-trip text mismatch for text: {text}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"bos_token": "<BOS>",
"eos_token": "<EOS>",
"mask_token": "<MASK>",
"pad_token": "<PAD>",
"unk_token": "<UNK>"
}
Loading