From 3ba2f9ff2667bb17eb7a31ef1bc1fa029878cf38 Mon Sep 17 00:00:00 2001 From: Agus <56895847+plaguss@users.noreply.github.com> Date: Tue, 31 Oct 2023 17:00:25 +0100 Subject: [PATCH] feat: add `push_to_huggingface` method to the `ArgillaTrainer` (#3976) # Description This PR adds a new method `push_to_huggingface` to the `ArgillaTrainer` to simplify uploading the trained models to the [huggingface model hub](https://huggingface.co/models). This option is implemented for the following models: - [x] `transformers` - [x] `peft` - [x] `setfit` - [x] `spacy` - [x] `spacy-transformers` - [x] `trl` - [ ] `sentence_transformers` This framework doesn't work as of currently. The following message is written at the corresponding test: *This framework is not implemented yet. Cross-Encoder models don't implement the functionality* *for pushing a model to huggingface, and SentenceTransformer models have the functionality* *but is outdated and doesn't work with the current versions of 'huggingface-hub'.* *The present test is let here for the future, when we either implement the functionality* *in 'argilla', or 'sentence-transformers'.* - [ ] `openai` *Doesn't apply* Closes #3633 **Type of change** - [x] New feature (non-breaking change which adds functionality) - [x] Refactor (change restructuring the codebase without changing functionality) - [x] Improvement (change adding some improvement to an existing functionality) **How Has This Been Tested** (Please describe the tests that you ran to verify your changes. And ideally, reference `tests`) - [x] `tests/integration/client/feedback/training/test_openai.py` - [x] `tests/integration/client/feedback/training/test_sentence_transformers.py` - [x] `tests/integration/client/feedback/training/test_trainer.py` - [x] `tests/integration/client/feedback/training/test_trl.py` **Checklist** - [x] I added relevant documentation - [x] I followed the style guidelines of this project - [x] I did a self-review of my code - [ ] I made corresponding changes to the documentation - [x] My changes generate no new warnings - [x] I have added tests that prove my fix is effective or that my feature works - [ ] I filled out [the contributor form](https://tally.so/r/n9XrxK) (see text above) - [x] I have added relevant notes to the `CHANGELOG.md` file (See https://keepachangelog.com/) --------- Co-authored-by: David Berenstein Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- CHANGELOG.md | 4 + docs/_source/practical_guides/fine_tune.md | 77 +++++++++++- environment_dev.yml | 1 + pyproject.toml | 1 + src/argilla/client/feedback/training/base.py | 53 ++++++++- .../feedback/training/frameworks/openai.py | 3 + .../feedback/training/frameworks/peft.py | 22 ++++ .../frameworks/sentence_transformers.py | 17 +++ .../feedback/training/frameworks/setfit.py | 22 +++- .../feedback/training/frameworks/spacy.py | 78 +++++++++++- .../training/frameworks/span_marker.py | 5 + .../training/frameworks/transformers.py | 25 ++++ .../feedback/training/frameworks/trl.py | 34 +++++- .../client/feedback/training/test_openai.py | 35 ++++++ .../training/test_sentence_transformers.py | 42 +++++++ .../client/feedback/training/test_trainer.py | 94 ++++++++++++++- .../client/feedback/training/test_trl.py | 112 +++++++++++++++++- tests/integration/conftest.py | 21 ++++ 18 files changed, 634 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d1879393b..e65722a282 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,10 @@ These are the section headers that we use: ## [Unreleased]() +### Added + +- Added functionality to push your models to huggingface hub with `ArgillaTrainer.push_to_huggingface` ([#3976](https://github.com/argilla-io/argilla/pull/3976)). + ### Fixed - Fix svg images out of screen with too large images ([#4047](https://github.com/argilla-io/argilla/pull/4047)) diff --git a/docs/_source/practical_guides/fine_tune.md b/docs/_source/practical_guides/fine_tune.md index fd60a92571..e418463d24 100644 --- a/docs/_source/practical_guides/fine_tune.md +++ b/docs/_source/practical_guides/fine_tune.md @@ -82,7 +82,15 @@ A `TrainingTask` is used to define how the data should be processed and formatte | for_direct_preference_optimization | `prompt-chosen-rejected` | `Union[Tuple[str, str, str], Iterator[Tuple[str, str, str]]]` | ✗ | | for_chat_completion | `chat-turn-role-content` | `Union[Tuple[str, str, str, str], Iterator[Tuple[str, str, str, str]]]`| ✗ | -#### Model card generation +#### Huggingface hub Integration + +This section presents some integrations with the huggingface 🤗[model hub](https://huggingface.co/docs/hub/models-the-hub), the easiest way to share your argilla models, as well as possibility to generate an automated model card. + +:::{note} +Take a look at the following [sample model](https://huggingface.co/plaguss/test_model) in the 🤗huggingface hub with the autogenerated model card, and check [https://huggingface.co/models?other=argilla](https://huggingface.co/models?other=argilla) for shared Argilla models to come. +::: + +##### Model card generation The `ArgillaTrainer` automatically generates a [model card](https://huggingface.co/docs/hub/model-cards) when saving the model. After calling `trainer.train(output_dir="my_model")`, you should see the model card under the same output dir you passed through the train method: `./my_model/README.md`. Most of the fields in the card are automatically generated when possible, but the following fields can be (optionally) updated via the `framework_kwargs` variable of the `ArgillaTrainer` like so: @@ -90,7 +98,6 @@ The `ArgillaTrainer` automatically generates a [model card](https://huggingface. model_card_kwargs = { "language": ["en", "es"], "license": "Apache-2.0", - "model_id": "all-MiniLM-L6-v2", "dataset_name": "argilla/emotion", "tags": ["nlp", "few-shot-learning", "argilla", "setfit"], "model_summary": "Small summary of what the model does", @@ -117,6 +124,72 @@ Even though its generated internally, you can get the card by calling the `gener argilla_model_card = trainer.generate_model_card("my_model") ``` +##### Upload your models to Huggingface Hub + + +If you don't have huggingface hub installed yet, you can do it with the following command: + +```console +pip install huggingface_hub +``` + +:::{note} + +If your framework chosen is `spacy` or `spacy-transformers` you should also install the following dependency: + +```console +pip install spacy-huggingface-hub +``` +::: + +And then select the environment, depending on whether you are working with a script or from a jupyter notebook: + +::::{tab-set} + +:::{tab-item} Console + +Run the following command from a console window and insert your 🤗huggingface hub token: + +```console +huggingface-cli login +``` + +::: + +:::{tab-item} Notebook + +Run the following command from a notebook cell and insert your 🤗huggingface hub token: + + +```python +from huggingface_hub import notebook_login + +notebook_login() +``` + +::: + +:::: + +Internally, the token will be used when calling the `push_to_huggingface` model. + +Be sure to take a look at the huggingface hub +[requirements](https://huggingface.co/docs/hub/repositories-getting-started#requirements) in case you need more help publishing your models. + +After your model is trained, you just need to call `push_to_huggingface` and wait for your model to be pushed to the hub (by default, a model card will be generated, put the argument to `False` if you don't want it): + +```python +# spaCy based models: +repo_id = output_dir + +# Every other framework: +repo_id = "organization/model-name" # for example: argilla/newest-model + +trainer.push_to_huggingface(repo_id, generate_card=True) +``` + +Due to the spaCy behavior when pushing models, the repo_id is automatically generated internally, you need to pass the path to where the model was saved (the same `output_dir` variable you may pass to the `train` method), and it will work out just the same. + ### Tasks #### Text Classification diff --git a/environment_dev.yml b/environment_dev.yml index 49223ab2ad..7a2a37a437 100644 --- a/environment_dev.yml +++ b/environment_dev.yml @@ -47,6 +47,7 @@ dependencies: - spacy==3.5.3 - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0.tar.gz - spacy-transformers>=1.2.5 + - spacy-huggingface-hub >= 0.0.10 - transformers[torch]>=4.30.0 # <- required for DPO with TRL - evaluate - seqeval diff --git a/pyproject.toml b/pyproject.toml index 579eb124e6..3902ab9ff2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,6 +100,7 @@ integrations = [ "snorkel >= 0.9.7", "spacy == 3.5.3", "spacy-transformers >= 1.2.5", + "spacy-huggingface-hub >= 0.0.10", "transformers[torch] >= 4.30.0", "evaluate", "seqeval", diff --git a/src/argilla/client/feedback/training/base.py b/src/argilla/client/feedback/training/base.py index 981406f179..ecce91804b 100644 --- a/src/argilla/client/feedback/training/base.py +++ b/src/argilla/client/feedback/training/base.py @@ -19,6 +19,8 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from huggingface_hub import HfApi + from argilla.client.feedback.schemas.records import FeedbackRecord from argilla.client.feedback.training.schemas import TrainingTaskForTextClassification, TrainingTaskTypes from argilla.client.models import Framework, TextClassificationRecord @@ -269,11 +271,11 @@ def save(self, output_dir: str, generate_card: bool = True) -> None: if generate_card: self.generate_model_card(output_dir) - def generate_model_card(self, output_dir: str) -> "ArgillaModelCard": + def generate_model_card(self, output_dir: Optional[str] = None) -> "ArgillaModelCard": """Generate and return a model card based on the model card data. Args: - output_dir: Folder where the model card will be written. + output_dir: If given, folder where the model card will be written. Returns: model_card: The model card. @@ -288,11 +290,46 @@ def generate_model_card(self, output_dir: str) -> "ArgillaModelCard": template_path=ArgillaModelCard.default_template_path, ) - model_card_path = Path(output_dir) / "README.md" - model_card.save(model_card_path) - self._logger.info(f"Model card generated at: {model_card_path}") + if output_dir: + model_card_path = Path(output_dir) / "README.md" + model_card.save(model_card_path) + self._logger.info(f"Model card generated at: {model_card_path}") + return model_card + def push_to_huggingface(self, repo_id: str, generate_card: Optional[bool] = True, **kwargs) -> None: + """Push your model to [huggingface's model hub](https://huggingface.co/models). + + Args: + repo_id: + The name of the repository you want to push your model and tokenizer to. + It should contain your organization name when pushing to a given organization. + generate_card: + Whether to generate (and push) a model card for your model. Defaults to True. + """ + if not kwargs.get("token"): + # Try obtaining the token with huggingface_hub utils as a last resort, or let it fail. + from huggingface_hub import HfFolder + + if token := HfFolder.get_token(): + kwargs["token"] = token + + # One last check for the tests. We use a different env var name + # that the one gathered with HfFolder.get_token + if token := kwargs.get("token", os.environ.get("HF_HUB_ACCESS_TOKEN", None)): + kwargs["token"] = token + + url = self._trainer.push_to_huggingface(repo_id, **kwargs) + + if generate_card: + model_card = self.generate_model_card() + # For spacy based models, overwrite the repo_id with the url variable returned + # from its trainer. + if getattr(self._trainer, "language", None): + repo_id = url + + model_card.push_to_hub(repo_id, repo_type="model", token=kwargs["token"]) + class ArgillaTrainerSkeleton(ABC): def __init__( @@ -360,3 +397,9 @@ def get_model_card_data(self, card_data_kwargs: Dict[str, Any]) -> "FrameworkCar """ Generates a `FrameworkCardData` instance to generate a model card from. """ + + @abstractmethod + def push_to_huggingface(self, repo_id: str, **kwargs) -> Optional[str]: + """ + Uploads the model to [Huggingface Hub](https://huggingface.co/docs/hub/models-the-hub). + """ diff --git a/src/argilla/client/feedback/training/frameworks/openai.py b/src/argilla/client/feedback/training/frameworks/openai.py index 0bbe7a1d17..fdd96a367d 100644 --- a/src/argilla/client/feedback/training/frameworks/openai.py +++ b/src/argilla/client/feedback/training/frameworks/openai.py @@ -67,3 +67,6 @@ def get_model_card_data(self, **card_data_kwargs) -> "OpenAIModelCardData": task=self._task, **card_data_kwargs, ) + + def push_to_huggingface(self, repo_id: str, **kwargs) -> None: + raise NotImplementedError("This method is not implemented for `ArgillaOpenAITrainer`.") diff --git a/src/argilla/client/feedback/training/frameworks/peft.py b/src/argilla/client/feedback/training/frameworks/peft.py index 6a6b76de23..cd81c60d8d 100644 --- a/src/argilla/client/feedback/training/frameworks/peft.py +++ b/src/argilla/client/feedback/training/frameworks/peft.py @@ -16,6 +16,7 @@ from argilla.client.feedback.training.frameworks.transformers import ArgillaTransformersTrainer from argilla.training.peft import ArgillaPeftTrainer as ArgillaPeftTrainerV1 +from argilla.utils.dependency import requires_dependencies if TYPE_CHECKING: from argilla.client.feedback.integrations.huggingface.model_card import PeftModelCardData @@ -43,3 +44,24 @@ def get_model_card_data(self, **card_data_kwargs) -> "PeftModelCardData": update_config_kwargs=self.lora_kwargs, **card_data_kwargs, ) + + @requires_dependencies("huggingface_hub") + def push_to_huggingface(self, repo_id: str, **kwargs) -> None: + """Uploads the model to [huggingface's model hub](https://huggingface.co/models). + + The full list of parameters can be seen at: + [huggingface_hub](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.ModelHubMixin.push_to_hub). + + Args: + repo_id: + The name of the repository you want to push your model and tokenizer to. + It should contain your organization name when pushing to a given organization. + """ + if not self._transformers_model: + raise ValueError( + "The model must be initialized prior to this point. You can either call `train` or `init_model`." + ) + model_url = self._transformers_model.push_to_hub(repo_id, **kwargs) + self._logger.info(f"Model pushed to: {model_url}") + tokenizer_url = self._transformers_tokenizer.push_to_hub(repo_id, **kwargs) + self._logger.info(f"Tokenizer pushed to: {tokenizer_url}") diff --git a/src/argilla/client/feedback/training/frameworks/sentence_transformers.py b/src/argilla/client/feedback/training/frameworks/sentence_transformers.py index 610b15506b..86d0d1dc8e 100644 --- a/src/argilla/client/feedback/training/frameworks/sentence_transformers.py +++ b/src/argilla/client/feedback/training/frameworks/sentence_transformers.py @@ -367,3 +367,20 @@ def get_model_card_data(self, **card_data_kwargs) -> "SentenceTransformerCardDat trainer_cls=self._trainer_cls, **card_data_kwargs, ) + + def push_to_huggingface(self, repo_id: str, **kwargs) -> None: + """Uploads the model to [huggingface's model hub](https://huggingface.co/models). + + The full list of parameters can be seen at: + [sentence-transformer api docs](https://www.sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.save_to_hub). + + Args: + repo_id: + The name of the repository you want to push your model and tokenizer to. + It should contain your organization name when pushing to a given organization. + + Raises: + NotImplementedError: + For `CrossEncoder` models, that currently aren't implemented underneath. + """ + raise NotImplementedError("This method is not implemented for `ArgillaSentenceTransformersTrainer`.") diff --git a/src/argilla/client/feedback/training/frameworks/setfit.py b/src/argilla/client/feedback/training/frameworks/setfit.py index 265a2ceb13..d5223bfa45 100644 --- a/src/argilla/client/feedback/training/frameworks/setfit.py +++ b/src/argilla/client/feedback/training/frameworks/setfit.py @@ -18,7 +18,7 @@ from argilla.client.feedback.training.frameworks.transformers import ArgillaTransformersTrainer from argilla.client.models import TextClassificationRecord from argilla.training.setfit import ArgillaSetFitTrainer as ArgillaSetFitTrainerV1 -from argilla.utils.dependency import require_dependencies +from argilla.utils.dependency import require_dependencies, requires_dependencies if TYPE_CHECKING: from argilla.client.feedback.integrations.huggingface.model_card import SetFitModelCardData @@ -66,3 +66,23 @@ def get_model_card_data(self, **card_data_kwargs) -> "SetFitModelCardData": update_config_kwargs={**self.setfit_model_kwargs, **self.setfit_trainer_kwargs}, **card_data_kwargs, ) + + @requires_dependencies("huggingface_hub") + def push_to_huggingface(self, repo_id: str, **kwargs) -> None: + """Uploads the model to [huggingface's model hub](https://huggingface.co/models). + + The full list of parameters can be seen at: + [huggingface_hub](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.ModelHubMixin.push_to_hub). + + Args: + repo_id: + The name of the repository you want to push your model and tokenizer to. + It should contain your organization name when pushing to a given organization. + + Raises: + NotImplementedError: If the model doesn't exist, meaning it hasn't been instantiated yet. + """ + if not self.__trainer: + raise ValueError("The `trainer` must be initialized prior to this point. You should call `train`.") + url = self.__trainer.push_to_hub(repo_id, **kwargs) + self._logger.info(f"Model pushed to: {url}") diff --git a/src/argilla/client/feedback/training/frameworks/spacy.py b/src/argilla/client/feedback/training/frameworks/spacy.py index 0833a006a4..2820861135 100644 --- a/src/argilla/client/feedback/training/frameworks/spacy.py +++ b/src/argilla/client/feedback/training/frameworks/spacy.py @@ -13,6 +13,8 @@ # limitations under the License. import logging +from pathlib import Path +from tempfile import TemporaryDirectory from typing import TYPE_CHECKING, Optional from typing_extensions import Literal @@ -22,7 +24,7 @@ from argilla.training.spacy import ArgillaSpaCyTrainer as ArgillaSpaCyTrainerV1 from argilla.training.spacy import ArgillaSpaCyTransformersTrainer as ArgillaSpaCyTransformersTrainerV1 from argilla.training.spacy import _ArgillaSpaCyTrainerBase as _ArgillaSpaCyTrainerBaseV1 -from argilla.utils.dependency import require_dependencies +from argilla.utils.dependency import require_dependencies, requires_dependencies if TYPE_CHECKING: from argilla.client.feedback.integrations.huggingface.model_card import ( @@ -139,6 +141,80 @@ def __init__( self.init_training_args() + @requires_dependencies("spacy-huggingface-hub") + def push_to_huggingface(self, output_dir: str, **kwargs) -> str: + r"""Uploads the model to [huggingface's model hub](https://huggingface.co/models). + + With spacy we don't need the `repo_id` as in the other frameworks, that + variable is generated internally by `spacy_huggingface_hub`, we need the + path to the nlp pipeline to package it and push it. + + See Also: + The optional arguments are the following: + namespace: Name of organization to which the pipeline should be uploaded. + commit_msg: Commit message to use for update + verbose: Output additional info for debugging, e.g. the full generated hub metadata. + + Args: + output_dir: The same path passed to `save` method. The path where the nlp pipeline + should be saved to. + + Returns: + model_name: + The model name will be used in the base trainer to find the repo to push the model card. + If the url of a model is: https://huggingface.co//, + pass /. + """ + from spacy.cli.package import package + from spacy_huggingface_hub import push + + if self._nlp is None: + raise ValueError( + "No pipeline was initialized, you must call either `init_model` or `train` before calling this method." + ) + + output_dir = Path(output_dir) + with TemporaryDirectory() as tmpdirname: + output_dir_pkg = Path(tmpdirname) / "spacy-packaged" + output_dir_pkg.mkdir(exist_ok=True, parents=True) + + if not output_dir.is_dir(): + raise ValueError( + f"output_dir: '{output_dir.resolve()}' doesn't exist, you must pass the path to the folder of the trained model." + ) + self._logger.info("Packaging nlp pipeline") + package( + input_dir=output_dir.resolve(), + output_dir=output_dir_pkg, + create_sdist=False, + create_wheel=True, + name=output_dir.stem, + ) + self._logger.info(f"spacy pipeline packaged at: {output_dir_pkg}") + + # The following line obtains the full path to the .whl file: + # The output dir contains a single package name. Inside this package + # there will be a `dist` folder containing the packages. As we always + # force to generate only the `wheel` package option, there we can only + # find the .whl file. In case we generated both the wheel and sdist, + # we could find it by getting the file with .whl extension. + whl_path = next((next(output_dir_pkg.iterdir()) / "dist").iterdir()) + # Remove unused parameters from push to avoid errors: + expected_kwargs = set(("namespace", "commig_msg", "silent", "verbose")) + for kw in tuple(kwargs.keys()): + if kw not in expected_kwargs: + kwargs.pop(kw) + + self._logger.info(f"Pushing: {whl_path} to huggingface hub.") + result = push(whl_path, **kwargs) + url = result["url"] + + self._logger.info(f"Model pushed to: {url}") + # Passing the model name generated with spacy-huggingface-hub to use + # it in the base ArgillaTrainer, it's easier to grab + # the generated repo name than forcing the user to pass an argument. + return url.replace("https://huggingface.co/", "") + class ArgillaSpaCyTrainer(ArgillaSpaCyTrainerV1, _ArgillaSpaCyTrainerBase): def __init__(self, freeze_tok2vec: bool = False, **kwargs) -> None: diff --git a/src/argilla/client/feedback/training/frameworks/span_marker.py b/src/argilla/client/feedback/training/frameworks/span_marker.py index 46e687706c..df8bae6f11 100644 --- a/src/argilla/client/feedback/training/frameworks/span_marker.py +++ b/src/argilla/client/feedback/training/frameworks/span_marker.py @@ -66,3 +66,8 @@ def get_model_card_data(self, **card_data_kwargs) -> "FrameworkCardData": raise NotImplementedError( "This method has to be implemented after `FeedbackDataset` allows for token classification." ) + + def push_to_huggingface(self, repo_id: str, **kwargs) -> None: + raise NotImplementedError( + "This method has to be implemented after `FeedbackDataset` allows for token classification." + ) diff --git a/src/argilla/client/feedback/training/frameworks/transformers.py b/src/argilla/client/feedback/training/frameworks/transformers.py index ae26f115c6..48ddd550c6 100644 --- a/src/argilla/client/feedback/training/frameworks/transformers.py +++ b/src/argilla/client/feedback/training/frameworks/transformers.py @@ -19,6 +19,7 @@ from argilla.client.feedback.training.base import ArgillaTrainerSkeleton from argilla.client.feedback.training.schemas import TrainingTaskForQuestionAnswering, TrainingTaskForTextClassification from argilla.training.transformers import ArgillaTransformersTrainer as ArgillaTransformersTrainerV1 +from argilla.utils.dependency import requires_dependencies if TYPE_CHECKING: from argilla.client.feedback.integrations.huggingface.model_card import TransformersModelCardData @@ -100,3 +101,27 @@ def get_model_card_data(self, **card_data_kwargs) -> "TransformersModelCardData" update_config_kwargs=self.trainer_kwargs, **card_data_kwargs, ) + + @requires_dependencies("huggingface_hub") + def push_to_huggingface(self, repo_id: str, **kwargs) -> None: + """Uploads the transformer model and tokenizer to [huggingface's model hub](https://huggingface.co/models). + + The full list of parameters can be seen at: + [huggingface_hub](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.ModelHubMixin.push_to_hub). + + Args: + repo_id: + The name of the repository you want to push your model and tokenizer to. + It should contain your organization name when pushing to a given organization. + + Raises: + NotImplementedError: If the model doesn't exist, meaning it hasn't been instantiated yet. + """ + if not self._transformers_model: + raise ValueError( + "The model must be initialized prior to this point. You can either call `train` or `init_model`." + ) + model_url = self._transformers_model.push_to_hub(repo_id, **kwargs) + self._logger.info(f"Model pushed to: {model_url}") + tokenizer_url = self._transformers_tokenizer.push_to_hub(repo_id, **kwargs) + self._logger.info(f"Tokenizer pushed to: {tokenizer_url}") diff --git a/src/argilla/client/feedback/training/frameworks/trl.py b/src/argilla/client/feedback/training/frameworks/trl.py index 52c6af9ed0..d2a08f7e5b 100644 --- a/src/argilla/client/feedback/training/frameworks/trl.py +++ b/src/argilla/client/feedback/training/frameworks/trl.py @@ -23,7 +23,7 @@ TrainingTaskForSFT, ) from argilla.training.utils import filter_allowed_args -from argilla.utils.dependency import require_dependencies +from argilla.utils.dependency import require_dependencies, requires_dependencies if TYPE_CHECKING: import transformers @@ -419,3 +419,35 @@ def get_model_card_data(self, **card_data_kwargs) -> "TRLModelCardData": update_config_kwargs={**self.training_args_kwargs, **self.trainer_kwargs}, **card_data_kwargs, ) + + @requires_dependencies("huggingface_hub") + def push_to_huggingface(self, repo_id: str, **kwargs) -> None: + """Uploads the transformer model and tokenizer to [huggingface's model hub](https://huggingface.co/models). + + The full list of parameters for PPO can be seen: + [here](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.ModelHubMixin.push_to_hub) + and for the remaining types of trainers, + [here](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer.push_to_hub). + + Args: + repo_id: + The name of the repository you want to push your model and tokenizer to. + It should contain your organization name when pushing to a given organization. + + Raises: + NotImplementedError: If the model doesn't exist, meaning it hasn't been instantiated yet. + """ + if isinstance(self._task, TrainingTaskForPPO): + # `PPOTrainer` inherits from `trl.trainer.BaseTrainer` + model_url = self._trainer.push_to_hub(repo_id, **kwargs) + else: + # The remaining models inherit from `transformers.Trainer`. + # The `Trainer` itself generates the repo_id variable using the args + # passed to the `TrainingArguments`. To keep a similar API we overwrite + # the args.hub_model_id with the argument repo_id, and the repo_id + # will be obtained from there. + self._trainer.args.hub_model_id = repo_id + model_url = self._trainer.model.push_to_hub(repo_id, **kwargs) + tokenizer_url = self._trainer.tokenizer.push_to_hub(repo_id, **kwargs) + + self._logger.info(f"Model pushed to: {model_url}") diff --git a/tests/integration/client/feedback/training/test_openai.py b/tests/integration/client/feedback/training/test_openai.py index d71873ab04..6bd12528a1 100644 --- a/tests/integration/client/feedback/training/test_openai.py +++ b/tests/integration/client/feedback/training/test_openai.py @@ -17,6 +17,7 @@ import argilla as rg import pytest from argilla.client.feedback.training.schemas import TrainingTaskForChatCompletionFormat +from argilla.feedback import ArgillaTrainer, FeedbackDataset, TrainingTask from tests.integration.client.feedback.helpers import formatting_func_chat_completion @@ -55,3 +56,37 @@ def test_training_task_for_chat_completion(mocked_openai): framework="openai", ) trainer.train("mock") + + +def test_push_to_huggingface(mocked_openai): + dataset = FeedbackDataset.from_huggingface("argilla/customer_assistant") + # adapation from LlamaIndex's TEXT_QA_PROMPT_TMPL_MSGS[1].content + user_message_prompt = """Context information is below. + --------------------- + {context_str} + --------------------- + Given the context information and not prior knowledge but keeping your Argilla Cloud assistant style, answer the query. + Query: {query_str} + Answer: + """ + # adapation from LlamaIndex's TEXT_QA_SYSTEM_PROMPT + system_prompt = """You are an expert customer service assistant for the Argilla Cloud product that is trusted around the world.""" + + def formatting_func(sample: dict): + from uuid import uuid4 + + if sample["response"]: + chat = str(uuid4()) + user_message = user_message_prompt.format(context_str=sample["context"], query_str=sample["user-message"]) + return [ + (chat, "0", "system", system_prompt), + (chat, "1", "user", user_message), + (chat, "2", "assistant", sample["response"][0]["value"]), + ] + else: + return None + + task = TrainingTask.for_chat_completion(formatting_func=formatting_func) + trainer = ArgillaTrainer(dataset=dataset, task=task, framework="openai") + with pytest.raises(NotImplementedError, match="This method is not implemented for `ArgillaOpenAITrainer`."): + trainer.push_to_huggingface("mocked", generate_card=True) diff --git a/tests/integration/client/feedback/training/test_sentence_transformers.py b/tests/integration/client/feedback/training/test_sentence_transformers.py index 98c5681704..439c154d16 100644 --- a/tests/integration/client/feedback/training/test_sentence_transformers.py +++ b/tests/integration/client/feedback/training/test_sentence_transformers.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json from typing import TYPE_CHECKING, Callable, List, Union import pytest @@ -206,3 +207,44 @@ def test_prepare_for_training_sentence_transformers_with_defaults( assert len(eval_trainer.predict([["first sentence", "second sentence"], ["to compare", "another one"]])) == 2 assert len(eval_trainer.predict(["first sentence", ["to compare", "another one"]])) == 2 + + +@pytest.mark.usefixtures( + "feedback_dataset_guidelines", + "feedback_dataset_fields", + "feedback_dataset_questions", + "feedback_dataset_records", +) +def test_push_to_huggingface( + feedback_dataset_guidelines: str, + feedback_dataset_fields: List["AllowedFieldTypes"], + feedback_dataset_questions: List["AllowedQuestionTypes"], + feedback_dataset_records: List[FeedbackRecord], + mocked_trainer_push_to_huggingface, +) -> None: + # This framework is not implemented yet. Cross-Encoder models don't implement the functionality + # for pushing a model to huggingface, and SentenceTransformer models have the functionality + # but is outdated and doesn't work with the current versions of 'huggingface-hub'. + # The present test is let here for the future, when we either implement the functionality + # in 'argilla', or to 'sentence-transformers'. + + dataset = FeedbackDataset( + guidelines=feedback_dataset_guidelines, + fields=feedback_dataset_fields, + questions=feedback_dataset_questions, + ) + dataset.add_records(records=feedback_dataset_records * 2) + + task = TrainingTask.for_sentence_similarity(formatting_func=formatting_func_sentence_transformers) + + model = "all-MiniLM-L6-v2" + + trainer = ArgillaTrainer(dataset=dataset, task=task, framework=__FRAMEWORK__, model=model) + + trainer.update_config(max_steps=1) + + train_with_cleanup(trainer, __OUTPUT_DIR__) + with pytest.raises( + NotImplementedError, match="This method is not implemented for `ArgillaSentenceTransformersTrainer`." + ): + trainer.push_to_huggingface("mocked", generate_card=True) diff --git a/tests/integration/client/feedback/training/test_trainer.py b/tests/integration/client/feedback/training/test_trainer.py index 60618287ee..56ceef8639 100644 --- a/tests/integration/client/feedback/training/test_trainer.py +++ b/tests/integration/client/feedback/training/test_trainer.py @@ -46,8 +46,16 @@ from argilla.client.models import Framework from transformers import AutoModelForSequenceClassification, AutoTokenizer +from tests.integration.training.helpers import train_with_cleanup + __OUTPUT_DIR__ = "tmp" +# To mimick the tests from huggingface_hub: https://github.com/huggingface/huggingface_hub/blob/v0.18.0.rc0/tests/testing_constants.py +HF_HUB_CONSTANTS = { + "HF_HUB_ENDPOINT_STAGING": "https://hub-ci.huggingface.co", + "HF_HUB_TOKEN": "hf_94wBhPGp6KrrTH3KDchhKpRxZwd6dmHWLL", +} + @pytest.mark.parametrize( "framework", @@ -55,7 +63,6 @@ Framework("spacy"), Framework("spacy-transformers"), Framework("transformers"), - Framework("spark-nlp"), Framework("span_marker"), Framework("setfit"), Framework("peft"), @@ -365,3 +372,88 @@ def test_tokenizer_warning_wrong_framework( tokenizer = AutoTokenizer.from_pretrained("gpt2") with pytest.warns(UserWarning, match="Passing a tokenizer is not supported for the setfit framework."): ArgillaTrainer(dataset=dataset, task=task, framework="setfit", tokenizer=tokenizer) + + +@pytest.mark.parametrize( + "framework", + [ + # Framework("spacy"), + # Framework("spacy-transformers"), + # Framework("transformers"), + # Framework("setfit"), + # Framework("peft"), + # The FeedbackDataset needs to work with token classification for this framework to work. + Framework("span_marker"), + ], +) +@pytest.mark.usefixtures( + "feedback_dataset_guidelines", + "feedback_dataset_fields", + "feedback_dataset_questions", + "feedback_dataset_records", +) +def test_push_to_huggingface( + framework: Union[Framework, str], + feedback_dataset_guidelines: str, + feedback_dataset_fields: List["AllowedFieldTypes"], + feedback_dataset_questions: List["AllowedQuestionTypes"], + feedback_dataset_records: List[FeedbackRecord], + mocked_trainer_push_to_huggingface, +) -> None: + dataset = FeedbackDataset( + guidelines=feedback_dataset_guidelines, + fields=feedback_dataset_fields, + questions=feedback_dataset_questions, + ) + dataset.add_records(records=feedback_dataset_records * 2) + + questions = [ + question for question in dataset.questions if isinstance(question, (LabelQuestion, MultiLabelQuestion)) + ] + label = LabelQuestionUnification(question=questions[0]) + task = TrainingTask.for_text_classification(text=dataset.fields[0], label=label) + + if framework == Framework("span_marker"): + with pytest.raises( + NotImplementedError, + match=f"Framework {framework} is not supported for this {TrainingTaskForTextClassification}.", + ): + ArgillaTrainer(dataset=dataset, task=task, framework=framework) + return + + else: + if framework == Framework("spacy"): + model = "en_core_web_sm" + elif framework == Framework("setfit"): + model = "all-MiniLM-L6-v2" + else: + model = "prajjwal1/bert-tiny" + + trainer = ArgillaTrainer(dataset=dataset, task=task, framework=framework, model=model) + + # We need to initialize the model (is faster than calling the whole training process) before calling push_to_huggingface. + # The remaining models need to call the train method first. + repo_id = "mocked" + if framework in (Framework("transformers"), Framework("peft")): + trainer.update_config(num_iterations=1) + trainer._trainer.init_model(new=True) + elif framework in (Framework("setfit"), Framework("spacy"), Framework("spacy-transformers")): + if framework in (Framework("spacy"), Framework("spacy-transformers")): + trainer.update_config(max_steps=1) + repo_id = __OUTPUT_DIR__ + else: + trainer.update_config(num_iterations=1) + else: + trainer._trainer.init_model() + + # We have to train the model and push it with spacy before removing the + # generated folder, as it needs to be packaged. + if framework in (Framework("spacy"), Framework("spacy-transformers")): + trainer.train(__OUTPUT_DIR__) + else: + train_with_cleanup(trainer, __OUTPUT_DIR__) + + # This functionality is mocked, no need to check the generated card too. + trainer.push_to_huggingface(repo_id, generate_card=False) + if Path(__OUTPUT_DIR__).exists(): + shutil.rmtree(__OUTPUT_DIR__) diff --git a/tests/integration/client/feedback/training/test_trl.py b/tests/integration/client/feedback/training/test_trl.py index 3d1deecee3..7ca305b941 100644 --- a/tests/integration/client/feedback/training/test_trl.py +++ b/tests/integration/client/feedback/training/test_trl.py @@ -15,7 +15,7 @@ import os import re from collections import Counter -from typing import TYPE_CHECKING, Any, Dict, Iterator, List +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterator, List import pytest from argilla.client.feedback.dataset import FeedbackDataset @@ -58,6 +58,24 @@ def try_wrong_format(dataset, task, format_func: Any) -> None: trainer.train(OUTPUT_DIR) +def formatting_func_sft(sample: Dict[str, Any]) -> Iterator[str]: + # For example, the sample must be most frequently rated as "1" in question-2 and + # label "b" from "question-3" must have not been set by any annotator + ratings = [ + annotation["value"] + for annotation in sample["question-2"] + if annotation["status"] == "submitted" and annotation["value"] is not None + ] + labels = [ + annotation["value"] + for annotation in sample["question-3"] + if annotation["status"] == "submitted" and annotation["value"] is not None + ] + if ratings and Counter(ratings).most_common(1)[0][0] == 1 and "b" not in labels: + return f"### Text\n{sample['text']}" + return None + + def test_prepare_for_training_sft( feedback_dataset_guidelines: str, feedback_dataset_fields: List["AllowedFieldTypes"], @@ -109,6 +127,23 @@ def test_prepare_for_training_sft( assert trainer._trainer._transformers_tokenizer.test_value == 12 +def formatting_func_rm(sample: Dict[str, Any]): + # The FeedbackDataset isn't really set up for RM, so we'll just use an arbitrary example here + labels = [ + annotation["value"] + for annotation in sample["question-3"] + if annotation["status"] == "submitted" and annotation["value"] is not None + ] + if labels: + # Three cases for the tests: None, one tuple and yielding multiple tuples + if labels[0] == "a": + return None + elif labels[0] == "b": + return sample["text"], sample["text"][:5] + elif labels[0] == "c": + return [(sample["text"], sample["text"][5:10]), (sample["text"], sample["text"][:5])] + + def test_prepare_for_training_rm( feedback_dataset_guidelines: str, feedback_dataset_fields: List["AllowedFieldTypes"], @@ -160,6 +195,10 @@ def test_prepare_for_training_rm( assert trainer._trainer._transformers_tokenizer.test_value == 12 +def formatting_func_ppo(sample: Dict[str, Any]): + return sample["text"] + + def test_prepare_for_training_ppo( feedback_dataset_guidelines: str, feedback_dataset_fields: List["AllowedFieldTypes"], @@ -222,6 +261,26 @@ def test_prepare_for_training_ppo( assert trainer._trainer._transformers_tokenizer.test_value == 12 +def formatting_func_dpo(sample: Dict[str, Any]): + # The FeedbackDataset isn't really set up for DPO, so we'll just use an arbitrary example here + labels = [ + annotation["value"] + for annotation in sample["question-3"] + if annotation["status"] == "submitted" and annotation["value"] is not None + ] + if labels: + # Three cases for the tests: None, one tuple and yielding multiple tuples + if labels[0] == "a": + return None + elif labels[0] == "b": + return sample["text"][::-1], sample["text"], sample["text"][:5] + elif labels[0] == "c": + return [ + (sample["text"], sample["text"][::-1], sample["text"][:5]), + (sample["text"][::-1], sample["text"], sample["text"][:5]), + ] + + def test_prepare_for_training_dpo( feedback_dataset_guidelines: str, feedback_dataset_fields: List["AllowedFieldTypes"], @@ -303,3 +362,54 @@ def test_sft_with_peft( trainer.train(tmp_path) assert "adapter_config.json" in os.listdir(tmp_path) assert "adapter_model.bin" in os.listdir(tmp_path) + + +# @pytest.mark.slow +@pytest.mark.parametrize( + "formatting_func, training_task", + ( + (formatting_func_sft, TrainingTask.for_supervised_fine_tuning), + (formatting_func_rm, TrainingTask.for_reward_modeling), + (formatting_func_ppo, TrainingTask.for_proximal_policy_optimization), + (formatting_func_dpo, TrainingTask.for_direct_preference_optimization), + ), +) +@pytest.mark.usefixtures( + "feedback_dataset_guidelines", + "feedback_dataset_fields", + "feedback_dataset_questions", + "feedback_dataset_records", +) +def test_push_to_huggingface( + formatting_func: Callable, + training_task: Callable, + feedback_dataset_guidelines: str, + feedback_dataset_fields: List["AllowedFieldTypes"], + feedback_dataset_questions: List["AllowedQuestionTypes"], + feedback_dataset_records: List[FeedbackRecord], + mocked_trainer_push_to_huggingface, +) -> None: + dataset = FeedbackDataset( + guidelines=feedback_dataset_guidelines, + fields=feedback_dataset_fields, + questions=feedback_dataset_questions, + ) + dataset.add_records(records=feedback_dataset_records * 2) + + task = training_task(formatting_func) + model = "sshleifer/tiny-gpt2" + + trainer = ArgillaTrainer(dataset=dataset, task=task, framework="trl", model=model) + + if training_task == TrainingTask.for_proximal_policy_optimization: + from transformers import pipeline + from trl import PPOConfig + + reward_model = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb") + trainer.update_config(config=PPOConfig(batch_size=1, ppo_epochs=2), reward_model=reward_model) + else: + trainer.update_config(max_steps=1) + + train_with_cleanup(trainer, OUTPUT_DIR) + + trainer.push_to_huggingface("mocked", generate_card=False) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 8805e50a6a..f712aa81dd 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -356,3 +356,24 @@ def mocked_openai(mocker): mocker.patch("openai.FineTuningJob.create", return_value=response) mocker.patch("openai.FineTune.create", return_value=response) mocker.patch("openai.File.create", return_value=response) + + +@pytest.fixture +def mocked_trainer_push_to_huggingface(mocker: "MockerFixture"): + # Mock the push_to_huggingface methods for the different trainers, + # most of the functionality is already tested by the frameworks itself. + # For transformers' model and tokenizer + mocker.patch("transformers.PreTrainedModel.push_to_hub", return_value="model_url") + mocker.patch("transformers.PreTrainedTokenizer.push_to_hub", return_value="model_url") + # For setfit + mocker.patch("setfit.trainer.SetFitTrainer.push_to_hub", return_value="model_url") + # For peft + mocker.patch("peft.PeftModel.push_to_hub", return_value="model_url") + mocker.patch("transformers.PreTrainedTokenizerBase.push_to_hub", return_value="model_url") + # For spacy and spacy-transformers + mocker.patch("spacy_huggingface_hub.push", return_value={"url": "model_url"}) + # For trl + mocker.patch("trl.trainer.sft_trainer.SFTTrainer.push_to_hub", return_value="model_url") + mocker.patch("trl.trainer.reward_trainer.RewardTrainer.push_to_hub", return_value="model_url") + mocker.patch("trl.trainer.base.BaseTrainer.push_to_hub", return_value="model_url") + mocker.patch("trl.trainer.dpo_trainer.DPOTrainer.push_to_hub", return_value="model_url")