diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e4e74caa2..ccc7b0c11d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ These are the section headers that we use: - Added strategy to handle and translate errors from the server for `401` HTTP status code` ([#4362](https://github.com/argilla-io/argilla/pull/4362)) - Added integration for `textdescriptives` using `TextDescriptivesExtractor` to configure `metadata_properties` in `FeedbackDataset` and `FeedbackRecord`. ([#4400](https://github.com/argilla-io/argilla/pull/4400)). Contributed by @m-newhauser - Added `POST /api/v1/me/responses/bulk` endpoint to create responses in bulk for current user. ([#4380](https://github.com/argilla-io/argilla/pull/4380)) +- Added list support for term metadata properties. (Closes [#4359](https://github.com/argilla-io/argilla/issues/4359)) - Added new CLI task to reindex datasets and records into the search engine. ([#4404](https://github.com/argilla-io/argilla/pull/4404)) ### Changed diff --git a/docs/_source/practical_guides/create_update_dataset/metadata.md b/docs/_source/practical_guides/create_update_dataset/metadata.md index 0a90dbaaa5..19560a3039 100644 --- a/docs/_source/practical_guides/create_update_dataset/metadata.md +++ b/docs/_source/practical_guides/create_update_dataset/metadata.md @@ -72,7 +72,11 @@ dataset.delete_metadata_properties(metadata_properties="groups") ### Format `metadata` -Record metadata can include any information about the record that is not part of the fields in the form of a dictionary. If you want the metadata to correspond with the metadata properties configured for your dataset so that these can be used for filtering and sorting records, make sure that the key of the dictionary corresponds with the metadata property `name`. When the key doesn't correspond, this will be considered extra metadata that will get stored with the record (as long as `allow_extra_metadata` is set to `True` for the dataset), but will not be usable for filtering and sorting. +Record metadata can include any information about the record that is not part of the fields in the form of a dictionary. If you want the metadata to correspond with the metadata properties configured for your dataset so that these can be used for filtering and sorting records, make sure that the key of the dictionary corresponds with the metadata property `name`. When the key doesn't correspond, this will be considered extra metadata that will get stored with the record (as long as `allow_extra_metadata` is set to `True` for the dataset), but will not be usable for filtering and sorting. For any metadata property, you can define a single metadata value in the form of a string or integer, or multiple metadata values in the form of a list of strings or integers. + +::::{tab-set} + +:::{tab-item} Single Metadata ```python record = rg.FeedbackRecord( @@ -80,10 +84,23 @@ record = rg.FeedbackRecord( metadata={"source": "encyclopedia", "text_length":150} ) ``` +::: + +:::{tab-item} Multiple Metadata +```python +record = rg.FeedbackRecord( + fields={...}, + metadata={"source": ["encyclopedia", "wikipedia"], "text_length":150} +) +``` + +::: + +:::: #### Add `metadata` -Once the `metadata_properties` were defined, to add metadata to the records, it slightly depends on whether you are using a `FeedbackDataset` or a `RemoteFeedbackDataset`. For an end-to-end example, check our [tutorial on adding metadata](/tutorials_and_integrations/tutorials/feedback/end2end_examples/add-metadata-003.ipynb). +Once the `metadata_properties` were defined, to add metadata to the records, it slightly depends on whether you are using a `FeedbackDataset` or a `RemoteFeedbackDataset`. For an end-to-end example, check our [tutorial on adding metadata](/tutorials_and_integrations/tutorials/feedback/end2end_examples/add-metadata-003.ipynb). Remember that you can either define a single metadata value for a metadata property or aggregate metadata values for the `TermsMetadataProperty` in the form of a list for the cases where one record falls into multiple metadata categories. ```{note} The dataset not yet pushed to Argilla or pulled from HuggingFace Hub is an instance of `FeedbackDataset` whereas the dataset pulled from Argilla is an instance of `RemoteFeedbackDataset`. The difference between the two is that the former is a local one and the changes made on it stay locally. On the other hand, the latter is a remote one and the changes made on it are directly reflected on the dataset on the Argilla server, which can make your process faster. @@ -202,4 +219,4 @@ for record in dataset: record.metadata["my_metadata"] = "my_value" modified_records.append(record) rg.log(name="my_dataset", records=modified_records) -``` \ No newline at end of file +``` diff --git a/docs/_source/practical_guides/create_update_dataset/records.md b/docs/_source/practical_guides/create_update_dataset/records.md index 525793d18a..53178f83ca 100644 --- a/docs/_source/practical_guides/create_update_dataset/records.md +++ b/docs/_source/practical_guides/create_update_dataset/records.md @@ -22,7 +22,7 @@ After configuring a `FeedbackDataset`, as shown in the [previous guide](/practic record = rg.FeedbackRecord( fields={ "question": "Why can camels survive long without water?", - "answer": "Camels use the fat in their humps to keep them filled with energy and hydration for long periods of time." + "answer": "Camels use the fat in their humps to keep them filled with energy and hydration for long periods." }, metadata={"source": "encyclopedia"}, vectors={"my_vector": [...], "my_other_vector": [...]}, @@ -46,7 +46,12 @@ record = rg.FeedbackRecord( ``` #### Format `metadata` -Record metadata can include any information about the record that is not part of the fields in the form of a dictionary. If you want the metadata to correspond with the metadata properties configured for your dataset so that these can be used for filtering and sorting records, make sure that the key of the dictionary corresponds with the metadata property `name`. When the key doesn't correspond, this will be considered extra metadata that will get stored with the record (as long as `allow_extra_metadata` is set to `True` for the dataset), but will not be usable for filtering and sorting. + +Record metadata can include any information about the record that is not part of the fields in the form of a dictionary. If you want the metadata to correspond with the metadata properties configured for your dataset so that these can be used for filtering and sorting records, make sure that the key of the dictionary corresponds with the metadata property `name`. When the key doesn't correspond, this will be considered extra metadata that will get stored with the record (as long as `allow_extra_metadata` is set to `True` for the dataset), but will not be usable for filtering and sorting. As well as adding one metadata property to a single record, you can also add aggregate metadata values for the `TermsMetadataProperty` in the form of a list. + +::::{tab-set} + +:::{tab-item} Single Metadata ```python record = rg.FeedbackRecord( @@ -54,6 +59,19 @@ record = rg.FeedbackRecord( metadata={"source": "encyclopedia", "text_length":150} ) ``` +::: + +:::{tab-item} Multiple Metadata +```python +record = rg.FeedbackRecord( + fields={...}, + metadata={"source": ["encyclopedia", "wikipedia"], "text_length":150} +) +``` + +::: + +:::: #### Format `vectors` You can associate vectors, like text embeddings, to your records. This will enable the [semantic search](filter_dataset.md#semantic-search) in the UI and the Python SDK. These are saved as a dictionary, where the keys correspond to the `name`s of the vector settings that were configured for your dataset and the value is a list of floats. Make sure that the length of the list corresponds to the dimensions set in the vector settings. @@ -510,4 +528,4 @@ rg.delete_records(name="example-dataset", query="metadata.code=33", discard_only ``` ::: -:::: \ No newline at end of file +:::: diff --git a/docs/_source/tutorials_and_integrations/tutorials/feedback/end2end_examples/add-metadata-003.ipynb b/docs/_source/tutorials_and_integrations/tutorials/feedback/end2end_examples/add-metadata-003.ipynb index 9ef26e6186..5724277807 100644 --- a/docs/_source/tutorials_and_integrations/tutorials/feedback/end2end_examples/add-metadata-003.ipynb +++ b/docs/_source/tutorials_and_integrations/tutorials/feedback/end2end_examples/add-metadata-003.ipynb @@ -300,7 +300,7 @@ "\n", "### TermsMetadataProperty\n", "\n", - "The `TermsMetadaProperty` is a metadata property that can be used to filter the metadata of a record based on a list of possible terms or values." + "The `TermsMetadataProperty` is a metadata property that can be used to filter the metadata of a record based on a list of possible terms or values." ] }, { @@ -439,6 +439,31 @@ "dataset_remote.update_records(modified_records)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Aggregate Metadata Values\n", + "\n", + "In addition, we have the opportunity to add multiple metadata values for the `TermsMetadataProperty` to a single record. This is quite useful when a record falls into multiple categories. For the example case at hand, let us imagine that one of the records (or any number of them) is to be annotated by two groups. We can simply encode this information by giving a list of the metadata values. Let us see how it is done for the local `FeedbackDataset` and it is just the same process for the `RemoteFeedbackDataset` as above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset[1].metadata[\"group\"] = [\"group-1\", \"group-2\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have seen an example of how to add aggregate metadata values for `TermsMetadataProperty` here. Please note that this is also applicable for `IntegerMetadataProperty` and `FloatMetadataProperty`, and you can add them in the same way." + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/src/argilla/client/feedback/constants.py b/src/argilla/client/feedback/constants.py index 784c12144c..1eb7a3be4a 100644 --- a/src/argilla/client/feedback/constants.py +++ b/src/argilla/client/feedback/constants.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import List, Union from pydantic import StrictFloat, StrictInt, StrictStr @@ -23,7 +24,7 @@ FIELD_TYPE_TO_PYTHON_TYPE = {FieldTypes.text: str} # We are using `pydantic`'s strict types to avoid implicit type conversions METADATA_PROPERTY_TYPE_TO_PYDANTIC_TYPE = { - MetadataPropertyTypes.terms: StrictStr, + MetadataPropertyTypes.terms: Union[StrictStr, List[StrictStr]], MetadataPropertyTypes.integer: StrictInt, MetadataPropertyTypes.float: StrictFloat, } @@ -32,4 +33,5 @@ StrictInt: int, StrictFloat: float, StrictStr: str, + Union[StrictStr, List[StrictStr]]: (str, list), } diff --git a/src/argilla/client/feedback/schemas/metadata.py b/src/argilla/client/feedback/schemas/metadata.py index 5f88f3fc1b..b4ad1716bf 100644 --- a/src/argilla/client/feedback/schemas/metadata.py +++ b/src/argilla/client/feedback/schemas/metadata.py @@ -145,11 +145,21 @@ def server_settings(self) -> Dict[str, Any]: settings["values"] = self.values return settings - def _all_values_exist(self, introduced_value: Optional[str] = None) -> Optional[str]: - if introduced_value is not None and self.values is not None and introduced_value not in self.values: - raise ValueError( - f"Provided '{self.name}={introduced_value}' is not valid, only values in {self.values} are allowed." - ) + def _all_values_exist(self, introduced_value: Optional[Union[str, List[str]]] = None) -> Optional[str]: + if introduced_value is None or self.values is None: + return introduced_value + + if isinstance(introduced_value, str): + values = [introduced_value] + else: + values = introduced_value + + for value in values: + if value not in self.values: + raise ValueError( + f"Provided '{self.name}={value}' is not valid, only values in {self.values} are allowed." + ) + return introduced_value def _validator(self, value: Any) -> Any: diff --git a/src/argilla/server/models/metadata_properties.py b/src/argilla/server/models/metadata_properties.py index 1e3d098328..7a80aaf36c 100644 --- a/src/argilla/server/models/metadata_properties.py +++ b/src/argilla/server/models/metadata_properties.py @@ -43,9 +43,17 @@ class TermsMetadataPropertySettings(BaseMetadataPropertySettings): type: Literal[MetadataPropertyType.terms] values: Optional[List[str]] = None - def check_metadata(self, value: str) -> None: - if self.values is not None and value not in self.values: - raise ValueError(f"'{value}' is not an allowed term.") + def check_metadata(self, value: Union[str, List[str]]) -> None: + if self.values is None: + return + + values = value + if isinstance(values, str): + values = [value] + + for v in values: + if v not in self.values: + raise ValueError(f"'{v}' is not an allowed term.") NT = TypeVar("NT", int, float) diff --git a/tests/unit/server/api/v1/test_records.py b/tests/unit/server/api/v1/test_records.py index e89e0e067b..2a031afbcc 100644 --- a/tests/unit/server/api/v1/test_records.py +++ b/tests/unit/server/api/v1/test_records.py @@ -276,6 +276,39 @@ async def test_update_record_with_no_metadata( } mock_search_engine.index_records.assert_not_called() + async def test_update_record_with_list_terms_metadata( + self, async_client: "AsyncClient", mock_search_engine: SearchEngine, owner_auth_header: dict + ): + dataset = await DatasetFactory.create() + await TermsMetadataPropertyFactory.create(name="terms-metadata-property", dataset=dataset) + record = await RecordFactory.create(dataset=dataset) + + response = await async_client.patch( + f"/api/v1/records/{record.id}", + headers=owner_auth_header, + json={ + "metadata": { + "terms-metadata-property": ["a", "b", "c"], + }, + }, + ) + + assert response.status_code == 200 + assert response.json() == { + "id": str(record.id), + "fields": {"text": "This is a text", "sentiment": "neutral"}, + "metadata": { + "terms-metadata-property": ["a", "b", "c"], + }, + "external_id": record.external_id, + "responses": [], + "suggestions": [], + "vectors": {}, + "inserted_at": record.inserted_at.isoformat(), + "updated_at": record.updated_at.isoformat(), + } + mock_search_engine.index_records.assert_called_once_with(dataset, [record]) + async def test_update_record_with_no_suggestions( self, async_client: "AsyncClient", db: "AsyncSession", mock_search_engine: SearchEngine, owner_auth_header: dict ):