Feature to export dataset records as generic python structures (#49)

* test: add integration tests for updating records * feat: add basic update logic to records api * feat: implement update filter logic in dataset records * fix: excess import in suggestion model * feat: add from model class method to reponse resource * feat: implement suggestion as complete resource with from_model method and properties * feat: implement Record as resource with containers * refactor: restructure records in submodules * refactor: use RecordModel and ResponseModel in record api * test: update tests to use fields and suggestions attributes * feat: implement fields and suggestions attributes within RecordFields and RecordSuggestions * chore: expand record ingestion function with logging and validation * fix: catch missing datasets in erroneous suggestions * chore: document and refactor record resource * chore: refactor and delete submodule records.utils * chore: delete redundant questions module * chore: liscencing in records module * test: revise test for new responses as attribute model * feat: implement sorter in ingestion function to make either response or suggestion * feat: add flats for as_sugestion to DatasetRecords specify suggestion or responses coming in * feat: add core properties to response resource * chore: log dataset name in add and update records * chore: reduce logging every record * chore: naming and formatting in Record * fix: question_name typing in suggestion * chore: delete excess dataset_records code from dataset module * chore: tidy excess imports in record model * test: add testing for export records to generic python structures * feat: implement export mixin for generic python structures * feat: integrate export mixin with dataset records * feat: expose record and metadata property * feat: add serialize to response resource * feat: add serialize method to suggestion resource * docs: add documentation to _generic export mixin * test: update tests for nested export configuration * feat: add to_dict methods in responses and suggestions * refactor: refactor generic export for simplicity and to_dict methods * feat: implement to_dict method for resourcer * feat: move to_dict out of export module
argilla-io · Apr 15, 2024 · 6ce7709 · 6ce7709
1 parent 7695a68
commit 6ce7709
Show file tree

Hide file tree

Showing 15 changed files with 439 additions and 186 deletions.
diff --git a/src/argilla_sdk/__init__.py b/src/argilla_sdk/__init__.py
@@ -19,3 +19,4 @@
 from argilla_sdk.settings import *  # noqa
 from argilla_sdk.suggestions import *  # noqa
 from argilla_sdk.responses import *  # noqa
+from argilla_sdk.records import *  # noqa
diff --git a/src/argilla_sdk/_models/_record.py b/src/argilla_sdk/_models/_record.py
@@ -1,5 +1,4 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
-from uuid import UUID, uuid4
 
 from pydantic import Field, field_serializer
 

diff --git a/src/argilla_sdk/datasets/__init__.py b/src/argilla_sdk/datasets/__init__.py
@@ -13,4 +13,3 @@
 # limitations under the License.
 
 from argilla_sdk.datasets._resource import Dataset  # noqa
-from argilla_sdk.datasets._record import Record  # noqa
diff --git a/src/argilla_sdk/datasets/_dataset_records.py b/src/argilla_sdk/datasets/_dataset_records.py
diff --git a/src/argilla_sdk/datasets/_record.py b/src/argilla_sdk/datasets/_record.py
diff --git a/src/argilla_sdk/datasets/_resource.py b/src/argilla_sdk/datasets/_resource.py
@@ -73,8 +73,6 @@ def __init__(
 
     @property
     def records(self) -> "DatasetRecords":
-        if not self.is_published:
-            raise DatasetNotPublished("Cannot access records before publishing the dataset. Call `publish` first.")
         return self.__records
 
     @property
@@ -137,7 +135,7 @@ def _configure(self, settings: Settings, publish: bool = False) -> "Dataset":
 
         if publish:
             self.__publish()
-        return self.get() # type: ignore
+        return self.get()  # type: ignore
 
     def __define_settings(
         self,

diff --git a/src/argilla_sdk/records/_dataset_records.py b/src/argilla_sdk/records/_dataset_records.py
@@ -19,6 +19,7 @@
 from argilla_sdk._resource import Resource
 from argilla_sdk.client import Argilla
 from argilla_sdk.records._resource import Record
+from argilla_sdk.records._export import GenericExportMixin
 
 if TYPE_CHECKING:
     from argilla_sdk.datasets import Dataset
@@ -76,7 +77,7 @@ def _list(self) -> Sequence[Record]:
             yield Record.from_model(model=record_model, dataset=self.__dataset)
 
 
-class DatasetRecords(Resource):
+class DatasetRecords(Resource, GenericExportMixin):
     """
     This class is used to work with records from a dataset.
 
@@ -167,7 +168,20 @@ def update(
             message=f"Updated {len(records_to_update)} records and added {len(records_to_add)} records to dataset {self.__dataset.name}",
             level="info",
         )
-
+
+    def pull(self) -> None:
+        """Fetch all records from the server and update the local records."""
+        records = self.__list_records_from_server()
+        self.__records = [Record.from_model(model=record, dataset=self.__dataset) for record in records]
+
+    def to_dict(self, flatten: bool = True, orient: str = "names") -> Dict[str, Any]:
+        """Return the records as a dictionary."""
+        return self._export_to_dict(records=self.__records, flatten=flatten, orient=orient)
+
+    def to_list(self, flatten: bool = True) -> List[Dict[str, Any]]:
+        """Return the records as a list of dictionaries."""
+        return self._export_to_list(records=self.__records, flatten=flatten)
+
     ############################
     # Utility methods
     ############################

diff --git a/src/argilla_sdk/records/_export/__init__.py b/src/argilla_sdk/records/_export/__init__.py
@@ -0,0 +1 @@
+from argilla_sdk.records._export._generic import GenericExportMixin  # noqa
diff --git a/src/argilla_sdk/records/_export/_datasets.py b/src/argilla_sdk/records/_export/_datasets.py
diff --git a/src/argilla_sdk/records/_export/_generic.py b/src/argilla_sdk/records/_export/_generic.py
@@ -0,0 +1,104 @@
+# Copyright 2024-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, TYPE_CHECKING, Union
+from collections import defaultdict
+
+if TYPE_CHECKING:
+    from argilla_sdk import Record
+
+
+class GenericExportMixin:
+    """This is a mixin class for DatasetRecords and Export classes.
+    It handles methods for exporting records to generic python formats."""
+
+    def _export_to_dict(
+        self, records: List["Record"], flatten=True, orient="names"
+    ) -> Dict[str, Union[str, float, int, list]]:
+        """Export records to a dictionary with either names or record index as keys.
+        Args:
+            records (List[Record]): List of Record objects to export.
+            flatten (bool): The structure of the exported dictionary.
+                - True: The record fields, metadata, suggestions and responses will be flattened.
+                - False: The record fields, metadata, suggestions and responses will be nested.
+            orient (str): The orientation of the exported dictionary.
+                - "names": The keys of the dictionary will be the names of the fields, metadata, suggestions and responses.
+                - "index": The keys of the dictionary will be the external_id of the records.
+        Returns:
+            dataset_records (Dict[str, Union[str, float, int, list]]): The exported records in a dictionary format.
+        """
+        if orient == "names":
+            dataset_records: dict = defaultdict(list)
+            for record in records:
+                for key, value in self.__record_to_dict(record=record, flatten=flatten).items():
+                    dataset_records[key].append(value)
+        elif orient == "index":
+            dataset_records: dict = {}
+            for record in records:
+                dataset_records[record.external_id] = self.__record_to_dict(record=record, flatten=flatten)
+        else:
+            raise ValueError(f"Invalid value for orient parameter: {orient}")
+        return dataset_records
+
+    def _export_to_list(self, records: List["Record"], flatten=True) -> List[Dict[str, Union[str, float, int, list]]]:
+        """Export records to a list of dictionaries with either names or record index as keys.
+        Args:
+            records (List[Record]): List of Record objects to export.
+            flatten (bool): The structure of the exported dictionary.
+                - True: The record fields, metadata, suggestions and responses will be flattened.
+                - False: The record fields, metadata, suggestions and responses will be nested.
+        Returns:
+            dataset_records (List[Dict[str, Union[str, float, int, list]]]): The exported records in a list of dictionaries format.
+        """
+        dataset_records: list = []
+        for record in records:
+            dataset_records.append(self.__record_to_dict(record=record, flatten=flatten))
+        return dataset_records
+
+    def __record_to_dict(self, record: "Record", flatten=True) -> Dict[str, Any]:
+        """Converts a Record object to a dictionary for export.
+        Args:
+            record (Record): The Record object to convert.
+            flatten (bool): The structure of the exported dictionary.
+                - True: The record fields, metadata, suggestions and responses will be flattened
+                        so that their keys becomes the keys of the record dictionary, using
+                        dot notation for nested keys. i.e. `label.suggestion` and `label.response`
+                - False: The record fields, metadata, suggestions and responses will be nested as
+                        dictionaries within the record dictionary. i.e. `label: {suggestion: ..., response: ...}`
+        Returns:
+            A dictionary representing the record.
+        """
+        record_dict = record.to_dict()
+        if flatten:
+            responses: dict = record_dict.pop("responses")
+            suggestions: dict = record_dict.pop("suggestions")
+            fields: dict = record_dict.pop("fields")
+            metadata: dict = record_dict.pop("metadata")
+            record_dict.update(fields)
+            record_dict.update(metadata)
+            question_names = set(suggestions.keys()).union(responses.keys())
+            for question_name in question_names:
+                _suggestion: Union[Dict, None] = suggestions.get(question_name)
+                if _suggestion:
+                    record_dict[f"{question_name}.suggestion"] = _suggestion.pop("value")
+                    record_dict.update(
+                        {f"{question_name}.suggestion.{key}": value for key, value in _suggestion.items()}
+                    )
+                for _response in responses.get(question_name, []):
+                    user_id = _response.pop("user_id")
+                    record_dict[f"{question_name}.response.{user_id}"] = _response.pop("value")
+                    record_dict.update(
+                        {f"{question_name}.response.{user_id}.{key}": value for key, value in _response.items()}
+                    )
+        return record_dict
Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,4 +13,3 @@
		# limitations under the License.

		from argilla_sdk.datasets._resource import Dataset # noqa
		from argilla_sdk.datasets._record import Record # noqa
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from argilla_sdk.records._export._generic import GenericExportMixin # noqa