Skip to content
This repository has been archived by the owner on Oct 2, 2024. It is now read-only.

Commit

Permalink
Feature to export dataset records as generic python structures (#49)
Browse files Browse the repository at this point in the history
* test: add integration tests for updating records

* feat: add basic update logic to records api

* feat: implement update filter logic in dataset records

* fix: excess import in suggestion model

* feat: add from model class method to reponse resource

* feat: implement suggestion as complete resource with from_model method and properties

* feat: implement Record as resource with containers

* refactor: restructure records in submodules

* refactor: use RecordModel and ResponseModel in record api

* test: update tests to use fields and suggestions attributes

* feat: implement fields and suggestions attributes within RecordFields and RecordSuggestions

* chore: expand record ingestion function with logging and validation

* fix: catch missing datasets in erroneous suggestions

* chore: document and refactor record resource

* chore: refactor and delete submodule records.utils

* chore: delete redundant questions module

* chore: liscencing in records module

* test: revise test for new responses as attribute model

* feat: implement sorter in ingestion function to make either response or suggestion

* feat: add flats for as_sugestion to DatasetRecords specify suggestion or responses coming in

* feat: add core properties to response resource

* chore: log dataset name in add and update records

* chore: reduce logging every record

* chore: naming and formatting in Record

* fix: question_name typing in suggestion

* chore: delete excess dataset_records code from dataset module

* chore: tidy excess imports in record model

* test: add testing for export records to generic python structures

* feat: implement export mixin for generic python structures

* feat: integrate export mixin with dataset records

* feat: expose record and metadata property

* feat: add serialize to response resource

* feat: add serialize method to suggestion resource

* docs: add documentation to _generic export mixin

* test: update tests for nested export configuration

* feat: add to_dict methods in responses and suggestions

* refactor: refactor generic export for simplicity and to_dict methods

* feat: implement to_dict method for resourcer

* feat: move to_dict out of export module
  • Loading branch information
burtenshaw authored Apr 15, 2024
1 parent 7695a68 commit 6ce7709
Show file tree
Hide file tree
Showing 15 changed files with 439 additions and 186 deletions.
1 change: 1 addition & 0 deletions src/argilla_sdk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@
from argilla_sdk.settings import * # noqa
from argilla_sdk.suggestions import * # noqa
from argilla_sdk.responses import * # noqa
from argilla_sdk.records import * # noqa
1 change: 0 additions & 1 deletion src/argilla_sdk/_models/_record.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from typing import Any, Dict, List, Optional, Tuple, Union
from uuid import UUID, uuid4

from pydantic import Field, field_serializer

Expand Down
1 change: 0 additions & 1 deletion src/argilla_sdk/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,3 @@
# limitations under the License.

from argilla_sdk.datasets._resource import Dataset # noqa
from argilla_sdk.datasets._record import Record # noqa
104 changes: 0 additions & 104 deletions src/argilla_sdk/datasets/_dataset_records.py

This file was deleted.

72 changes: 0 additions & 72 deletions src/argilla_sdk/datasets/_record.py

This file was deleted.

4 changes: 1 addition & 3 deletions src/argilla_sdk/datasets/_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,6 @@ def __init__(

@property
def records(self) -> "DatasetRecords":
if not self.is_published:
raise DatasetNotPublished("Cannot access records before publishing the dataset. Call `publish` first.")
return self.__records

@property
Expand Down Expand Up @@ -137,7 +135,7 @@ def _configure(self, settings: Settings, publish: bool = False) -> "Dataset":

if publish:
self.__publish()
return self.get() # type: ignore
return self.get() # type: ignore

def __define_settings(
self,
Expand Down
18 changes: 16 additions & 2 deletions src/argilla_sdk/records/_dataset_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from argilla_sdk._resource import Resource
from argilla_sdk.client import Argilla
from argilla_sdk.records._resource import Record
from argilla_sdk.records._export import GenericExportMixin

if TYPE_CHECKING:
from argilla_sdk.datasets import Dataset
Expand Down Expand Up @@ -76,7 +77,7 @@ def _list(self) -> Sequence[Record]:
yield Record.from_model(model=record_model, dataset=self.__dataset)


class DatasetRecords(Resource):
class DatasetRecords(Resource, GenericExportMixin):
"""
This class is used to work with records from a dataset.
Expand Down Expand Up @@ -167,7 +168,20 @@ def update(
message=f"Updated {len(records_to_update)} records and added {len(records_to_add)} records to dataset {self.__dataset.name}",
level="info",
)


def pull(self) -> None:
"""Fetch all records from the server and update the local records."""
records = self.__list_records_from_server()
self.__records = [Record.from_model(model=record, dataset=self.__dataset) for record in records]

def to_dict(self, flatten: bool = True, orient: str = "names") -> Dict[str, Any]:
"""Return the records as a dictionary."""
return self._export_to_dict(records=self.__records, flatten=flatten, orient=orient)

def to_list(self, flatten: bool = True) -> List[Dict[str, Any]]:
"""Return the records as a list of dictionaries."""
return self._export_to_list(records=self.__records, flatten=flatten)

############################
# Utility methods
############################
Expand Down
1 change: 1 addition & 0 deletions src/argilla_sdk/records/_export/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from argilla_sdk.records._export._generic import GenericExportMixin # noqa
Empty file.
104 changes: 104 additions & 0 deletions src/argilla_sdk/records/_export/_generic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Copyright 2024-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Dict, List, TYPE_CHECKING, Union
from collections import defaultdict

if TYPE_CHECKING:
from argilla_sdk import Record


class GenericExportMixin:
"""This is a mixin class for DatasetRecords and Export classes.
It handles methods for exporting records to generic python formats."""

def _export_to_dict(
self, records: List["Record"], flatten=True, orient="names"
) -> Dict[str, Union[str, float, int, list]]:
"""Export records to a dictionary with either names or record index as keys.
Args:
records (List[Record]): List of Record objects to export.
flatten (bool): The structure of the exported dictionary.
- True: The record fields, metadata, suggestions and responses will be flattened.
- False: The record fields, metadata, suggestions and responses will be nested.
orient (str): The orientation of the exported dictionary.
- "names": The keys of the dictionary will be the names of the fields, metadata, suggestions and responses.
- "index": The keys of the dictionary will be the external_id of the records.
Returns:
dataset_records (Dict[str, Union[str, float, int, list]]): The exported records in a dictionary format.
"""
if orient == "names":
dataset_records: dict = defaultdict(list)
for record in records:
for key, value in self.__record_to_dict(record=record, flatten=flatten).items():
dataset_records[key].append(value)
elif orient == "index":
dataset_records: dict = {}
for record in records:
dataset_records[record.external_id] = self.__record_to_dict(record=record, flatten=flatten)
else:
raise ValueError(f"Invalid value for orient parameter: {orient}")
return dataset_records

def _export_to_list(self, records: List["Record"], flatten=True) -> List[Dict[str, Union[str, float, int, list]]]:
"""Export records to a list of dictionaries with either names or record index as keys.
Args:
records (List[Record]): List of Record objects to export.
flatten (bool): The structure of the exported dictionary.
- True: The record fields, metadata, suggestions and responses will be flattened.
- False: The record fields, metadata, suggestions and responses will be nested.
Returns:
dataset_records (List[Dict[str, Union[str, float, int, list]]]): The exported records in a list of dictionaries format.
"""
dataset_records: list = []
for record in records:
dataset_records.append(self.__record_to_dict(record=record, flatten=flatten))
return dataset_records

def __record_to_dict(self, record: "Record", flatten=True) -> Dict[str, Any]:
"""Converts a Record object to a dictionary for export.
Args:
record (Record): The Record object to convert.
flatten (bool): The structure of the exported dictionary.
- True: The record fields, metadata, suggestions and responses will be flattened
so that their keys becomes the keys of the record dictionary, using
dot notation for nested keys. i.e. `label.suggestion` and `label.response`
- False: The record fields, metadata, suggestions and responses will be nested as
dictionaries within the record dictionary. i.e. `label: {suggestion: ..., response: ...}`
Returns:
A dictionary representing the record.
"""
record_dict = record.to_dict()
if flatten:
responses: dict = record_dict.pop("responses")
suggestions: dict = record_dict.pop("suggestions")
fields: dict = record_dict.pop("fields")
metadata: dict = record_dict.pop("metadata")
record_dict.update(fields)
record_dict.update(metadata)
question_names = set(suggestions.keys()).union(responses.keys())
for question_name in question_names:
_suggestion: Union[Dict, None] = suggestions.get(question_name)
if _suggestion:
record_dict[f"{question_name}.suggestion"] = _suggestion.pop("value")
record_dict.update(
{f"{question_name}.suggestion.{key}": value for key, value in _suggestion.items()}
)
for _response in responses.get(question_name, []):
user_id = _response.pop("user_id")
record_dict[f"{question_name}.response.{user_id}"] = _response.pop("value")
record_dict.update(
{f"{question_name}.response.{user_id}.{key}": value for key, value in _response.items()}
)
return record_dict
Loading

0 comments on commit 6ce7709

Please sign in to comment.