Skip to content

Commit

Permalink
Merge pull request #3 from magicalpuffin/v0.1
Browse files Browse the repository at this point in the history
V0.1 Release
  • Loading branch information
magicalpuffin authored Jan 14, 2024
2 parents 597547a + 12f5b87 commit c88c45b
Show file tree
Hide file tree
Showing 17 changed files with 939 additions and 209 deletions.
52 changes: 35 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

**WARNING: Library is currently unstable and in beta.**

Library for converting pandas dataframes into pydantic models. This allows conversion between popular python formats for flat and structured data. Pydantic model annotations are matched with pandas dataframe columns. Supports models nested in lists.
This library provides functions for converting Pandas Dataframes to Pydantic Models. This allows you to easily transform data in a table-like format into a json-like format. Pydantic Model annotations are matched with Pandas Dataframe columns. Supports models nested in lists.

[![PyPI - Version](https://img.shields.io/pypi/v/pandas-to-pydantic.svg)](https://pypi.org/project/pandas-to-pydantic)
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pandas-to-pydantic.svg)](https://pypi.org/project/pandas-to-pydantic)
Expand Down Expand Up @@ -110,7 +110,9 @@ Returns (output shortened):

## Example 2

Pydantic models can be nested using `list` annotations. This requires another unique field to be available. In this example, it is `AuthorName` and `Genre`.
In this example, Pydantic models are nested using the `list` type annotation. A unique id field must be provided for each parent model to determine how child models are structured. In this example, the unique id column for the `Genre` model is `Genre`, and the unique id column for the `Author` model is `AuthorName`.

Note: Keys are the field name not model name, except for the parent level model.

For example:

Expand All @@ -128,7 +130,11 @@ class Genre(BaseModel):
Genre: str
AuthorList: list[Author]

dataframe_to_pydantic(book_data, Genre).model_dump()
dataframe_to_pydantic(
data=bookData,
model=Genre,
id_column_map={"Genre": "Genre", "AuthorList": "AuthorName"},
).model_dump()
```

Returns (output shortened)
Expand Down Expand Up @@ -164,6 +170,10 @@ Returns (output shortened)
- Accepts classes created with pydantic.BaseModel
- Supports nested models in lists
- Annotation names must match columns in the dataframe
- id_column_map(`dict[str,str]`)
- Required when nesting Pydantic models
- Each key value pair corresponds with field names and their associated unique id column for the nested Pydantic model
- For the parent level model, use the model name as key

## Returns

Expand All @@ -175,42 +185,50 @@ Returns (output shortened)

This example uses a larger data set with additional nesting.

[Example Library Data](https://github.com/magicalpuffin/pandas-to-pydantic/blob/main/tests/data/libraryData.csv)
[Example Library Data](https://github.com/magicalpuffin/pandas-to-pydantic/blob/main/tests/data/library_data/library_data.csv)

```python
import pandas as pd
from pydantic import BaseModel
from pandas_to_pydantic import dataframe_to_pydantic

# Declare pydantic models
class Book(BaseModel):
BookID: int
Title: str
Genre: str
PublishedYear: int
AvailableCopies: int

class LibaryDetail(BaseModel):
LibraryName: str
Location: str
EstablishedYear: int
BookCollectionSize: int

class Author(BaseModel):
AuthorID: int
AuthorName: str
AuthorBirthdate: str
BookList: list[Book]

class Book(BaseModel):
BookID: int
Title: str
Genre: str
PublishedYear: int

class Library(BaseModel):
LibraryID: int
LibraryName: str
Location: str
EstablishedYear: int
BookCollectionSize: int
Detail: LibaryDetail
AuthorList: list[Author]
BookList: list[Book]

# Input data is a pandas dataframe
data = pd.read_csv(FILE_PATH)

# Convert pandas dataframe to a pydantic root model
library_list_root = dataframe_to_pydantic(data, Library)
library_list_root = dataframe_to_pydantic(
data,
Library,
{
"Library": "LibraryID",
"BookList": "BookID",
"AuthorList": "AuthorID",
},
)

# Access data as a list of pydantic models
library_list_root.root
Expand Down
2 changes: 1 addition & 1 deletion src/pandas_to_pydantic/__about__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
VERSION = "0.0.7"
VERSION = "0.1.0"
4 changes: 3 additions & 1 deletion src/pandas_to_pydantic/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
from pandas_to_pydantic.annotation_utils import expand_annotation, get_base_fields, get_list_fields # noqa: F401
from pandas_to_pydantic.annotation_utils import ( # noqa: F401
get_model_columns,
)
from pandas_to_pydantic.to_pydantic import dataframe_to_pydantic, get_root_list, serialize_dataframe # noqa: F401
120 changes: 78 additions & 42 deletions src/pandas_to_pydantic/annotation_utils.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,109 @@
import types
from typing import Optional

from pydantic import BaseModel
from pydantic._internal._model_construction import ModelMetaclass


def expand_annotation(model: ModelMetaclass) -> dict:
class ModelColumns(BaseModel):
"""
Expands a pydantic model annotations into basic types. Recursively expands nested models.
Describes model fields. Used when mapping Dataframe columns to fields.
Args:
model (ModelMetaclass): pydantic model
Raises:
TypeError: error if not pydantic model
Returns:
dict: key as annotation name, value as type
BaseModel (_type_): Pydantic BaseModel
"""
if not model.__base__ == BaseModel:
error_message = f"{model} is not a BaseModel"
raise TypeError(error_message)

annotations = model.__annotations__.copy()

for key, field_type in annotations.items():
if isinstance(field_type, types.GenericAlias):
# Only expanding lists
if field_type.__origin__ == list:
# Using lists to indicate list structure
annotations[key] = [expand_annotation(field_type.__args__[0])]

return annotations
name: str
id_column: Optional[str]
base_columns: list[str]
list_columns: list["ModelColumns"]
child_columns: list["ModelColumns"]


# TODO
# Combine functionality with list field
def get_base_fields(annotation: dict) -> list[str]:
def get_model_columns(
model: ModelMetaclass, id_column_map: Optional[dict[str, str]] = None, name: Optional[str] = None
) -> ModelColumns:
"""
Gets fields with basic types
Creates ModelColumns for a Pydantic BaseModel
Args:
annotation (dict): key as annotation name, value as type
model (ModelMetaclass): Pydantic BaseModel class
id_column_map (Optional[dict[str, str]], optional): Map of field names and unique ID. Necessary for identifying
and structuring nested objects. Defaults to None.
name (Optional[str], optional): For name field in ModelColumns. If None, uses model.__name__. Defaults to None.
Raises:
TypeError: Error if model is not a Pydantic BaseModel
Returns:
list[str]: key names that are not list type
ModelColumns: ModelColumns generated for the model.
"""
base_fields = []
# TODO consider returning field name
if not model.__base__ == BaseModel:
error_message = f"{model} is not a BaseModel"
raise TypeError(error_message)

for k, v in annotation.items():
if not isinstance(v, list):
base_fields.append(k)
if id_column_map is None:
id_column_map = {}
if name is None:
name = model.__name__

return base_fields
id_column = id_column_map.get(name)
annotations = model.__annotations__

base_columns = []
list_columns = []
child_columns = []

def get_list_fields(annotation: dict) -> list[str]:
for field_name, field_type in annotations.items():
if isinstance(field_type, types.GenericAlias):
if field_type.__origin__ == list:
# TODO reevaluate passed in field name
list_columns.append(get_model_columns(field_type.__args__[0], id_column_map, field_name))
elif isinstance(field_type, ModelMetaclass):
if field_type.__base__ == BaseModel:
child_columns.append(get_model_columns(field_type, id_column_map, field_name))
else:
base_columns.append(field_name)

return ModelColumns(
name=name,
id_column=id_column,
base_columns=base_columns,
list_columns=list_columns,
child_columns=child_columns,
)


# TODO deprecated?
def expand_annotation(model: ModelMetaclass) -> dict:
"""
Gets fields with list types
Expands a pydantic model annotations into basic types. Recursively expands nested models.
Args:
annotation (dict): key as annotation name, value as type
model (ModelMetaclass): pydantic model
Raises:
TypeError: error if not pydantic model
Returns:
list[str]: key names that are list type
dict: key as annotation name, value as type
"""
list_fields = []
if not model.__base__ == BaseModel:
error_message = f"{model} is not a BaseModel"
raise TypeError(error_message)

annotations = model.__annotations__.copy()

for k, v in annotation.items():
if isinstance(v, list):
list_fields.append(k)
for field_name, field_type in annotations.items():
if isinstance(field_type, types.GenericAlias):
# Expanding lists
if field_type.__origin__ == list:
# Using lists to indicate list structure
annotations[field_name] = [expand_annotation(field_type.__args__[0])]
elif isinstance(field_type, ModelMetaclass):
# Expanding pydantic models
if field_type.__base__ == BaseModel:
annotations[field_name] = expand_annotation(field_type)

return list_fields
return annotations
65 changes: 36 additions & 29 deletions src/pandas_to_pydantic/to_pydantic.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,51 @@
from typing import Union
from typing import Optional, Union

import pandas as pd
from pydantic import RootModel
from pydantic._internal._model_construction import ModelMetaclass

from pandas_to_pydantic.annotation_utils import expand_annotation, get_base_fields, get_list_fields
from pandas_to_pydantic.annotation_utils import ModelColumns, get_model_columns


def serialize_dataframe(data: pd.DataFrame, annotation: dict) -> list[dict]:
def serialize_dataframe(data: pd.DataFrame, model_columns: ModelColumns) -> list[dict]:
"""
Converts a dataframe into json-like structure using an annotation
Converts a Pandas Dataframe into a json-like structure
Args:
data (pd.DataFrame): data with columns matching annotation
annotation (dict): key as annotation name, value as type
data (pd.DataFrame): Dataframe with columns matching ModelColumns
model_columns (ModelColumns): ModelColumns object for maping model fields with columns
Raises:
ValueError: error if column used as id has NA
ValueError: Error for invalid data or ModelColumns
Returns:
list[dict]: data in json-like structure
list[dict]: Data in json-like structure
"""
# TODO maybe only return list if needed
new_list = []
base_fields = get_base_fields(annotation)
list_fields = get_list_fields(annotation)
# Assumes first field is id
id_field = base_fields[0]

if not list_fields:
# Might be bad design, should ensure unique id
return data[base_fields].to_dict(orient="records")
if not model_columns.id_column:
# TODO consider returning child models with base columns
return data[model_columns.base_columns].to_dict(orient="records")

if data[id_field].isna().any():
error_message = f"{id_field} contains NA"
if data[model_columns.id_column].isna().any():
error_message = f"{model_columns.id_column} contains NA"
raise ValueError(error_message)

for value in data[id_field].unique():
slice_data = data[data[id_field] == value]
for value in data[model_columns.id_column].unique():
base_dict = {}

base_dict = slice_data[base_fields].iloc[0].to_dict()
slice_data = data[data[model_columns.id_column] == value]

if list_fields:
# Only one list field is currently supported
base_dict[list_fields[0]] = serialize_dataframe(slice_data, annotation[list_fields[0]][0])
# Using first row for base data
base_dict = {**slice_data[model_columns.base_columns].iloc[0].to_dict()}

for list_model in model_columns.list_columns:
base_dict[list_model.name] = serialize_dataframe(slice_data, list_model)

for child_model in model_columns.child_columns:
# TODO using zero index to work around returning a list
base_dict[child_model.name] = serialize_dataframe(slice_data, child_model)[0]

new_list.append(base_dict)

Expand All @@ -66,19 +69,23 @@ def get_root_list(serialize_data: list[Union[dict, ModelMetaclass]], model: Mode
return root_list


def dataframe_to_pydantic(data: pd.DataFrame, model: ModelMetaclass) -> RootModel:
def dataframe_to_pydantic(
data: pd.DataFrame, model: ModelMetaclass, id_column_map: Optional[dict[str, str]] = None
) -> RootModel:
"""
Converts a dataframe to a pydantic model
Args:
data (pd.DataFrame): input dataframe. Columns must match model
model (ModelMetaclass): target pydantic model
data (pd.DataFrame): Dataframe with columns matching Pydantic Model
model (ModelMetaclass): Target Pydantic Model
id_column_map (Optional[dict[str, str]], optional): Map of field names and unique ID. Necessary for identifying
and structuring nested objects.
Returns:
RootModel: list of pydantic model set to the input data
RootModel: _description_
"""
target_annotation = expand_annotation(model)
serialize_data = serialize_dataframe(data, target_annotation)
target_model_columns = get_model_columns(model, id_column_map)
serialize_data = serialize_dataframe(data, target_model_columns)
model_list = get_root_list(serialize_data, model)

return model_list
Loading

0 comments on commit c88c45b

Please sign in to comment.