diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 64a08e1..49758fc 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -79,4 +79,4 @@ jobs: poetry install poetry run pytest --cov=src --cov-report=xml - name: Upload coverage reports to Codecov with GitHub Action - uses: codecov/codecov-action@v3 \ No newline at end of file + uses: codecov/codecov-action@v3 diff --git a/Makefile b/Makefile index 02e3606..ddea2b5 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.DEFAULT_GOAL := help +.DEFAULT_GOAL := all sources = src tests .PHONY: .poetry # Check that poetry is installed diff --git a/README.md b/README.md index 9583d96..d9d8968 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,58 @@ # pandas-validity [![PyPI - Version](https://img.shields.io/pypi/v/pandas-validity)](https://pypi.org/project/pandas-validity/) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pandas-validity) -[![Test and lint](https://github.com/ohmycoffe/pandas-validity/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/ohmycoffe/pandas-validity/actions/workflows/test.yaml?query=branch%3Amain) -[![codecov](https://codecov.io/gh/ohmycoffe/organize-photos/graph/badge.svg?token=PAN0F7B4E8)](https://codecov.io/gh/ohmycoffe/organize-photos) +[![Test and lint](https://github.com/ohmycoffe/pandas-validity/actions/workflows/test.yml/badge.svg?branch=main)](https://github.com/ohmycoffe/pandas-validity/actions/workflows/test.yml?query=branch%3Amain) +[![codecov](https://codecov.io/gh/ohmycoffe/pandas-validity/graph/badge.svg?token=4K6RV6E9JX)](https://codecov.io/gh/ohmycoffe/pandas-validity) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Checked with mypy](https://www.mypy-lang.org/static/mypy_badge.svg)](https://mypy-lang.org/) [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/) ![PyPI - License](https://img.shields.io/pypi/l/organize-photos) + ## What is it? -**pandas-validity** is a Python library for validation of pandas DataFrames. It provides a `DataFrameValidator` class that serves as a context manager. Within this context, you can perform multiple validations and checks. Any encountered errors are collected and raised at the end of the process. The `DataFrameValidator` raises a `ValidationErrorsGroup` exception to summarize the errors. -## Where to get it? +**pandas-validity** is a Python library for the validation of pandas DataFrames. It provides a `DataFrameValidator` class that serves as a context manager. Within this context, you can perform multiple validations and checks. Any encountered errors are collected and raised at the end of the process. The `DataFrameValidator` raises a `ValidationErrorsGroup` exception to summarize the errors. + +## Installation + You can easily install the latest released version using binary installers from the [Python Package Index (PyPI)](https://pypi.org/project/pandas-validity): ```sh pip install pandas-validity ``` +### Development Installation + +**Prerequisites**: [poetry](https://python-poetry.org/) for environment management + +The source code is currently hosted on GitHub at [ohmycoffe/pandas-validity](https://github.com/ohmycoffe/pandas-validity). To get the development version: + +```shell +git clone git@github.com:ohmycoffe/pandas-validity.git +``` + +To install the project and development dependencies: + +```shell +make install +``` + +To run tests: + +```shell +make test +``` + +To view all possible commands, use: + +```shell +make help +``` + ## Usage ```python import pandas as pd import datetime -from pandas_validity.validator import DataFrameValidator +from pandas_validity import DataFrameValidator # Create a sample DataFrame df = pd.DataFrame( @@ -54,6 +85,7 @@ with DataFrameValidator(df) as validator: ``` **Output:** + ```shell Error occurred: () The dataframe has missing columns: ['E'] Error occurred: () The dataframe has redundant columns: ['D'] @@ -72,6 +104,12 @@ Error occurred: () Found 1 m | pandas_validity.exceptions.ValidationError: Found 1 missing value: [{'index': 1, 'column': 'B', 'value': None}] +------------------------------------ ``` +--- + +The library supports the following data types for validation: +- predefined: `"str"`, `"int"`, `"float"`,`"datetime"`, `"bool"` +- or any `Callable` that accepts a data `type/dtype` object and returns a boolean value to indicate the validation status - example: `pd.api.types.is_string_dtype` + ## Development **Prerequisites**: [poetry](https://python-poetry.org/) for environment management @@ -82,7 +120,7 @@ The source code is currently hosted on GitHub at: ```shell git clone git@github.com:ohmycoffe/pandas-validity.git ``` -To install project and development dependencies: +To install the project and development dependencies: ```shell make install ``` diff --git a/pyproject.toml b/pyproject.toml index 8f5b49e..97c39a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,32 @@ +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + [tool.poetry] name = "pandas-validity" -version = "0.1.0" +version = "0.1.1" description = "Validation library for Pandas Dataframe" authors = ["ohmycoffe "] readme = "README.md" packages = [{include = "pandas_validity", from = "src"}] +license = "MIT" +repository = "https://github.com/ohmycoffe/pandas-validity" +keywords = ["pandas", "dataframe", "validation"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Software Development :: Libraries", + "License :: OSI Approved :: MIT License", +] + [tool.poetry.dependencies] python = "^3.9" @@ -37,16 +59,13 @@ types-setuptools = "^68.2.0.0" black = "^23.7.0" isort = "^5.12.0" -[build-system] -requires = ["poetry-core>=1.0.0"] -build-backend = "poetry.core.masonry.api" - [tool.isort] profile = "black" [tool.flake8] max-line-length = 88 extend-ignore = "E203" +exclude = ['src/pandas_validity/__init__.py'] [tool.bandit.assert_used] skips = ['tests/**/*.py', 'tests/*.py'] diff --git a/src/pandas_validity/__init__.py b/src/pandas_validity/__init__.py index e69de29..6fd07b3 100644 --- a/src/pandas_validity/__init__.py +++ b/src/pandas_validity/__init__.py @@ -0,0 +1 @@ +from .validator import DataFrameValidator as DataFrameValidator diff --git a/src/pandas_validity/validator.py b/src/pandas_validity/validator.py index 3e934e0..cd80b25 100644 --- a/src/pandas_validity/validator.py +++ b/src/pandas_validity/validator.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from collections.abc import Mapping import numpy as np import pandas as pd @@ -17,7 +18,6 @@ class DataFrameValidator(AbstractValidator): - """ Context manager to validate pandas dataframes. @@ -96,7 +96,7 @@ def has_no_redundant_columns(self, expected_columns: list[str]) -> None: ) def has_valid_data_types( - self, expected_data_types: dict[str, ValidationFunc_T | type | str] + self, expected_data_types: Mapping[str, ValidationFunc_T | type | str] ) -> None: """Check if columns have valid data types""" for col, dtype in self._df.dtypes.items(): diff --git a/tests/test_dataframe_validator.py b/tests/test_dataframe_validator.py index aba8731..c2d4900 100644 --- a/tests/test_dataframe_validator.py +++ b/tests/test_dataframe_validator.py @@ -98,7 +98,9 @@ def test_should_raise_error_if_wrong_datatypes(valid_df: pd.DataFrame): } with pytest.raises(ValidationErrorsGroup) as excinfo: with DataFrameValidator(valid_df) as validator: - validator.has_valid_data_types(expected_data_types=wrong_validators) + validator.has_valid_data_types( + expected_data_types=wrong_validators # pyright: ignore + ) reasons = excinfo.value.args[1] assert len(wrong_validators) == len(reasons)