diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 99a75ea..82b6f8b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -21,6 +21,8 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt pip install -r requirements-dev.txt + - name: Lint with black + run: black --check . - name: Lint with flake8 run: | flake8 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5c52aa7..1577149 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,6 +11,13 @@ repos: entry: mypy language: system types: [python] + - id: black + name: black + description: "Black: The uncompromising Python code formatter" + entry: black + language: system + require_serial: true + files: \.(py|ipynb)$ - id: pytest name: pytest entry: coverage run -m pytest --typeguard-packages=strictly_typed_pandas,tests diff --git a/docs/source/advanced.ipynb b/docs/source/advanced.ipynb index b076fc1..df0ade5 100644 --- a/docs/source/advanced.ipynb +++ b/docs/source/advanced.ipynb @@ -17,6 +17,7 @@ "%autoreload 2\n", "\n", "import sys\n", + "\n", "sys.path.append(\"../\")" ] }, @@ -37,19 +38,22 @@ "source": [ "from strictly_typed_pandas import DataSet\n", "\n", + "\n", "class SchemaA:\n", " name: str\n", "\n", + "\n", "class SchemaB(SchemaA):\n", " id: int\n", "\n", + "\n", "df = DataSet[SchemaA]({\"name\": [\"John\", \"Jane\", \"Jack\"]})\n", "\n", + "\n", "def foo(df: DataSet[SchemaA]) -> DataSet[SchemaB]:\n", - " return (\n", - " df.assign(id=lambda df: range(df.shape[0]))\n", - " .pipe(DataSet[SchemaB])\n", - " )" + " return df.assign(\n", + " id=lambda df: range(df.shape[0]),\n", + " ).pipe(DataSet[SchemaB])" ] }, { @@ -69,19 +73,19 @@ " id: int\n", " name: str\n", "\n", + "\n", "class SchemaB:\n", " id: int\n", " job: str\n", "\n", + "\n", "class SchemaAB(SchemaA, SchemaB):\n", " pass\n", "\n", + "\n", "df1 = DataSet[SchemaA]({\"id\": [1, 2, 3], \"name\": [\"John\", \"Jane\", \"Jack\"]})\n", "df2 = DataSet[SchemaB]({\"id\": [1, 2, 3], \"job\": \"Data Scientist\"})\n", - "(\n", - " df1.merge(df2, on=\"id\")\n", - " .pipe(DataSet[SchemaAB])\n", - ")" + "df1.merge(df2, on=\"id\").pipe(DataSet[SchemaAB])" ] }, { @@ -102,6 +106,7 @@ " id: int\n", " name: str\n", "\n", + "\n", "DataSet[Schema]()" ] }, @@ -123,18 +128,20 @@ "import pandas as pd\n", "from typing import Any\n", "\n", + "\n", "class Schema:\n", " name: pd.StringDtype\n", " money: np.float64\n", " eggs: np.int64\n", " potatoes: Any\n", "\n", + "\n", "df = DataSet[Schema](\n", " {\n", " \"name\": pd.Series([\"John\", \"Jane\", \"Jack\"], dtype=\"string\"),\n", " \"money\": pd.Series([100.50, 1000.23, 123.45], dtype=np.float64),\n", " \"eggs\": pd.Series([1, 2, 3], dtype=np.int64),\n", - " \"potatoes\": [\"1\", 0, np.nan]\n", + " \"potatoes\": [\"1\", 0, np.nan],\n", " }\n", ")\n", "\n", @@ -158,13 +165,16 @@ "source": [ "from strictly_typed_pandas import IndexedDataSet\n", "\n", + "\n", "class IndexSchema:\n", " id: int\n", " job: str\n", "\n", + "\n", "class DataSchema:\n", " name: str\n", "\n", + "\n", "df = (\n", " pd.DataFrame({\"id\": [1, 2, 3], \"name\": [\"John\", \"Jane\", \"Jack\"], \"job\": \"Data Scientist\"})\n", " .set_index([\"id\", \"job\"])\n", @@ -189,14 +199,14 @@ "class SchemaA:\n", " name: str\n", "\n", + "\n", "class SchemaB(SchemaA):\n", " id: int\n", "\n", + "\n", "def foo(df: DataSet[SchemaA]) -> DataSet[SchemaB]:\n", - " return (\n", - " df.assign(id=1)\n", - " .pipe(DataSet[SchemaB])\n", - " )\n", + " return df.assign(id=1).pipe(DataSet[SchemaB])\n", + "\n", "\n", "df = DataSet[SchemaA]({\"name\": [\"John\", \"Jane\", \"Jack\"]})\n", "df = foo(df)\n", diff --git a/docs/source/conf.py b/docs/source/conf.py index 73a62ac..f52d1a9 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -19,9 +19,9 @@ # -- Project information ----------------------------------------------------- -project = 'Strictly Typed Pandas' -copyright = '2021, Nanne Aben' -author = 'Nanne Aben' +project = "Strictly Typed Pandas" +copyright = "2021, Nanne Aben" +author = "Nanne Aben" # -- General configuration --------------------------------------------------- @@ -29,10 +29,10 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ['sphinx.ext.autodoc', 'sphinx_rtd_theme', 'nbsphinx'] +extensions = ["sphinx.ext.autodoc", "sphinx_rtd_theme", "nbsphinx"] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -45,9 +45,9 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] diff --git a/docs/source/deepdive_into_dtypes.ipynb b/docs/source/deepdive_into_dtypes.ipynb index 7a7d61b..bca820f 100644 --- a/docs/source/deepdive_into_dtypes.ipynb +++ b/docs/source/deepdive_into_dtypes.ipynb @@ -17,6 +17,7 @@ "%autoreload 2\n", "\n", "import sys\n", + "\n", "sys.path.append(\"../\")\n", "\n", "import pandas as pd\n", @@ -45,7 +46,7 @@ " {\n", " \"a\": pd.Series([1, 2, 3], dtype=int),\n", " \"b\": pd.Series([1.0, 2.0, 3.0], dtype=float),\n", - " \"c\": pd.Series([True, False, True], dtype=bool)\n", + " \"c\": pd.Series([True, False, True], dtype=bool),\n", " }\n", ")\n", "\n", @@ -106,6 +107,7 @@ " b: float\n", " c: bool\n", "\n", + "\n", "df = DataSet[Schema]()\n", "df.dtypes" ] @@ -121,6 +123,7 @@ " b: np.float64\n", " c: np.bool_\n", "\n", + "\n", "df = DataSet[Schema]()\n", "df.dtypes" ] @@ -142,11 +145,9 @@ " a: np.integer\n", " b: np.float_\n", "\n", + "\n", "df = DataSet[Schema](\n", - " {\n", - " \"a\": pd.Series([1, 2, 3], dtype=np.int64),\n", - " \"b\": pd.Series([1.0, 2.0, 3.0], dtype=np.float64)\n", - " }\n", + " {\"a\": pd.Series([1, 2, 3], dtype=np.int64), \"b\": pd.Series([1.0, 2.0, 3.0], dtype=np.float64)}\n", ")\n", "df.dtypes" ] @@ -169,6 +170,7 @@ " a: np.datetime64\n", " b: np.timedelta64\n", "\n", + "\n", "df = DataSet[Schema]()\n", "df.dtypes" ] @@ -202,6 +204,7 @@ " f: pd.Int64Dtype\n", " h: pd.BooleanDtype\n", "\n", + "\n", "df = DataSet[Schema]()\n", "df.dtypes" ] @@ -224,14 +227,13 @@ "class SchemaA:\n", " a: pd.Int64Dtype\n", "\n", + "\n", "class SchemaB:\n", " a: np.int64\n", "\n", + "\n", "try:\n", - " (\n", - " DataSet[SchemaA]()\n", - " .pipe(DataSet[SchemaB])\n", - " )\n", + " (DataSet[SchemaA]().pipe(DataSet[SchemaB]))\n", "except TypeError as e:\n", " print(e)" ] @@ -254,10 +256,11 @@ " a: str\n", " b: pd.StringDtype\n", "\n", + "\n", "df = DataSet[Schema](\n", " {\n", " \"a\": pd.Series([\"a\", \"b\", \"c\"], dtype=\"string\"),\n", - " \"b\": pd.Series([\"a\", \"b\", \"c\"], dtype=\"string\")\n", + " \"b\": pd.Series([\"a\", \"b\", \"c\"], dtype=\"string\"),\n", " }\n", ")\n", "df.dtypes" @@ -296,6 +299,7 @@ "class Schema:\n", " a: str\n", "\n", + "\n", "df = DataSet[Schema]({\"a\": [\"a\", \"b\", \"c\"]})\n", "df.dtypes" ] @@ -316,11 +320,8 @@ "class Schema:\n", " a: str\n", "\n", - "df = DataSet[Schema](\n", - " {\n", - " \"a\": [None, 42, lambda x: x]\n", - " }\n", - ")\n", + "\n", + "df = DataSet[Schema]({\"a\": [None, 42, lambda x: x]})\n", "df.dtypes" ] }, @@ -340,11 +341,8 @@ "class Schema:\n", " a: pd.StringDtype\n", "\n", - "df = DataSet[Schema](\n", - " {\n", - " \"a\": pd.Series([\"a\", \"b\", \"c\"], dtype=\"string\")\n", - " }\n", - ")" + "\n", + "df = DataSet[Schema]({\"a\": pd.Series([\"a\", \"b\", \"c\"], dtype=\"string\")})" ] }, { @@ -354,11 +352,7 @@ "outputs": [], "source": [ "try:\n", - " DataSet[Schema](\n", - " {\n", - " \"a\": [None, 42, lambda x: x]\n", - " }\n", - " )\n", + " DataSet[Schema]({\"a\": [None, 42, lambda x: x]})\n", "except TypeError as e:\n", " print(e)" ] @@ -382,10 +376,11 @@ " a: Any\n", " b: Any\n", "\n", + "\n", "df = DataSet[Schema](\n", " {\n", " \"a\": [1, 2, 3],\n", - " \"b\": [\"1\", 2, None]\n", + " \"b\": [\"1\", 2, None],\n", " }\n", ")\n", "df.dtypes" diff --git a/docs/source/getting_started.ipynb b/docs/source/getting_started.ipynb index d5e506e..3e078a4 100644 --- a/docs/source/getting_started.ipynb +++ b/docs/source/getting_started.ipynb @@ -17,6 +17,7 @@ "%autoreload 2\n", "\n", "import sys\n", + "\n", "sys.path.append(\"../\")" ] }, @@ -37,6 +38,7 @@ "source": [ "import pandas as pd\n", "\n", + "\n", "def foo(df: pd.DataFrame) -> pd.DataFrame:\n", " # do stuff\n", " return df" @@ -79,10 +81,12 @@ "source": [ "from strictly_typed_pandas import DataSet\n", "\n", + "\n", "class Schema:\n", " id: int\n", " name: str\n", "\n", + "\n", "def foo(df: DataSet[Schema]) -> DataSet[Schema]:\n", " # do stuff\n", " return df" @@ -160,6 +164,7 @@ " id: int\n", " first_name: str\n", "\n", + "\n", "df = DataSet[AlternativeSchema]({\"id\": [1, 2, 3], \"first_name\": [\"John\", \"Jane\", \"Jack\"]})\n", "try:\n", " res = foo(df)\n", @@ -187,12 +192,13 @@ " id: int\n", " name: str\n", "\n", + "\n", "def foo() -> DataSet[Schema]:\n", " return DataSet[Schema](\n", " {\n", " \"id\": [1, 2, 3],\n", " \"name\": [\"John\", \"Jane\", \"Jack\"],\n", - " \"job\": \"Data Scientist\"\n", + " \"job\": \"Data Scientist\",\n", " }\n", " )" ] @@ -267,8 +273,8 @@ "try:\n", " df[\"id\"] = ids\n", " df.id = ids\n", - " df.loc[:,\"id\"] = ids\n", - " df.iloc[:,0] = ids\n", + " df.loc[:, \"id\"] = ids\n", + " df.iloc[:, 0] = ids\n", " df.assign(id=ids, inplace=True)\n", "except NotImplementedError as e:\n", " print(e)" @@ -329,12 +335,15 @@ "class SchemaA:\n", " name: str\n", "\n", + "\n", "class SchemaB:\n", " id: int\n", " name: str\n", "\n", + "\n", "df = DataSet[SchemaA]({\"name\": [\"John\", \"Jane\", \"Jack\"]})\n", "\n", + "\n", "def foo(df: DataSet[SchemaA]) -> DataSet[SchemaB]:\n", " n = df.shape[0]\n", " ids = range(n)\n", @@ -356,10 +365,9 @@ "outputs": [], "source": [ "def foo(data: DataSet[SchemaA]) -> DataSet[SchemaB]:\n", - " return (\n", - " df.assign(id=lambda df: range(df.shape[0]))\n", - " .pipe(DataSet[SchemaB])\n", - " )" + " return df.assign(\n", + " id=lambda df: range(df.shape[0]),\n", + " ).pipe(DataSet[SchemaB])" ] }, { @@ -388,6 +396,7 @@ " .iloc[:3]\n", " )\n", "\n", + "\n", "res = foo()" ] }, @@ -410,6 +419,7 @@ "source": [ "from typeguard import typechecked\n", "\n", + "\n", "@typechecked\n", "def foo() -> DataSet[Schema]:\n", " return (\n", @@ -418,6 +428,7 @@ " .iloc[:3]\n", " )\n", "\n", + "\n", "try:\n", " res = foo()\n", "except TypeError as e:\n", @@ -445,10 +456,12 @@ "source": [ "from strictly_typed_pandas import DataSet\n", "\n", + "\n", "class Schema:\n", " id: int\n", " name: str\n", "\n", + "\n", "def foo(df: DataSet[Schema]) -> DataSet[Schema]:\n", " # do stuff\n", " return df" @@ -501,4 +514,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..aa4949a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[tool.black] +line-length = 100 diff --git a/requirements-dev.txt b/requirements-dev.txt index 95cfb6b..5d8a0fd 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,8 @@ -mypy==1.6.1 +mypy==1.6.1 flake8==6.1.0 +black[jupyter]==23.10.0 coverage==7.3.2 -pytest==7.4.2 +pytest==7.4.2 papermill==2.4.0 jupyter==1.0.0 sphinx<=7.2.6 diff --git a/setup.py b/setup.py index 8eeb623..8f776a6 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ def get_requirements(): - with open('requirements.txt') as f: + with open("requirements.txt") as f: return f.read().splitlines() @@ -26,6 +26,6 @@ def get_long_description(): python_requires=">=3.8.0", classifiers=["Typing :: Typed"], version_config=True, - setup_requires=['setuptools-git-versioning'], + setup_requires=["setuptools-git-versioning"], package_data={"strictly_typed_pandas": ["py.typed"]}, ) diff --git a/strictly_typed_pandas/create_empty_dataframe.py b/strictly_typed_pandas/create_empty_dataframe.py index e0e70d8..aacc1eb 100644 --- a/strictly_typed_pandas/create_empty_dataframe.py +++ b/strictly_typed_pandas/create_empty_dataframe.py @@ -32,10 +32,9 @@ def create_empty_dataframe(schema: Dict[str, Any]) -> pd.DataFrame: return pd.DataFrame(res) -def create_empty_indexed_dataframe(index_schema: Dict[str, Any], data_schema: Dict[str, Any]) -> pd.DataFrame: +def create_empty_indexed_dataframe( + index_schema: Dict[str, Any], data_schema: Dict[str, Any] +) -> pd.DataFrame: df_index = create_empty_dataframe(index_schema) df_data = create_empty_dataframe(data_schema) - return ( - pd.concat([df_index, df_data], axis=1) - .set_index(list(index_schema.keys())) - ) + return pd.concat([df_index, df_data], axis=1).set_index(list(index_schema.keys())) diff --git a/strictly_typed_pandas/dataset.py b/strictly_typed_pandas/dataset.py index 08150c9..e6fe878 100644 --- a/strictly_typed_pandas/dataset.py +++ b/strictly_typed_pandas/dataset.py @@ -5,12 +5,16 @@ from typing import Any, Generic, TypeVar, get_type_hints from strictly_typed_pandas.immutable import ( - _ImmutableiLocIndexer, _ImmutableLocIndexer, immutable_error_msg, inplace_argument_interceptor + _ImmutableiLocIndexer, + _ImmutableLocIndexer, + immutable_error_msg, + inplace_argument_interceptor, ) -from strictly_typed_pandas.validate_schema import ( - check_for_duplicate_columns, validate_schema +from strictly_typed_pandas.validate_schema import check_for_duplicate_columns, validate_schema +from strictly_typed_pandas.create_empty_dataframe import ( + create_empty_dataframe, + create_empty_indexed_dataframe, ) -from strictly_typed_pandas.create_empty_dataframe import create_empty_dataframe, create_empty_indexed_dataframe dataframe_functions = dict(inspect.getmembers(pd.DataFrame, predicate=inspect.isfunction)) @@ -19,10 +23,10 @@ class DataSetBase(pd.DataFrame, ABC): def __init__(self, *args, **kwargs) -> None: - ''' + """ This class is a subclass of `pd.DataFrame`, hence it is initialized with the same parameters as a `DataFrame`. See the Pandas `DataFrame` documentation for more information. - ''' + """ super().__init__(*args, **kwargs) if self.columns.duplicated().any(): @@ -72,15 +76,15 @@ def _continue_initialization(self) -> None: pass # pragma: no cover def to_dataframe(self) -> pd.DataFrame: - ''' + """ Converts the object to a pandas `DataFrame`. - ''' + """ return pd.DataFrame(self) def to_frame(self) -> pd.DataFrame: - ''' + """ Synonym of to to_dataframe(): converts the object to a pandas `DataFrame`. - ''' + """ return self.to_dataframe() @@ -89,7 +93,7 @@ def to_frame(self) -> pd.DataFrame: class DataSet(Generic[T], DataSetBase): - ''' + """ `DataSet` allows for static type checking of pandas DataFrames, for example: .. code-block:: python @@ -107,7 +111,8 @@ class Schema: The `DataSet[Schema]` annotations are compatible with: * `mypy` for type checking during linting-time (i.e. while you write your code). * `typeguard` for type checking during run-time (i.e. while you run your unit tests). - ''' + """ + def _continue_initialization(self) -> None: schema_expected = get_type_hints(self._schema_annotations[0]) @@ -120,7 +125,7 @@ def _continue_initialization(self) -> None: class IndexedDataSet(Generic[T, V], DataSetBase): - ''' + """ `IndexedDataSet` allows for static type checking of indexed pandas DataFrames, for example: .. code-block:: text @@ -150,14 +155,14 @@ class DataSchema: The `IndexedDataSet[Schema]` annotations are compatible with: * `mypy` for type checking during linting-time (i.e. while you write your code). * `typeguard` for type checking during run-time (i.e. while you run your unit tests). - ''' + """ + def _continue_initialization(self) -> None: schema_index_expected = get_type_hints(self._schema_annotations[0]) schema_data_expected = get_type_hints(self._schema_annotations[1]) check_for_duplicate_columns( - set(schema_index_expected.keys()), - set(schema_data_expected.keys()) + set(schema_index_expected.keys()), set(schema_data_expected.keys()) ) if self.shape == (0, 0) and self.index.shape == (0,): diff --git a/strictly_typed_pandas/immutable.py b/strictly_typed_pandas/immutable.py index 8c0921f..368d6b4 100644 --- a/strictly_typed_pandas/immutable.py +++ b/strictly_typed_pandas/immutable.py @@ -5,9 +5,9 @@ immutable_error_msg = ( - "To ensure that the DataSet adheres to its schema, you cannot perform inplace modifications. You can either use " + - "dataset.to_dataframe() to cast the DataSet to a DataFrame, or use operations that return a DataFrame, e.g. " + - "df = df.assign(...)." + "To ensure that the DataSet adheres to its schema, you cannot perform inplace modifications. You can either use " + + "dataset.to_dataframe() to cast the DataSet to a DataFrame, or use operations that return a DataFrame, e.g. " + + "df = df.assign(...)." ) diff --git a/strictly_typed_pandas/pandas_types.py b/strictly_typed_pandas/pandas_types.py index bd17cad..9bc11d8 100644 --- a/strictly_typed_pandas/pandas_types.py +++ b/strictly_typed_pandas/pandas_types.py @@ -12,6 +12,7 @@ def __init__(self, *args, **kwargs) -> None: if hasattr(pd, "StringDtype"): StringDtype = pd.StringDtype else: # pragma: no cover + class StringDtype(BackwardCompatibility): # type: ignore pass @@ -19,6 +20,7 @@ class StringDtype(BackwardCompatibility): # type: ignore if hasattr(pd, "DatetimeTZDtype"): DatetimeTZDtype = pd.DatetimeTZDtype else: # pragma: no cover + class DatetimeTZDtype(BackwardCompatibility): # type: ignore pass @@ -26,6 +28,7 @@ class DatetimeTZDtype(BackwardCompatibility): # type: ignore if hasattr(pd, "CategoricalDtype"): CategoricalDtype = pd.CategoricalDtype else: # pragma: no cover + class CategoricalDtype(BackwardCompatibility): # type: ignore pass @@ -33,6 +36,7 @@ class CategoricalDtype(BackwardCompatibility): # type: ignore if hasattr(pd, "PeriodDtype"): PeriodDtype = pd.PeriodDtype else: # pragma: no cover + class PeriodDtype(BackwardCompatibility): # type: ignore pass @@ -40,6 +44,7 @@ class PeriodDtype(BackwardCompatibility): # type: ignore if hasattr(pd, "SparseDtype"): SparseDtype = pd.SparseDtype else: # pragma: no cover + class SparseDtype(BackwardCompatibility): # type: ignore pass @@ -47,6 +52,7 @@ class SparseDtype(BackwardCompatibility): # type: ignore if hasattr(pd, "IntervalDtype"): IntervalDtype = pd.IntervalDtype else: # pragma: no cover + class IntervalDtype(BackwardCompatibility): # type: ignore pass @@ -54,6 +60,7 @@ class IntervalDtype(BackwardCompatibility): # type: ignore if hasattr(pd, "Int64Dtype"): Int64Dtype = pd.Int64Dtype else: # pragma: no cover + class Int64Dtype(BackwardCompatibility): # type: ignore pass @@ -61,5 +68,6 @@ class Int64Dtype(BackwardCompatibility): # type: ignore if hasattr(pd, "BooleanDtype"): BooleanDtype = pd.BooleanDtype else: # pragma: no cover + class BooleanDtype(BackwardCompatibility): # type: ignore pass diff --git a/strictly_typed_pandas/typeguard.py b/strictly_typed_pandas/typeguard.py index f3f424e..7926335 100644 --- a/strictly_typed_pandas/typeguard.py +++ b/strictly_typed_pandas/typeguard.py @@ -11,7 +11,7 @@ def check_dataset(argname: str, value, expected_type, memo: typeguard._TypeCheck msg.format( argname=argname, schema_expected=typeguard.qualified_name(schema_expected), - class_observed=typeguard.qualified_name(value) + class_observed=typeguard.qualified_name(value), ) ) @@ -22,7 +22,7 @@ def check_dataset(argname: str, value, expected_type, memo: typeguard._TypeCheck msg.format( argname=argname, schema_expected=typeguard.qualified_name(schema_expected), - schema_observed=typeguard.qualified_name(schema_observed) + schema_observed=typeguard.qualified_name(schema_observed), ) ) @@ -32,24 +32,27 @@ def check_indexed_dataset(argname: str, value, expected_type, memo: typeguard._T schema_data_expected = expected_type.__args__[1] if not isinstance(value, IndexedDataSet): msg = ( - "Type of {argname} must be a IndexedDataSet[{schema_index_expected},{schema_data_expected}];" + - "got {class_observed} instead" + "Type of {argname} must be a IndexedDataSet[{schema_index_expected},{schema_data_expected}];" + + "got {class_observed} instead" ) raise TypeError( msg.format( argname=argname, schema_index_expected=typeguard.qualified_name(schema_index_expected), schema_data_expected=typeguard.qualified_name(schema_data_expected), - class_observed=typeguard.qualified_name(value) + class_observed=typeguard.qualified_name(value), ) ) schema_index_observed = value.__orig_class__.__args__[0] schema_data_observed = value.__orig_class__.__args__[1] - if schema_index_observed != schema_index_expected or schema_data_observed != schema_data_expected: + if ( + schema_index_observed != schema_index_expected + or schema_data_observed != schema_data_expected + ): msg = ( - "Type of {argname} must be a IndexedDataSet[{schema_index_expected},{schema_data_expected}];" + - "got IndexedDataSet[{schema_index_observed},{schema_data_observed}] instead" + "Type of {argname} must be a IndexedDataSet[{schema_index_expected},{schema_data_expected}];" + + "got IndexedDataSet[{schema_index_observed},{schema_data_observed}] instead" ) raise TypeError( msg.format( @@ -57,7 +60,7 @@ def check_indexed_dataset(argname: str, value, expected_type, memo: typeguard._T schema_index_expected=typeguard.qualified_name(schema_index_expected), schema_data_expected=typeguard.qualified_name(schema_data_expected), schema_index_observed=typeguard.qualified_name(schema_index_observed), - schema_data_observed=typeguard.qualified_name(schema_data_observed) + schema_data_observed=typeguard.qualified_name(schema_data_observed), ) ) diff --git a/strictly_typed_pandas/validate_schema.py b/strictly_typed_pandas/validate_schema.py index 3dcc0af..995acb6 100644 --- a/strictly_typed_pandas/validate_schema.py +++ b/strictly_typed_pandas/validate_schema.py @@ -23,17 +23,13 @@ def _check_names(names_expected: Set[str], names_observed: Set[str]) -> None: diff = names_observed - names_expected if diff: raise TypeError( - "Data contains the following columns not present in schema: {diff}".format( - diff=diff - ) + "Data contains the following columns not present in schema: {diff}".format(diff=diff) ) diff = names_expected - names_observed if diff: raise TypeError( - "Schema contains the following columns not present in data: {diff}".format( - diff=diff - ) + "Schema contains the following columns not present in data: {diff}".format(diff=diff) ) @@ -54,7 +50,9 @@ def _check_dtypes(schema_expected: Dict[str, Any], schema_observed: Dict[str, An if dtype_observed == dtype_expected or np.issubdtype(dtype_observed, dtype_expected): continue - if isinstance(dtype_expected, ExtensionDtype) and is_dtype_equal(dtype_expected, dtype_observed): + if isinstance(dtype_expected, ExtensionDtype) and is_dtype_equal( + dtype_expected, dtype_observed + ): continue if dtype_observed != object and isinstance(dtype_observed, dtype_expected): diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 3e874ff..2f0cba7 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -15,10 +15,7 @@ class AlternativeSchema: a: int -dictionary = { - "a": [1, 2, 3], - "b": ["a", "b", "c"] -} +dictionary = {"a": [1, 2, 3], "b": ["a", "b", "c"]} def test_empty_dataset() -> None: @@ -42,23 +39,12 @@ def test_dataset_missing_colnames() -> None: def test_dataset_too_many_colnames() -> None: with pytest.raises(TypeError): - DataSet[Schema]( - { - "a": [], - "b": [], - "c": [] - } - ) + DataSet[Schema]({"a": [], "b": [], "c": []}) def test_dataset_check_types() -> None: with pytest.raises(TypeError): - DataSet[Schema]( - { - "a": ["1", "2", "3"], - "b": "" - } - ) + DataSet[Schema]({"a": ["1", "2", "3"], "b": ""}) def test_dataset_immutable() -> None: diff --git a/tests/test_indexed_dataset.py b/tests/test_indexed_dataset.py index 6ad62fb..4a25266 100644 --- a/tests/test_indexed_dataset.py +++ b/tests/test_indexed_dataset.py @@ -32,7 +32,9 @@ def test_empty_indexed_dataset() -> None: assert np.all(df.columns == ["c", "d"]) assert df.index.get_level_values(0).dtype == int - assert df.index.get_level_values(1).dtype == object or isinstance(df.index.get_level_values(1).dtype, StringDtype) + assert df.index.get_level_values(1).dtype == object or isinstance( + df.index.get_level_values(1).dtype, StringDtype + ) assert df.dtypes.iloc[0] == int assert df.dtypes.iloc[1] == object or isinstance(df.dtypes.iloc[1], StringDtype) @@ -40,14 +42,7 @@ def test_empty_indexed_dataset() -> None: def test_indexed_dataset() -> None: ( - pd.DataFrame( - { - "a": [1, 2, 3], - "b": ["a", "b", "c"], - "c": [1, 2, 3], - "d": ["a", "b", "c"] - } - ) + pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"], "c": [1, 2, 3], "d": ["a", "b", "c"]}) .set_index(["a", "b"]) .pipe(IndexedDataSet[IndexSchema, DataSchema]) ) diff --git a/tests/test_type_validation.py b/tests/test_type_validation.py index 75d7b85..e68f25f 100644 --- a/tests/test_type_validation.py +++ b/tests/test_type_validation.py @@ -13,7 +13,7 @@ IntervalDtype, PeriodDtype, SparseDtype, - StringDtype + StringDtype, ) @@ -67,7 +67,9 @@ def test_numpy_types(): check_list_of_types(np.int64, [np.int64, np.int_, int], [float, np.float_]) check_list_of_types(np.float64, [np.float64, np.float_, float], [int, np.int_]) check_list_of_types(np.bool_, [np.bool_, bool], [int, np.int_]) - check_list_of_types(np.datetime64, [np.datetime64], [np.timedelta64, DatetimeTZDtype(tz="UTC"), np.int_]) + check_list_of_types( + np.datetime64, [np.datetime64], [np.timedelta64, DatetimeTZDtype(tz="UTC"), np.int_] + ) check_list_of_types(np.timedelta64, [np.timedelta64], [np.datetime64, np.int64]) @@ -75,18 +77,18 @@ def test_pandas_types(): check_list_of_types( DatetimeTZDtype(tz="UTC"), [DatetimeTZDtype(tz="UTC")], - [np.datetime64, DatetimeTZDtype(tz="GMT"), np.int_] + [np.datetime64, DatetimeTZDtype(tz="GMT"), np.int_], ) check_list_of_types(CategoricalDtype, [CategoricalDtype], [Int64Dtype, np.int_, int]) check_list_of_types( PeriodDtype(freq="D"), [PeriodDtype(freq="D")], - [np.datetime64, PeriodDtype(freq="W"), np.int_] + [np.datetime64, PeriodDtype(freq="W"), np.int_], ) check_list_of_types( SparseDtype(dtype=np.int64), [SparseDtype(dtype=np.int64)], - [np.int64, SparseDtype(dtype=np.float64), int] + [np.int64, SparseDtype(dtype=np.float64), int], ) check_list_of_types(IntervalDtype, [IntervalDtype], [Int64Dtype, np.int_, int]) check_list_of_types(Int64Dtype, [Int64Dtype], [IntervalDtype, np.int64, int]) @@ -116,9 +118,25 @@ class DataSchema: def test_supported_index_data_type(): dtypes = [ - DatetimeTZDtype(tz="UTC"), CategoricalDtype, PeriodDtype(freq="D"), IntervalDtype, str, int, float, np.int_, - np.float_, np.datetime64, np.timedelta64, Any, object, np.object_, SparseDtype(dtype=np.int64), np.bool_, - Int64Dtype, BooleanDtype, StringDtype + DatetimeTZDtype(tz="UTC"), + CategoricalDtype, + PeriodDtype(freq="D"), + IntervalDtype, + str, + int, + float, + np.int_, + np.float_, + np.datetime64, + np.timedelta64, + Any, + object, + np.object_, + SparseDtype(dtype=np.int64), + np.bool_, + Int64Dtype, + BooleanDtype, + StringDtype, ] for dtype in dtypes: if is_backward_compatibility_type(dtype):