Merge pull request #57 from StreetEasy/validation

allow `strict` attribute for legacy schemas
StreetEasy · Mar 14, 2023 · f689aff · f689aff
2 parents f384b11 + 709b098
commit f689aff
Show file tree

Hide file tree

Showing 13 changed files with 33 additions and 22 deletions.
diff --git a/changelog.md b/changelog.md
@@ -1,6 +1,6 @@
 # Changelog
 
-- rename `DfSchema.validate_df` to `DfSchema.validate`
+- rename `DfSchema.validate_df` to `DfSchema.validate` (UNDONE: `validate` is reserved by Pydantic object)
 - updated documentation
 
 v0.0.6:

diff --git a/dfschema/cli.py b/dfschema/cli.py
@@ -63,7 +63,7 @@ def validate(
     Schema = DfSchema.from_file(schema)
 
     try:
-        Schema.validate(df, summary=summary)
+        Schema.validate_df(df, summary=summary)
     except Exception as e:
         typer.echo(f"File violates schema: {e}", err=True)
     else:

diff --git a/dfschema/core/core.py b/dfschema/core/core.py
@@ -103,7 +103,7 @@ def validate_column_presence(self, df: pd.DataFrame) -> None:
             df, schema_col_names, additionalColumns=self.additionalColumns, root=self
         )
 
-    def validate(self, df: pd.DataFrame, summary: bool = True) -> None:
+    def validate_df(self, df: pd.DataFrame, summary: bool = True) -> None:
         """validate Dataframe aganist this schema
 
         validate dataframe agains the schema as a dictionary. will raise
@@ -118,7 +118,7 @@ def validate(self, df: pd.DataFrame, summary: bool = True) -> None:
         path = '/schema.json'
 
         df = pd.DataFrame({'a':[1,2], 'b':[3,4]})
-        dfs.DfSchema.from_file(path).validate(df)
+        dfs.DfSchema.from_file(path).validate_df(df)
         ```
 
         Args:
@@ -136,7 +136,7 @@ def validate(self, df: pd.DataFrame, summary: bool = True) -> None:
             )
 
         if self.shape:
-            self.shape.validate(df, root=self)
+            self.shape.validate_df(df, root=self)
 
         if self.columns:
             self.validate_column_presence(df)
@@ -146,7 +146,7 @@ def validate(self, df: pd.DataFrame, summary: bool = True) -> None:
 
         if self.subsets:
             for subset in self.subsets:
-                subset.validate(df=df, root=self)
+                subset.validate_df(df=df, root=self)
 
         if len(self._exception_pool) > 0:
             error = self._summary_error()
@@ -172,7 +172,7 @@ def validate_sql(
             None
         """
         df = pd.read_sql(sql, con, **(read_sql_kwargs or {}))
-        self.validate(df, summary=summary)
+        self.validate_df(df, summary=summary)
 
     @classmethod
     def from_file(cls, path: Union[str, Path]) -> "DfSchema":
@@ -379,7 +379,7 @@ def validate_column_presence_and_order(self, df: pd.DataFrame) -> None:
             root=self,
         )
 
-    def validate(self, df: pd.DataFrame, root: DfSchema) -> None:
+    def validate_df(self, df: pd.DataFrame, root: DfSchema) -> None:
         """validate Dataframe aganist this schema
 
         validate dataframe agains the schema as a dictionary. will raise
@@ -397,7 +397,7 @@ def validate(self, df: pd.DataFrame, root: DfSchema) -> None:
         filtered_df = self._filter(df, self.predicate)
 
         if self.shape:
-            self.shape.validate(filtered_df, root=self)
+            self.shape.validate_df(filtered_df, root=self)
 
         if self.columns:
             self.validate_column_presence_and_order(filtered_df)

diff --git a/dfschema/core/legacy/v1.py b/dfschema/core/legacy/v1.py
@@ -50,6 +50,8 @@ class V1_DfSchema(BaseModel):
     class Config:
         extra = Extra.forbid
         arbitrary_types_allowed = True
+        allow_population_by_field_name = True
+
 
     version: Optional[str] = Field(
         None,
@@ -63,7 +65,7 @@ class Config:
     custom_settings: Optional[dict] = None
 
     strict_cols: Optional[bool] = Field(
-        False, description="if true, won't support additional columns"
+        False, description="if true, won't allow any additional columns", alias="strict"
     )
     shape: Optional[V1_ShapeSchema] = Field(None, description="shape expectations")
     columns: Union[List[str], V1_ColumnsSchema, None] = Field(

diff --git a/dfschema/core/shape.py b/dfschema/core/shape.py
@@ -22,7 +22,7 @@ class Config:
         extra = Extra.forbid
 
     @exception_collector
-    def validate(self, df: pd.DataFrame) -> None:
+    def validate_df(self, df: pd.DataFrame) -> None:
         """validate shape of the dataframe"""
         for i, el in enumerate(("rows", "cols")):
             exact = getattr(self, el)

diff --git a/dfschema/validate.py b/dfschema/validate.py
@@ -36,4 +36,4 @@ def validate(df: pd.DataFrame, schema: dict, summary: bool = True) -> None:
     """
 
     Schema = DfSchema.from_dict(schema)
-    Schema.validate(df=df, summary=summary)
+    Schema.validate_df(df=df, summary=summary)
diff --git a/makefile b/makefile
@@ -9,7 +9,7 @@ serve_docs:
 	poetry run mkdocs serve
 
 docs:
-	poetry run mkdocs build
+	poetry run mkdocs build -f .config/mkdocs/mkdocs.yml
 
 changelog:
 	poetry run gitchangelog
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dfschema"
-version = "0.0.6"  # set via gitlab-ci
+version = "0.0.7"
 description = "lightweight pandas.DataFrame schema"
 authors = ["Philipp <[email protected]>"]
 readme = "README.md"

diff --git a/tests/test_generate.py b/tests/test_generate.py
@@ -12,7 +12,7 @@ def test_generate_df1(df1):
         sd = DfSchema.from_df(df1, return_dict=True)
         raise Exception(sd, e)
 
-    S.validate(df1)  # type: ignore
+    S.validate_df(df1)  # type: ignore
 
 
 def test_generate_df4(df4):
@@ -26,4 +26,4 @@ def test_generate_df4(df4):
         sd = DfSchema.from_df(df4, return_dict=True)
         raise Exception(sd, e)
 
-    S.validate(df4)  # type: ignore
+    S.validate_df(df4)  # type: ignore
diff --git a/tests/test_read_write.py b/tests/test_read_write.py
@@ -23,7 +23,7 @@ def test_read_schema_file(path, sample_df):
     from dfschema import DfSchema
 
     schema = DfSchema.from_file(path)
-    schema.validate(sample_df)
+    schema.validate_df(sample_df)
 
 
 @pytest.mark.parametrize("format", ["json", "yml"])

diff --git a/tests/test_schemas/v1/good/v2_strict.json b/tests/test_schemas/v1/good/v2_strict.json
@@ -0,0 +1,9 @@
+{   "strict": true,
+    "columns": {
+        "pid": {"na_limit": 0.1,  "dtype": "number"},
+        "unittype": {"na_limit": 0.1, "dtype": "str"},
+        "bedrooms": {"na_limit": 0.1, "dtype": "number"},
+        "bathrooms": {"na_limit": 0.1, "dtype": "number"},
+        "size": {"na_limit": 0.1, "dtype": "number"}
+    }
+}
diff --git a/tests/test_str_patterns.py b/tests/test_str_patterns.py
@@ -26,7 +26,7 @@ def test_string_matching(str_df):
     }
 
     S = DfSchema.from_dict(D)
-    S.validate(str_df)
+    S.validate_df(str_df)
 
 
 def test_string_matching_raises(str_df):
@@ -43,4 +43,4 @@ def test_string_matching_raises(str_df):
 
     S = DfSchema.from_dict(D)
     with pytest.raises(DataFrameSummaryError):
-        S.validate(str_df)
+        S.validate_df(str_df)
diff --git a/tests/test_subsets.py b/tests/test_subsets.py
@@ -28,7 +28,7 @@ def test_subset_dict(df_subset):
     }
 
     S = DfSchema.from_dict(D)
-    S.validate(df_subset)
+    S.validate_df(df_subset)
 
 
 def test_subset_query(df_subset):
@@ -52,7 +52,7 @@ def test_subset_query(df_subset):
     }
 
     S = DfSchema.from_dict(D)
-    S.validate(df_subset)
+    S.validate_df(df_subset)
 
 
 def test_subset_query_raises(df_subset):
@@ -67,4 +67,4 @@ def test_subset_query_raises(df_subset):
     S = DfSchema.from_dict(D)
 
     with pytest.raises(DataFrameSummaryError):
-        S.validate(df_subset)
+        S.validate_df(df_subset)