Skip to content

Commit

Permalink
minProperties, maxProperties, array fixes, ...
Browse files Browse the repository at this point in the history
  • Loading branch information
m-mohr committed May 9, 2024
1 parent 7dd0b29 commit fa38de9
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 34 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- `fiboa create-geojson`: Show conversion progress
- `fiboa jsonschema` and `fiboa validate`: Support `geometryTypes` for `geometry` data type in GeoJSON
- `fiboa validate`:
- Basic validation for geometries and bounding boxes in GeoParquet files
- Basic validation for objects, geometries and bounding boxes in GeoParquet files

### Changed

Expand Down
44 changes: 29 additions & 15 deletions fiboa_cli/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,26 @@
def is_enum(schema):
return isinstance(schema.get("enum"), list)

def is_integer_type(dtype):
return dtype.startswith("int") or dtype.startswith("uint")

def is_floating_type(dtype):
return dtype == "float" or dtype == "double"

def is_numerical_type(dtype):
return is_integer_type(dtype) or is_floating_type(dtype)

def is_temporal_type(dtype):
return dtype == "date" or dtype == "date-time"

def is_scalar_type(dtype):
return dtype == "string" or dtype == "binary" or dtype == "boolean" or is_numerical_type(dtype) or is_temporal_type(dtype)

def get_geopandas_dtype(type, required = False, schema = {}):
"""
fiboa datatypes to geopandas datatypes
"""
if is_enum(schema) and (type == "string" or type.startswith("int") or type.startswith("uint")):
if is_enum(schema) and (type == "string" or is_integer_type(type)):
return "category"
elif type == "boolean":
if required:
Expand Down Expand Up @@ -107,7 +121,7 @@ def get_pyarrow_type(schema):
dtype = schema.get("type")
if dtype == "boolean":
return pa.bool_()
elif dtype.startswith("int") or dtype.startswith("uint") or dtype == "string" or dtype == "binary":
elif is_integer_type(dtype) or dtype == "string" or dtype == "binary":
return getattr(pa, dtype)()
elif dtype == "float":
return pa.float32()
Expand Down Expand Up @@ -192,19 +206,19 @@ def get_pyarrow_type_for_geopandas(dtype):

# checks pyarrow datatypes
PYTHON_TYPES = {
"boolean": bool,
"int8": int,
"uint8": int,
"int16": int,
"uint16": int,
"int32": int,
"uint32": int,
"int64": int,
"uint64": int,
"float": float,
"double": float,
"binary": str,
"string": str,
"boolean": (bool, np.bool_),
"int8": (int, np.int8),
"uint8": (int, np.uint8),
"int16": (int, np.int16),
"uint16": (int, np.uint16),
"int32": (int, np.int32),
"uint32": (int, np.uint32),
"int64": (int, np.int64),
"uint64": (int, np.uint64),
"float": (float, np.float32),
"double": (float, np.float64),
"binary": (str, np.bytes_),
"string": (str, np.str_),
"array": (list, np.ndarray),
"object": dict,
"date": (datetime.date, np.datetime64),
Expand Down
52 changes: 34 additions & 18 deletions fiboa_cli/validate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
import pandas as pd

from urllib.parse import urlparse
from shapely.geometry.base import BaseGeometry
from shapely.validation import explain_validity

from .types import PYTHON_TYPES
from .types import PYTHON_TYPES, is_numerical_type, is_scalar_type

REGEX_EMAIL = re.compile("[^@]+@[^@]+\.[^@]+")
REGEX_UUID = re.compile("^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}\Z")
Expand All @@ -17,24 +16,24 @@ def validate_column(data, rules):
# Skip validation for NaN values or implement special handling if required
continue

dtype = rules.get('type')
python_type = PYTHON_TYPES.get(dtype)
if python_type is not None and not isinstance(value, python_type):
return [f"Value '{value}' is not of type {dtype}."]
dtype = rules.get("type")
expected_pytype = PYTHON_TYPES.get(dtype)
if expected_pytype is not None and not isinstance(value, expected_pytype):
actualy_pytype = type(value)
return [f"Value '{value}' is not of type {dtype}, is {actualy_pytype}"]

if isinstance(value, str):
if dtype == "string":
issues = validate_string(value, rules)
elif isinstance(value, (int, float)):
elif is_numerical_type(dtype):
issues = validate_numerical(value, rules)
elif isinstance(value, list):
elif dtype == "array":
issues = validate_array(value, rules)
elif isinstance(value, BaseGeometry):
elif dtype == "geometry":
issues = validate_geometry(value, rules)
elif isinstance(value, dict):
if dtype == 'bounding-box':
issues = validate_bbox(value, rules)
else:
issues = validate_object(value, rules)
elif dtype == "bounding-box":
issues = validate_bbox(value, rules)
elif dtype == "object":
issues = validate_object(value, rules)
else:
continue

Expand Down Expand Up @@ -103,29 +102,46 @@ def validate_numerical(value, rules):
issues.append(f"Value {value} is greater than or equal to the exclusive maximum value of {rules['exclusiveMaximum']}.")
if 'enum' in rules and value not in rules['enum']:
allowed = ", ".join(map(str, rules['enum']))
issues.append(f"String '{value}' is not one of the allowed values in the enumeration: {allowed}")
issues.append(f"Integer '{value}' is not one of the allowed values in the enumeration: {allowed}")
return issues

# Array validation
def validate_array(values, rules):
issues = []

item_schema = rules.get('items', {})

if 'minItems' in rules and len(values) < rules['minItems']:
issues.append(f"Array has fewer items than the minimum of {rules['minItems']}.")
if 'maxItems' in rules and len(values) > rules['maxItems']:
issues.append(f"Array has more items than the maximum of {rules['maxItems']}.")
if 'uniqueItems' in rules and rules['uniqueItems'] and len(values) != len(set(values)):
issues.append("Array items are not unique.")

if 'uniqueItems' in rules and rules['uniqueItems']:
item_dtype = item_schema.get('type')
if is_scalar_type(item_dtype) and len(values) != len(set(values)):
issues.append("Array items are not unique.")
else:
pass # not supported for non-scalar types


# todo: Further validation for 'items' if necessary
return issues

# Object validation
def validate_object(value, rules):
issues = []

if 'minProperties' in rules and len(value) < rules['minProperties']:
issues.append(f"Object has fewer properties than the minimum of {rules['minProperties']}.")
if 'maxProperties' in rules and len(value) > rules['maxProperties']:
issues.append(f"Object has more properties than the maximum of {rules['maxProperties']}.")

props = rules.get('properties', {})
other_props = rules.get('additionalProperties', False)
pattern_props = rules.get('patternProperties', {})
for key, val in props.items():
if key not in value:
issues.append(f"Key '{key}' is missing from the object.")
# todo: Further validation based on the type of property

return issues

0 comments on commit fa38de9

Please sign in to comment.