diff --git a/CHANGELOG.md b/CHANGELOG.md index b114b79..9e4854a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - `fiboa create-geojson`: Show conversion progress - `fiboa jsonschema` and `fiboa validate`: Support `geometryTypes` for `geometry` data type in GeoJSON - `fiboa validate`: - - Basic validation for geometries and bounding boxes in GeoParquet files + - Basic validation for objects, geometries and bounding boxes in GeoParquet files ### Changed diff --git a/fiboa_cli/types.py b/fiboa_cli/types.py index dac0c89..12cc715 100644 --- a/fiboa_cli/types.py +++ b/fiboa_cli/types.py @@ -8,12 +8,26 @@ def is_enum(schema): return isinstance(schema.get("enum"), list) +def is_integer_type(dtype): + return dtype.startswith("int") or dtype.startswith("uint") + +def is_floating_type(dtype): + return dtype == "float" or dtype == "double" + +def is_numerical_type(dtype): + return is_integer_type(dtype) or is_floating_type(dtype) + +def is_temporal_type(dtype): + return dtype == "date" or dtype == "date-time" + +def is_scalar_type(dtype): + return dtype == "string" or dtype == "binary" or dtype == "boolean" or is_numerical_type(dtype) or is_temporal_type(dtype) def get_geopandas_dtype(type, required = False, schema = {}): """ fiboa datatypes to geopandas datatypes """ - if is_enum(schema) and (type == "string" or type.startswith("int") or type.startswith("uint")): + if is_enum(schema) and (type == "string" or is_integer_type(type)): return "category" elif type == "boolean": if required: @@ -107,7 +121,7 @@ def get_pyarrow_type(schema): dtype = schema.get("type") if dtype == "boolean": return pa.bool_() - elif dtype.startswith("int") or dtype.startswith("uint") or dtype == "string" or dtype == "binary": + elif is_integer_type(dtype) or dtype == "string" or dtype == "binary": return getattr(pa, dtype)() elif dtype == "float": return pa.float32() @@ -192,19 +206,19 @@ def get_pyarrow_type_for_geopandas(dtype): # checks pyarrow datatypes PYTHON_TYPES = { - "boolean": bool, - "int8": int, - "uint8": int, - "int16": int, - "uint16": int, - "int32": int, - "uint32": int, - "int64": int, - "uint64": int, - "float": float, - "double": float, - "binary": str, - "string": str, + "boolean": (bool, np.bool_), + "int8": (int, np.int8), + "uint8": (int, np.uint8), + "int16": (int, np.int16), + "uint16": (int, np.uint16), + "int32": (int, np.int32), + "uint32": (int, np.uint32), + "int64": (int, np.int64), + "uint64": (int, np.uint64), + "float": (float, np.float32), + "double": (float, np.float64), + "binary": (str, np.bytes_), + "string": (str, np.str_), "array": (list, np.ndarray), "object": dict, "date": (datetime.date, np.datetime64), diff --git a/fiboa_cli/validate_data.py b/fiboa_cli/validate_data.py index 12b8b07..23ef30a 100644 --- a/fiboa_cli/validate_data.py +++ b/fiboa_cli/validate_data.py @@ -2,10 +2,9 @@ import pandas as pd from urllib.parse import urlparse -from shapely.geometry.base import BaseGeometry from shapely.validation import explain_validity -from .types import PYTHON_TYPES +from .types import PYTHON_TYPES, is_numerical_type, is_scalar_type REGEX_EMAIL = re.compile("[^@]+@[^@]+\.[^@]+") REGEX_UUID = re.compile("^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}\Z") @@ -17,24 +16,24 @@ def validate_column(data, rules): # Skip validation for NaN values or implement special handling if required continue - dtype = rules.get('type') - python_type = PYTHON_TYPES.get(dtype) - if python_type is not None and not isinstance(value, python_type): - return [f"Value '{value}' is not of type {dtype}."] + dtype = rules.get("type") + expected_pytype = PYTHON_TYPES.get(dtype) + if expected_pytype is not None and not isinstance(value, expected_pytype): + actualy_pytype = type(value) + return [f"Value '{value}' is not of type {dtype}, is {actualy_pytype}"] - if isinstance(value, str): + if dtype == "string": issues = validate_string(value, rules) - elif isinstance(value, (int, float)): + elif is_numerical_type(dtype): issues = validate_numerical(value, rules) - elif isinstance(value, list): + elif dtype == "array": issues = validate_array(value, rules) - elif isinstance(value, BaseGeometry): + elif dtype == "geometry": issues = validate_geometry(value, rules) - elif isinstance(value, dict): - if dtype == 'bounding-box': - issues = validate_bbox(value, rules) - else: - issues = validate_object(value, rules) + elif dtype == "bounding-box": + issues = validate_bbox(value, rules) + elif dtype == "object": + issues = validate_object(value, rules) else: continue @@ -103,24 +102,40 @@ def validate_numerical(value, rules): issues.append(f"Value {value} is greater than or equal to the exclusive maximum value of {rules['exclusiveMaximum']}.") if 'enum' in rules and value not in rules['enum']: allowed = ", ".join(map(str, rules['enum'])) - issues.append(f"String '{value}' is not one of the allowed values in the enumeration: {allowed}") + issues.append(f"Integer '{value}' is not one of the allowed values in the enumeration: {allowed}") return issues # Array validation def validate_array(values, rules): issues = [] + + item_schema = rules.get('items', {}) + if 'minItems' in rules and len(values) < rules['minItems']: issues.append(f"Array has fewer items than the minimum of {rules['minItems']}.") if 'maxItems' in rules and len(values) > rules['maxItems']: issues.append(f"Array has more items than the maximum of {rules['maxItems']}.") - if 'uniqueItems' in rules and rules['uniqueItems'] and len(values) != len(set(values)): - issues.append("Array items are not unique.") + + if 'uniqueItems' in rules and rules['uniqueItems']: + item_dtype = item_schema.get('type') + if is_scalar_type(item_dtype) and len(values) != len(set(values)): + issues.append("Array items are not unique.") + else: + pass # not supported for non-scalar types + + # todo: Further validation for 'items' if necessary return issues # Object validation def validate_object(value, rules): issues = [] + + if 'minProperties' in rules and len(value) < rules['minProperties']: + issues.append(f"Object has fewer properties than the minimum of {rules['minProperties']}.") + if 'maxProperties' in rules and len(value) > rules['maxProperties']: + issues.append(f"Object has more properties than the maximum of {rules['maxProperties']}.") + props = rules.get('properties', {}) other_props = rules.get('additionalProperties', False) pattern_props = rules.get('patternProperties', {}) @@ -128,4 +143,5 @@ def validate_object(value, rules): if key not in value: issues.append(f"Key '{key}' is missing from the object.") # todo: Further validation based on the type of property + return issues