Skip to content

Commit dffe06c

Browse files
authored
Merge pull request #36 from IFRCGo/feat/src-validators
Feat/src validators
2 parents 5af5c89 + 23db889 commit dffe06c

24 files changed

+1442
-519
lines changed

Diff for: .pre-commit-config.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,12 @@ repos:
5050
- id: ruff-format
5151
types_or: [python, pyi, jupyter, toml]
5252

53+
- repo: https://github.com/astral-sh/uv-pre-commit
54+
rev: 0.6.9
55+
hooks:
56+
- id: uv-lock
57+
args: ["--locked", "--offline"]
58+
5359
- repo: https://github.com/RobertCraigie/pyright-python
5460
rev: v1.1.396
5561
hooks:

Diff for: pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ dependencies = [
3333
"pytz>=2021.1",
3434
"pandas>=2.2.0",
3535
"lxml>=5.3.0",
36-
"pydantic",
36+
"pydantic>=2.10.6",
3737
]
3838
dynamic = ["version"]
3939

Diff for: pystac_monty/geocoding.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,8 @@ def get_geometry_by_country_name(self, country_name: str) -> Optional[Dict[str,
501501
return None
502502

503503
def get_iso3_from_geometry(self, geometry: Dict[str, Any]) -> Optional[str]:
504-
raise NotImplementedError("Method not implemented")
504+
# FIXME: Implement this later
505+
return "UNK"
505506

506507
def get_geometry_from_iso3(self, iso3: str) -> Optional[Dict[str, Any]]:
507508
raise NotImplementedError("Method not implemented")

Diff for: pystac_monty/sources/desinventar.py

+9-95
Original file line numberDiff line numberDiff line change
@@ -24,107 +24,19 @@
2424
)
2525
from pystac_monty.hazard_profiles import MontyHazardProfiles
2626
from pystac_monty.sources.common import MontyDataTransformer
27-
28-
STAC_EVENT_ID_PREFIX = "desinventar-event-"
29-
STAC_HAZARD_ID_PREFIX = "desinventar-hazard-"
30-
STAC_IMPACT_ID_PREFIX = "desinventar-impact-"
27+
from pystac_monty.validators.desinventar import (
28+
STAC_EVENT_ID_PREFIX,
29+
STAC_HAZARD_ID_PREFIX,
30+
STAC_IMPACT_ID_PREFIX,
31+
DataRow,
32+
GeoDataEntry,
33+
)
3134

3235
logger = logging.getLogger(__name__)
3336

3437
T = typing.TypeVar("T")
3538

3639

37-
class GeoDataEntry(TypedDict):
38-
level: Optional[str]
39-
property_code: Optional[str]
40-
shapefile_data: Optional[gpd.GeoDataFrame]
41-
42-
43-
# Properties extracted from desinventar
44-
class DataRow(pydantic.BaseModel):
45-
serial: str
46-
comment: str | None
47-
# source: str | None
48-
49-
deaths: float | None
50-
injured: float | None
51-
missing: float | None
52-
houses_destroyed: float | None
53-
houses_damaged: float | None
54-
directly_affected: float | None
55-
indirectly_affected: float | None
56-
relocated: float | None
57-
evacuated: float | None
58-
losses_in_dollar: float | None
59-
losses_local_currency: float | None
60-
# education_centers: str | None
61-
# hospitals: str | None
62-
damages_in_crops_ha: float | None
63-
lost_cattle: float | None
64-
damages_in_roads_mts: float | None
65-
66-
level0: str | None
67-
level1: str | None
68-
level2: str | None
69-
# name0: str | None
70-
# name1: str | None
71-
# name2: str | None
72-
# latitude: str | None
73-
# longitude: str | None
74-
75-
# haz_maxvalue: str | None
76-
event: str | None
77-
# glide: str | None
78-
location: str | None
79-
80-
# duration: str | None
81-
year: int
82-
month: int | None
83-
day: int | None
84-
85-
# Added fields
86-
87-
iso3: str
88-
data_source_url: str | None
89-
90-
@property
91-
def event_stac_id(self):
92-
return f"{STAC_EVENT_ID_PREFIX}{self.iso3}-{self.serial}"
93-
94-
@property
95-
def event_title(self):
96-
return f"{self.event} in {self.location} on {self.event_start_date}"
97-
98-
@property
99-
def event_description(self):
100-
return f"{self.event} in {self.location}: {self.comment}"
101-
102-
@property
103-
def event_start_date(self):
104-
if self.year is None:
105-
return
106-
107-
start_year = self.year
108-
start_month = self.month or 1
109-
start_day = self.day or 1
110-
111-
try:
112-
start_dt = datetime(start_year, start_month, start_day)
113-
return pytz.utc.localize(start_dt)
114-
except Exception:
115-
return None
116-
117-
@property
118-
def lowest_level(self):
119-
if self.level2 is not None:
120-
return 'level2'
121-
if self.level1 is not None:
122-
return 'level1'
123-
if self.level0 is not None:
124-
return 'level0'
125-
return None
126-
127-
12840
# TODO: move to common utils
12941
def get_list_item_safe(lst: list[T], index: int, default_value: T | None = None) -> T | None:
13042
try:
@@ -686,6 +598,8 @@ def get_stac_items(self) -> typing.Generator[Item, None, None]:
686598
if event_item := self._create_event_item_from_row(row_data):
687599
yield event_item
688600
yield from self._create_impact_items_from_row(row_data, event_item)
601+
else:
602+
failed_items_count += 1
689603
except Exception:
690604
failed_items_count += 1
691605
logger.error('Failed to process desinventar', exc_info=True)

Diff for: pystac_monty/sources/emdat.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from pystac_monty.hazard_profiles import MontyHazardProfiles
2020
from pystac_monty.sources.common import MontyDataSource, MontyDataTransformer
2121
from pystac_monty.utils import rename_columns
22+
from pystac_monty.validators.em_dat import EmdatDataValidator
2223

2324
STAC_EVENT_ID_PREFIX = "emdat-event-"
2425
STAC_HAZARD_ID_PREFIX = "emdat-hazard-"
@@ -40,12 +41,24 @@ def __init__(self, source_url: str, data: Union[str, pd.DataFrame]):
4041
self.df = data
4142
elif isinstance(data, dict):
4243
# If data is a dict, assume it's Json content
43-
data = data["data"]["public_emdat"]["data"]
44+
# data = data["data"]["public_emdat"]["data"]
45+
data = self.source_data_validator(data["data"]["public_emdat"]["data"])
4446
df = pd.DataFrame(data)
4547
self.df = rename_columns(df)
4648
else:
4749
raise ValueError("Data must be either Excel content (str) or pandas DataFrame or Json")
4850

51+
def source_data_validator(self, data):
52+
valid_list = []
53+
error_list = []
54+
for item in data:
55+
if EmdatDataValidator.validate_event(item):
56+
valid_list.append(item)
57+
else:
58+
error_list.append(item)
59+
60+
return valid_list
61+
4962
def get_data(self) -> pd.DataFrame:
5063
return self.df
5164

Diff for: pystac_monty/sources/gdacs.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
)
2323
from pystac_monty.hazard_profiles import MontyHazardProfiles
2424
from pystac_monty.sources.common import MontyDataSource, MontyDataTransformer
25+
from pystac_monty.validators.gdacs_events import GdacsDataValidatorEvents
26+
from pystac_monty.validators.gdacs_geometry import GdacsDataValidatorGeometry
2527

2628
# Constants
2729

@@ -46,7 +48,24 @@ def __init__(self, source_url: str, data: Any, type: GDACSDataSourceType):
4648
super().__init__(source_url, data)
4749
self.type = type
4850
# all gdacs data are json
49-
self.data = json.loads(data)
51+
self.data = self.source_data_validator(json.loads(data))
52+
53+
def source_data_validator(self, data: dict):
54+
# Debug print
55+
if self.type == GDACSDataSourceType.EVENT:
56+
result = GdacsDataValidatorEvents.validate_event(data)
57+
if result:
58+
return data
59+
elif self.type == GDACSDataSourceType.GEOMETRY:
60+
new_data = {} # Store the filtered dictionary
61+
for key, value in data.items():
62+
if key == "features" and isinstance(value, list):
63+
# Validate each feature in the list and skip the ones with 'Figure cause' = 'Conflict'
64+
new_data[key] = [feature for feature in value if GdacsDataValidatorGeometry.validate_event(feature)]
65+
else:
66+
# Keep normal key-value pairs unchanged
67+
new_data[key] = value
68+
return new_data
5069

5170
def get_type(self) -> GDACSDataSourceType:
5271
return self.type

Diff for: pystac_monty/sources/gfd.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
)
1616
from pystac_monty.hazard_profiles import MontyHazardProfiles
1717
from pystac_monty.sources.common import MontyDataSource, MontyDataTransformer
18+
from pystac_monty.validators.gfd import GFDSourceValidator
1819

1920
# Constants
2021

@@ -28,7 +29,20 @@ class GFDDataSource(MontyDataSource):
2829

2930
def __init__(self, source_url: str, data: Any):
3031
super().__init__(source_url, data)
31-
self.data = json.loads(data)
32+
self.data = self.source_data_validator(json.loads(data))
33+
34+
def source_data_validator(self, data: list[dict]):
35+
"""Validate the source data and collect only the success items"""
36+
# TODO Handle the failed_items
37+
failed_items = []
38+
success_items = []
39+
for item in data:
40+
is_valid = GFDSourceValidator.validate_event(item)
41+
if is_valid:
42+
success_items.append(item)
43+
else:
44+
failed_items.append(item)
45+
return success_items
3246

3347

3448
class GFDTransformer(MontyDataTransformer[GFDDataSource]):
@@ -85,7 +99,7 @@ def make_source_event_item(self, data: dict) -> Item:
8599
enddate = pytz.utc.localize(datetime.fromtimestamp(data["system:time_end"] / 1000))
86100

87101
item = Item(
88-
id=f'{STAC_EVENT_ID_PREFIX}{data["id"]}',
102+
id=f"{STAC_EVENT_ID_PREFIX}{data['id']}",
89103
geometry=geometry,
90104
bbox=bbox,
91105
datetime=startdate,

Diff for: pystac_monty/sources/gidd.py

+16
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pystac_monty.hazard_profiles import MontyHazardProfiles
1818
from pystac_monty.sources.common import MontyDataSource, MontyDataTransformer
1919
from pystac_monty.sources.utils import IDMCUtils
20+
from pystac_monty.validators.gidd import GiddValidator
2021

2122
logger = logging.getLogger(__name__)
2223

@@ -31,6 +32,21 @@ class GIDDDataSource(MontyDataSource):
3132
def __init__(self, source_url: str, data: Any):
3233
super().__init__(source_url, data)
3334
self.data = json.loads(data)
35+
self.data = self.source_data_validator(json.loads(data))
36+
37+
def source_data_validator(self, data: dict):
38+
"""Validate only the items inside 'features' while keeping other keys unchanged."""
39+
40+
new_data = {} # Store the filtered dictionary
41+
42+
for key, value in data.items():
43+
if key == "features" and isinstance(value, list):
44+
# Validate each feature in the list and skip the ones with 'Figure cause' = 'Conflict'
45+
new_data[key] = [feature for feature in value if GiddValidator.validate_event(feature)]
46+
else:
47+
# Keep normal key-value pairs unchanged
48+
new_data[key] = value
49+
return new_data
3450

3551

3652
class GIDDTransformer(MontyDataTransformer[GIDDDataSource]):

Diff for: pystac_monty/sources/glide.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from pystac_monty.extension import HazardDetail, MontyEstimateType, MontyExtension
1010
from pystac_monty.hazard_profiles import MontyHazardProfiles
1111
from pystac_monty.sources.common import MontyDataSource, MontyDataTransformer
12+
from pystac_monty.validators.glide import GlideSetValidator
1213

1314
STAC_EVENT_ID_PREFIX = "glide-event-"
1415
STAC_HAZARD_ID_PREFIX = "glide-hazard-"
@@ -17,7 +18,19 @@
1718
class GlideDataSource(MontyDataSource):
1819
def __init__(self, source_url: str, data: Any):
1920
super().__init__(source_url, data)
20-
self.data = json.loads(data)
21+
self.data = self.source_data_validator(json.loads(data))
22+
23+
def source_data_validator(self, data: dict[dict]):
24+
"""Validate the source data and collect only the success items"""
25+
failed_items = []
26+
success_items = []
27+
for item in data["glideset"]:
28+
is_valid = GlideSetValidator.validate_event(item)
29+
if is_valid:
30+
success_items.append(item)
31+
else:
32+
failed_items.append(item)
33+
return {"glideset": success_items}
2134

2235

2336
class GlideTransformer(MontyDataTransformer[GlideDataSource]):

0 commit comments

Comments
 (0)