Skip to content

Commit

Permalink
Merge pull request #81 from worldbank/feature/ntl_metadata
Browse files Browse the repository at this point in the history
[WIP] Feature/ntl metadata
  • Loading branch information
bpstewar authored Oct 31, 2024
2 parents 234aa3f + 75d7485 commit 14b1bf6
Show file tree
Hide file tree
Showing 13 changed files with 591 additions and 125 deletions.
12 changes: 9 additions & 3 deletions space2stats_api/src/README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
## space2stats

### Generating STAC files
### Generating Preliminary CATALOG, COLLECTION, and ITEM files
- Navigate to the METADATA sub-directory and run the following commands in order:
1. get_types.py
2. create_stac.py
- Note that the get types function is reading in a parquet file from the following directory: space2stats_api/src/local.parquet
- Here is a workflow diagram of the STAC metadata creation:
- Here is a workflow diagram of the initial STAC metadata creation:

![Create Stac](../../docs/images/create_stac_workflow.png)
![Create Stac](../../docs/images/create_stac_workflow.png)

### Adding new ITEM files
- In link_new_item.py set "Paths and metadata setup" in the main function to point towards the corresponding locally saved parquet file
- Navigate to the METADATA sub-directory and run the following commands in order:
1. get_types.py
2. line_new_items.py
Binary file not shown.
37 changes: 16 additions & 21 deletions space2stats_api/src/space2stats_ingest/METADATA/create_stac.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,7 @@ def load_metadata(file: str) -> Dict[str, pd.DataFrame]:


# Function to create STAC catalog
def create_stac_catalog(
overview: pd.DataFrame, nada: pd.DataFrame, catalog_dir: str
) -> Catalog:
def create_stac_catalog(overview: pd.DataFrame, nada: pd.DataFrame) -> Catalog:
catalog = Catalog(
id="space2stats-catalog",
description=overview.loc["Description Resource"].values[0],
Expand All @@ -66,8 +64,6 @@ def create_stac_catalog(
href="https://worldbank.github.io/DECAT_Space2Stats/stac/catalog.json",
)

# catalog.set_self_href(os.path.relpath("catalog.json", start=catalog_dir))

return catalog


Expand Down Expand Up @@ -112,11 +108,11 @@ def create_stac_collection(overview: pd.DataFrame) -> Collection:


# Function to create STAC Item from GeoDataFrame
def create_stac_item(
column_types: dict, feature_catalog: pd.DataFrame, item_dir: str
) -> Item:
def create_stac_item(column_types: dict, metadata: pd.DataFrame) -> Item:
data_dict = []

feature_catalog = metadata["feature_catalog"]

for column, dtype in column_types.items():
description = feature_catalog.loc[
feature_catalog["variable"] == column, "description"
Expand Down Expand Up @@ -154,34 +150,35 @@ def create_stac_item(
89.98750455101016,
]

sources = metadata["sources"]
pop_metadata = sources[sources["Name"] == "Population"].iloc[0]
item = Item(
id="space2stats_population_2020",
geometry=geom,
bbox=bbox,
datetime=datetime.now(),
properties={
"name": "Population Data",
"description": "Gridded population disaggregated by gender for the year 2020, with data available for different age groups.",
"methodological_notes": "Global raster files are processed for each hexagonal grid using zonal statistics.",
"source_data": "WorldPop gridded population, 2020, Unconstrained, UN-Adjusted",
"sci:citation": "Stevens FR, Gaughan AE, Linard C, Tatem AJ (2015) Disaggregating Census Data for Population Mapping Using Random Forests with Remotely-Sensed and Ancillary Data.",
"organization": "WorldPop, https://www.worldpop.org",
"method": "sum",
"resolution": "100 meters",
"name": pop_metadata["Name"],
"description": pop_metadata["Description"],
"methodological_notes": pop_metadata["Methodological Notes"],
"source_data": pop_metadata["Source Data"],
"sci:citation": pop_metadata["Citation source"],
"organization": pop_metadata["Organization"],
"method": pop_metadata["Method"],
"resolution": pop_metadata["Resolution"],
"table:primary_geometry": "geometry",
"table:columns": data_dict,
"vector:layers": {
"space2stats": column_types_with_geometry,
},
"themes": ["Demographics", "Population"],
"themes": pop_metadata["Theme"],
},
stac_extensions=[
"https://stac-extensions.github.io/table/v1.2.0/schema.json",
"https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
],
)

# item.set_self_href(os.path.join("items", f"{item.id}.json"))
return item


Expand Down Expand Up @@ -232,7 +229,6 @@ def main():
catalog = create_stac_catalog(
metadata["overview"],
metadata["nada"],
join(git_root, metadata_dir, "stac"),
)

# Create STAC collection
Expand All @@ -241,8 +237,7 @@ def main():
# Create STAC item
item = create_stac_item(
column_types,
metadata["feature_catalog"],
join(git_root, metadata_dir, "stac"),
metadata,
)

# Add assets to item
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def save_parquet_types_to_json(parquet_file: str, json_file: str):

if __name__ == "__main__":
git_root = get_git_root()
parquet_file = join(git_root, "space2stats_api/src/space2stats.parquet")
parquet_file = join(git_root, "space2stats_api/src/ntl2012.parquet")
json_file = join(
git_root, "space2stats_api/src/space2stats_ingest/METADATA/types.json"
)
Expand Down
151 changes: 151 additions & 0 deletions space2stats_api/src/space2stats_ingest/METADATA/link_new_item.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import ast
import json
import os
from datetime import datetime
from os.path import join
from typing import Dict

import git
import pandas as pd
from pystac import Asset, CatalogType, Collection, Item
from pystac.extensions.table import TableExtension


# Function to get the root of the git repository
def get_git_root() -> str:
git_repo = git.Repo(os.getcwd(), search_parent_directories=True)
return git_repo.git.rev_parse("--show-toplevel")


# Function to load metadata from the Excel file
def load_metadata(file: str) -> Dict[str, pd.DataFrame]:
overview = pd.read_excel(file, sheet_name="DDH Dataset", index_col="Field")
nada = pd.read_excel(file, sheet_name="NADA", index_col="Field")
feature_catalog = pd.read_excel(file, sheet_name="Feature Catalog")
sources = pd.read_excel(file, sheet_name="Sources")
sources["Variables"] = sources.apply(
lambda x: ast.literal_eval(x["Variables"]), axis=1
)
return {
"overview": overview,
"nada": nada,
"feature_catalog": feature_catalog,
"sources": sources,
}


# Function to read the existing STAC collection
def load_existing_collection(collection_path: str) -> Collection:
return Collection.from_file(collection_path)


# Function to create a new STAC item
def create_new_item(sources: pd.DataFrame, column_types: dict, item_name: str) -> Item:
# Define geometry and bounding box (you may want to customize these)
geom = {
"type": "Polygon",
"coordinates": [
[
[-179.99999561620714, -89.98750455101016],
[-179.99999561620714, 89.98750455101016],
[179.99999096313272, 89.98750455101016],
[179.99999096313272, -89.98750455101016],
[-179.99999561620714, -89.98750455101016],
]
],
}
bbox = [
-179.99999561620714,
-89.98750455101016,
179.99999096313272,
89.98750455101016,
]

# Get metadata for Population item
src_metadata = sources[sources["Name"] == "Nighttime Lights"].iloc[0]

# Define the item
item = Item(
id=item_name,
geometry=geom,
bbox=bbox,
datetime=datetime.now(),
properties={
"name": src_metadata["Name"],
"description": src_metadata["Description"],
"methodological_notes": src_metadata["Methodological Notes"],
"source_data": src_metadata["Source Data"],
"sci:citation": src_metadata["Citation source"],
"method": src_metadata["Method"],
"resolution": src_metadata["Resolution"],
"themes": src_metadata["Theme"],
},
stac_extensions=[
"https://stac-extensions.github.io/table/v1.2.0/schema.json",
"https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
],
)

# Add table columns as properties
TableExtension.add_to(item)
table_extension = TableExtension.ext(item, add_if_missing=True)
table_extension.columns = [
{"name": col, "type": dtype} for col, dtype in column_types.items()
]

# Add asset
item.add_asset(
"api-docs",
Asset(
href="https://space2stats.ds.io/docs",
title="API Documentation",
media_type="text/html",
roles=["metadata"],
),
)

return item


# Function to add the new item to the existing collection
def add_item_to_collection(collection: Collection, item: Item):
collection.add_item(item)


# Save the updated collection
def save_collection(collection: Collection, collection_path: str):
collection.normalize_hrefs(collection_path)
collection.save(catalog_type=CatalogType.RELATIVE_PUBLISHED)


# Main function
def main():
git_root = get_git_root()
metadata_dir = join(git_root, "space2stats_api/src/space2stats_ingest/METADATA")

# Paths and metadata setup
item_name = "space2stats_ntl_2013"
collection_path = join(metadata_dir, "stac/space2stats-collection/collection.json")
excel_path = join(metadata_dir, "Space2Stats Metadata Content.xlsx")
column_types_file = join(metadata_dir, "types.json")

# Load metadata and column types
metadata = load_metadata(excel_path)
with open(column_types_file, "r") as f:
column_types = json.load(f)

# Load existing collection
collection = load_existing_collection(collection_path)

# Create a new item
new_item = create_new_item(metadata["sources"], column_types, item_name)

# Add the new item to the collection
collection.add_item(new_item, title="Space2Stats NTL 2013 Data Item")

# Save the updated collection
save_collection(collection, collection_path)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@
"href": "./space2stats_population_2020/space2stats_population_2020.json",
"type": "application/json",
"title": "Space2Stats Population Data Item"
},
{
"rel": "item",
"href": "./space2stats_ntl_2013/space2stats_ntl_2013.json",
"type": "application/json",
"title": "Space2Stats NTL 2013 Data Item"
}
],
"Title": "Space2Stats Database",
Expand All @@ -32,32 +38,6 @@
"hexagons",
"global"
],
"summaries": {
"datetime": {
"min": "2020-01-01T00:00:00Z",
"max": null
}
},
"providers": [
{
"name": "World Bank",
"roles": [
"producer",
"licensor"
],
"url": "https://www.worldbank.org/"
}
],
"assets": {
"documentation": {
"href": "https://space2stats.ds.io/docs",
"type": "text/html",
"title": "API Documentation",
"roles": [
"metadata"
]
}
},
"title": "Space2Stats Collection",
"extent": {
"spatial": {
Expand All @@ -79,5 +59,31 @@
]
}
},
"license": "CC-BY-4.0"
"license": "CC-BY-4.0",
"providers": [
{
"name": "World Bank",
"roles": [
"producer",
"licensor"
],
"url": "https://www.worldbank.org/"
}
],
"summaries": {
"datetime": {
"min": "2020-01-01T00:00:00Z",
"max": null
}
},
"assets": {
"documentation": {
"href": "https://space2stats.ds.io/docs",
"type": "text/html",
"title": "API Documentation",
"roles": [
"metadata"
]
}
}
}
Loading

0 comments on commit 14b1bf6

Please sign in to comment.