Merge pull request #81 from worldbank/feature/ntl_metadata

[WIP] Feature/ntl metadata
worldbank · Oct 31, 2024 · 14b1bf6 · 14b1bf6
2 parents 234aa3f + 75d7485
commit 14b1bf6
Show file tree

Hide file tree

Showing 13 changed files with 591 additions and 125 deletions.
diff --git a/space2stats_api/src/README.md b/space2stats_api/src/README.md
@@ -1,10 +1,16 @@
 ## space2stats
 
-### Generating STAC files
+### Generating Preliminary CATALOG, COLLECTION, and ITEM files
 - Navigate to the METADATA sub-directory and run the following commands in order:
     1. get_types.py
     2. create_stac.py
 - Note that the get types function is reading in a parquet file from the following directory: space2stats_api/src/local.parquet
-- Here is a workflow diagram of the STAC metadata creation:
+- Here is a workflow diagram of the initial STAC metadata creation:
 
-![Create Stac](../../docs/images/create_stac_workflow.png)
+![Create Stac](../../docs/images/create_stac_workflow.png)
+
+### Adding new ITEM files
+- In link_new_item.py set "Paths and metadata setup" in the main function to point towards the corresponding locally saved parquet file
+- Navigate to the METADATA sub-directory and run the following commands in order:
+    1. get_types.py
+    2. line_new_items.py
diff --git a/space2stats_api/src/space2stats_ingest/METADATA/Space2Stats Metadata Content.xlsx b/space2stats_api/src/space2stats_ingest/METADATA/Space2Stats Metadata Content.xlsx
diff --git a/space2stats_api/src/space2stats_ingest/METADATA/create_stac.py b/space2stats_api/src/space2stats_ingest/METADATA/create_stac.py
@@ -50,9 +50,7 @@ def load_metadata(file: str) -> Dict[str, pd.DataFrame]:
 
 
 # Function to create STAC catalog
-def create_stac_catalog(
-    overview: pd.DataFrame, nada: pd.DataFrame, catalog_dir: str
-) -> Catalog:
+def create_stac_catalog(overview: pd.DataFrame, nada: pd.DataFrame) -> Catalog:
     catalog = Catalog(
         id="space2stats-catalog",
         description=overview.loc["Description Resource"].values[0],
@@ -66,8 +64,6 @@ def create_stac_catalog(
         href="https://worldbank.github.io/DECAT_Space2Stats/stac/catalog.json",
     )
 
-    # catalog.set_self_href(os.path.relpath("catalog.json", start=catalog_dir))
-
     return catalog
 
 
@@ -112,11 +108,11 @@ def create_stac_collection(overview: pd.DataFrame) -> Collection:
 
 
 # Function to create STAC Item from GeoDataFrame
-def create_stac_item(
-    column_types: dict, feature_catalog: pd.DataFrame, item_dir: str
-) -> Item:
+def create_stac_item(column_types: dict, metadata: pd.DataFrame) -> Item:
     data_dict = []
 
+    feature_catalog = metadata["feature_catalog"]
+
     for column, dtype in column_types.items():
         description = feature_catalog.loc[
             feature_catalog["variable"] == column, "description"
@@ -154,34 +150,35 @@ def create_stac_item(
         89.98750455101016,
     ]
 
+    sources = metadata["sources"]
+    pop_metadata = sources[sources["Name"] == "Population"].iloc[0]
     item = Item(
         id="space2stats_population_2020",
         geometry=geom,
         bbox=bbox,
         datetime=datetime.now(),
         properties={
-            "name": "Population Data",
-            "description": "Gridded population disaggregated by gender for the year 2020, with data available for different age groups.",
-            "methodological_notes": "Global raster files are processed for each hexagonal grid using zonal statistics.",
-            "source_data": "WorldPop gridded population, 2020, Unconstrained, UN-Adjusted",
-            "sci:citation": "Stevens FR, Gaughan AE, Linard C, Tatem AJ (2015) Disaggregating Census Data for Population Mapping Using Random Forests with Remotely-Sensed and Ancillary Data.",
-            "organization": "WorldPop, https://www.worldpop.org",
-            "method": "sum",
-            "resolution": "100 meters",
+            "name": pop_metadata["Name"],
+            "description": pop_metadata["Description"],
+            "methodological_notes": pop_metadata["Methodological Notes"],
+            "source_data": pop_metadata["Source Data"],
+            "sci:citation": pop_metadata["Citation source"],
+            "organization": pop_metadata["Organization"],
+            "method": pop_metadata["Method"],
+            "resolution": pop_metadata["Resolution"],
             "table:primary_geometry": "geometry",
             "table:columns": data_dict,
             "vector:layers": {
                 "space2stats": column_types_with_geometry,
             },
-            "themes": ["Demographics", "Population"],
+            "themes": pop_metadata["Theme"],
         },
         stac_extensions=[
             "https://stac-extensions.github.io/table/v1.2.0/schema.json",
             "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
         ],
     )
 
-    # item.set_self_href(os.path.join("items", f"{item.id}.json"))
     return item
 
 
@@ -232,7 +229,6 @@ def main():
     catalog = create_stac_catalog(
         metadata["overview"],
         metadata["nada"],
-        join(git_root, metadata_dir, "stac"),
     )
 
     # Create STAC collection
@@ -241,8 +237,7 @@ def main():
     # Create STAC item
     item = create_stac_item(
         column_types,
-        metadata["feature_catalog"],
-        join(git_root, metadata_dir, "stac"),
+        metadata,
     )
 
     # Add assets to item

diff --git a/space2stats_api/src/space2stats_ingest/METADATA/get_types.py b/space2stats_api/src/space2stats_ingest/METADATA/get_types.py
@@ -33,7 +33,7 @@ def save_parquet_types_to_json(parquet_file: str, json_file: str):
 
 if __name__ == "__main__":
     git_root = get_git_root()
-    parquet_file = join(git_root, "space2stats_api/src/space2stats.parquet")
+    parquet_file = join(git_root, "space2stats_api/src/ntl2012.parquet")
     json_file = join(
         git_root, "space2stats_api/src/space2stats_ingest/METADATA/types.json"
     )

diff --git a/space2stats_api/src/space2stats_ingest/METADATA/link_new_item.py b/space2stats_api/src/space2stats_ingest/METADATA/link_new_item.py
@@ -0,0 +1,151 @@
+import ast
+import json
+import os
+from datetime import datetime
+from os.path import join
+from typing import Dict
+
+import git
+import pandas as pd
+from pystac import Asset, CatalogType, Collection, Item
+from pystac.extensions.table import TableExtension
+
+
+# Function to get the root of the git repository
+def get_git_root() -> str:
+    git_repo = git.Repo(os.getcwd(), search_parent_directories=True)
+    return git_repo.git.rev_parse("--show-toplevel")
+
+
+# Function to load metadata from the Excel file
+def load_metadata(file: str) -> Dict[str, pd.DataFrame]:
+    overview = pd.read_excel(file, sheet_name="DDH Dataset", index_col="Field")
+    nada = pd.read_excel(file, sheet_name="NADA", index_col="Field")
+    feature_catalog = pd.read_excel(file, sheet_name="Feature Catalog")
+    sources = pd.read_excel(file, sheet_name="Sources")
+    sources["Variables"] = sources.apply(
+        lambda x: ast.literal_eval(x["Variables"]), axis=1
+    )
+    return {
+        "overview": overview,
+        "nada": nada,
+        "feature_catalog": feature_catalog,
+        "sources": sources,
+    }
+
+
+# Function to read the existing STAC collection
+def load_existing_collection(collection_path: str) -> Collection:
+    return Collection.from_file(collection_path)
+
+
+# Function to create a new STAC item
+def create_new_item(sources: pd.DataFrame, column_types: dict, item_name: str) -> Item:
+    # Define geometry and bounding box (you may want to customize these)
+    geom = {
+        "type": "Polygon",
+        "coordinates": [
+            [
+                [-179.99999561620714, -89.98750455101016],
+                [-179.99999561620714, 89.98750455101016],
+                [179.99999096313272, 89.98750455101016],
+                [179.99999096313272, -89.98750455101016],
+                [-179.99999561620714, -89.98750455101016],
+            ]
+        ],
+    }
+    bbox = [
+        -179.99999561620714,
+        -89.98750455101016,
+        179.99999096313272,
+        89.98750455101016,
+    ]
+
+    # Get metadata for Population item
+    src_metadata = sources[sources["Name"] == "Nighttime Lights"].iloc[0]
+
+    # Define the item
+    item = Item(
+        id=item_name,
+        geometry=geom,
+        bbox=bbox,
+        datetime=datetime.now(),
+        properties={
+            "name": src_metadata["Name"],
+            "description": src_metadata["Description"],
+            "methodological_notes": src_metadata["Methodological Notes"],
+            "source_data": src_metadata["Source Data"],
+            "sci:citation": src_metadata["Citation source"],
+            "method": src_metadata["Method"],
+            "resolution": src_metadata["Resolution"],
+            "themes": src_metadata["Theme"],
+        },
+        stac_extensions=[
+            "https://stac-extensions.github.io/table/v1.2.0/schema.json",
+            "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
+        ],
+    )
+
+    # Add table columns as properties
+    TableExtension.add_to(item)
+    table_extension = TableExtension.ext(item, add_if_missing=True)
+    table_extension.columns = [
+        {"name": col, "type": dtype} for col, dtype in column_types.items()
+    ]
+
+    # Add asset
+    item.add_asset(
+        "api-docs",
+        Asset(
+            href="https://space2stats.ds.io/docs",
+            title="API Documentation",
+            media_type="text/html",
+            roles=["metadata"],
+        ),
+    )
+
+    return item
+
+
+# Function to add the new item to the existing collection
+def add_item_to_collection(collection: Collection, item: Item):
+    collection.add_item(item)
+
+
+# Save the updated collection
+def save_collection(collection: Collection, collection_path: str):
+    collection.normalize_hrefs(collection_path)
+    collection.save(catalog_type=CatalogType.RELATIVE_PUBLISHED)
+
+
+# Main function
+def main():
+    git_root = get_git_root()
+    metadata_dir = join(git_root, "space2stats_api/src/space2stats_ingest/METADATA")
+
+    # Paths and metadata setup
+    item_name = "space2stats_ntl_2013"
+    collection_path = join(metadata_dir, "stac/space2stats-collection/collection.json")
+    excel_path = join(metadata_dir, "Space2Stats Metadata Content.xlsx")
+    column_types_file = join(metadata_dir, "types.json")
+
+    # Load metadata and column types
+    metadata = load_metadata(excel_path)
+    with open(column_types_file, "r") as f:
+        column_types = json.load(f)
+
+    # Load existing collection
+    collection = load_existing_collection(collection_path)
+
+    # Create a new item
+    new_item = create_new_item(metadata["sources"], column_types, item_name)
+
+    # Add the new item to the collection
+    collection.add_item(new_item, title="Space2Stats NTL 2013 Data Item")
+
+    # Save the updated collection
+    save_collection(collection, collection_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/space2stats_api/src/space2stats_ingest/METADATA/stac/space2stats-collection/collection.json b/space2stats_api/src/space2stats_ingest/METADATA/stac/space2stats-collection/collection.json
@@ -21,6 +21,12 @@
       "href": "./space2stats_population_2020/space2stats_population_2020.json",
       "type": "application/json",
       "title": "Space2Stats Population Data Item"
+    },
+    {
+      "rel": "item",
+      "href": "./space2stats_ntl_2013/space2stats_ntl_2013.json",
+      "type": "application/json",
+      "title": "Space2Stats NTL 2013 Data Item"
     }
   ],
   "Title": "Space2Stats Database",
@@ -32,32 +38,6 @@
     "hexagons",
     "global"
   ],
-  "summaries": {
-    "datetime": {
-      "min": "2020-01-01T00:00:00Z",
-      "max": null
-    }
-  },
-  "providers": [
-    {
-      "name": "World Bank",
-      "roles": [
-        "producer",
-        "licensor"
-      ],
-      "url": "https://www.worldbank.org/"
-    }
-  ],
-  "assets": {
-    "documentation": {
-      "href": "https://space2stats.ds.io/docs",
-      "type": "text/html",
-      "title": "API Documentation",
-      "roles": [
-        "metadata"
-      ]
-    }
-  },
   "title": "Space2Stats Collection",
   "extent": {
     "spatial": {
@@ -79,5 +59,31 @@
       ]
     }
   },
-  "license": "CC-BY-4.0"
+  "license": "CC-BY-4.0",
+  "providers": [
+    {
+      "name": "World Bank",
+      "roles": [
+        "producer",
+        "licensor"
+      ],
+      "url": "https://www.worldbank.org/"
+    }
+  ],
+  "summaries": {
+    "datetime": {
+      "min": "2020-01-01T00:00:00Z",
+      "max": null
+    }
+  },
+  "assets": {
+    "documentation": {
+      "href": "https://space2stats.ds.io/docs",
+      "type": "text/html",
+      "title": "API Documentation",
+      "roles": [
+        "metadata"
+      ]
+    }
+  }
 }