-
Notifications
You must be signed in to change notification settings - Fork 67
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature/grid tiles #393
Feature/grid tiles #393
Changes from 40 commits
e0fab1e
f6cb725
fc4f415
f0eef62
94effd2
abf3f24
964cfbc
f772460
35e0fef
cfe0a04
789b170
4ee50f9
23fb47f
bb85cb3
dbaebc2
ee80140
e4588ea
f220d75
cd47656
076bdc2
ed9d80e
c3025bc
ddab8cc
43e483f
3d6b648
e2455b1
8e18317
cfacf5f
30f8b31
2a7c34c
35526c0
509cf31
bef9f04
1f886ab
ee1a4ac
41b9dae
6d4067c
9b79122
7170697
19773fe
27e5870
b9b1856
b7f856d
742045a
5e1a47b
5cb0a9e
9f462eb
b47a731
4747771
00063ce
624dba6
58e6160
3a66513
7b3c084
1b0cb9d
529c95f
76b8044
366f430
b3141d5
a50399b
093f005
9625eb0
8b53c22
c1a6e6b
c235b34
b7e0144
1c36512
9df305c
b7eaba5
3e7d421
e20c0d8
2355885
9c2d6e5
af15929
525afd5
79213a1
cdd9b0d
ccf39bb
b7dedf7
3aabd8f
74195ce
aa47d9c
d9c3b5c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,281 @@ | ||
# Databricks notebook source | ||
# MAGIC %md | ||
# MAGIC ## Install the libraries and prepare the environment | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %md | ||
# MAGIC For this demo we will require a few spatial libraries that can be easily installed via pip install. We will be using gdal, rasterio, pystac and databricks-mosaic for data download and data manipulation. We will use planetary computer as the source of the raster data for the analysis. | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %pip install databricks-mosaic rasterio==1.3.5 --quiet gdal==3.4.3 pystac pystac_client planetary_computer tenacity rich | ||
|
||
# COMMAND ---------- | ||
|
||
import library | ||
import pystac_client | ||
import planetary_computer | ||
import mosaic as mos | ||
|
||
from pyspark.sql import functions as F | ||
|
||
mos.enable_mosaic(spark, dbutils) | ||
mos.enable_gdal(spark) | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %reload_ext autoreload | ||
# MAGIC %autoreload 2 | ||
# MAGIC %reload_ext library | ||
|
||
# COMMAND ---------- | ||
|
||
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "false") | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %md | ||
# MAGIC We will download census data from TIGER feed for this demo. The data can be downloaded as a zip to dbfs (or managed volumes). | ||
|
||
# COMMAND ---------- | ||
|
||
dbutils.fs.rm("/FileStore/geospatial/odin/census/", True) | ||
dbutils.fs.mkdirs("/FileStore/geospatial/odin/census/") | ||
|
||
# COMMAND ---------- | ||
|
||
import urllib.request | ||
urllib.request.urlretrieve( | ||
"https://www2.census.gov/geo/tiger/TIGER2021/COUNTY/tl_2021_us_county.zip", | ||
"/dbfs/FileStore/geospatial/odin/census/data.zip" | ||
) | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %sh ls -al /dbfs/FileStore/geospatial/odin/census/ | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %md | ||
# MAGIC Mosaic has specialised readers for shape files and other GDAL supported formats. We dont need to unzip the data zip file. Just need to pass "vsizip" option to the reader. | ||
|
||
# COMMAND ---------- | ||
|
||
census_df = mos.read().format("multi_read_ogr")\ | ||
.option("vsizip", "true")\ | ||
.option("chunkSize", "50")\ | ||
.load("dbfs:/FileStore/geospatial/odin/census/data.zip")\ | ||
.cache() # We will cache the loaded data to avoid schema inference being done repeatedly for each query | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %md | ||
# MAGIC For this exmaple we will focus on Alaska counties. Alska state code is 02 so we will apply a filter to our ingested data. | ||
|
||
# COMMAND ---------- | ||
|
||
census_df.where("STATEFP == 2").display() | ||
|
||
# COMMAND ---------- | ||
|
||
to_display = census_df\ | ||
.where("STATEFP == 2")\ | ||
.withColumn( | ||
"geom_0", | ||
mos.st_updatesrid("geom_0", "geom_0_srid", F.lit(4326)) | ||
)\ | ||
.select("geom_0") | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %%mosaic_kepler | ||
# MAGIC to_display geom_0 geometry 50 | ||
|
||
# COMMAND ---------- | ||
|
||
cells = census_df\ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be great to explain why we are doing tessellation here. |
||
.where("STATEFP == 2")\ | ||
.withColumn( | ||
"geom_0", | ||
mos.st_updatesrid("geom_0", "geom_0_srid", F.lit(4326)) | ||
)\ | ||
.withColumn("geom_0_srid", F.lit(4326))\ | ||
.withColumn( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is good practice to drop the original geometry after a tessellateexplode |
||
"grid", | ||
mos.grid_tessellateexplode("geom_0", F.lit(3)) | ||
) | ||
|
||
# COMMAND ---------- | ||
|
||
cells.display() | ||
|
||
# COMMAND ---------- | ||
|
||
to_display = cells.select(mos.st_simplify("grid.wkb", F.lit(0.1)).alias("wkb")) | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %%mosaic_kepler | ||
# MAGIC to_display wkb geometry 100000 | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %md | ||
# MAGIC It is fairly easy to interface with the pysta_client and a remote raster data catalogs. We can browse resource collections and individual assets. | ||
|
||
# COMMAND ---------- | ||
|
||
time_range = "2021-06-01/2021-06-30" | ||
|
||
# COMMAND ---------- | ||
|
||
cell_jsons = cells\ | ||
.withColumn("area_id", F.hash("geom_0"))\ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
.withColumn("h3", F.col("grid.index_id"))\ | ||
.groupBy("h3")\ | ||
.agg( | ||
mos.st_union_agg("grid.wkb").alias("geom_1") | ||
)\ | ||
.withColumn("geojson", mos.st_asgeojson(mos.grid_boundaryaswkb("h3")))\ | ||
.drop("count", "geom_1") | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %md | ||
# MAGIC Stac catalogs support easy download for area of interest provided as geojsons. With this in mind we will convert all our H3 cells of interest into geojsons and prepare stac requests. | ||
|
||
# COMMAND ---------- | ||
|
||
cell_jsons.display() | ||
|
||
# COMMAND ---------- | ||
|
||
cell_jsons.count() | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %%mosaic_kepler | ||
# MAGIC cell_jsons h3 h3 | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %md | ||
# MAGIC Our framework allows for easy preparation of stac requests with only one line of code. This data is delta ready as this point and can easily be stored for lineage purposes. | ||
|
||
# COMMAND ---------- | ||
|
||
eod_items = library.get_assets_for_cells(cell_jsons.repartition(200), time_range ,"sentinel-2-l2a" ).cache() | ||
eod_items.display() | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %md | ||
# MAGIC From this point we can easily extract the download links for items of interest. | ||
|
||
# COMMAND ---------- | ||
|
||
dbutils.fs.rm("/FileStore/geospatial/odin/alaska/", True) | ||
dbutils.fs.mkdirs("/FileStore/geospatial/odin/alaska/") | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %sql | ||
# MAGIC DROP DATABASE IF EXISTS odin_alaska CASCADE; | ||
# MAGIC CREATE DATABASE IF NOT EXISTS odin_alaska; | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %sql | ||
# MAGIC USE odin_alaska; | ||
|
||
# COMMAND ---------- | ||
|
||
def download_band(eod_items, band_name): | ||
to_download = eod_items\ | ||
.withColumn("timestamp", F.col("item_properties.datetime"))\ | ||
.groupBy("item_id", "timestamp")\ | ||
.agg( | ||
*[F.first(cn).alias(cn) for cn in eod_items.columns if cn not in ["item_id"]] | ||
)\ | ||
.withColumn("date", F.to_date("timestamp"))\ | ||
.withColumn("href", F.col("asset.href"))\ | ||
.where( | ||
f"asset.name == '{band_name}'" | ||
) | ||
|
||
spark.sql(f"DROP TABLE IF EXISTS alaska_{band_name}") | ||
dbutils.fs.rm(f"/FileStore/geospatial/odin/alaska/{band_name}", True) | ||
dbutils.fs.mkdirs(f"/FileStore/geospatial/odin/alaska/{band_name}") | ||
|
||
catalof_df = to_download\ | ||
.withColumn( | ||
"outputfile", | ||
library.download_asset("href", F.lit(f"/dbfs/FileStore/geospatial/odin/alaska/{band_name}"), | ||
F.concat(F.hash(F.rand()), F.lit(".tif"))) | ||
) | ||
|
||
catalof_df.write\ | ||
.mode("overwrite")\ | ||
.option("overwriteSchema", "true")\ | ||
.format("delta")\ | ||
.saveAsTable(f"alaska_{band_name}") | ||
|
||
|
||
# COMMAND ---------- | ||
|
||
import rich.table | ||
|
||
region = census_df.where("STATEFP == 2").select(mos.st_asgeojson("geom_0").alias("geojson")).limit(1).collect()[0]["geojson"] | ||
|
||
catalog = pystac_client.Client.open( | ||
"https://planetarycomputer.microsoft.com/api/stac/v1", | ||
modifier=planetary_computer.sign_inplace, | ||
) | ||
|
||
search = catalog.search( | ||
collections=["sentinel-2-l2a"], | ||
intersects=region, | ||
datetime=time_range | ||
) | ||
|
||
items = search.item_collection() | ||
|
||
table = rich.table.Table("Asset Key", "Description") | ||
for asset_key, asset in items[0].assets.items(): | ||
table.add_row(asset_key, asset.title) | ||
|
||
table | ||
|
||
# COMMAND ---------- | ||
|
||
bands = [] | ||
for asset_key, asset in items[0].assets.items(): | ||
bands.append(asset_key) | ||
|
||
bands = [b for b in bands if b not in ["visual", "preview", "safe-manifest", "tilejson", "rendered_preview", "granule-metadata", "inspire-metadata", "product-metadata", "datastrip-metadata"]] | ||
bands | ||
|
||
# COMMAND ---------- | ||
|
||
for band in bands: | ||
download_band(eod_items, band) | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %fs ls /FileStore/geospatial/odin/alaska/B08 | ||
|
||
# COMMAND ---------- | ||
|
||
import rasterio | ||
from matplotlib import pyplot | ||
from rasterio.plot import show | ||
|
||
fig, ax = pyplot.subplots(1, figsize=(12, 12)) | ||
raster = rasterio.open("""/dbfs/FileStore/geospatial/odin/alaska/B08/2764922.tif""") | ||
show(raster, ax=ax, cmap='Greens') | ||
pyplot.show() | ||
|
||
# COMMAND ---------- | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am missing an introduction that describes the use case or goal of this analysis.
It would also be helpful to have a bullet list of the high-level steps involved here.