Merge pull request #175 from cagov/footprints-dedupe

Footprints deduplication
cagov · Aug 22, 2023 · 35ec65c · 35ec65c
2 parents 5e3ea5e + a38f96f
commit 35ec65c
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 3 deletions.
diff --git a/transform/models/marts/geo_reference/_geo_reference__models.yml b/transform/models/marts/geo_reference/_geo_reference__models.yml
@@ -39,6 +39,9 @@ models:
           and the tabulation block number
       - name: geometry
         description: The spatial component of geographic features
+    tests:
+      - dbt_utils.equal_rowcount:
+          compare_model: source('building_footprints', 'california_building_footprints')
   - name: geo_reference__building_footprints_with_places
     description: |
       This data table is a join of the TIGER shapefile

diff --git a/transform/models/marts/geo_reference/geo_reference__building_footprints_with_blocks.sql b/transform/models/marts/geo_reference/geo_reference__building_footprints_with_blocks.sql
@@ -2,7 +2,11 @@ with footprints as (
     select
         "release",
         "capture_dates_range",
-        "geometry"
+        "geometry",
+        /* Generate a temporary ID for footprints. We will need this to group/partition
+        by unique footprints further down. We could use a UUID, but integers are
+        cheaper to generate and compare. */
+        seq4() as _tmp_id
     from {{ source('building_footprints', 'california_building_footprints') }}
 ),
 
@@ -24,9 +28,32 @@ blocks as (
 footprints_and_blocks_joined as (
     select
         footprints.*,
-        blocks.* exclude "geometry"
+        blocks.* exclude "geometry",
+        /* We don't actually need the intersection for every footprint, only for the
+         ones that intersect more than one block. However, in order to establish which
+         ones intersect more than one block, we need a windowed COUNT partitioned by
+         _tmp_id. This is an expensive operation, as it likely triggers a shuffle
+         (even though it should already be sorted by _tmp_id). In testing we've found
+         that it's cheaper to just do the intersection for all the footprints. */
+        st_area(st_intersection(footprints."geometry", blocks."geometry"))
+            as _tmp_intersection
     from footprints
     left join blocks on st_intersects(footprints."geometry", blocks."geometry")
+),
+
+footprints_and_blocks_joined_dedupe as (
+    select
+        -- Snowflake doesn't support geometries in max_by. It should, but it doesn't.
+        -- Fortunately, we know that the geometries are identical when partitioned
+        -- by _tmp_id, so we can just choose any_value.
+        any_value("geometry") as "geometry",
+        max_by("county_fips", _tmp_intersection) as "county_fips",
+        max_by("tract", _tmp_intersection) as "tract",
+        max_by("block", _tmp_intersection) as "block",
+        max_by("geoid", _tmp_intersection) as "geoid",
+        max_by("name", _tmp_intersection) as "name"
+    from footprints_and_blocks_joined
+    group by _tmp_id
 )
 
-select * from footprints_and_blocks_joined
+select * from footprints_and_blocks_joined_dedupe