Skip to content

Commit

Permalink
Merge pull request #175 from cagov/footprints-dedupe
Browse files Browse the repository at this point in the history
Footprints deduplication
  • Loading branch information
britt-allen authored Aug 22, 2023
2 parents 5e3ea5e + a38f96f commit 35ec65c
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ models:
and the tabulation block number
- name: geometry
description: The spatial component of geographic features
tests:
- dbt_utils.equal_rowcount:
compare_model: source('building_footprints', 'california_building_footprints')
- name: geo_reference__building_footprints_with_places
description: |
This data table is a join of the TIGER shapefile
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ with footprints as (
select
"release",
"capture_dates_range",
"geometry"
"geometry",
/* Generate a temporary ID for footprints. We will need this to group/partition
by unique footprints further down. We could use a UUID, but integers are
cheaper to generate and compare. */
seq4() as _tmp_id
from {{ source('building_footprints', 'california_building_footprints') }}
),

Expand All @@ -24,9 +28,32 @@ blocks as (
footprints_and_blocks_joined as (
select
footprints.*,
blocks.* exclude "geometry"
blocks.* exclude "geometry",
/* We don't actually need the intersection for every footprint, only for the
ones that intersect more than one block. However, in order to establish which
ones intersect more than one block, we need a windowed COUNT partitioned by
_tmp_id. This is an expensive operation, as it likely triggers a shuffle
(even though it should already be sorted by _tmp_id). In testing we've found
that it's cheaper to just do the intersection for all the footprints. */
st_area(st_intersection(footprints."geometry", blocks."geometry"))
as _tmp_intersection
from footprints
left join blocks on st_intersects(footprints."geometry", blocks."geometry")
),

footprints_and_blocks_joined_dedupe as (
select
-- Snowflake doesn't support geometries in max_by. It should, but it doesn't.
-- Fortunately, we know that the geometries are identical when partitioned
-- by _tmp_id, so we can just choose any_value.
any_value("geometry") as "geometry",
max_by("county_fips", _tmp_intersection) as "county_fips",
max_by("tract", _tmp_intersection) as "tract",
max_by("block", _tmp_intersection) as "block",
max_by("geoid", _tmp_intersection) as "geoid",
max_by("name", _tmp_intersection) as "name"
from footprints_and_blocks_joined
group by _tmp_id
)

select * from footprints_and_blocks_joined
select * from footprints_and_blocks_joined_dedupe

0 comments on commit 35ec65c

Please sign in to comment.