Update according to things learned from performance profiling.

cagov · Aug 22, 2023 · a38f96f · a38f96f
1 parent 246cc7e
commit a38f96f
Showing 1 changed file with 15 additions and 22 deletions.
diff --git a/transform/models/marts/geo_reference/geo_reference__building_footprints_with_blocks.sql b/transform/models/marts/geo_reference/geo_reference__building_footprints_with_blocks.sql
@@ -3,9 +3,9 @@ with footprints as (
         "release",
         "capture_dates_range",
         "geometry",
-        -- TODO: is seq4() superior to to uuid_string()? It should be able
-        -- to improve the partitioning below based on micropartition logic.
-        -- but in practice the difference doesn't seem large.
+        /* Generate a temporary ID for footprints. We will need this to group/partition
+        by unique footprints further down. We could use a UUID, but integers are
+        cheaper to generate and compare. */
         seq4() as _tmp_id
     from {{ source('building_footprints', 'california_building_footprints') }}
 ),
@@ -29,30 +29,23 @@ footprints_and_blocks_joined as (
     select
         footprints.*,
         blocks.* exclude "geometry",
-        -- TODO: investigate how much savings there are by doing this with
-        -- a `case` statement rather than computing the area for everything.
-        -- also investigate whether there are any savings to be had doing
-        -- this with a self join instead of a window function.
-        case count(*) over (partition by footprints._tmp_id) > 1
-            when false then 1.0
-            when
-                true
-                then st_area(st_intersection(footprints."geometry", blocks."geometry"))
-        end as _tmp_intersection
+        /* We don't actually need the intersection for every footprint, only for the
+         ones that intersect more than one block. However, in order to establish which
+         ones intersect more than one block, we need a windowed COUNT partitioned by
+         _tmp_id. This is an expensive operation, as it likely triggers a shuffle
+         (even though it should already be sorted by _tmp_id). In testing we've found
+         that it's cheaper to just do the intersection for all the footprints. */
+        st_area(st_intersection(footprints."geometry", blocks."geometry"))
+            as _tmp_intersection
     from footprints
     left join blocks on st_intersects(footprints."geometry", blocks."geometry")
 ),
 
--- TODO: investigate the performance characteristics of using the window
--- function approach vs the max_by approach.
 footprints_and_blocks_joined_dedupe as (
-    select *
-    from footprints_and_blocks_joined
-    qualify row_number() over (partition by _tmp_id order by _tmp_intersection desc) = 1
-),
-
-footprints_and_blocks_joined_dedupe2 as (
     select
+        -- Snowflake doesn't support geometries in max_by. It should, but it doesn't.
+        -- Fortunately, we know that the geometries are identical when partitioned
+        -- by _tmp_id, so we can just choose any_value.
         any_value("geometry") as "geometry",
         max_by("county_fips", _tmp_intersection) as "county_fips",
         max_by("tract", _tmp_intersection) as "tract",
@@ -63,4 +56,4 @@ footprints_and_blocks_joined_dedupe2 as (
     group by _tmp_id
 )
 
-select * from footprints_and_blocks_joined_dedupe2
+select * from footprints_and_blocks_joined_dedupe