feat(pipeline): add scoring model

This implementation uses a plpython udf that calls the scoring api of the data inclusion schema.
gip-inclusion · Sep 11, 2024 · db8769c · db8769c
1 parent bb26c7d
commit db8769c
Show file tree

Hide file tree

Showing 4 changed files with 75 additions and 0 deletions.
diff --git a/pipeline/dbt/macros/create_udfs.sql b/pipeline/dbt/macros/create_udfs.sql
@@ -14,6 +14,7 @@ Another way would be to use the `on-run-start` hook, but it does not play nicely
 {{ create_udf__service_checks() }}
 {{ create_udf__adresse_checks() }}
 {{ create_udf__structure_checks() }}
+{{ create_udf__score() }}
 
 {% endset %}
 

diff --git a/pipeline/dbt/macros/udfs/udf__score.sql b/pipeline/dbt/macros/udfs/udf__score.sql
@@ -0,0 +1,30 @@
+{% macro create_udf__score() %}
+
+DROP FUNCTION IF EXISTS score;
+CREATE OR REPLACE FUNCTION score(data JSONB)
+RETURNS
+    TABLE(
+        score_ligne FLOAT,
+        nom_critere TEXT,
+        score_critere FLOAT
+    )
+AS $$
+
+import json
+
+from data_inclusion.schema import Service, score_qualite
+
+score, details = score_qualite.score(Service(**json.loads(data)))
+
+return [
+    {
+        "score_ligne": score,
+        "nom_critere": nom_critere,
+        "score_critere": score_critere,
+    }
+    for nom_critere, score_critere in details.items()
+]
+
+$$ LANGUAGE plpython3u;
+
+{% endmacro %}
diff --git a/pipeline/dbt/models/intermediate/_models.yml b/pipeline/dbt/models/intermediate/_models.yml
@@ -118,3 +118,31 @@ models:
                 - street
                 - locality
                 - municipality
+
+  - name: int__criteres_qualite
+    description: |
+      Quality criteria scorings for all sources.
+
+      Each row holds a single criterion score for a service.
+
+      There is as many rows as there are criteria for a service.
+
+      Scoring is done by data-inclusion-schema scoring api in PL/Python.
+    columns:
+      - name: service_id
+        data_tests:
+          - not_null
+          - relationships:
+              to: ref('int__union_services')
+              field: _di_surrogate_id
+      - name: nom_critere
+        description: Name of the criterion.
+        data_tests:
+          - not_null
+          - dbt_utils.not_empty_string
+      - name: score_critere
+        description: |
+          Score for the given criterion and the given service, between 0 and 1.
+      - name: score_ligne
+        data_tests:
+          - not_null
diff --git a/pipeline/dbt/models/intermediate/int__criteres_qualite.sql b/pipeline/dbt/models/intermediate/int__criteres_qualite.sql
@@ -0,0 +1,16 @@
+WITH services AS (
+    SELECT * FROM {{ ref('int__union_services__enhanced') }}
+),
+
+final AS (
+    SELECT
+        services._di_surrogate_id AS "service_id",
+        scores.score_critere      AS "score_critere",
+        scores.nom_critere        AS "nom_critere",
+        scores.score_ligne        AS "score_ligne"
+    FROM
+        services,
+        LATERAL (SELECT * FROM processings.score(TO_JSONB(services))) AS scores
+)
+
+SELECT * FROM final