Skip to content

Commit

Permalink
feat(pipeline): add scoring model
Browse files Browse the repository at this point in the history
This implementation uses a plpython udf that calls
the scoring api of the data inclusion schema.
  • Loading branch information
vmttn committed Sep 11, 2024
1 parent bb26c7d commit db8769c
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 0 deletions.
1 change: 1 addition & 0 deletions pipeline/dbt/macros/create_udfs.sql
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Another way would be to use the `on-run-start` hook, but it does not play nicely
{{ create_udf__service_checks() }}
{{ create_udf__adresse_checks() }}
{{ create_udf__structure_checks() }}
{{ create_udf__score() }}

{% endset %}

Expand Down
30 changes: 30 additions & 0 deletions pipeline/dbt/macros/udfs/udf__score.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{% macro create_udf__score() %}

DROP FUNCTION IF EXISTS score;
CREATE OR REPLACE FUNCTION score(data JSONB)
RETURNS
TABLE(
score_ligne FLOAT,
nom_critere TEXT,
score_critere FLOAT
)
AS $$

import json

from data_inclusion.schema import Service, score_qualite

score, details = score_qualite.score(Service(**json.loads(data)))

return [
{
"score_ligne": score,
"nom_critere": nom_critere,
"score_critere": score_critere,
}
for nom_critere, score_critere in details.items()
]

$$ LANGUAGE plpython3u;

{% endmacro %}
28 changes: 28 additions & 0 deletions pipeline/dbt/models/intermediate/_models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,31 @@ models:
- street
- locality
- municipality

- name: int__criteres_qualite
description: |
Quality criteria scorings for all sources.
Each row holds a single criterion score for a service.
There is as many rows as there are criteria for a service.
Scoring is done by data-inclusion-schema scoring api in PL/Python.
columns:
- name: service_id
data_tests:
- not_null
- relationships:
to: ref('int__union_services')
field: _di_surrogate_id
- name: nom_critere
description: Name of the criterion.
data_tests:
- not_null
- dbt_utils.not_empty_string
- name: score_critere
description: |
Score for the given criterion and the given service, between 0 and 1.
- name: score_ligne
data_tests:
- not_null
16 changes: 16 additions & 0 deletions pipeline/dbt/models/intermediate/int__criteres_qualite.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
WITH services AS (
SELECT * FROM {{ ref('int__union_services__enhanced') }}
),

final AS (
SELECT
services._di_surrogate_id AS "service_id",
scores.score_critere AS "score_critere",
scores.nom_critere AS "nom_critere",
scores.score_ligne AS "score_ligne"
FROM
services,
LATERAL (SELECT * FROM processings.score(TO_JSONB(services))) AS scores
)

SELECT * FROM final

0 comments on commit db8769c

Please sign in to comment.