Skip to content

Commit

Permalink
fix(dora): remove empty strings from all fields
Browse files Browse the repository at this point in the history
Also:

* rewrite casts to use standard sql `CAST`.
* remove redundant text array casts.
* remove useless thematiques column.
* enforce relationships between tables.
* move source config next to staging models.
  • Loading branch information
vmttn committed Sep 27, 2024
1 parent f22131d commit 7175d1d
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 94 deletions.
17 changes: 0 additions & 17 deletions pipeline/dbt/models/_sources.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
version: 2

sources:
- name: internal
schema: public
tables:
- name: extra__geocoded_results

- name: data_inclusion
schema: data_inclusion
meta:
Expand All @@ -27,18 +22,6 @@ sources:
- name: sirene_etablissement_historique
- name: sirene_etablissement_succession

- name: dora
schema: dora
meta:
is_provider: true
tables:
- name: structures
meta:
kind: structure
- name: services
meta:
kind: service

- name: france_travail
schema: france_travail
meta:
Expand Down
9 changes: 0 additions & 9 deletions pipeline/dbt/models/staging/sources/dora/_dora__models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,6 @@ models:
- relationships:
to: ref('stg_dora__structures')
field: id
# Some services are associated to draft structures which are not published
# by dora on its api. Therefore some services are missing their structure.
config:
severity: warn
- name: telephone
data_tests:
- dbt_utils.not_constant
Expand Down Expand Up @@ -281,11 +277,6 @@ models:
data_tests:
- dbt_utils.not_constant
- dbt_utils.not_empty_string
- name: thematiques
data_tests:
# dora's thematiques are defined at the service level
- dbt_utils.expression_is_true:
expression: "IS NULL"
- name: typologie
data_tests:
- dbt_utils.not_constant
Expand Down
12 changes: 12 additions & 0 deletions pipeline/dbt/models/staging/sources/dora/_dora__sources.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
sources:
- name: dora
schema: dora
meta:
is_provider: true
tables:
- name: structures
meta:
kind: structure
- name: services
meta:
kind: service
82 changes: 41 additions & 41 deletions pipeline/dbt/models/staging/sources/dora/stg_dora__services.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,47 +8,47 @@ structures AS (

services AS (
SELECT
_di_source_id AS "_di_source_id",
(data ->> 'contact_public')::BOOLEAN AS "contact_public",
(data ->> 'cumulable')::BOOLEAN AS "cumulable",
(data ->> 'date_creation')::TIMESTAMP WITH TIME ZONE AS "date_creation",
(data ->> 'date_maj')::TIMESTAMP WITH TIME ZONE AS "date_maj",
(data ->> 'date_suspension')::TIMESTAMP WITH TIME ZONE AS "date_suspension",
(data ->> 'latitude')::FLOAT AS "latitude",
(data ->> 'longitude')::FLOAT AS "longitude",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_accueil'))::TEXT [] AS "modes_accueil",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'profils'))::TEXT [] AS "profils",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'thematiques'))::TEXT [] AS "thematiques",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'types'))::TEXT [] AS "types",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'justificatifs'))::TEXT [] AS "justificatifs",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'pre_requis'))::TEXT [] AS "pre_requis",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_orientation_accompagnateur'))::TEXT [] AS "modes_orientation_accompagnateur",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_orientation_beneficiaire'))::TEXT [] AS "modes_orientation_beneficiaire",
data ->> 'modes_orientation_accompagnateur_autres' AS "modes_orientation_accompagnateur_autres",
data ->> 'modes_orientation_beneficiaire_autres' AS "modes_orientation_beneficiaire_autres",
data ->> 'adresse' AS "adresse",
data ->> 'code_insee' AS "code_insee",
data ->> 'code_postal' AS "code_postal",
data ->> 'commune' AS "commune",
data ->> 'complement_adresse' AS "complement_adresse",
NULLIF(TRIM(data ->> 'contact_nom_prenom'), '') AS "contact_nom_prenom",
NULLIF(TRIM(data ->> 'courriel'), '') AS "courriel",
data ->> 'formulaire_en_ligne' AS "formulaire_en_ligne",
data ->> 'frais_autres' AS "frais_autres",
data ->> 'frais' AS "frais",
data ->> 'id' AS "id",
data ->> 'lien_source' AS "lien_source",
data ->> 'nom' AS "nom",
data ->> 'presentation_resume' AS "presentation_resume",
data ->> 'presentation_detail' AS "presentation_detail",
NULLIF(TRIM(data ->> 'prise_rdv'), '') AS "prise_rdv",
data ->> 'recurrence' AS "recurrence",
data ->> 'source' AS "source",
data ->> 'structure_id' AS "structure_id",
NULLIF(TRIM(data ->> 'telephone'), '') AS "telephone",
NULLIF(TRIM(data ->> 'zone_diffusion_code'), '') AS "zone_diffusion_code",
NULLIF(TRIM(data ->> 'zone_diffusion_nom'), '') AS "zone_diffusion_nom",
data ->> 'zone_diffusion_type' AS "zone_diffusion_type"
_di_source_id AS "_di_source_id",
CAST((data ->> 'contact_public') AS BOOLEAN) AS "contact_public",
CAST((data ->> 'cumulable') AS BOOLEAN) AS "cumulable",
CAST((data ->> 'date_creation') AS TIMESTAMP WITH TIME ZONE) AS "date_creation",
CAST((data ->> 'date_maj') AS TIMESTAMP WITH TIME ZONE) AS "date_maj",
CAST((data ->> 'date_suspension') AS TIMESTAMP WITH TIME ZONE) AS "date_suspension",
CAST((data ->> 'latitude') AS FLOAT) AS "latitude",
CAST((data ->> 'longitude') AS FLOAT) AS "longitude",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_accueil')) AS "modes_accueil",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'profils')) AS "profils",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'thematiques')) AS "thematiques",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'types')) AS "types",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'justificatifs')) AS "justificatifs",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'pre_requis')) AS "pre_requis",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_orientation_accompagnateur')) AS "modes_orientation_accompagnateur",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'modes_orientation_beneficiaire')) AS "modes_orientation_beneficiaire",
NULLIF(TRIM(data ->> 'modes_orientation_accompagnateur_autres'), '') AS "modes_orientation_accompagnateur_autres",
NULLIF(TRIM(data ->> 'modes_orientation_beneficiaire_autres'), '') AS "modes_orientation_beneficiaire_autres",
NULLIF(TRIM(data ->> 'adresse'), '') AS "adresse",
NULLIF(TRIM(data ->> 'code_insee'), '') AS "code_insee",
NULLIF(TRIM(data ->> 'code_postal'), '') AS "code_postal",
NULLIF(TRIM(data ->> 'commune'), '') AS "commune",
NULLIF(TRIM(data ->> 'complement_adresse'), '') AS "complement_adresse",
NULLIF(TRIM(data ->> 'contact_nom_prenom'), '') AS "contact_nom_prenom",
NULLIF(TRIM(data ->> 'courriel'), '') AS "courriel",
NULLIF(TRIM(data ->> 'formulaire_en_ligne'), '') AS "formulaire_en_ligne",
NULLIF(TRIM(data ->> 'frais_autres'), '') AS "frais_autres",
NULLIF(TRIM(data ->> 'frais'), '') AS "frais",
NULLIF(TRIM(data ->> 'id'), '') AS "id",
NULLIF(TRIM(data ->> 'lien_source'), '') AS "lien_source",
NULLIF(TRIM(data ->> 'nom'), '') AS "nom",
NULLIF(TRIM(data ->> 'presentation_resume'), '') AS "presentation_resume",
NULLIF(TRIM(data ->> 'presentation_detail'), '') AS "presentation_detail",
NULLIF(TRIM(data ->> 'prise_rdv'), '') AS "prise_rdv",
NULLIF(TRIM(data ->> 'recurrence'), '') AS "recurrence",
NULLIF(TRIM(data ->> 'source'), '') AS "source",
NULLIF(TRIM(data ->> 'structure_id'), '') AS "structure_id",
NULLIF(TRIM(data ->> 'telephone'), '') AS "telephone",
NULLIF(TRIM(data ->> 'zone_diffusion_code'), '') AS "zone_diffusion_code",
NULLIF(TRIM(data ->> 'zone_diffusion_nom'), '') AS "zone_diffusion_nom",
NULLIF(TRIM(data ->> 'zone_diffusion_type'), '') AS "zone_diffusion_type"
FROM source
),

Expand Down
53 changes: 26 additions & 27 deletions pipeline/dbt/models/staging/sources/dora/stg_dora__structures.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,32 @@ WITH source AS (

final AS (
SELECT
_di_source_id AS "_di_source_id",
(data ->> 'antenne')::BOOLEAN AS "antenne",
(data ->> 'date_maj')::TIMESTAMP WITH TIME ZONE AS "date_maj",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'labels_autres'))::TEXT [] AS "labels_autres",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'labels_nationaux'))::TEXT [] AS "labels_nationaux",
(data ->> 'latitude')::FLOAT AS "latitude",
(data ->> 'longitude')::FLOAT AS "longitude",
(data ->> 'thematiques')::TEXT [] AS "thematiques",
data ->> 'accessibilite' AS "accessibilite",
data ->> 'adresse' AS "adresse",
data ->> 'code_insee' AS "code_insee",
data ->> 'code_postal' AS "code_postal",
data ->> 'commune' AS "commune",
NULLIF(TRIM(data ->> 'complement_adresse'), '') AS "complement_adresse",
NULLIF(TRIM(data ->> 'courriel'), '') AS "courriel",
data ->> 'horaires_ouverture' AS "horaires_ouverture",
data ->> 'id' AS "id",
data ->> 'lien_source' AS "lien_source",
NULLIF(TRIM(data ->> 'nom'), '') AS "nom",
data ->> 'presentation_detail' AS "presentation_detail",
data ->> 'presentation_resume' AS "presentation_resume",
data ->> 'rna' AS "rna",
data ->> 'siret' AS "siret",
data ->> 'site_web' AS "site_web",
data ->> 'source' AS "source",
data ->> 'telephone' AS "telephone",
NULLIF(TRIM(data ->> 'typologie'), '') AS "typologie"
_di_source_id AS "_di_source_id",
CAST((data ->> 'antenne') AS BOOLEAN) AS "antenne",
CAST((data ->> 'date_maj') AS TIMESTAMP WITH TIME ZONE) AS "date_maj",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'labels_autres')) AS "labels_autres",
ARRAY(SELECT * FROM JSONB_ARRAY_ELEMENTS_TEXT(data -> 'labels_nationaux')) AS "labels_nationaux",
CAST((data ->> 'latitude') AS FLOAT) AS "latitude",
CAST((data ->> 'longitude') AS FLOAT) AS "longitude",
NULLIF(TRIM(data ->> 'accessibilite'), '') AS "accessibilite",
NULLIF(TRIM(data ->> 'adresse'), '') AS "adresse",
NULLIF(TRIM(data ->> 'code_insee'), '') AS "code_insee",
NULLIF(TRIM(data ->> 'code_postal'), '') AS "code_postal",
NULLIF(TRIM(data ->> 'commune'), '') AS "commune",
NULLIF(TRIM(data ->> 'complement_adresse'), '') AS "complement_adresse",
NULLIF(TRIM(data ->> 'courriel'), '') AS "courriel",
NULLIF(TRIM(data ->> 'horaires_ouverture'), '') AS "horaires_ouverture",
NULLIF(TRIM(data ->> 'id'), '') AS "id",
NULLIF(TRIM(data ->> 'lien_source'), '') AS "lien_source",
NULLIF(TRIM(data ->> 'nom'), '') AS "nom",
NULLIF(TRIM(data ->> 'presentation_detail'), '') AS "presentation_detail",
NULLIF(TRIM(data ->> 'presentation_resume'), '') AS "presentation_resume",
NULLIF(TRIM(data ->> 'rna'), '') AS "rna",
NULLIF(TRIM(data ->> 'siret'), '') AS "siret",
NULLIF(TRIM(data ->> 'site_web'), '') AS "site_web",
NULLIF(TRIM(data ->> 'source'), '') AS "source",
NULLIF(TRIM(data ->> 'telephone'), '') AS "telephone",
NULLIF(TRIM(data ->> 'typologie'), '') AS "typologie"
FROM source
)

Expand Down

0 comments on commit 7175d1d

Please sign in to comment.