Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented OrcaVault link satellite model between Library and SequencingRun - glims #36

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions orcavault/models/raw/sat_library_sequencing_run_glims.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
{{
config(
materialized='incremental',
incremental_strategy='append',
on_schema_change='fail'
)
}}

with source as (

select
illumina_id as sequencing_run_id,
library_id,
record_source,
cast("timestamp" as date) as "timestamp",
run,
override_cycles,
secondary_analysis,
number_fastqs,
fastq,
results,
notes,
trello
from
{{ ref('spreadsheet_google_lims') }}

),

cleaned as (

select
row_number() over (partition by library_id, "timestamp" order by "timestamp" desc, sequencing_run_id desc) as rank,
sequencing_run_id,
trim(regexp_replace(library_id, E'[\\n\\r]+', '', 'g')) as library_id,
record_source,
"timestamp",
run,
trim(regexp_replace(override_cycles, E'[\\n\\r]+', '', 'g')) as override_cycles,
trim(regexp_replace(secondary_analysis, E'[\\n\\r]+', '', 'g')) as secondary_analysis,
trim(regexp_replace(number_fastqs, E'[\\n\\r]+', '', 'g')) as number_fastqs,
trim(regexp_replace(fastq, E'[\\n\\r]+', '', 'g')) as fastq,
trim(regexp_replace(results, E'[\\n\\r]+', '', 'g')) as results,
trim(regexp_replace(notes, E'[\\n\\r]+', '', 'g')) as notes,
trim(regexp_replace(trello, E'[\\n\\r]+', '', 'g')) as trello
from
source
where
(library_id is not null or library_id <> '') and
(sequencing_run_id is not null or sequencing_run_id <> '')

),

differentiated as (

select
*
from
cleaned
where
rank = 1
{% if is_incremental() %}
and cast("timestamp" as timestamptz) + time '11:00' > ( select coalesce(max(load_datetime), '1900-01-01') as ldts from {{ this }} )
{% endif %}

),

encoded as (

select
encode(sha256(cast(sequencing_run_id as bytea)), 'hex') as sequencing_run_hk,
encode(sha256(cast(library_id as bytea)), 'hex') as library_hk,
record_source,
encode(sha256(concat("timestamp", run, override_cycles, secondary_analysis, number_fastqs, fastq, results, notes, trello)::bytea), 'hex') as hash_diff,
"timestamp",
run,
override_cycles,
secondary_analysis,
number_fastqs,
fastq,
results,
notes,
trello
from
differentiated

),

transformed as (

select
encode(sha256(concat(sequencing_run_hk, library_hk)::bytea), 'hex') as library_sequencing_run_hk,
cast("timestamp" as timestamptz) + time '11:00' as load_datetime,
record_source,
hash_diff,
"timestamp",
run,
override_cycles,
secondary_analysis,
number_fastqs,
fastq,
results,
notes,
trello
from
encoded

),

final as (

select
cast(library_sequencing_run_hk as char(64)) as library_sequencing_run_hk,
cast(load_datetime as timestamptz) as load_datetime,
cast(record_source as varchar(255)) as record_source,
cast(hash_diff as char(64)) as hash_diff,
cast("timestamp" as date) as "timestamp",
cast(run as integer) as run,
cast(override_cycles as varchar(255)) as override_cycles,
cast(secondary_analysis as varchar(255)) as secondary_analysis,
cast(number_fastqs as varchar(255)) as number_fastqs,
cast(fastq as text) as fastq,
cast(results as text) as results,
cast(notes as text) as notes,
cast(trello as text) as trello
from
transformed

)

select * from final
38 changes: 38 additions & 0 deletions orcavault/models/raw/sat_schema.yml
Original file line number Diff line number Diff line change
@@ -207,3 +207,41 @@ models:
data_type: text
- name: trello
data_type: text

- name: sat_library_sequencing_run_glims
config:
contract: { enforced: true }
constraints:
- type: primary_key
columns: [ library_sequencing_run_hk, load_datetime ]
- type: foreign_key
columns: [ library_sequencing_run_hk ]
to: ref('link_library_sequencing_run')
to_columns: [ library_sequencing_run_hk ]
columns:
- name: library_sequencing_run_hk
data_type: char(64)
- name: load_datetime
data_type: timestamptz
- name: record_source
data_type: varchar(255)
- name: hash_diff
data_type: char(64)
- name: timestamp
data_type: date
- name: run
data_type: integer
- name: override_cycles
data_type: varchar(255)
- name: secondary_analysis
data_type: varchar(255)
- name: number_fastqs
data_type: varchar(255)
- name: fastq
data_type: text
- name: results
data_type: text
- name: notes
data_type: text
- name: trello
data_type: text