Skip to content

Commit

Permalink
Merge branch 'master' into flu-genbank
Browse files Browse the repository at this point in the history
  • Loading branch information
atc3 committed Feb 24, 2024
2 parents 3e8b945 + 0b487de commit 9b2d695
Show file tree
Hide file tree
Showing 55 changed files with 1,894 additions and 262 deletions.
11 changes: 9 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ data_flu/**
data_gisaid_flu/**
data_gisaid_rsv/**
data_genbank_rsv/**
data_6month
data_6month/**

# Ignore server passwords
server/htpasswd
Expand All @@ -108,7 +110,8 @@ daily_update.sh
daily_update_new.sh
daily_update_rsv.sh
daily_update_sars2_genbank.sh
daily_update_sars2_gisaid.sh
daily_update_sars2_gisaid_full.sh
daily_update_sars2_gisaid_6month.sh
update_gisaid.sh
update_genbank.sh
filter_list.txt
Expand All @@ -133,6 +136,7 @@ example_data_genbank/*/lineage_treetime/*.pdf

data
data_genbank
data_flu_genbank
example_data_genbank/rsv/**
example_data_genbank/flu/**
example_data_genbank/sars2/**
Expand All @@ -143,4 +147,7 @@ data_flu_genbank
workflow_main/notebooks/**

# SnapGene - temp files
static_data/flu/alignments/.sglock/**
static_data/flu/alignments/.sglock/**

# MOTD
MOTD*.html
15 changes: 15 additions & 0 deletions config/config_flu_genbank.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,20 @@ chunk_size: 10000
# ANALYSIS
# --------------------

# Don't process sequences prior to this date
# Leave empty to ignore
start_date_cutoff:
# Don't process sequences after this date
# Leave empty to ignore
end_date_cutoff:

# Don't process sequences after X days ago
# Leave empty to ignore
start_date_cutoff_days_ago:
# Don't process sequences prior to X days ago
# Leave empty to ignore
end_date_cutoff_days_ago:

segments: ["1", "2", "3", "4", "5", "6", "7", "8"]

# Insertions or deletions with more than this difference in bases between the
Expand Down Expand Up @@ -128,6 +142,7 @@ prod_hostname:

site_title: "Flu PathMut"
data_provider: "NCBI GenBank"
motd_url: "https://storage.googleapis.com/ve-public/MOTD.html"

# Default references for each subtype
default_references:
Expand Down
15 changes: 15 additions & 0 deletions config/config_flu_gisaid.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,20 @@ chunk_size: 10000
# ANALYSIS
# --------------------

# Don't process sequences prior to this date
# Leave empty to ignore
start_date_cutoff:
# Don't process sequences after this date
# Leave empty to ignore
end_date_cutoff:

# Don't process sequences after X days ago
# Leave empty to ignore
start_date_cutoff_days_ago:
# Don't process sequences prior to X days ago
# Leave empty to ignore
end_date_cutoff_days_ago:

segments: ["1", "2", "3", "4", "5", "6", "7", "8"]

# Insertions or deletions with more than this difference in bases between the
Expand Down Expand Up @@ -157,6 +171,7 @@ prod_hostname:

site_title: "Flu PathMut"
data_provider: "GISAID"
motd_url: "https://storage.googleapis.com/ve-public/MOTD_FLU_GISAID.html"

# Default references for each subtype
default_references:
Expand Down
214 changes: 214 additions & 0 deletions config/config_flu_gisaid_dev.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
# ------------------
# GLOBAL
# ------------------

# Virus this config is written for
virus: "flu"

# Path to folder with downloaded and processed data
# This path is relative to the project root
data_folder: "data_flu_small"

# Path to folder with genome information (reference.fasta, genes.json, proteins.json)
# This path is relative to the project root
static_data_folder: "static_data/flu"

# Path to folder with data to use in development
# This path is relative to the project root
# Only used for database seeding in development
example_data_folder: "data_flu_small"

# Database for this virus
postgres_db: "flu_gisaid_dev"

# ------------------
# INGEST
# ------------------

# Number of genomes to load into memory before flushing to disk
chunk_size: 10000

# --------------------
# ANALYSIS
# --------------------

# Don't process sequences prior to this date
# Leave empty to ignore
start_date_cutoff:
# Don't process sequences after this date
# Leave empty to ignore
end_date_cutoff:

# Don't process sequences after X days ago
# Leave empty to ignore
start_date_cutoff_days_ago:
# Don't process sequences prior to X days ago
# Leave empty to ignore
end_date_cutoff_days_ago:

segments: ["1", "2", "3", "4", "5", "6", "7", "8"]

# Insertions or deletions with more than this difference in bases between the
# ref and the alt will be discarded (NT level only)
max_indel_length: 100

# Mutations with less than this number of global occurrences will be ignored
mutation_count_threshold: 3

# Threshold of prevalence to report a mutation as being a consensus
# mutation for a group (e.g., clade, lineage)
consensus_fraction: 0.9

# Threshold of prevalence to report a mutation as being associated
# with a group (e.g., clade, lineage)
min_reporting_fraction: 0.05

metadata_cols:
clade:
title: "Clade"
lineage:
title: "Lineage"
passage:
title: "Passage"
host:
title: "Host"
isolate_submitter:
title: "Isolate Submitter"
submitting_lab:
title: "Submitting Lab"
originating_lab:
title: "Originating Lab"
authors:
title: "Authors"
publication:
title: "Publication"
adamantanes_resistance_geno:
title: "Adamantanes Resistance Genotype"
oseltamivir_resistance_geno:
title: "Oseltamivir Resistance Genotype"
zanamivir_resistance_geno:
title: "Zanamivir Resistance Genotype"
peramivir_resistance_geno:
title: "Peramivir Resistance Genotype"
other_resistance_geno:
title: "Other Resistance Genotype"
adamantanes_resistance_pheno:
title: "Adamantanes Resistance Phenotype"
oseltamivir_resistance_pheno:
title: "Oseltamivir Resistance Phenotype"
zanamivir_resistance_pheno:
title: "Zanamivir Resistance Phenotype"
peramivir_resistance_pheno:
title: "Peramivir Resistance Phenotype"
other_resistance_pheno:
title: "Other Resistance Phenotype"
gender:
title: "Gender"
patient_status:
title: "Patient Status"

group_cols:
serotype:
name: "serotype"
title: "Serotype"
description: ""
show_collapse_options: false

# AZ report options
report_gene: HA
report_group_col: serotype
report_group_references:
B-vic: B-Austria-1359417-2021
B-yam: B-Phuket-3073-2013
H1N1: A-Wisconsin-67-2022
H3N2: A-Darwin-6-2021
H5NX: A-Goose-Guangdong-1-96
H7NX: A-Shanghai-02-2013
H9NX: A-Hong-Kong-1073-99

# Surveillance plot options
# see: workflow_main/scripts/surveillance.py
surv_group_col: "serotype"
surv_start_date: "1956-01-01"
surv_period: "Y"
surv_min_combo_count: 50
surv_min_single_count: 50
surv_start_date_days_ago: 90
surv_end_date_days_ago: 30
surv_group_references:
B-vic: B-Austria-1359417-2021
B-yam: B-Phuket-3073-2013
H1N1: A-Wisconsin-67-2022
H3N2: A-Darwin-6-2021
H5NX: A-Goose-Guangdong-1-96
H7NX: A-Shanghai-02-2013
H9NX: A-Hong-Kong-1073-99

# ---------------
# DATABASE
# ---------------

# Split mutation table partitions into periods of this length
# See: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
# Common options:
# 'D' calendar day frequency
# 'W' weekly frequency
# 'M' month end frequency
mutation_partition_break: "Y"

# ---------------
# SERVER
# ---------------

# Require a login for accessing the website
# Users are provided to the app via. the "LOGINS" environment variable,
# which is structured as "user1:pass1,user2:pass2,..."
login_required: true

dev_hostname: "http://localhost:5003"
prod_hostname:
- "https://flu.pathmut.org"
- "https://flu.gisaid.pathmut.org"

# ----------------------
# VISUALIZATION
# ----------------------

site_title: "Flu PathMut"
data_provider: "GISAID"
motd_url: "https://storage.googleapis.com/ve-public/MOTD_FLU_GISAID.html"

# Default references for each subtype
default_references:
B-vic: B-Austria-1359417-2021
B-yam: B-Phuket-3073-2013
H1N1: A-Wisconsin-67-2022
H3N2: A-Darwin-6-2021
H5NX: A-Goose-Guangdong-1-96
H7NX: A-Shanghai-02-2013
H9NX: A-Hong-Kong-1073-99

# Home page
show_home_banner: false
show_walkthroughs: false
show_surveillance: true
show_global_seq_plot: false

show_reports_tab: false
show_global_sequencing_tab: false
show_methods_tab: false
show_related_projects_tab: false

default_gene: HA
default_protein: HA

min_date: "1956-01-01"

show_logos:
GISAID: true
GenBank: false

# Allow downloads of sequence metadata (before aggregation)
allow_metadata_download: true
# Allow downloads of raw genomes
allow_genome_download: true
15 changes: 15 additions & 0 deletions config/config_rsv_custom.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,20 @@ chunk_size: 100000
# ANALYSIS
# --------------------

# Don't process sequences prior to this date
# Leave empty to ignore
start_date_cutoff:
# Don't process sequences after this date
# Leave empty to ignore
end_date_cutoff:

# Don't process sequences after X days ago
# Leave empty to ignore
start_date_cutoff_days_ago:
# Don't process sequences prior to X days ago
# Leave empty to ignore
end_date_cutoff_days_ago:

segments: ["1"]

# Insertions or deletions with more than this difference in bases between the
Expand Down Expand Up @@ -111,6 +125,7 @@ prod_hostname:

site_title: "RSV PathMut"
data_provider: "Custom"
motd_url: "https://storage.googleapis.com/ve-public/MOTD.html"

# Default references for each subtype
default_references:
Expand Down
15 changes: 15 additions & 0 deletions config/config_rsv_genbank.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,20 @@ chunk_size: 100000
# ANALYSIS
# --------------------

# Don't process sequences prior to this date
# Leave empty to ignore
start_date_cutoff:
# Don't process sequences after this date
# Leave empty to ignore
end_date_cutoff:

# Don't process sequences after X days ago
# Leave empty to ignore
start_date_cutoff_days_ago:
# Don't process sequences prior to X days ago
# Leave empty to ignore
end_date_cutoff_days_ago:

segments: ["1"]

# Insertions or deletions with more than this difference in bases between the
Expand Down Expand Up @@ -122,6 +136,7 @@ prod_hostname:

site_title: "RSV PathMut"
data_provider: "NCBI GenBank"
motd_url: "https://storage.googleapis.com/ve-public/MOTD.html"

# Default references for each subtype
default_references:
Expand Down
Loading

0 comments on commit 9b2d695

Please sign in to comment.