Merge pull request #357 from theislab/scrinshot_probe_designer

adapter tutorials to new framework
theislab · Nov 23, 2024 · 1ec08e6 · 1ec08e6
2 parents 1ea8b70 + 3c6ffe2
commit 1ec08e6
Show file tree

Hide file tree

Showing 4 changed files with 1,481 additions and 345 deletions.
diff --git a/docs/_tutorials/data/genomic_region_generator_ncbi.yaml b/docs/_tutorials/data/genomic_region_generator_ncbi.yaml
@@ -0,0 +1,26 @@
+#######################
+### BASIC PARAMETERS ###
+#######################
+
+### General parameters
+dir_output: output_genomic_region_generator_ncbi # name of the directory where the output files will be written
+
+### Parameters for genome and gene annotation
+source: ncbi # required: indicate that ncbi annotation should be used
+source_params:
+  taxon: vertebrate_mammalian # required: taxon of the species, valid taxa are: archaea, bacteria, fungi, invertebrate, mitochondrion, plant, plasmid, plastid, protozoa, vertebrate_mammalian, vertebrate_other, viral
+  species: Homo_sapiens # required: species name in NCBI download format, e.g. 'Homo_sapiens' for human; see https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ for available species name
+  annotation_release: 110 # required: release number of annotation e.g. '109' or '109.20211119'  or 'current' to use most recent annotation release. Check out release numbers for NCBI at ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/
+
+### Parameters for sequences generation
+# List of genomic regions that should be generated, set the genomic regions you want to generate to True
+genomic_regions:
+  gene: false
+  exon: true
+  exon_exon_junction: true
+  cds: false
+  intron: false
+
+# If exon_exon_junction is ste to true, specify the block size, i.e. +/- "block_size" bp around the junction
+# Hint: it does not make sense to set the block size larger than the maximum oligo length
+exon_exon_junction_block_size: 50
diff --git a/docs/_tutorials/data/scrinshot_probe_designer.yaml b/docs/_tutorials/data/scrinshot_probe_designer.yaml
@@ -0,0 +1,165 @@
+#######################
+### USER PARAMETERS ###
+#######################
+
+### General parameters
+### -----------------------------------------------
+n_jobs: 4 # number of cores used to run the pipeline and 2*n_jobs +1 of regions that should be stored in cache. If memory consumption of pipeline is too high reduce this number, if a lot of RAM is available increase this number to decrease runtime
+dir_output: output_scrinshot_probe_designer # name of the directory where the output files will be written
+write_intermediate_steps: true # if true, writes the oligo sequences after each step of the pipeline into a csv file
+
+### Parameters for probe sequences generation
+### -----------------------------------------------
+file_regions: my_genes.txt # file with a list the genes used to generate the oligos sequences, leave empty if all the genes are used
+files_fasta_probe_database: # fasta file with sequences form which the oligos should be generated. Hint: use the genomic_region_generator pipeline to create fasta files of genomic regions of interest
+  - output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna
+  - output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna
+probe_length_min: 40 #min length of oligos
+probe_length_max: 45 #max length of oligos
+
+### Parameters for the property filers, i.e. properties that the sequences should fulfill
+### -----------------------------------------------
+## target probe sequence
+probe_GC_content_min: 40 # minimum GC content of oligos
+probe_GC_content_max: 60 # maximum GC content of oligos
+probe_Tm_min: 65 # minimum melting temperature of oligos
+probe_Tm_max: 75 # maximum melting temperature of oligos
+homopolymeric_base_n: # minimum number of nucleotides to consider it a homopolymeric run per base
+  A: 5
+  T: 5
+  C: 5
+  G: 5
+## padlock arms
+arm_Tm_dif_max: 2 # maximum melting temperature difference of both arms (difference shouldn't be higher than 5! But range is not super important, the lower the better)
+arm_length_min: 10 # minimum length of each arm
+arm_Tm_min: 50 # minimum melting temperature of each arm 
+arm_Tm_max: 60 # maximum melting temperature of each arm
+## detection oligos
+min_thymines: 2 # minimal number of Thymines in detection oligo.
+detect_oligo_length_min: 15 # minimum length of detection probe
+detect_oligo_length_max: 40 # maximum length of detection probe
+
+### Parameters for the specificity filters
+### -----------------------------------------------
+files_fasta_reference_database: # fasta file with sequences used as reference for the specificity filters. Hint: use the genomic_region_generator pipeline to create fasta files of genomic regions of interest
+  - output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna
+  - output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna
+ligation_region_size: 5 # size of the seed region around the ligation site for blast seed region filter; set to 0 if ligation region should not be considered for blast search
+
+### Parameters for set selection
+### -----------------------------------------------
+probe_isoform_weight: 2 # weight of the isoform consensus of the probe in the efficiency score
+probe_GC_content_opt: 50 # max and min values are defiend above
+probe_GC_weight: 1 # weight of the GC content of the probe in the efficiency score
+probe_Tm_opt: 70 # max and min values are defiend above
+probe_Tm_weight: 1 # weight of the Tm of the probe in the efficiency score
+
+probeset_size_min: 3 # minimum size of probe sets (in case there exist no set of the optimal size) -> genes with less oligos will be filtered out and stored in regions_with_insufficient_oligos_for_db_probes
+probeset_size_opt: 5 # optimal size of probe sets
+distance_between_probes: 0 # how much overlap should be allowed between oligos, e.g. if oligos can overlpap x bases choose -x, if oligos can be next to one another choose 0, if oligos should be x bases apart choose x
+n_sets: 100 # maximum number of sets to generate
+
+### Parameters for final sequence design
+### -----------------------------------------------
+U_distance: 5 # preferred minimal distance between U(racils)
+detect_oligo_Tm_opt: 56 # optimal melting temperature of detection probe
+top_n_sets: 3 #maximum number of sets to report in padlock_probes.yaml and "padlock_probes_order.yaml"
+
+############################
+### DEVELOPER PARAMETERS ###
+############################
+
+### Parameters for the specificity filters
+### -----------------------------------------------
+# Specificity filter with BlastN
+specificity_blastn_search_parameters:
+  perc_identity: 80
+  strand: "minus" # this parameter is fixed, if reference is whole genome, consider using "both"
+  word_size: 10
+  dust: "no"
+  soft_masking: "false"
+  max_target_seqs: 10
+  max_hsps: 1000
+specificity_blastn_hit_parameters:
+  coverage: 50 # can be turned into min_alignment_length
+
+# Crosshybridization filter with BlastN
+cross_hybridization_blastn_search_parameters:
+  perc_identity: 80
+  strand: "minus" # this parameter is fixed
+  word_size: 10
+  dust: "no"
+  soft_masking: "false"
+  max_target_seqs: 10
+cross_hybridization_blastn_hit_parameters:
+  coverage: 80 # can be turned into min_alignment_length
+
+
+### Parameters for the Oligo set selection
+### -----------------------------------------------
+max_graph_size: 5000 # maximum number of oligos that are taken into consisderation in the last step (5000 -> ~5GB, 2500 -> ~1GB)
+
+
+### Parameters for Melting Temperature
+### -----------------------------------------------
+# The melting temperature is used in 2 different stages (property filters and padlock detection probe design), where a few parameters are shared and the others differ.
+# parameters for melting temperature -> for more information on parameters, see: https://biopython.org/docs/1.75/api/Bio.SeqUtils.MeltingTemp.html#Bio.SeqUtils.MeltingTemp.Tm_NN
+## target probe
+Tm_parameters_probe:
+    check: true #default
+    strict: true #default
+    c_seq: null #default
+    shift: 0 #default
+    nn_table: DNA_NN3 # Allawi & SantaLucia (1997)
+    tmm_table: DNA_TMM1 #default
+    imm_table: DNA_IMM1 #default
+    de_table: DNA_DE1 #default
+    dnac1: 50 #[nM]
+    dnac2: 0 #[nM]
+    selfcomp: false #default
+    saltcorr: 7 # Owczarzy et al. (2008)
+    Na: 39 #[mM]
+    K: 75 #[mM]
+    Tris: 20 #[mM]
+    Mg: 10 #[mM]
+    dNTPs: 0 #[mM] default
+
+Tm_chem_correction_param_probe:
+    DMSO: 0 #default
+    fmd: 20
+    DMSOfactor: 0.75 #default
+    fmdfactor: 0.65 #default
+    fmdmethod: 1 #default
+    GC: null #default
+
+Tm_salt_correction_param_probe: null # if salt correction desired, please add parameters below
+
+## detection oligo
+Tm_parameters_detection_oligo:
+    check: true #default
+    strict: true #default
+    c_seq: null #default
+    shift: 0 #default
+    nn_table: DNA_NN3 # Allawi & SantaLucia (1997)
+    tmm_table: DNA_TMM1 #default
+    imm_table: DNA_IMM1 #default
+    de_table: DNA_DE1 #default
+    dnac1: 50 #[nM]
+    dnac2: 0 #[nM]
+    selfcomp: false #default
+    saltcorr: 7 # Owczarzy et al. (2008)
+    Na: 39 #[mM]
+    K: 0 #[mM] default
+    Tris: 0 #[mM] default
+    Mg: 0 #[mM] default
+    dNTPs: 0 #[mM] default
+
+Tm_chem_correction_param_detection_oligo:
+    DMSO: 0 #default
+    fmd: 30
+    DMSOfactor: 0.75 #default
+    fmdfactor: 0.65 #default
+    fmdmethod: 1 #default
+    GC: null #default
+
+Tm_salt_correction_param_detection_oligo: null # if salt correction desired, please add parameters below