Skip to content

Commit

Permalink
Merge pull request #357 from theislab/scrinshot_probe_designer
Browse files Browse the repository at this point in the history
adapter tutorials to new framework
  • Loading branch information
LouisK92 authored Nov 23, 2024
2 parents 1ea8b70 + 3c6ffe2 commit 1ec08e6
Show file tree
Hide file tree
Showing 4 changed files with 1,481 additions and 345 deletions.
26 changes: 26 additions & 0 deletions docs/_tutorials/data/genomic_region_generator_ncbi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#######################
### BASIC PARAMETERS ###
#######################

### General parameters
dir_output: output_genomic_region_generator_ncbi # name of the directory where the output files will be written

### Parameters for genome and gene annotation
source: ncbi # required: indicate that ncbi annotation should be used
source_params:
taxon: vertebrate_mammalian # required: taxon of the species, valid taxa are: archaea, bacteria, fungi, invertebrate, mitochondrion, plant, plasmid, plastid, protozoa, vertebrate_mammalian, vertebrate_other, viral
species: Homo_sapiens # required: species name in NCBI download format, e.g. 'Homo_sapiens' for human; see https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ for available species name
annotation_release: 110 # required: release number of annotation e.g. '109' or '109.20211119' or 'current' to use most recent annotation release. Check out release numbers for NCBI at ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/

### Parameters for sequences generation
# List of genomic regions that should be generated, set the genomic regions you want to generate to True
genomic_regions:
gene: false
exon: true
exon_exon_junction: true
cds: false
intron: false

# If exon_exon_junction is ste to true, specify the block size, i.e. +/- "block_size" bp around the junction
# Hint: it does not make sense to set the block size larger than the maximum oligo length
exon_exon_junction_block_size: 50
165 changes: 165 additions & 0 deletions docs/_tutorials/data/scrinshot_probe_designer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#######################
### USER PARAMETERS ###
#######################

### General parameters
### -----------------------------------------------
n_jobs: 4 # number of cores used to run the pipeline and 2*n_jobs +1 of regions that should be stored in cache. If memory consumption of pipeline is too high reduce this number, if a lot of RAM is available increase this number to decrease runtime
dir_output: output_scrinshot_probe_designer # name of the directory where the output files will be written
write_intermediate_steps: true # if true, writes the oligo sequences after each step of the pipeline into a csv file

### Parameters for probe sequences generation
### -----------------------------------------------
file_regions: my_genes.txt # file with a list the genes used to generate the oligos sequences, leave empty if all the genes are used
files_fasta_probe_database: # fasta file with sequences form which the oligos should be generated. Hint: use the genomic_region_generator pipeline to create fasta files of genomic regions of interest
- output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna
- output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna
probe_length_min: 40 #min length of oligos
probe_length_max: 45 #max length of oligos

### Parameters for the property filers, i.e. properties that the sequences should fulfill
### -----------------------------------------------
## target probe sequence
probe_GC_content_min: 40 # minimum GC content of oligos
probe_GC_content_max: 60 # maximum GC content of oligos
probe_Tm_min: 65 # minimum melting temperature of oligos
probe_Tm_max: 75 # maximum melting temperature of oligos
homopolymeric_base_n: # minimum number of nucleotides to consider it a homopolymeric run per base
A: 5
T: 5
C: 5
G: 5
## padlock arms
arm_Tm_dif_max: 2 # maximum melting temperature difference of both arms (difference shouldn't be higher than 5! But range is not super important, the lower the better)
arm_length_min: 10 # minimum length of each arm
arm_Tm_min: 50 # minimum melting temperature of each arm
arm_Tm_max: 60 # maximum melting temperature of each arm
## detection oligos
min_thymines: 2 # minimal number of Thymines in detection oligo.
detect_oligo_length_min: 15 # minimum length of detection probe
detect_oligo_length_max: 40 # maximum length of detection probe

### Parameters for the specificity filters
### -----------------------------------------------
files_fasta_reference_database: # fasta file with sequences used as reference for the specificity filters. Hint: use the genomic_region_generator pipeline to create fasta files of genomic regions of interest
- output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna
- output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna
ligation_region_size: 5 # size of the seed region around the ligation site for blast seed region filter; set to 0 if ligation region should not be considered for blast search

### Parameters for set selection
### -----------------------------------------------
probe_isoform_weight: 2 # weight of the isoform consensus of the probe in the efficiency score
probe_GC_content_opt: 50 # max and min values are defiend above
probe_GC_weight: 1 # weight of the GC content of the probe in the efficiency score
probe_Tm_opt: 70 # max and min values are defiend above
probe_Tm_weight: 1 # weight of the Tm of the probe in the efficiency score

probeset_size_min: 3 # minimum size of probe sets (in case there exist no set of the optimal size) -> genes with less oligos will be filtered out and stored in regions_with_insufficient_oligos_for_db_probes
probeset_size_opt: 5 # optimal size of probe sets
distance_between_probes: 0 # how much overlap should be allowed between oligos, e.g. if oligos can overlpap x bases choose -x, if oligos can be next to one another choose 0, if oligos should be x bases apart choose x
n_sets: 100 # maximum number of sets to generate

### Parameters for final sequence design
### -----------------------------------------------
U_distance: 5 # preferred minimal distance between U(racils)
detect_oligo_Tm_opt: 56 # optimal melting temperature of detection probe
top_n_sets: 3 #maximum number of sets to report in padlock_probes.yaml and "padlock_probes_order.yaml"

############################
### DEVELOPER PARAMETERS ###
############################

### Parameters for the specificity filters
### -----------------------------------------------
# Specificity filter with BlastN
specificity_blastn_search_parameters:
perc_identity: 80
strand: "minus" # this parameter is fixed, if reference is whole genome, consider using "both"
word_size: 10
dust: "no"
soft_masking: "false"
max_target_seqs: 10
max_hsps: 1000
specificity_blastn_hit_parameters:
coverage: 50 # can be turned into min_alignment_length

# Crosshybridization filter with BlastN
cross_hybridization_blastn_search_parameters:
perc_identity: 80
strand: "minus" # this parameter is fixed
word_size: 10
dust: "no"
soft_masking: "false"
max_target_seqs: 10
cross_hybridization_blastn_hit_parameters:
coverage: 80 # can be turned into min_alignment_length


### Parameters for the Oligo set selection
### -----------------------------------------------
max_graph_size: 5000 # maximum number of oligos that are taken into consisderation in the last step (5000 -> ~5GB, 2500 -> ~1GB)


### Parameters for Melting Temperature
### -----------------------------------------------
# The melting temperature is used in 2 different stages (property filters and padlock detection probe design), where a few parameters are shared and the others differ.
# parameters for melting temperature -> for more information on parameters, see: https://biopython.org/docs/1.75/api/Bio.SeqUtils.MeltingTemp.html#Bio.SeqUtils.MeltingTemp.Tm_NN
## target probe
Tm_parameters_probe:
check: true #default
strict: true #default
c_seq: null #default
shift: 0 #default
nn_table: DNA_NN3 # Allawi & SantaLucia (1997)
tmm_table: DNA_TMM1 #default
imm_table: DNA_IMM1 #default
de_table: DNA_DE1 #default
dnac1: 50 #[nM]
dnac2: 0 #[nM]
selfcomp: false #default
saltcorr: 7 # Owczarzy et al. (2008)
Na: 39 #[mM]
K: 75 #[mM]
Tris: 20 #[mM]
Mg: 10 #[mM]
dNTPs: 0 #[mM] default

Tm_chem_correction_param_probe:
DMSO: 0 #default
fmd: 20
DMSOfactor: 0.75 #default
fmdfactor: 0.65 #default
fmdmethod: 1 #default
GC: null #default

Tm_salt_correction_param_probe: null # if salt correction desired, please add parameters below

## detection oligo
Tm_parameters_detection_oligo:
check: true #default
strict: true #default
c_seq: null #default
shift: 0 #default
nn_table: DNA_NN3 # Allawi & SantaLucia (1997)
tmm_table: DNA_TMM1 #default
imm_table: DNA_IMM1 #default
de_table: DNA_DE1 #default
dnac1: 50 #[nM]
dnac2: 0 #[nM]
selfcomp: false #default
saltcorr: 7 # Owczarzy et al. (2008)
Na: 39 #[mM]
K: 0 #[mM] default
Tris: 0 #[mM] default
Mg: 0 #[mM] default
dNTPs: 0 #[mM] default

Tm_chem_correction_param_detection_oligo:
DMSO: 0 #default
fmd: 30
DMSOfactor: 0.75 #default
fmdfactor: 0.65 #default
fmdmethod: 1 #default
GC: null #default

Tm_salt_correction_param_detection_oligo: null # if salt correction desired, please add parameters below
Loading

0 comments on commit 1ec08e6

Please sign in to comment.