From 600d1b15c41d8de27acca322c9be619779279ed6 Mon Sep 17 00:00:00 2001 From: Lisa Sousa Date: Sat, 22 Jun 2024 07:27:19 +0200 Subject: [PATCH 1/3] adapter tutorials to new framework --- .../data/genomic_region_generator_ncbi.yaml | 26 + .../data/scrinshot_probe_designer.yaml | 150 ++ ...papros_tutorial_end_to_end_selection.ipynb | 1276 +++++++++++++---- ..._tutorial_end_to_end_selection_short.ipynb | 419 +++++- 4 files changed, 1526 insertions(+), 345 deletions(-) create mode 100644 docs/_tutorials/data/genomic_region_generator_ncbi.yaml create mode 100644 docs/_tutorials/data/scrinshot_probe_designer.yaml diff --git a/docs/_tutorials/data/genomic_region_generator_ncbi.yaml b/docs/_tutorials/data/genomic_region_generator_ncbi.yaml new file mode 100644 index 0000000..4fab1ac --- /dev/null +++ b/docs/_tutorials/data/genomic_region_generator_ncbi.yaml @@ -0,0 +1,26 @@ +####################### +### BASIC PARAMETERS ### +####################### + +### General parameters +dir_output: output_genomic_region_generator_ncbi # name of the directory where the output files will be written + +### Parameters for genome and gene annotation +source: ncbi # required: indicate that ncbi annotation should be used +source_params: + taxon: vertebrate_mammalian # required: taxon of the species, valid taxa are: archaea, bacteria, fungi, invertebrate, mitochondrion, plant, plasmid, plastid, protozoa, vertebrate_mammalian, vertebrate_other, viral + species: Homo_sapiens # required: species name in NCBI download format, e.g. 'Homo_sapiens' for human; see https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ for available species name + annotation_release: 110 # required: release number of annotation e.g. '109' or '109.20211119' or 'current' to use most recent annotation release. Check out release numbers for NCBI at ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/ + +### Parameters for sequences generation +# List of genomic regions that should be generated, set the genomic regions you want to generate to True +genomic_regions: + gene: false + exon: true + exon_exon_junction: true + cds: false + intron: false + +# If exon_exon_junction is ste to true, specify the block size, i.e. +/- "block_size" bp around the junction +# Hint: it does not make sense to set the block size larger than the maximum oligo length +exon_exon_junction_block_size: 50 diff --git a/docs/_tutorials/data/scrinshot_probe_designer.yaml b/docs/_tutorials/data/scrinshot_probe_designer.yaml new file mode 100644 index 0000000..2254482 --- /dev/null +++ b/docs/_tutorials/data/scrinshot_probe_designer.yaml @@ -0,0 +1,150 @@ +####################### +### USER PARAMETERS ### +####################### + +### General parameters +n_jobs: 4 # number of cores used to run the pipeline +dir_output: output_scrinshot_probe_designer # name of the directory where the output files will be written +write_intermediate_steps: true # if true, writes the oligo sequences after each step of the pipeline into a csv file + +### Parameters for probe sequences generation +file_regions: my_genes.txt # file with a list the genes used to generate the oligos sequences, leave empty if all the genes are used +files_fasta_probe_database: # fasta file with sequences form which the oligos should be generated. Hint: use the genomic_region_generator pipeline to create fasta files of genomic regions of interest + - output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna + - output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna +probe_length_min: 40 #min length of oligos +probe_length_max: 45 #max length of oligos + +### Parameters for the property filers, i.e. properties that the sequences should fulfill +# probe sequence +probe_GC_content_min: 40 # minimum GC content of oligos +probe_GC_content_max: 60 # maximum GC content of oligos +probe_Tm_min: 65 # minimum melting temperature of oligos +probe_Tm_max: 75 # maximum melting temperature of oligos +homopolymeric_base_n: # minimum number of nucleotides to consider it a homopolymeric run per base + A: 5 + T: 5 + C: 5 + G: 5 +# padlock arms +arm_Tm_dif_max: 2 # maximum melting temperature difference of both arms (difference shouldn't be higher than 5! But range is not super important, the lower the better) +arm_length_min: 10 # minimum length of each arm +arm_Tm_min: 50 # minimum melting temperature of each arm +arm_Tm_max: 60 # maximum melting temperature of each arm +# detection oligos +min_thymines: 2 # minimal number of Thymines in detection oligo. +detect_oligo_length_min: 15 # minimum length of detection probe +detect_oligo_length_max: 40 # maximum length of detection probe + +### Parameters for the specificity filters +files_fasta_reference_database: # fasta file with sequences used as reference for the specificity filters. Hint: use the genomic_region_generator pipeline to create fasta files of genomic regions of interest + - output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna + - output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna +ligation_region_size: 5 # size of the seed region around the ligation site for blast seed region filter; set to 0 if ligation region should not be considered for blast search + +### Parameters for the Oligo set selection +probe_isoform_weight: 2 # weight of the isoform consensus of the probe in the efficiency score +probe_GC_content_opt: 50 # max and min values are defiend above +probe_GC_weight: 1 # weight of the GC content of the probe in the efficiency score +probe_Tm_opt: 70 # max and min values are defiend above +probe_Tm_weight: 1 # weight of the Tm of the probe in the efficiency score + +probeset_size_min: 3 # minimum size of probe sets (in case there exist no set of the optimal size) -> genes with less oligos will be filtered out and stored in regions_with_insufficient_oligos_for_db_probes +probeset_size_opt: 5 # optimal size of probe sets +distance_between_probes: 0 # how much overlap should be allowed between oligos, e.g. if oligos can overlpap x bases choose -x, if oligos can be next to one another choose 0, if oligos should be x bases apart choose x +n_sets: 100 # maximum number of sets to generate + +### Parameters for creation of final probe sequence +U_distance: 5 # preferred minimal distance between U(racils) +detect_oligo_Tm_opt: 56 # optimal melting temperature of detection probe +top_n_sets: 3 #maximum number of sets to report in padlock_probes.yaml and "padlock_probes_order.yaml" + +############################ +### DEVELOPER PARAMETERS ### +############################ + +### Parameters for the specificity filters +# Specificity filter with BlastN +specificity_blastn_search_parameters: + perc_identity: 80 + strand: "minus" # this parameter is fixed, if reference is whole genome, consider using "both" + word_size: 10 + dust: "no" + soft_masking: "false" + max_target_seqs: 10 + max_hsps: 1000 +specificity_blastn_hit_parameters: + coverage: 50 # can be turned into min_alignment_length + +# Crosshybridization filter with BlastN +cross_hybridization_blastn_search_parameters: + perc_identity: 80 + strand: "minus" # this parameter is fixed + word_size: 10 + dust: "no" + soft_masking: "false" + max_target_seqs: 10 +cross_hybridization_blastn_hit_parameters: + coverage: 80 # can be turned into min_alignment_length + + +### Parameters for the Oligo set selection +max_graph_size: 5000 # maximum number of oligos that are taken into consisderation in the last step (5000 -> ~5GB, 2500 -> ~1GB) + + +### Parameters for Melting Temperature +# The melting temperature is used in 2 different stages (property filters and padlock detection probe design), where a few parameters are shared and the others differ. +# parameters for melting temperature -> for more information on parameters, see: https://biopython.org/docs/1.75/api/Bio.SeqUtils.MeltingTemp.html#Bio.SeqUtils.MeltingTemp.Tm_NN +Tm_parameters_probe: + check: true #default + strict: true #default + c_seq: null #default + shift: 0 #default + nn_table: DNA_NN3 # Allawi & SantaLucia (1997) + tmm_table: DNA_TMM1 #default + imm_table: DNA_IMM1 #default + de_table: DNA_DE1 #default + dnac1: 50 #[nM] + dnac2: 0 #[nM] + selfcomp: false #default + saltcorr: 7 # Owczarzy et al. (2008) + Na: 39 #[mM] + K: 75 #[mM] + Tris: 20 #[mM] + Mg: 10 #[mM] + dNTPs: 0 #[mM] default + +Tm_chem_correction_param_probe: + DMSO: 0 #default + fmd: 20 + DMSOfactor: 0.75 #default + fmdfactor: 0.65 #default + fmdmethod: 1 #default + GC: null #default + +Tm_parameters_detection_oligo: + check: true #default + strict: true #default + c_seq: null #default + shift: 0 #default + nn_table: DNA_NN3 # Allawi & SantaLucia (1997) + tmm_table: DNA_TMM1 #default + imm_table: DNA_IMM1 #default + de_table: DNA_DE1 #default + dnac1: 50 #[nM] + dnac2: 0 #[nM] + selfcomp: false #default + saltcorr: 7 # Owczarzy et al. (2008) + Na: 39 #[mM] + K: 0 #[mM] default + Tris: 0 #[mM] default + Mg: 0 #[mM] default + dNTPs: 0 #[mM] default + +Tm_chem_correction_param_detection_oligo: + DMSO: 0 #default + fmd: 30 + DMSOfactor: 0.75 #default + fmdfactor: 0.65 #default + fmdmethod: 1 #default + GC: null #default diff --git a/docs/_tutorials/spapros_tutorial_end_to_end_selection.ipynb b/docs/_tutorials/spapros_tutorial_end_to_end_selection.ipynb index f02abe5..c8e70df 100644 --- a/docs/_tutorials/spapros_tutorial_end_to_end_selection.ipynb +++ b/docs/_tutorials/spapros_tutorial_end_to_end_selection.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "nbsphinx": "hidden", "tags": [ @@ -14,7 +14,8 @@ "from IPython.display import HTML, display, Image\n", "\n", "import warnings\n", - "warnings.filterwarnings('ignore')\n", + "warnings.simplefilter('ignore')\n", + "warnings.filterwarnings(\"ignore\")\n", "\n", "import pandas as pd\n", "#pd.set_option(\"max_columns\", None) # show all cols\n", @@ -80,33 +81,33 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Besides `spapros` also install `oligo_designer_toolsuite` if not done already. First we need to install some dependencies:\n", + "Besides `spapros` also install `oligo_designer_toolsuite`. Therefore, first setup a conda environment (packages is tested for Python 3.9 - 3.10), e.g.:\n", + "\n", + "```bash\n", + "conda create -n odt python=3.10\n", + "conda activate odt\n", + "```\n", + "\n", + "Then, install the required dependencies, i.e. **Blast** (2.15 or higher), **BedTools** (2.30 or higher), **Bowtie** (1.3 or higher) and **Bowtie2** (2.5 or higher), that need to be installed independently. To install those tools via conda, please activate the Bioconda and conda-forge channels in your conda environment with and update conda and all packages in your environment:\n", "\n", "```bash\n", - "conda config --add channels bioconda\n", "conda config --add channels conda-forge\n", - "conda update conda\n", + "conda config --add channels bioconda \n", "conda update --all\n", "\n", - "conda install \"blast>=2.12\"\n", + "conda install \"blast>=2.15.0\"\n", "conda install \"bedtools>=2.30\"\n", "conda install \"bowtie>=1.3.1\"\n", "conda install \"bowtie2>=2.5\"\n", "```\n", "\n", - "\n", - "To run the code below we need to install the current dev version of the oligo designer:\n", + "All other required packages are automatically installed during the `pip` installation:\n", "\n", "```bash\n", "git clone https://github.com/HelmholtzAI-Consultants-Munich/oligo-designer-toolsuite.git\n", "cd oligo-designer-toolsuite\n", "git switch pipelines\n", "pip install -e .\n", - "```\n", - "\n", - "Otherwise, if that didn't work, try:\n", - "```bash\n", - "pip install oligo_designer_toolsuite\n", "```" ] }, @@ -119,12 +120,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "scanpy==1.9.3 anndata==0.9.2 umap==0.5.3 numpy==1.24.4 scipy==1.11.1 pandas==1.5.3 scikit-learn==1.3.0 statsmodels==0.14.0 python-igraph==0.9.11 pynndescent==0.5.10\n", - "spapros==0.1.3\n" + "scanpy==1.10.1 anndata==0.10.7 umap==0.5.6 numpy==1.26.4 scipy==1.13.1 pandas==1.5.3 scikit-learn==1.5.0 statsmodels==0.14.2 igraph==0.11.5 pynndescent==0.5.12\n", + "spapros==0.1.5\n" ] } ], "source": [ + "import os\n", "import scanpy as sc\n", "sc.settings.verbosity = 0\n", "sc.logging.print_header()\n", @@ -132,7 +134,9 @@ "import spapros as sp\n", "print(f\"spapros=={sp.__version__}\")\n", "\n", - "from oligo_designer_toolsuite.pipelines import ScrinshotProbeDesigner #, MerfishProbeDesigner, SeqfishPlusProbeDesigner" + "from Bio.SeqUtils import MeltingTemp as mt\n", + "from oligo_designer_toolsuite.database import OligoDatabase \n", + "from oligo_designer_toolsuite.pipelines import GenomicRegionGenerator, ScrinshotProbeDesigner" ] }, { @@ -151,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -175,109 +179,175 @@ "source": [ "## Probeset Design\n", "\n", - "Before choosing a gene panel, we design probesets for our given set of 1000 highly variable genes that fulfill certain experiment-specific criteria. \n", - "Therefore, we first create an instance of a ProbeDesigner class, where we can choose from ```ScrinshotProbeDesigner```, ```MerfishProbeDesigner``` an ```SeqfishPlusProbeDesigner``` (see our [resource table](https://www.google.com/url?q=https://docs.google.com/spreadsheets/d/1NCDLscdmzn32U7_IKy6OKYHqfECn76x0pAD3KFhuJgQ/edit%23gid%3D0&sa=D&source=docs&ust=1692975800552487&usg=AOvVaw20CRwiObnVkWVS54CcqXMq) for an overview of differences between the technologies).\n", - "For each of those classes, we need to define an output directory and set the parameters ```write_removed_genes``` (if true, save gene with insufficient probes in a file) and\n", - "```write_intermediate_steps``` (if true, save the probe database after each processing step, such that the pipline can resume from a certain step onwards).\n", + "Before choosing a gene panel, we design probesets for our given set of 1000 highly variable genes that fulfill certain experiment-specific criteria. Therefore, we first have to create the input fasta files for the probe design pipeline. We can create those files from custom annotation (GFF and fasta) files or download those annotation files directly from the NCBI or Ensembl FTP server. For generating the probe design pipeline input fasta files, we use the ```GenomicRegionGenerator```class for which we need to set the parameters:\n", "\n", - "Here, we showcase the probe design of padlock probes.\n" + "- `dir_output`: name of the directory where the output files will be written" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "## Define parameters\n", + "dir_output = \"output_genomic_region_generator_ncbi\" " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-08-22 17:25:41,843 [INFO] Parameters Init:\n", - "2023-08-22 17:25:41,844 [INFO] dir_output = ./output\n", - "2023-08-22 17:25:41,845 [INFO] write_removed_genes = True\n", - "2023-08-22 17:25:41,846 [INFO] write_intermediate_steps = True\n" - ] - } - ], + "outputs": [], "source": [ - "probe_designer = ScrinshotProbeDesigner(dir_output=\"./output\")" + "## Setup pipeline\n", + "pipeline = GenomicRegionGenerator(dir_output=dir_output)" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "After instatiating a ProbeDesigner class, we need to load the annotation we are using. Our example dataset uses the NCBI gene annotation. Hence, we define *ncbi* as source and define the NCBI-specific parameters *taxon*, *species* and *annotation_release*.\n", - "Apart from *NCBI* annotation, we can also choose an *Ensembl* annotation. If ```source=\"ncbi\"``` or ```source=\"ensembl\"``` is choosen, the annotation files are automatically downloaded from their servers.\n", - "In addition, we can provide a custom annotation when specifying ```source=\"custom\"```. \n", + "### Generate Genomic Region Files\n", + "\n", + "Our example dataset uses a custom annotation and we create input fasta files for the transcriptome, consisting of exons and exon-exon junctions. Hence, we define *custom* as source and define the custom-specific parameters listed below.\n", + "Apart from *custom* annotation, we can also choose an *NCBI* or *Ensembl* annotation. If *source=\"ncbi\"* or *source=\"ensembl\"* is choosen, the annotation files are automatically downloaded from their servers.\n", "\n", - "### Parameters for annotation loader\n", + "**Parameters for annotation loader** \n", "- `source`: define annotation source -> currently supported: ncbi, ensembl and custom\n", "\n", - "*NCBI annnotation parameters:*\n", - "- `taxon`: taxon of the species, valid taxa are: archaea, bacteria, fungi, invertebrate, mitochondrion, plant, plasmid, plastid, protozoa, vertebrate_mammalian, vertebrate_other, viral\n", - "- `species`: species name in NCBI download format, e.g. 'Homo_sapiens' for human; see [here](https://ftp.ncbi.nlm.nih.gov/genomes/refseq/) for available species name\n", - "- `annotation_release`: release number (e.g. 109 or 109.20211119 for ncbi) of annotation or 'current' to use most recent annotation release. Check out release numbers for NCBI at ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/\n", + "*Custom annnotation parameters:*\n", + "- `source_params`:\n", + " - `file_annotation`: required: GTF file with gene annotation\n", + " - `file_sequence`: required: FASTA file with genome sequence\n", + " - `files_source`: optional: original source of the genomic files\n", + " - `species`: optional: species of provided annotation, leave empty if unknown\n", + " - `annotation_release`: optional: release number of provided annotation, leave empty if unknown\n", + " - `genome_assembly`: optional: genome assembly of provided annotation, leave empty if unknown\n", + "\n", + "*For NCBI or Ensembl parameters, see examples in the code below*\n", "\n", - "*Ensembl annotation parameters:*\n", - "- `species`: species name in ensembl download format, e.g. 'homo_sapiens' for human; see http://ftp.ensembl.org/pub/release-108/gtf/ for available species names\n", - "- `annotation_release`: release number of annotation, e.g. 'release-108' or 'current' to use most recent annotation release. Check out release numbers for ensemble at ftp.ensembl.org/pub/\n", + "**Parameters for sequences generation** \n", + "Set the genomic regions that should be generated to True. For each region one fasta file is generated.\n", + "- `genomic_regions`:\n", + " - `gene`: create fasta file from gene regions in GFF annotation\n", + " - `exon`: create fasta file from exon regions in GFF annotation and merge same exons coming from different transcripts into one sequence entry while preserving the transcript information\n", + " - `exon_exon_junction`: create fasta file from exon exon junctions in GFF annotation and merge same junctions coming from different transcripts into one sequence entry while preserving the transcript information\n", + " - `cds`: create fasta file from cds regions in GFF annotation and merge same cds coming from different transcripts into one sequence entry while preserving the transcript information\n", + " - `intron`: create fasta file from regions between exons\n", "\n", - "*Custom annotation parameters:*\n", - "- `file_annotation`: GTF file with gene annotation\n", - "- `file_sequence`: FASTA file with genome sequence\n", - "- `files_source`: original source of the genomic files -> optional\n", - "- `species`: species of provided annotation, leave empty if unknown -> optional\n", - "- `annotation_release`: release number of provided annotation, leave empty if unknown -> optional\n", - "- `genome_assembly`: genome assembly of provided annotation, leave empty if unknown -> optional" + "`exon_exon_junction_block_size`: If exon_exon_junction is set to true, specify the block size, i.e. +/- \"block_size\" bp around the junction. Hint: it does not make sense to set the block size larger than the maximum probe length\n", + "\n", + "*Note: if an error occurs for the unzipping of files, this might be due to a faulty download of files from the ftp server. In this case, try to download the files manually from the ftp server and use those files as input for the pipeline with custom input files.*" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-05-26 17:13:37,570 [INFO] Parameters Load Annotations:\n", - "2023-05-26 17:13:37,572 [INFO] source = custom\n", - "2023-05-26 17:13:37,573 [INFO] source_params = {'file_annotation': './output/annotation/GCF_000001405.40_GRCh38.p14_genomic.gtf', 'file_sequence': './output/annotation/GCF_000001405.40_GRCh38.p14_genomic.fna', 'files_source': 'NCBI', 'species': 'Homo_sapiens', 'annotation_release': '110', 'genome_assembly': 'GRCh38.p14'}\n", - "2023-05-26 17:16:31,649 [INFO] The following annotation files are used for GTF annotation of regions: ./output/annotation/GCF_000001405.40_GRCh38.p14_genomic.gtf and for fasta sequence file: ./output/annotation/GCF_000001405.40_GRCh38.p14_genomic.fna .\n", - "2023-05-26 17:16:31,653 [INFO] The annotations are from NCBI source, for the species: Homo_sapiens, release number: 110 and genome assembly: GRCh38.p14\n" - ] - } - ], + "outputs": [], "source": [ - "# example for ncbi annotation loader\n", + "## Define parameters for NCBI source\n", "source = \"ncbi\"\n", - "params = {\n", - " \"taxon\": \"vertebrate_mammalian\",\n", - " \"species\": \"Homo_sapiens\",\n", - " \"annotation_release\": \"110\",\n", + "source_params = {\n", + " \"taxon\": \"vertebrate_mammalian\", # required: taxon of the species, valid taxa are: archaea, bacteria, fungi, invertebrate, mitochondrion, plant, plasmid, plastid, protozoa, vertebrate_mammalian, vertebrate_other, viral\n", + " \"species\": \"Homo_sapiens\", # required: species name in NCBI download format, e.g. 'Homo_sapiens' for human; see https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ for available species name\n", + " \"annotation_release\": 110 # required: release number of annotation e.g. '109' or '109.20211119' or 'current' to use most recent annotation release. Check out release numbers for NCBI at ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/\n", "}\n", "\n", - "# example for ensembl annotation loader\n", + "## Define parameters for ensembl source\n", "# source = \"ensembl\"\n", - "# params = {\n", - "# \"species\": \"homo_sapiens\",\n", - "# \"annotation_release\": \"109\",\n", + "# source_params = {\n", + "# \"species\": \"homo_sapiens\", # required: species name in ensemble download format, e.g. 'homo_sapiens' for human; see http://ftp.ensembl.org/pub/release-108/gtf/ for available species names\n", + "# \"annotation_release\": \"109\", # required: release number of annotation, e.g. 'release-108' or 'current' to use most recent annotation release. Check out release numbers for ensemble at ftp.ensembl.org/pub/\n", "# }\n", "\n", - "# example for custom annotation loader\n", + "## Define parameters for custom source\n", "# source = \"custom\"\n", - "# params = {\n", - "# \"file_annotation\": \"./output/annotation/GCF_000001405.40_GRCh38.p14_genomic.gtf\",\n", - "# \"file_sequence\": \"./output/annotation/GCF_000001405.40_GRCh38.p14_genomic.fna\",\n", + "# source_params = {\n", + "# \"file_annotation\": \"data/custom_GCF_000001405.40_GRCh38.p14_genomic_chr16.gtf\",\n", + "# \"file_sequence\": \"data/custom_GCF_000001405.40_GRCh38.p14_genomic_chr16.fna\",\n", "# \"files_source\": \"NCBI\",\n", "# \"species\": \"Homo_sapiens\",\n", - "# \"annotation_release\": \"110\",\n", - "# \"genome_assembly\": \"GRCh38.p14\",\n", + "# \"annotation_release\": 110,\n", + "# \"genome_assembly\": \"GRCh38\",\n", "# }\n", "\n", - "probe_designer.load_annotations(source=source, source_params=params)" + "genomic_regions = {\"gene\": False, \"exon\": True, \"exon_exon_junction\": True, \"cds\": False, \"intron\": False}\n", + "block_size = 39 # min probe size - 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate the genomic regions\n", + "region_generator = pipeline.load_annotations(\n", + " source=source,\n", + " source_params=source_params,\n", + ")\n", + "\n", + "fasta_files = pipeline.generate_genomic_regions(\n", + " region_generator=region_generator,\n", + " genomic_regions=genomic_regions,\n", + " block_size=block_size,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "fasta_files = [\"output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna\",\n", + " \"output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Probe Database\n", + "\n", + "After generating the input fasta files, we can start with the probe design process. Therefore, we first create an instance of a ProbeDesigner class, where we can choose from ```ScrinshotProbeDesigner```, ```MerfishProbeDesigner``` an ```SeqfishPlusProbeDesigner``` (see our [resource table](https://www.google.com/url?q=https://docs.google.com/spreadsheets/d/1NCDLscdmzn32U7_IKy6OKYHqfECn76x0pAD3KFhuJgQ/edit%23gid%3D0&sa=D&source=docs&ust=1692975800552487&usg=AOvVaw20CRwiObnVkWVS54CcqXMq) for an overview of differences between the technologies).\n", + "For each of those classes, we need to set the parameter:\n", + "\n", + "- `gene_ids`: a list the genes used to generate the oligos sequences, leave empty list if all the genes should be used\n", + "- `write_intermediate_steps`: if true, writes the oligo sequences after each step of the pipeline into a tsv file\n", + "- `dir_output`: name of the directory where the output files will be written\n", + "- `n_jobs`: number of cores used to run the pipeline and number of regions that should be stored in cache. If memory consumption of pipeline is too high reduce this number, if a lot of RAM is available increase this number to decrease runtime\n", + "\n", + "Here, we showcase the probe design of padlock probes for 1000 highly variable genes.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "## Define parameters\n", + "gene_ids = highly_variable_genes # here we use the list of genes of highly variable genes we stored in step 1 \n", + "write_intermediate_steps = True\n", + "dir_output = \"output_scrinshot_probe_designer\"\n", + "n_jobs = 4" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "## Setup pipeline\n", + "pipeline = ScrinshotProbeDesigner(\n", + " write_intermediate_steps=write_intermediate_steps,\n", + " dir_output=dir_output,\n", + " n_jobs=n_jobs,\n", + ")" ] }, { @@ -285,47 +355,146 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "After downloading the annotations, we have to create the oligo database. Running the function below, will automatically create a transcriptome from the given annotation (therefore, the provided GTF file must contain transcript and exon information) and use this transcriptome to create all possible probes for each gene, that is provided in the *gene* list. \n", + "After instatiating our ScrinshotProbeDesigner class, we have to create the oligo database. Running the function below, will automatically create all possible probes from the input fasta files for each gene that is provided in the *file_regions* file. \n", "\n", - "### Parameters for Probe Sequences Database\n", + "**Parameters for Probe Database**\n", "- `probe_length_min`: minimum length of probes\n", "- `probe_length_max`: maximum length of probes\n", - "- `min_probes_per_gene`: minimum number of probes that a gene must have before it gets deleted\n", - "- `region`: Target sequence type for which probes are designed (choose from: \"transcript\", \"genome\", \"cds\")\n", + "- `probes_per_gene_min`: minimum number of probes that a gene must have before it gets deleted\n", + "- `files_fasta_oligo_database`: fasta file with sequences form which the oligos should be generated. Hint: use the GenomicRegionGenerator class to create fasta files of genomic regions of interest\n", + "\n", "\n", "*Note: Instead of creating a new probe database, we can also load an existing databases.* \n", - "Loading a database can be useful when starting the pipeline from a certain step, e.g. load a database which was already filtered by probe properties and continue immediately with the specificity filter step. \n", - "We can load an existing database by calling ```load_probe_database()```. See example code in the cells below (commented)." + "Loading a database can be useful when starting the pipeline from a certain step, e.g. load a database which was already filtered by probe properties and continue immediately with the specificity filter step. We can load an existing database by calling ```load_database()```. See example code in the cells below (commented)." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "## Define parameters\n", + "probe_length_min = 40\n", + "probe_length_max = 45\n", + "min_probes_per_gene = 3 # should be at least \"min_probeset_size\" probes per gene to create one set" + ] + }, + { + "cell_type": "code", + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2023-05-26 17:16:31,766 [INFO] Parameters Create Database:\n", - "2023-05-26 17:16:31,767 [INFO] genes = ['AAGAB', 'AATF', 'ABCC10', 'ABHD12', 'ABHD17B', 'ABHD5', 'ABRACL', 'ABT1', 'AC005082.12', 'AC074138.3', 'AC093323.3', 'ACAP1', 'ACBD3', 'ACD', 'ACOT13', 'ACP1', 'ACRBP', 'ACTL6A', 'ACTR6', 'ACVR2A', 'ADAL', 'ADAM10', 'ADAM28', 'ADD1', 'ADIPOR2', 'ADPRM', 'ADSL', 'AEBP1', 'AGPAT1', 'AHSA1', 'AIF1', 'AIM2', 'AKTIP', 'AL928768.3', 'ALKBH7', 'ANAPC13', 'ANKAR', 'ANKEF1', 'ANKRD27', 'ANKRD54', 'AP001462.6', 'AP003419.16', 'AP3M2', 'AP4B1-AS1', 'AP4S1', 'APOBEC3A', 'APOBEC3B', 'APOBEC3G', 'AQP3', 'ARHGAP11A', 'ARHGAP19', 'ARHGAP24', 'ARHGAP33', 'ARHGAP6', 'ARID4A', 'ARIH2OS', 'ARL2', 'ARL2BP', 'ARL4A', 'ARL6IP5', 'ARMC7', 'ARMCX5', 'ARRDC3', 'ARRDC4', 'ARSD', 'ARSG', 'ARVCF', 'ASB8', 'ASXL2', 'ATAD3C', 'ATF7IP2', 'ATG16L1', 'ATP10A', 'ATP5H', 'ATP5O', 'ATP5SL', 'ATP6V0E2', 'ATXN1L', 'ATXN3', 'AURKC', 'BABAM1', 'BACE2', 'BAZ2A', 'BBX', 'BCDIN3D', 'BET1', 'BEX4', 'BGLAP', 'BLNK', 'BLZF1', 'BMPR2', 'BNIP2', 'BOLA1', 'BOLA3', 'BRAT1', 'BRWD1', 'BTN3A1', 'BTN3A2', 'BUB3', 'C10orf32', 'C12orf45', 'C14orf1', 'C14orf166', 'C14orf80', 'C15orf57', 'C16orf13', 'C16orf52', 'C16orf54', 'C16orf58', 'C16orf74', 'C16orf80', 'C17orf59', 'C17orf62', 'C19orf33', 'C19orf52', 'C1QA', 'C1QB', 'C1QC', 'C1orf162', 'C1orf35', 'C21orf33', 'C2CD4D', 'C2orf76', 'C2orf88', 'C3orf18', 'C5orf15', 'C5orf42', 'C8orf44', 'C9orf142', 'C9orf16', 'C9orf37', 'CAMK1D', 'CAMK2G', 'CAMK2N1', 'CAPN12', 'CARHSP1', 'CARS', 'CASC4', 'CBX5', 'CCDC115', 'CCDC122', 'CCDC66', 'CCDC91', 'CCL3', 'CCL4', 'CCL5', 'CCND2', 'CCNG1', 'CCP110', 'CCT4', 'CCT7', 'CD160', 'CD19', 'CD2', 'CD247', 'CD274', 'CD2AP', 'CD320', 'CD72', 'CD79A', 'CD79B', 'CD82', 'CD9', 'CD96', 'CDC123', 'CDC16', 'CDC37', 'CDC40', 'CDK19', 'CDKN2A', 'CEACAM4', 'CEBPB', 'CECR5', 'CEP120', 'CEP68', 'CEP85L', 'CEPT1', 'CES4A', 'CGRRF1', 'CHD2', 'CHD7', 'CHERP', 'CHI3L2', 'CHPF2', 'CIAPIN1', 'CISD1', 'CISH', 'CITED4', 'CKS1B', 'CLDN5', 'CLEC2B', 'CLIC3', 'CLNS1A', 'CLPX', 'CLU', 'CLYBL', 'CMTM5', 'CNEP1R1', 'COMMD10', 'COQ7', 'CORO1B', 'COTL1', 'CPNE2', 'CPQ', 'CPSF3L', 'CR1', 'CRIP3', 'CRTC2', 'CST3', 'CST7', 'CTA-29F11.1', 'CTB-113I20.2', 'CTB-152G17.6', 'CTC-444N24.11', 'CTD-2015H6.3', 'CTD-2302E22.4', 'CTD-2368P22.1', 'CTD-2537I9.12', 'CTSS', 'CTSW', 'CWC15', 'CWC27', 'CXCL10', 'CXCL3', 'CYB5B', 'CYTH2', 'DAGLB', 'DCAF5', 'DDI2', 'DDT', 'DDX1', 'DDX17', 'DDX46', 'DDX56', 'DENND1C', 'DENND2D', 'DENND5B', 'DENND6A', 'DERL1', 'DEXI', 'DHX34', 'DHX9', 'DIDO1', 'DIMT1', 'DIS3', 'DISP1', 'DLST', 'DMTN', 'DNAJA3', 'DNAJB14', 'DNAJC10', 'DNAJC15', 'DNAJC2', 'DNAJC27', 'DNASE1L3', 'DNMT3A', 'DOK3', 'DPH6', 'DPY19L4', 'DRAXIN', 'DSCR3', 'DTX3', 'DUS3L', 'DUSP10', 'EAF2', 'EARS2', 'ECHDC1', 'EDC3', 'EID2', 'EIF1AY', 'EIF1B', 'EIF2B1', 'EIF3D', 'ELANE', 'ELOF1', 'ELOVL4', 'ELP6', 'EMB', 'EMG1', 'EML6', 'ENTPD3-AS1', 'EOGT', 'ERH', 'ERV3-1', 'EVA1B', 'EWSR1', 'EXOC6', 'F5', 'FADS1', 'FAM107B', 'FAM173A', 'FAM210B', 'FAM96A', 'FAM98A', 'FBXL14', 'FBXO21', 'FBXO33', 'FBXO4', 'FBXW4', 'FCER1A', 'FCER1G', 'FCGR2B', 'FCGR3A', 'FCN1', 'FCRLA', 'FEM1A', 'FERMT3', 'FGFBP2', 'FH', 'FHL1', 'FKBP3', 'FKBP5', 'FLOT1', 'FMO4', 'FN3KRP', 'FNBP4', 'FNTA', 'FOPNL', 'FRY-AS1', 'FUS', 'FXN', 'FYB', 'G0S2', 'GADD45B', 'GALT', 'GBGT1', 'GBP1', 'GDF11', 'GFER', 'GGA3', 'GGNBP2', 'GIMAP2', 'GIMAP4', 'GIMAP5', 'GIMAP7', 'GIT2', 'GMPPA', 'GNE', 'GNG11', 'GNG3', 'GNLY', 'GNPAT', 'GOLGB1', 'GP9', 'GPATCH4', 'GPKOW', 'GPR171', 'GPR183', 'GPR35', 'GPS1', 'GPX1', 'GRAP', 'GRN', 'GSTP1', 'GTPBP6', 'GUSB', 'GYS1', 'GZMA', 'GZMB', 'GZMH', 'GZMK', 'HAGH', 'HBA1', 'HBP1', 'HCFC2', 'HDAC1', 'HDAC5', 'HDAC9', 'HELQ', 'HEMK1', 'HERPUD2', 'HIST1H1B', 'HIST1H2AC', 'HIST1H2AH', 'HLA-DMA', 'HLA-DMB', 'HLA-DOB', 'HLA-DPA1', 'HLA-DPB1', 'HLA-DQA1', 'HLA-DQB1', 'HLA-DRB1', 'HMBOX1', 'HMGCL', 'HMGXB4', 'HNRNPH3', 'HOOK2', 'HOPX', 'HSPB11', 'HVCN1', 'ICAM2', 'ICOS', 'ICOSLG', 'ID2', 'IDUA', 'IFFO1', 'IFI27', 'IFIT1', 'IFIT2', 'IFITM3', 'IGFBP7', 'IGJ', 'IGLL5', 'IL1B', 'IL1RAP', 'IL23A', 'IL24', 'IL27RA', 'IL32', 'IL6', 'IL8', 'ILF3', 'ILF3-AS1', 'ING5', 'INSL3', 'INTS12', 'INTS2', 'IP6K1', 'IQCE', 'IRF8', 'IRF9', 'ISCA2', 'ISOC1', 'ITGA2B', 'ITGB7', 'ITM2A', 'ITSN2', 'JAKMIP1', 'JUND', 'KARS', 'KCNG1', 'KCNQ1OT1', 'KIAA0040', 'KIAA0125', 'KIAA0196', 'KIAA1430', 'KIF3A', 'KIF3C', 'KIF5B', 'KLHL24', 'KLRB1', 'KLRG1', 'KRBOX4', 'LAMP3', 'LARS', 'LAT2', 'LBR', 'LDLRAP1', 'LGALS1', 'LGALS2', 'LGALS3', 'LILRA4', 'LIN52', 'LINC00494', 'LINC00662', 'LINC00886', 'LINC00926', 'LINC00936', 'LINC01013', 'LIX1L', 'LONRF1', 'LPIN1', 'LRBA', 'LRRIQ3', 'LSM14A', 'LST1', 'LTB', 'LTV1', 'LUC7L', 'LUC7L3', 'LYAR', 'LYPD2', 'LYPLA1', 'LYRM4', 'LYSMD4', 'LZTS2', 'MADD', 'MAEA', 'MAGEH1', 'MAL', 'MALT1', 'MAP2K7', 'MARCKSL1', 'MCF2L', 'MCM3', 'MDS2', 'MED30', 'MED9', 'METTL21A', 'METTL3', 'METTL8', 'MFF', 'MFSD10', 'MIS18A', 'MKKS', 'MLLT11', 'MLLT6', 'MMADHC', 'MMP9', 'MNAT1', 'MOCS2', 'MORF4L2', 'MPHOSPH10', 'MRM1', 'MRPL1', 'MRPL19', 'MRPL42', 'MRPS12', 'MRPS33', 'MS4A1', 'MS4A6A', 'MTERFD2', 'MTIF2', 'MTRF1', 'MUM1', 'MYADM', 'MYCBP2', 'MYL9', 'MYO1E', 'MYOM2', 'MZB1', 'MZT1', 'NAA20', 'NAP1L4', 'NAPA-AS1', 'NARG2', 'NAT9', 'NBR1', 'NCOR2', 'NCR3', 'NDUFA10', 'NDUFA12', 'NECAB3', 'NEFH', 'NEK8', 'NELFB', 'NEMF', 'NFAT5', 'NFE2L2', 'NFIC', 'NFU1', 'NIT2', 'NKAP', 'NKG7', 'NKTR', 'NME3', 'NME6', 'NMNAT3', 'NNT-AS1', 'NOC4L', 'NOG', 'NOL11', 'NONO', 'NOP58', 'NPC2', 'NPHP3', 'NPRL2', 'NR2C1', 'NR3C1', 'NSA2', 'NT5C', 'NT5C3A', 'NUDCD1', 'NUDT16L1', 'NUP54', 'NXT2', 'OARD1', 'OAT', 'OBSCN', 'ODC1', 'ORAI1', 'ORC2', 'OSBPL1A', 'OSBPL7', 'OXLD1', 'P2RX5', 'P2RY10', 'PACS1', 'PACSIN2', 'PAICS', 'PARP1', 'PARS2', 'PASK', 'PAWR', 'PAXIP1-AS1', 'PBLD', 'PBRM1', 'PCNA', 'PCSK7', 'PDCD1', 'PDCD2L', 'PDE6B', 'PDIA3', 'PDIK1L', 'PDK2', 'PDXDC1', 'PDZD4', 'PEMT', 'PEX16', 'PEX26', 'PF4', 'PGM1', 'PGM2L1', 'PHACTR4', 'PHF12', 'PHF14', 'PHF3', 'PIGF', 'PIGU', 'PIGX', 'PIK3R1', 'PITHD1', 'PITPNA-AS1', 'PJA1', 'PKIG', 'PLA2G12A', 'PLCL1', 'PLD6', 'PLEKHA1', 'PLEKHA3', 'PLRG1', 'PMEPA1', 'PNOC', 'POLR2I', 'POLR2K', 'POLR3E', 'POMT1', 'PPA2', 'PPBP', 'PPIE', 'PPIG', 'PPIL2', 'PPIL4', 'PPP1R14A', 'PPP1R2', 'PPP2R1B', 'PPP6C', 'PPT2-EGFL8', 'PQBP1', 'PRAF2', 'PRDX1', 'PRELID2', 'PRF1', 'PRICKLE1', 'PRKACB', 'PRKCB', 'PRKD2', 'PRMT2', 'PRNP', 'PRPF31', 'PRPS2', 'PRR5', 'PSMD14', 'PTCRA', 'PTGDR', 'PTGDS', 'PTGES2', 'PTPN7', 'PURA', 'PWP1', 'PXMP4', 'PYCARD', 'R3HDM1', 'R3HDM2', 'RAB40C', 'RABEP2', 'RABL6', 'RAD51B', 'RALBP1', 'RALY', 'RASD1', 'RASGRP2', 'RBM25', 'RBM26-AS1', 'RBM39', 'RBM4', 'RBM48', 'RBM5', 'RBM7', 'RBPJ', 'RCE1', 'RCHY1', 'RCL1', 'RCN2', 'RDH14', 'RELB', 'REXO2', 'RFC1', 'RFC5', 'RFNG', 'RFPL2', 'RGS14', 'RIC3', 'RIOK1', 'RIOK2', 'RNF113A', 'RNF125', 'RNF139', 'RNF14', 'RNF168', 'RNF187', 'RNF213', 'RNF25', 'RNF26', 'RORA', 'RP1-28O10.1', 'RP11-1055B8.7', 'RP11-138A9.2', 'RP11-141B14.1', 'RP11-142C4.6', 'RP11-162G10.5', 'RP11-164H13.1', 'RP11-178G16.4', 'RP11-18H21.1', 'RP11-211G3.2', 'RP11-219B17.1', 'RP11-219B4.7', 'RP11-252A24.3', 'RP11-291B21.2', 'RP11-314N13.3', 'RP11-324I22.4', 'RP11-349A22.5', 'RP11-378J18.3', 'RP11-390B4.5', 'RP11-398C13.6', 'RP11-400F19.6', 'RP11-421L21.3', 'RP11-428G5.5', 'RP11-432I5.1', 'RP11-468E2.4', 'RP11-488C13.5', 'RP11-493L12.4', 'RP11-527L4.5', 'RP11-545I5.3', 'RP11-589C21.6', 'RP11-5C23.1', 'RP11-701P16.5', 'RP11-706O15.1', 'RP11-70P17.1', 'RP11-727F15.9', 'RP11-798G7.6', 'RP11-879F14.2', 'RP11-950C14.3', 'RP3-325F22.5', 'RP5-1073O3.7', 'RP5-827C21.4', 'RP5-887A10.1', 'RPH3A', 'RPL39L', 'RPL7L1', 'RPN2', 'RPS6KL1', 'RPUSD2', 'RRAGC', 'RRS1', 'RUNDC1', 'S100A11', 'S100A12', 'S100A8', 'S100B', 'SAFB2', 'SAMD1', 'SAMD3', 'SAMSN1', 'SARDH', 'SARS', 'SAT1', 'SCAI', 'SCAPER', 'SCGB3A1', 'SCPEP1', 'SDCCAG8', 'SDPR', 'SEC61A2', 'SELL', 'SEPT11', 'SERAC1', 'SETD1B', 'SF3B1', 'SF3B5', 'SH3GLB1', 'SH3KBP1', 'SHOC2', 'SHPK-1', 'SIAH2', 'SIRPG', 'SIRT1', 'SIVA1', 'SLA', 'SLBP', 'SLC22A4', 'SLC25A11', 'SLC25A12', 'SLC25A14', 'SLC27A1', 'SLC2A13', 'SLC35A2', 'SLC48A1', 'SLFN5', 'SMARCA4', 'SMARCC2', 'SMC2', 'SMCHD1', 'SMDT1', 'SMIM14', 'SMIM7', 'SNAP47', 'SNHG12', 'SNHG8', 'SNTA1', 'SNX29P2', 'SOX13', 'SPARC', 'SPATA7', 'SPG7', 'SPIB', 'SPIN1', 'SPOCD1', 'SPON2', 'SPSB2', 'SREBF1', 'SRM', 'SRP9', 'SRSF6', 'SSBP1', 'ST3GAL2', 'STAMBP', 'STAU2', 'STK17A', 'STK38', 'STMN1', 'STOML2', 'STUB1', 'STX16', 'STX18', 'SUCLG2', 'SUOX', 'SURF1', 'SURF6', 'SWAP70', 'SYCE1', 'SYP', 'SYVN1', 'TACR2', 'TADA2A', 'TAF10', 'TAF12', 'TAF1D', 'TAL1', 'TALDO1', 'TAPBP', 'TARSL2', 'TASP1', 'TBC1D15', 'TBCK', 'TBXA2R', 'TCEAL4', 'TCEAL8', 'TCL1A', 'TCL1B', 'TCP1', 'TDG', 'TERF2IP', 'TGFBRAP1', 'THAP2', 'THEM4', 'THOC7', 'THUMPD3', 'THYN1', 'TIGIT', 'TIMM10B', 'TMEM116', 'TMEM138', 'TMEM140', 'TMEM14B', 'TMEM165', 'TMEM177', 'TMEM194A', 'TMEM219', 'TMEM242', 'TMEM40', 'TMEM60', 'TMEM80', 'TMEM87A', 'TMEM87B', 'TMEM91', 'TMTC2', 'TMX2', 'TMX3', 'TNFRSF17', 'TNFRSF25', 'TNFRSF4', 'TNFRSF9', 'TNFSF10', 'TOP1MT', 'TOP2B', 'TRABD2A', 'TRAF3IP3', 'TRAPPC12-AS1', 'TRAPPC3', 'TREML1', 'TRIM23', 'TRIP12', 'TRIT1', 'TRMT61A', 'TRPM4', 'TSC22D1', 'TSPAN15', 'TSSC1', 'TTC1', 'TTC14', 'TTC3', 'TTC8', 'TTN-AS1', 'TUBB1', 'TUBG2', 'TYMP', 'TYROBP', 'U2SURP', 'UBA5', 'UBAC2', 'UBE2D2', 'UBE2D4', 'UBE2K', 'UBE2Q1', 'UBE3A', 'UBIAD1', 'UBLCP1', 'UBXN4', 'UCK1', 'UNC45A', 'UQCC1', 'URB2', 'URGCP', 'USP30', 'USP33', 'USP36', 'USP38', 'USP5', 'USP7', 'VAMP5', 'VDAC3', 'VIPR1', 'VPS13A', 'VPS13C', 'VPS25', 'VPS26B', 'VPS28', 'VTI1A', 'VTI1B', 'WARS2', 'WBP2NL', 'WDR55', 'WDR91', 'WDYHV1', 'WNK1', 'WTAP', 'XCL2', 'XPOT', 'XRRA1', 'XXbac-BPG299F13.17', 'YEATS2', 'YES1', 'YPEL2', 'YPEL3', 'YTHDF2', 'ZAP70', 'ZBED5-AS1', 'ZBP1', 'ZC3H15', 'ZCCHC11', 'ZCCHC9', 'ZFAND4', 'ZNF175', 'ZNF232', 'ZNF256', 'ZNF263', 'ZNF276', 'ZNF32', 'ZNF350', 'ZNF436', 'ZNF45', 'ZNF493', 'ZNF503', 'ZNF528', 'ZNF559', 'ZNF561', 'ZNF587B', 'ZNF594', 'ZNF653', 'ZNF682', 'ZNF688', 'ZNF718', 'ZNF747', 'ZNF799', 'ZNF836', 'ZNF92', 'ZRANB3', 'ZSWIM6', 'ZUFSP']\n", - "2023-05-26 17:16:31,768 [INFO] probe_length_min = 38\n", - "2023-05-26 17:16:31,769 [INFO] probe_length_max = 45\n", - "2023-05-26 17:16:31,771 [INFO] min_probes_per_gene = 3\n", - "2023-05-26 17:16:31,772 [INFO] n_jobs = 4\n", - "2023-05-26 17:56:26,665 [INFO] Genes with <= 3 probes will be removed from the probe database and their names will be stored in './output/regions_with_insufficient_oligos.txt'.\n", - "2023-05-26 17:56:26,862 [INFO] Step - Generate Probes: the database contains 35957196 probes from 887 genes.\n" + "2024-06-21 18:43:04,735 [INFO] Parameters Create Database:\n", + "2024-06-21 18:43:04,739 [INFO] Function: create_probe_database\n", + "2024-06-21 18:43:04,741 [INFO] Parameter: gene_ids = ['AAGAB', 'AATF', 'ABCC10', 'ABHD12', 'ABHD17B', 'ABHD5', 'ABRACL', 'ABT1', 'AC005082.12', 'AC074138.3', 'AC093323.3', 'ACAP1', 'ACBD3', 'ACD', 'ACOT13', 'ACP1', 'ACRBP', 'ACTL6A', 'ACTR6', 'ACVR2A', 'ADAL', 'ADAM10', 'ADAM28', 'ADD1', 'ADIPOR2', 'ADPRM', 'ADSL', 'AEBP1', 'AGPAT1', 'AHSA1', 'AIF1', 'AIM2', 'AKTIP', 'AL928768.3', 'ALKBH7', 'ANAPC13', 'ANKAR', 'ANKEF1', 'ANKRD27', 'ANKRD54', 'AP001462.6', 'AP003419.16', 'AP3M2', 'AP4B1-AS1', 'AP4S1', 'APOBEC3A', 'APOBEC3B', 'APOBEC3G', 'AQP3', 'ARHGAP11A', 'ARHGAP19', 'ARHGAP24', 'ARHGAP33', 'ARHGAP6', 'ARID4A', 'ARIH2OS', 'ARL2', 'ARL2BP', 'ARL4A', 'ARL6IP5', 'ARMC7', 'ARMCX5', 'ARRDC3', 'ARRDC4', 'ARSD', 'ARSG', 'ARVCF', 'ASB8', 'ASXL2', 'ATAD3C', 'ATF7IP2', 'ATG16L1', 'ATP10A', 'ATP5H', 'ATP5O', 'ATP5SL', 'ATP6V0E2', 'ATXN1L', 'ATXN3', 'AURKC', 'BABAM1', 'BACE2', 'BAZ2A', 'BBX', 'BCDIN3D', 'BET1', 'BEX4', 'BGLAP', 'BLNK', 'BLZF1', 'BMPR2', 'BNIP2', 'BOLA1', 'BOLA3', 'BRAT1', 'BRWD1', 'BTN3A1', 'BTN3A2', 'BUB3', 'C10orf32', 'C12orf45', 'C14orf1', 'C14orf166', 'C14orf80', 'C15orf57', 'C16orf13', 'C16orf52', 'C16orf54', 'C16orf58', 'C16orf74', 'C16orf80', 'C17orf59', 'C17orf62', 'C19orf33', 'C19orf52', 'C1QA', 'C1QB', 'C1QC', 'C1orf162', 'C1orf35', 'C21orf33', 'C2CD4D', 'C2orf76', 'C2orf88', 'C3orf18', 'C5orf15', 'C5orf42', 'C8orf44', 'C9orf142', 'C9orf16', 'C9orf37', 'CAMK1D', 'CAMK2G', 'CAMK2N1', 'CAPN12', 'CARHSP1', 'CARS', 'CASC4', 'CBX5', 'CCDC115', 'CCDC122', 'CCDC66', 'CCDC91', 'CCL3', 'CCL4', 'CCL5', 'CCND2', 'CCNG1', 'CCP110', 'CCT4', 'CCT7', 'CD160', 'CD19', 'CD2', 'CD247', 'CD274', 'CD2AP', 'CD320', 'CD72', 'CD79A', 'CD79B', 'CD82', 'CD9', 'CD96', 'CDC123', 'CDC16', 'CDC37', 'CDC40', 'CDK19', 'CDKN2A', 'CEACAM4', 'CEBPB', 'CECR5', 'CEP120', 'CEP68', 'CEP85L', 'CEPT1', 'CES4A', 'CGRRF1', 'CHD2', 'CHD7', 'CHERP', 'CHI3L2', 'CHPF2', 'CIAPIN1', 'CISD1', 'CISH', 'CITED4', 'CKS1B', 'CLDN5', 'CLEC2B', 'CLIC3', 'CLNS1A', 'CLPX', 'CLU', 'CLYBL', 'CMTM5', 'CNEP1R1', 'COMMD10', 'COQ7', 'CORO1B', 'COTL1', 'CPNE2', 'CPQ', 'CPSF3L', 'CR1', 'CRIP3', 'CRTC2', 'CST3', 'CST7', 'CTA-29F11.1', 'CTB-113I20.2', 'CTB-152G17.6', 'CTC-444N24.11', 'CTD-2015H6.3', 'CTD-2302E22.4', 'CTD-2368P22.1', 'CTD-2537I9.12', 'CTSS', 'CTSW', 'CWC15', 'CWC27', 'CXCL10', 'CXCL3', 'CYB5B', 'CYTH2', 'DAGLB', 'DCAF5', 'DDI2', 'DDT', 'DDX1', 'DDX17', 'DDX46', 'DDX56', 'DENND1C', 'DENND2D', 'DENND5B', 'DENND6A', 'DERL1', 'DEXI', 'DHX34', 'DHX9', 'DIDO1', 'DIMT1', 'DIS3', 'DISP1', 'DLST', 'DMTN', 'DNAJA3', 'DNAJB14', 'DNAJC10', 'DNAJC15', 'DNAJC2', 'DNAJC27', 'DNASE1L3', 'DNMT3A', 'DOK3', 'DPH6', 'DPY19L4', 'DRAXIN', 'DSCR3', 'DTX3', 'DUS3L', 'DUSP10', 'EAF2', 'EARS2', 'ECHDC1', 'EDC3', 'EID2', 'EIF1AY', 'EIF1B', 'EIF2B1', 'EIF3D', 'ELANE', 'ELOF1', 'ELOVL4', 'ELP6', 'EMB', 'EMG1', 'EML6', 'ENTPD3-AS1', 'EOGT', 'ERH', 'ERV3-1', 'EVA1B', 'EWSR1', 'EXOC6', 'F5', 'FADS1', 'FAM107B', 'FAM173A', 'FAM210B', 'FAM96A', 'FAM98A', 'FBXL14', 'FBXO21', 'FBXO33', 'FBXO4', 'FBXW4', 'FCER1A', 'FCER1G', 'FCGR2B', 'FCGR3A', 'FCN1', 'FCRLA', 'FEM1A', 'FERMT3', 'FGFBP2', 'FH', 'FHL1', 'FKBP3', 'FKBP5', 'FLOT1', 'FMO4', 'FN3KRP', 'FNBP4', 'FNTA', 'FOPNL', 'FRY-AS1', 'FUS', 'FXN', 'FYB', 'G0S2', 'GADD45B', 'GALT', 'GBGT1', 'GBP1', 'GDF11', 'GFER', 'GGA3', 'GGNBP2', 'GIMAP2', 'GIMAP4', 'GIMAP5', 'GIMAP7', 'GIT2', 'GMPPA', 'GNE', 'GNG11', 'GNG3', 'GNLY', 'GNPAT', 'GOLGB1', 'GP9', 'GPATCH4', 'GPKOW', 'GPR171', 'GPR183', 'GPR35', 'GPS1', 'GPX1', 'GRAP', 'GRN', 'GSTP1', 'GTPBP6', 'GUSB', 'GYS1', 'GZMA', 'GZMB', 'GZMH', 'GZMK', 'HAGH', 'HBA1', 'HBP1', 'HCFC2', 'HDAC1', 'HDAC5', 'HDAC9', 'HELQ', 'HEMK1', 'HERPUD2', 'HIST1H1B', 'HIST1H2AC', 'HIST1H2AH', 'HLA-DMA', 'HLA-DMB', 'HLA-DOB', 'HLA-DPA1', 'HLA-DPB1', 'HLA-DQA1', 'HLA-DQB1', 'HLA-DRB1', 'HMBOX1', 'HMGCL', 'HMGXB4', 'HNRNPH3', 'HOOK2', 'HOPX', 'HSPB11', 'HVCN1', 'ICAM2', 'ICOS', 'ICOSLG', 'ID2', 'IDUA', 'IFFO1', 'IFI27', 'IFIT1', 'IFIT2', 'IFITM3', 'IGFBP7', 'IGJ', 'IGLL5', 'IL1B', 'IL1RAP', 'IL23A', 'IL24', 'IL27RA', 'IL32', 'IL6', 'IL8', 'ILF3', 'ILF3-AS1', 'ING5', 'INSL3', 'INTS12', 'INTS2', 'IP6K1', 'IQCE', 'IRF8', 'IRF9', 'ISCA2', 'ISOC1', 'ITGA2B', 'ITGB7', 'ITM2A', 'ITSN2', 'JAKMIP1', 'JUND', 'KARS', 'KCNG1', 'KCNQ1OT1', 'KIAA0040', 'KIAA0125', 'KIAA0196', 'KIAA1430', 'KIF3A', 'KIF3C', 'KIF5B', 'KLHL24', 'KLRB1', 'KLRG1', 'KRBOX4', 'LAMP3', 'LARS', 'LAT2', 'LBR', 'LDLRAP1', 'LGALS1', 'LGALS2', 'LGALS3', 'LILRA4', 'LIN52', 'LINC00494', 'LINC00662', 'LINC00886', 'LINC00926', 'LINC00936', 'LINC01013', 'LIX1L', 'LONRF1', 'LPIN1', 'LRBA', 'LRRIQ3', 'LSM14A', 'LST1', 'LTB', 'LTV1', 'LUC7L', 'LUC7L3', 'LYAR', 'LYPD2', 'LYPLA1', 'LYRM4', 'LYSMD4', 'LZTS2', 'MADD', 'MAEA', 'MAGEH1', 'MAL', 'MALT1', 'MAP2K7', 'MARCKSL1', 'MCF2L', 'MCM3', 'MDS2', 'MED30', 'MED9', 'METTL21A', 'METTL3', 'METTL8', 'MFF', 'MFSD10', 'MIS18A', 'MKKS', 'MLLT11', 'MLLT6', 'MMADHC', 'MMP9', 'MNAT1', 'MOCS2', 'MORF4L2', 'MPHOSPH10', 'MRM1', 'MRPL1', 'MRPL19', 'MRPL42', 'MRPS12', 'MRPS33', 'MS4A1', 'MS4A6A', 'MTERFD2', 'MTIF2', 'MTRF1', 'MUM1', 'MYADM', 'MYCBP2', 'MYL9', 'MYO1E', 'MYOM2', 'MZB1', 'MZT1', 'NAA20', 'NAP1L4', 'NAPA-AS1', 'NARG2', 'NAT9', 'NBR1', 'NCOR2', 'NCR3', 'NDUFA10', 'NDUFA12', 'NECAB3', 'NEFH', 'NEK8', 'NELFB', 'NEMF', 'NFAT5', 'NFE2L2', 'NFIC', 'NFU1', 'NIT2', 'NKAP', 'NKG7', 'NKTR', 'NME3', 'NME6', 'NMNAT3', 'NNT-AS1', 'NOC4L', 'NOG', 'NOL11', 'NONO', 'NOP58', 'NPC2', 'NPHP3', 'NPRL2', 'NR2C1', 'NR3C1', 'NSA2', 'NT5C', 'NT5C3A', 'NUDCD1', 'NUDT16L1', 'NUP54', 'NXT2', 'OARD1', 'OAT', 'OBSCN', 'ODC1', 'ORAI1', 'ORC2', 'OSBPL1A', 'OSBPL7', 'OXLD1', 'P2RX5', 'P2RY10', 'PACS1', 'PACSIN2', 'PAICS', 'PARP1', 'PARS2', 'PASK', 'PAWR', 'PAXIP1-AS1', 'PBLD', 'PBRM1', 'PCNA', 'PCSK7', 'PDCD1', 'PDCD2L', 'PDE6B', 'PDIA3', 'PDIK1L', 'PDK2', 'PDXDC1', 'PDZD4', 'PEMT', 'PEX16', 'PEX26', 'PF4', 'PGM1', 'PGM2L1', 'PHACTR4', 'PHF12', 'PHF14', 'PHF3', 'PIGF', 'PIGU', 'PIGX', 'PIK3R1', 'PITHD1', 'PITPNA-AS1', 'PJA1', 'PKIG', 'PLA2G12A', 'PLCL1', 'PLD6', 'PLEKHA1', 'PLEKHA3', 'PLRG1', 'PMEPA1', 'PNOC', 'POLR2I', 'POLR2K', 'POLR3E', 'POMT1', 'PPA2', 'PPBP', 'PPIE', 'PPIG', 'PPIL2', 'PPIL4', 'PPP1R14A', 'PPP1R2', 'PPP2R1B', 'PPP6C', 'PPT2-EGFL8', 'PQBP1', 'PRAF2', 'PRDX1', 'PRELID2', 'PRF1', 'PRICKLE1', 'PRKACB', 'PRKCB', 'PRKD2', 'PRMT2', 'PRNP', 'PRPF31', 'PRPS2', 'PRR5', 'PSMD14', 'PTCRA', 'PTGDR', 'PTGDS', 'PTGES2', 'PTPN7', 'PURA', 'PWP1', 'PXMP4', 'PYCARD', 'R3HDM1', 'R3HDM2', 'RAB40C', 'RABEP2', 'RABL6', 'RAD51B', 'RALBP1', 'RALY', 'RASD1', 'RASGRP2', 'RBM25', 'RBM26-AS1', 'RBM39', 'RBM4', 'RBM48', 'RBM5', 'RBM7', 'RBPJ', 'RCE1', 'RCHY1', 'RCL1', 'RCN2', 'RDH14', 'RELB', 'REXO2', 'RFC1', 'RFC5', 'RFNG', 'RFPL2', 'RGS14', 'RIC3', 'RIOK1', 'RIOK2', 'RNF113A', 'RNF125', 'RNF139', 'RNF14', 'RNF168', 'RNF187', 'RNF213', 'RNF25', 'RNF26', 'RORA', 'RP1-28O10.1', 'RP11-1055B8.7', 'RP11-138A9.2', 'RP11-141B14.1', 'RP11-142C4.6', 'RP11-162G10.5', 'RP11-164H13.1', 'RP11-178G16.4', 'RP11-18H21.1', 'RP11-211G3.2', 'RP11-219B17.1', 'RP11-219B4.7', 'RP11-252A24.3', 'RP11-291B21.2', 'RP11-314N13.3', 'RP11-324I22.4', 'RP11-349A22.5', 'RP11-378J18.3', 'RP11-390B4.5', 'RP11-398C13.6', 'RP11-400F19.6', 'RP11-421L21.3', 'RP11-428G5.5', 'RP11-432I5.1', 'RP11-468E2.4', 'RP11-488C13.5', 'RP11-493L12.4', 'RP11-527L4.5', 'RP11-545I5.3', 'RP11-589C21.6', 'RP11-5C23.1', 'RP11-701P16.5', 'RP11-706O15.1', 'RP11-70P17.1', 'RP11-727F15.9', 'RP11-798G7.6', 'RP11-879F14.2', 'RP11-950C14.3', 'RP3-325F22.5', 'RP5-1073O3.7', 'RP5-827C21.4', 'RP5-887A10.1', 'RPH3A', 'RPL39L', 'RPL7L1', 'RPN2', 'RPS6KL1', 'RPUSD2', 'RRAGC', 'RRS1', 'RUNDC1', 'S100A11', 'S100A12', 'S100A8', 'S100B', 'SAFB2', 'SAMD1', 'SAMD3', 'SAMSN1', 'SARDH', 'SARS', 'SAT1', 'SCAI', 'SCAPER', 'SCGB3A1', 'SCPEP1', 'SDCCAG8', 'SDPR', 'SEC61A2', 'SELL', 'SEPT11', 'SERAC1', 'SETD1B', 'SF3B1', 'SF3B5', 'SH3GLB1', 'SH3KBP1', 'SHOC2', 'SHPK-1', 'SIAH2', 'SIRPG', 'SIRT1', 'SIVA1', 'SLA', 'SLBP', 'SLC22A4', 'SLC25A11', 'SLC25A12', 'SLC25A14', 'SLC27A1', 'SLC2A13', 'SLC35A2', 'SLC48A1', 'SLFN5', 'SMARCA4', 'SMARCC2', 'SMC2', 'SMCHD1', 'SMDT1', 'SMIM14', 'SMIM7', 'SNAP47', 'SNHG12', 'SNHG8', 'SNTA1', 'SNX29P2', 'SOX13', 'SPARC', 'SPATA7', 'SPG7', 'SPIB', 'SPIN1', 'SPOCD1', 'SPON2', 'SPSB2', 'SREBF1', 'SRM', 'SRP9', 'SRSF6', 'SSBP1', 'ST3GAL2', 'STAMBP', 'STAU2', 'STK17A', 'STK38', 'STMN1', 'STOML2', 'STUB1', 'STX16', 'STX18', 'SUCLG2', 'SUOX', 'SURF1', 'SURF6', 'SWAP70', 'SYCE1', 'SYP', 'SYVN1', 'TACR2', 'TADA2A', 'TAF10', 'TAF12', 'TAF1D', 'TAL1', 'TALDO1', 'TAPBP', 'TARSL2', 'TASP1', 'TBC1D15', 'TBCK', 'TBXA2R', 'TCEAL4', 'TCEAL8', 'TCL1A', 'TCL1B', 'TCP1', 'TDG', 'TERF2IP', 'TGFBRAP1', 'THAP2', 'THEM4', 'THOC7', 'THUMPD3', 'THYN1', 'TIGIT', 'TIMM10B', 'TMEM116', 'TMEM138', 'TMEM140', 'TMEM14B', 'TMEM165', 'TMEM177', 'TMEM194A', 'TMEM219', 'TMEM242', 'TMEM40', 'TMEM60', 'TMEM80', 'TMEM87A', 'TMEM87B', 'TMEM91', 'TMTC2', 'TMX2', 'TMX3', 'TNFRSF17', 'TNFRSF25', 'TNFRSF4', 'TNFRSF9', 'TNFSF10', 'TOP1MT', 'TOP2B', 'TRABD2A', 'TRAF3IP3', 'TRAPPC12-AS1', 'TRAPPC3', 'TREML1', 'TRIM23', 'TRIP12', 'TRIT1', 'TRMT61A', 'TRPM4', 'TSC22D1', 'TSPAN15', 'TSSC1', 'TTC1', 'TTC14', 'TTC3', 'TTC8', 'TTN-AS1', 'TUBB1', 'TUBG2', 'TYMP', 'TYROBP', 'U2SURP', 'UBA5', 'UBAC2', 'UBE2D2', 'UBE2D4', 'UBE2K', 'UBE2Q1', 'UBE3A', 'UBIAD1', 'UBLCP1', 'UBXN4', 'UCK1', 'UNC45A', 'UQCC1', 'URB2', 'URGCP', 'USP30', 'USP33', 'USP36', 'USP38', 'USP5', 'USP7', 'VAMP5', 'VDAC3', 'VIPR1', 'VPS13A', 'VPS13C', 'VPS25', 'VPS26B', 'VPS28', 'VTI1A', 'VTI1B', 'WARS2', 'WBP2NL', 'WDR55', 'WDR91', 'WDYHV1', 'WNK1', 'WTAP', 'XCL2', 'XPOT', 'XRRA1', 'XXbac-BPG299F13.17', 'YEATS2', 'YES1', 'YPEL2', 'YPEL3', 'YTHDF2', 'ZAP70', 'ZBED5-AS1', 'ZBP1', 'ZC3H15', 'ZCCHC11', 'ZCCHC9', 'ZFAND4', 'ZNF175', 'ZNF232', 'ZNF256', 'ZNF263', 'ZNF276', 'ZNF32', 'ZNF350', 'ZNF436', 'ZNF45', 'ZNF493', 'ZNF503', 'ZNF528', 'ZNF559', 'ZNF561', 'ZNF587B', 'ZNF594', 'ZNF653', 'ZNF682', 'ZNF688', 'ZNF718', 'ZNF747', 'ZNF799', 'ZNF836', 'ZNF92', 'ZRANB3', 'ZSWIM6', 'ZUFSP']\n", + "2024-06-21 18:43:04,743 [INFO] Parameter: probe_length_min = 40\n", + "2024-06-21 18:43:04,744 [INFO] Parameter: probe_length_max = 45\n", + "2024-06-21 18:43:04,745 [INFO] Parameter: files_fasta_oligo_database = ['output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna', 'output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna']\n", + "2024-06-21 18:43:04,746 [INFO] Parameter: min_probes_per_gene = 3\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", + "\n", + "Due to the on going maintenance burden of keeping command line application\n", + "wrappers up to date, we have decided to deprecate and eventually remove these\n", + "modules.\n", + "\n", + "We instead now recommend building your command line and invoking it directly\n", + "with the subprocess module.\n", + " warnings.warn(\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", + "\n", + "Due to the on going maintenance burden of keeping command line application\n", + "wrappers up to date, we have decided to deprecate and eventually remove these\n", + "modules.\n", + "\n", + "We instead now recommend building your command line and invoking it directly\n", + "with the subprocess module.\n", + " warnings.warn(\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", + "\n", + "Due to the on going maintenance burden of keeping command line application\n", + "wrappers up to date, we have decided to deprecate and eventually remove these\n", + "modules.\n", + "\n", + "We instead now recommend building your command line and invoking it directly\n", + "with the subprocess module.\n", + " warnings.warn(\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", + "\n", + "Due to the on going maintenance burden of keeping command line application\n", + "wrappers up to date, we have decided to deprecate and eventually remove these\n", + "modules.\n", + "\n", + "We instead now recommend building your command line and invoking it directly\n", + "with the subprocess module.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "be10edafcab84490b40411a24194a401", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-06-21 20:53:12,552 [INFO] Step - Create Database: database contains 26663811 oligos from 887 genes.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-06-21 20:53:12,618 [DEBUG] handle_msg[be10edafcab84490b40411a24194a401]({'header': {'date': datetime.datetime(2024, 6, 21, 18, 53, 12, 601000, tzinfo=tzutc()), 'msg_id': 'd60c5994-b6ae-4aeb-b38d-782e4fadc32e', 'msg_type': 'comm_msg', 'session': 'f5badd4b-0344-47de-95b0-5b394769c48b', 'username': '14d27083-0f27-4302-83da-06a8ed8f694f', 'version': '5.2'}, 'msg_id': 'd60c5994-b6ae-4aeb-b38d-782e4fadc32e', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': 'be10edafcab84490b40411a24194a401', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': ' \\x1b[35m100%\\x1b[0m \\x1b[36mDatabase Loading\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[32m887/887\\x1b[0m \\x1b[33m1:08:39\\x1b[0m < \\x1b[36m0:00:00\\x1b[0m\\n', 'text/html': '
  100% Database Loading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 887/887 1:08:39 < 0:00:00\\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n" ] } ], "source": [ - "probe_length_min = 38\n", - "probe_length_max = 45\n", - "min_probes_per_gene = 3\n", - "region = \"transcript\"\n", - "\n", - "# highly_variable_genes = highly_variable_genes[100:]\n", - "probe_database, file_database = probe_designer.create_probe_database(genes=highly_variable_genes, probe_length_min=probe_length_min, probe_length_max=probe_length_max, region=region, min_probes_per_gene=min_probes_per_gene, n_jobs=4)" + "## Create initial database\n", + "probe_database, file_database = pipeline.create_probe_database(\n", + " gene_ids=gene_ids,\n", + " probe_length_min=probe_length_min,\n", + " probe_length_max=probe_length_max,\n", + " files_fasta_oligo_database=fasta_files,\n", + " min_probes_per_gene=min_probes_per_gene,\n", + ")" ] }, { @@ -333,73 +502,193 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In order to create experiment-specific probes, we have to apply several filter to each probe, e.g. melting temperature or GC content filters. \n", + "### Refine Database with Property Filters\n", "\n", - "### Parameters for Property Filters\n", + "In order to create experiment-specific probes, we have to apply several filter to each probe, e.g. melting temperature or GC content filters. \n", "\n", - "*Parameters for Probe Sequence:*\n", - "- GC_content_min: minimum GC content of probes\n", - "- GC_content_max: maximum GC content of probes\n", - "- Tm_min: minimum melting temperature of probes\n", - "- Tm_max: maximum melting temperature of probes\n", + "**Parameters for Probe Sequences:**\n", + "- `probe_GC_content_min`: minimum GC content of oligos\n", + "- `probe_GC_content_max`: maximum GC content of oligos\n", + "- `probe_Tm_min`: minimum melting temperature of oligos\n", + "- `probe_Tm_max`: maximum melting temperature of oligos\n", + "- `homopolymeric_base_n`: minimum number of nucleotides to consider it a homopolymeric run per base and all probes with such a homopolymeric run are filtered out, e.g. if A: 5 is set then the sequence \"CTGGTAAAAACTGGA\" is filtered out, but the sequence \"CTGGTAAAACTGGA\" is kept\n", "\n", - "*Parameters for Padlock Arms:*\n", - "- min_arm_length: minimum length of each arm\n", - "- max_arm_Tm_dif: maximum melting temperature difference of both arms\n", - "- arm_Tm_min: minimum melting temperature of each arm (difference shouldn't be higher than 5! But range is not super important, the lower the better)\n", - "- arm_Tm_max: maximum melting temperature of each arm\n", + "**Parameters for Padlock Arms:**\n", + "- `arm_Tm_dif_max`: maximum melting temperature difference of both arms (difference shouldn't be higher than 5! But range is not super important, the lower the better)\n", + "- `arm_length_min`: minimum length of each arm\n", + "- `arm_Tm_min`: minimum melting temperature of each arm \n", + "- `arm_Tm_max`: maximum melting temperature of each arm\n", "\n", - "*Parameters for Melting Temperature:*\n", - "- Tm_parameters_probe: melting temperature parameters for probe design\n", - "- Tm_chem_correction_param_pobe: parameters for chemical correction of melting temperature for probe design\n", + "**Parameters for Detection Oligos:**\n", + "- `min_thymines`: minimal number of Thymines in detection oligo.\n", + "- `detect_oligo_length_min`: minimum length of detection probe\n", + "- `detect_oligo_length_max`: maximum length of detection probe\n", "\n", "*Note: The melting temperature is used in 2 different stages (probe and detection oligo design), where a few parameters are shared and the others differ. Parameters for melting temperature -> for more information on parameters, see: [here](https://biopython.org/docs/1.75/api/Bio.SeqUtils.MeltingTemp.html#Bio.SeqUtils.MeltingTemp.Tm_NN)*" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "## Define parameters\n", + "# probe sequence\n", + "probe_GC_content_min = 40 \n", + "probe_GC_content_max = 60 \n", + "probe_Tm_min = 65 \n", + "probe_Tm_max = 75 \n", + "homopolymeric_base_n = {\"A\": 5, \"T\": 5, \"C\": 5, \"G\": 5} \n", + "\n", + "# padlock arms\n", + "arm_Tm_dif_max = 2 \n", + "arm_length_min = 10 \n", + "arm_Tm_min = 50 \n", + "arm_Tm_max = 60 \n", + "\n", + "# detection oligos\n", + "min_thymines = 2 \n", + "U_distance = 5 \n", + "detect_oligo_length_min = 15 \n", + "detect_oligo_length_max = 40 \n", + "detect_oligo_Tm_opt = 56 \n", + "\n", + "\n", + "Tm_parameters_probe = {\n", + " \"check\": True, # default\n", + " \"strict\": True, # default\n", + " \"c_seq\": None, # default\n", + " \"shift\": 0, # default\n", + " \"nn_table\": getattr(mt, \"DNA_NN3\"), # Allawi & SantaLucia (1997)\n", + " \"tmm_table\": getattr(mt, \"DNA_TMM1\"), #default\n", + " \"imm_table\": getattr(mt, \"DNA_IMM1\"), #default\n", + " \"de_table\": getattr(mt, \"DNA_DE1\"), #default\n", + " \"dnac1\": 50, # [nM]\n", + " \"dnac2\": 0, # [nM]\n", + " \"selfcomp\": False, # default\n", + " \"saltcorr\": 7, # Owczarzy et al. (2008)\n", + " \"Na\": 39, # [mM]\n", + " \"K\": 75, # [mM]\n", + " \"Tris\": 20, # [mM]\n", + " \"Mg\": 10, # [mM]\n", + " \"dNTPs\": 0, # [mM] default\n", + "}\n", + "\n", + "Tm_chem_correction_param_probe = {\n", + " \"DMSO\": 0, # default\n", + " \"fmd\": 20,\n", + " \"DMSOfactor\": 0.75, # default\n", + " \"fmdfactor\": 0.65, # default\n", + " \"fmdmethod\": 1, # default\n", + " \"GC\": None, # default\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2023-05-26 18:41:05,992 [INFO] Parameters Property Filters:\n", - "2023-05-26 18:41:06,004 [INFO] probe_database = \n", - "2023-05-26 18:41:06,010 [INFO] GC_content_min = 40\n", - "2023-05-26 18:41:06,012 [INFO] GC_content_max = 60\n", - "2023-05-26 18:41:06,015 [INFO] Tm_min = 52\n", - "2023-05-26 18:41:06,016 [INFO] Tm_max = 67\n", - "2023-05-26 18:41:06,017 [INFO] min_arm_length = 10\n", - "2023-05-26 18:41:06,019 [INFO] max_arm_Tm_dif = 2\n", - "2023-05-26 18:41:06,025 [INFO] arm_Tm_min = 38\n", - "2023-05-26 18:41:06,026 [INFO] arm_Tm_max = 49\n", - "2023-05-26 18:41:06,029 [INFO] Tm_parameters_probe = {'check': True, 'strict': True, 'c_seq': None, 'shift': 0, 'nn_table': 'DNA_NN3', 'tmm_table': 'DNA_TMM1', 'imm_table': 'DNA_IMM1', 'de_table': 'DNA_DE1', 'dnac1': 50, 'dnac2': 0, 'selfcomp': False, 'dNTPs': 0, 'saltcorr': 7, 'Na': 1.25, 'K': 75, 'Tris': 20, 'Mg': 10}\n", - "2023-05-26 18:41:06,030 [INFO] Tm_chem_correction_param_probe = {'DMSO': 0, 'DMSOfactor': 0.75, 'fmdfactor': 0.65, 'fmdmethod': 1, 'GC': None, 'fmd': 20}\n", - "2023-05-26 18:41:06,031 [INFO] n_jobs = 4\n", - "2023-05-26 21:18:45,849 [INFO] Step - Filter Probes by Sequence Property: the database contains 3732914 probes from 882 genes, while 32224282 probes and 5 genes have been deleted in this step.\n" + "2024-06-21 20:53:35,992 [INFO] Parameters Property Filters:\n", + "2024-06-21 20:53:35,994 [INFO] Function: filter_by_property\n", + "2024-06-21 20:53:35,996 [INFO] Parameter: oligo_database = \n", + "2024-06-21 20:53:35,998 [INFO] Parameter: probe_GC_content_min = 40\n", + "2024-06-21 20:53:35,999 [INFO] Parameter: probe_GC_content_max = 60\n", + "2024-06-21 20:53:36,001 [INFO] Parameter: probe_Tm_min = 65\n", + "2024-06-21 20:53:36,002 [INFO] Parameter: probe_Tm_max = 75\n", + "2024-06-21 20:53:36,004 [INFO] Parameter: detect_oligo_length_min = 15\n", + "2024-06-21 20:53:36,006 [INFO] Parameter: detect_oligo_length_max = 40\n", + "2024-06-21 20:53:36,007 [INFO] Parameter: min_thymines = 2\n", + "2024-06-21 20:53:36,008 [INFO] Parameter: arm_Tm_dif_max = 2\n", + "2024-06-21 20:53:36,011 [INFO] Parameter: arm_length_min = 10\n", + "2024-06-21 20:53:36,012 [INFO] Parameter: arm_Tm_min = 50\n", + "2024-06-21 20:53:36,014 [INFO] Parameter: arm_Tm_max = 60\n", + "2024-06-21 20:53:36,016 [INFO] Parameter: homopolymeric_base_n = {'A': 5, 'T': 5, 'C': 5, 'G': 5}\n", + "2024-06-21 20:53:36,017 [INFO] Parameter: Tm_parameters_probe = {'check': True, 'strict': True, 'c_seq': None, 'shift': 0, 'nn_table': {'init': (0, 0), 'init_A/T': (2.3, 4.1), 'init_G/C': (0.1, -2.8), 'init_oneG/C': (0, 0), 'init_allA/T': (0, 0), 'init_5T/A': (0, 0), 'sym': (0, -1.4), 'AA/TT': (-7.9, -22.2), 'AT/TA': (-7.2, -20.4), 'TA/AT': (-7.2, -21.3), 'CA/GT': (-8.5, -22.7), 'GT/CA': (-8.4, -22.4), 'CT/GA': (-7.8, -21.0), 'GA/CT': (-8.2, -22.2), 'CG/GC': (-10.6, -27.2), 'GC/CG': (-9.8, -24.4), 'GG/CC': (-8.0, -19.9)}, 'tmm_table': {'AA/TA': (-3.1, -7.8), 'TA/AA': (-2.5, -6.3), 'CA/GA': (-4.3, -10.7), 'GA/CA': (-8.0, -22.5), 'AC/TC': (-0.1, 0.5), 'TC/AC': (-0.7, -1.3), 'CC/GC': (-2.1, -5.1), 'GC/CC': (-3.9, -10.6), 'AG/TG': (-1.1, -2.1), 'TG/AG': (-1.1, -2.7), 'CG/GG': (-3.8, -9.5), 'GG/CG': (-0.7, -19.2), 'AT/TT': (-2.4, -6.5), 'TT/AT': (-3.2, -8.9), 'CT/GT': (-6.1, -16.9), 'GT/CT': (-7.4, -21.2), 'AA/TC': (-1.6, -4.0), 'AC/TA': (-1.8, -3.8), 'CA/GC': (-2.6, -5.9), 'CC/GA': (-2.7, -6.0), 'GA/CC': (-5.0, -13.8), 'GC/CA': (-3.2, -7.1), 'TA/AC': (-2.3, -5.9), 'TC/AA': (-2.7, -7.0), 'AC/TT': (-0.9, -1.7), 'AT/TC': (-2.3, -6.3), 'CC/GT': (-3.2, -8.0), 'CT/GC': (-3.9, -10.6), 'GC/CT': (-4.9, -13.5), 'GT/CC': (-3.0, -7.8), 'TC/AT': (-2.5, -6.3), 'TT/AC': (-0.7, -1.2), 'AA/TG': (-1.9, -4.4), 'AG/TA': (-2.5, -5.9), 'CA/GG': (-3.9, -9.6), 'CG/GA': (-6.0, -15.5), 'GA/CG': (-4.3, -11.1), 'GG/CA': (-4.6, -11.4), 'TA/AG': (-2.0, -4.7), 'TG/AA': (-2.4, -5.8), 'AG/TT': (-3.2, -8.7), 'AT/TG': (-3.5, -9.4), 'CG/GT': (-3.8, -9.0), 'CT/GG': (-6.6, -18.7), 'GG/CT': (-5.7, -15.9), 'GT/CG': (-5.9, -16.1), 'TG/AT': (-3.9, -10.5), 'TT/AG': (-3.6, -9.8)}, 'imm_table': {'AG/TT': (1.0, 0.9), 'AT/TG': (-2.5, -8.3), 'CG/GT': (-4.1, -11.7), 'CT/GG': (-2.8, -8.0), 'GG/CT': (3.3, 10.4), 'GG/TT': (5.8, 16.3), 'GT/CG': (-4.4, -12.3), 'GT/TG': (4.1, 9.5), 'TG/AT': (-0.1, -1.7), 'TG/GT': (-1.4, -6.2), 'TT/AG': (-1.3, -5.3), 'AA/TG': (-0.6, -2.3), 'AG/TA': (-0.7, -2.3), 'CA/GG': (-0.7, -2.3), 'CG/GA': (-4.0, -13.2), 'GA/CG': (-0.6, -1.0), 'GG/CA': (0.5, 3.2), 'TA/AG': (0.7, 0.7), 'TG/AA': (3.0, 7.4), 'AC/TT': (0.7, 0.2), 'AT/TC': (-1.2, -6.2), 'CC/GT': (-0.8, -4.5), 'CT/GC': (-1.5, -6.1), 'GC/CT': (2.3, 5.4), 'GT/CC': (5.2, 13.5), 'TC/AT': (1.2, 0.7), 'TT/AC': (1.0, 0.7), 'AA/TC': (2.3, 4.6), 'AC/TA': (5.3, 14.6), 'CA/GC': (1.9, 3.7), 'CC/GA': (0.6, -0.6), 'GA/CC': (5.2, 14.2), 'GC/CA': (-0.7, -3.8), 'TA/AC': (3.4, 8.0), 'TC/AA': (7.6, 20.2), 'AA/TA': (1.2, 1.7), 'CA/GA': (-0.9, -4.2), 'GA/CA': (-2.9, -9.8), 'TA/AA': (4.7, 12.9), 'AC/TC': (0.0, -4.4), 'CC/GC': (-1.5, -7.2), 'GC/CC': (3.6, 8.9), 'TC/AC': (6.1, 16.4), 'AG/TG': (-3.1, -9.5), 'CG/GG': (-4.9, -15.3), 'GG/CG': (-6.0, -15.8), 'TG/AG': (1.6, 3.6), 'AT/TT': (-2.7, -10.8), 'CT/GT': (-5.0, -15.8), 'GT/CT': (-2.2, -8.4), 'TT/AT': (0.2, -1.5), 'AI/TC': (-8.9, -25.5), 'TI/AC': (-5.9, -17.4), 'AC/TI': (-8.8, -25.4), 'TC/AI': (-4.9, -13.9), 'CI/GC': (-5.4, -13.7), 'GI/CC': (-6.8, -19.1), 'CC/GI': (-8.3, -23.8), 'GC/CI': (-5.0, -12.6), 'AI/TA': (-8.3, -25.0), 'TI/AA': (-3.4, -11.2), 'AA/TI': (-0.7, -2.6), 'TA/AI': (-1.3, -4.6), 'CI/GA': (2.6, 8.9), 'GI/CA': (-7.8, -21.1), 'CA/GI': (-7.0, -20.0), 'GA/CI': (-7.6, -20.2), 'AI/TT': (0.49, -0.7), 'TI/AT': (-6.5, -22.0), 'AT/TI': (-5.6, -18.7), 'TT/AI': (-0.8, -4.3), 'CI/GT': (-1.0, -2.4), 'GI/CT': (-3.5, -10.6), 'CT/GI': (0.1, -1.0), 'GT/CI': (-4.3, -12.1), 'AI/TG': (-4.9, -15.8), 'TI/AG': (-1.9, -8.5), 'AG/TI': (0.1, -1.8), 'TG/AI': (1.0, 1.0), 'CI/GG': (7.1, 21.3), 'GI/CG': (-1.1, -3.2), 'CG/GI': (5.8, 16.9), 'GG/CI': (-7.6, -22.0), 'AI/TI': (-3.3, -11.9), 'TI/AI': (0.1, -2.3), 'CI/GI': (1.3, 3.0), 'GI/CI': (-0.5, -1.3)}, 'de_table': {'AA/.T': (0.2, 2.3), 'AC/.G': (-6.3, -17.1), 'AG/.C': (-3.7, -10.0), 'AT/.A': (-2.9, -7.6), 'CA/.T': (0.6, 3.3), 'CC/.G': (-4.4, -12.6), 'CG/.C': (-4.0, -11.9), 'CT/.A': (-4.1, -13.0), 'GA/.T': (-1.1, -1.6), 'GC/.G': (-5.1, -14.0), 'GG/.C': (-3.9, -10.9), 'GT/.A': (-4.2, -15.0), 'TA/.T': (-6.9, -20.0), 'TC/.G': (-4.0, -10.9), 'TG/.C': (-4.9, -13.8), 'TT/.A': (-0.2, -0.5), '.A/AT': (-0.7, -0.8), '.C/AG': (-2.1, -3.9), '.G/AC': (-5.9, -16.5), '.T/AA': (-0.5, -1.1), '.A/CT': (4.4, 14.9), '.C/CG': (-0.2, -0.1), '.G/CC': (-2.6, -7.4), '.T/CA': (4.7, 14.2), '.A/GT': (-1.6, -3.6), '.C/GG': (-3.9, -11.2), '.G/GC': (-3.2, -10.4), '.T/GA': (-4.1, -13.1), '.A/TT': (2.9, 10.4), '.C/TG': (-4.4, -13.1), '.G/TC': (-5.2, -15.0), '.T/TA': (-3.8, -12.6)}, 'dnac1': 50, 'dnac2': 0, 'selfcomp': False, 'saltcorr': 7, 'Na': 39, 'K': 75, 'Tris': 20, 'Mg': 10, 'dNTPs': 0}\n", + "2024-06-21 20:53:36,019 [INFO] Parameter: Tm_chem_correction_param_probe = {'DMSO': 0, 'fmd': 20, 'DMSOfactor': 0.75, 'fmdfactor': 0.65, 'fmdmethod': 1, 'GC': None}\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1b6e1bbfde05498394497b2d64da341f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-06-21 22:01:23,977 [INFO] Step - Property Filters: database contains 3468382 oligos from 887 genes.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-06-21 22:01:24,010 [DEBUG] handle_msg[1b6e1bbfde05498394497b2d64da341f]({'header': {'date': datetime.datetime(2024, 6, 21, 20, 1, 24, 3000, tzinfo=tzutc()), 'msg_id': 'a8137585-25fd-4c5c-aa27-eec9590e2742', 'msg_type': 'comm_msg', 'session': 'f5badd4b-0344-47de-95b0-5b394769c48b', 'username': '14d27083-0f27-4302-83da-06a8ed8f694f', 'version': '5.2'}, 'msg_id': 'a8137585-25fd-4c5c-aa27-eec9590e2742', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '1b6e1bbfde05498394497b2d64da341f', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': ' \\x1b[35m100%\\x1b[0m \\x1b[36mProperty Filter\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[32m887/887\\x1b[0m \\x1b[33m0:56:32\\x1b[0m < \\x1b[36m0:00:00\\x1b[0m\\n', 'text/html': '
  100% Property Filter ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 887/887 0:56:32 < 0:00:00\\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n" ] } ], "source": [ "####### Load existing database #######\n", - "# file_database = \"./output/oligo_database/probe_database_initial.txt\"\n", - "# min_probes_per_gene = 3\n", - "# probe_database = probe_designer.load_probe_database(file_database=file_database, min_probes_per_gene=min_probes_per_gene)\n", - "\n", - "####### Apply Property Filter #######\n", - "GC_content_min=40\n", - "GC_content_max=60\n", - "Tm_min=52\n", - "Tm_max=67\n", - "min_arm_length=10\n", - "max_arm_Tm_dif=2\n", - "arm_Tm_min=38\n", - "arm_Tm_max=49\n", + "# dir_database = os.path.join(dir_output, \"db_probes/1_db_probes_initial\")\n", + "# probe_database = OligoDatabase(min_oligos_per_region=min_probes_per_gene, write_regions_with_insufficient_oligos=True, lru_db_max_in_memory=db_max_in_memory, database_name=\"db_probes\", dir_output=dir_output, n_jobs=n_jobs)\n", + "# probe_database.load_database(dir_database=dir_database, region_ids=gene_ids, database_overwrite=True)\n", "\n", - "probe_database, file_database = probe_designer.filter_probes_by_property(probe_database, GC_content_min=GC_content_min, GC_content_max=GC_content_max,\n", - " Tm_min=Tm_min, Tm_max=Tm_max, min_arm_length=min_arm_length, max_arm_Tm_dif=max_arm_Tm_dif, arm_Tm_min=arm_Tm_min, arm_Tm_max=arm_Tm_max, n_jobs=4)" + "## Apply property filters\n", + "probe_database, file_database = pipeline.filter_by_property(\n", + " oligo_database=probe_database,\n", + " probe_GC_content_min=probe_GC_content_min,\n", + " probe_GC_content_max=probe_GC_content_max,\n", + " probe_Tm_min=probe_Tm_min,\n", + " probe_Tm_max=probe_Tm_max,\n", + " detect_oligo_length_min=detect_oligo_length_min,\n", + " detect_oligo_length_max=detect_oligo_length_max,\n", + " min_thymines=min_thymines,\n", + " arm_Tm_dif_max=arm_Tm_dif_max,\n", + " arm_length_min=arm_length_min,\n", + " arm_Tm_min=arm_Tm_min,\n", + " arm_Tm_max=arm_Tm_max,\n", + " homopolymeric_base_n=homopolymeric_base_n,\n", + " Tm_parameters_probe=Tm_parameters_probe,\n", + " Tm_chem_correction_param_probe=Tm_chem_correction_param_probe,\n", + ")" ] }, { @@ -407,66 +696,233 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Parameters for Specificity Filters\n", + "### Refine Database with Specificity Filters\n", "\n", - "*BlastN Similarity Filter:*\n", - "- blast_word_size: word size for the blastn seed (exact match to target)\n", - "- blast_percent_identity: maximum similarity between oligos and target sequences, ranging from 0 to 100% (no missmatch)\n", - "- blast_coverage: minimum coverage between oligos and target sequence, ranging from 0 to 100% (full coverage)\n", + "In order to create target specific probes, we have to apply several filter to each probe to lower the probability of off-target effects and cross-hybridization. \n", "\n", - "*Bowtie Ligation Region filter:*\n", - "- ligation_region_size: size of the seed region around the ligation site for bowtie seed region filter\n", + "**Parameters for BlastN Crosshybridization filter** \n", + "In this filter, the probes are aligned against the probe database, to detect possible cross-hybridization events. \n", + "As filter policy, we remove the probe coming from the region with more probes, to avoid deleting too many probes in this step.\n", + "- `cross_hybridization_blastn_search_parameters`: parameters for the BlastN search\n", + "- `cross_hybridization_blastn_hit_parameters`: parameters for the evaluation of BlastN hits\n", + "\n", + "**Parameters for BlastN Specificity filter** \n", + "In this filter, the probes are aligned against a reference fasta file, to detect possible off-target events. \n", + "In our case, we use the transcriptome that we generated before with the Genomic Region Generator, but this can be chaned upon needs.\n", + "- `files_fasta_reference_database`: fasta file with sequences used as reference for the specificity filters. Multiple fasta files can be used as reference. Hint: use the genomic_region_generator pipeline to create fasta files of genomic regions of interest\n", + "- `ligation_region_size`: size of the seed region around the ligation site for blast seed region filter; set to 0 if ligation region should not be considered for blast search\n", + "- `specificity_blastn_search_parameters`: parameters for the BlastN search\n", + "- `specificity_blastn_hit_parameters`: parameters for the evaluation of BlastN hits\n", "\n", "*Note: Depending on the number of genes, this step might be time and memory consuming. For high number of genes, you might want to run this step on a bigger machine!*" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "## Define parameters\n", + "# Cross-hybridization filter\n", + "cross_hybridization_blastn_search_parameters = {\n", + " \"perc_identity\": 80,\n", + " \"strand\": \"minus\", # this parameter is fixed\n", + " \"word_size\": 10,\n", + " \"dust\": \"no\",\n", + " \"soft_masking\": \"false\",\n", + " \"max_target_seqs\": 10,\n", + "}\n", + "\n", + "cross_hybridization_blastn_hit_parameters = {\n", + " \"coverage\": 80 # can be turned into min_alignment_length\n", + "}\n", + "\n", + "\n", + "# Specificity Filter\n", + "files_fasta_reference_database = fasta_files\n", + "ligation_region_size = 5\n", + "specificity_blastn_search_parameters = {\n", + " \"perc_identity\": 80,\n", + " \"strand\": \"minus\", # this parameter is fixed, however if reference is whole genome, consider using \"both\"\n", + " \"word_size\": 10,\n", + " \"dust\": \"no\",\n", + " \"soft_masking\": \"false\",\n", + " \"max_target_seqs\": 10,\n", + " \"max_hsps\": 1000,\n", + "}\n", + "specificity_blastn_hit_parameters = {\n", + " \"coverage\": 50 # can be turned into min_alignment_length if more meaningful\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2023-05-26 21:18:46,642 [INFO] Parameters Specificity Filters:\n", - "2023-05-26 21:18:46,647 [INFO] probe_database = \n", - "2023-05-26 21:18:46,651 [INFO] ligation_region_size = 5\n", - "2023-05-26 21:18:46,652 [INFO] blast_word_size = 10\n", - "2023-05-26 21:18:46,653 [INFO] blast_percent_identity = 80\n", - "2023-05-26 21:18:46,654 [INFO] blast_coverage = 50\n", - "2023-05-26 21:18:46,654 [INFO] n_jobs = 2\n", - "2023-05-27 01:19:33,100 [INFO] Step - Filter Probes by Specificity: the database contains 570410 probes from 796 genes, while 3162504 probes and 86 genes have been deleted in this step.\n" + "2024-06-21 22:01:38,485 [INFO] Parameters Specificity Filters:\n", + "2024-06-21 22:01:38,488 [INFO] Function: filter_by_specificity\n", + "2024-06-21 22:01:38,489 [INFO] Parameter: oligo_database = \n", + "2024-06-21 22:01:38,493 [INFO] Parameter: files_fasta_reference_database = ['output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna', 'output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna']\n", + "2024-06-21 22:01:38,496 [INFO] Parameter: specificity_blastn_search_parameters = {'perc_identity': 80, 'strand': 'minus', 'word_size': 10, 'dust': 'no', 'soft_masking': 'false', 'max_target_seqs': 10, 'max_hsps': 1000}\n", + "2024-06-21 22:01:38,497 [INFO] Parameter: specificity_blastn_hit_parameters = {'coverage': 50}\n", + "2024-06-21 22:01:38,499 [INFO] Parameter: cross_hybridization_blastn_search_parameters = {'perc_identity': 80, 'strand': 'minus', 'word_size': 10, 'dust': 'no', 'soft_masking': 'false', 'max_target_seqs': 10}\n", + "2024-06-21 22:01:38,500 [INFO] Parameter: cross_hybridization_blastn_hit_parameters = {'coverage': 80}\n", + "2024-06-21 22:01:38,501 [INFO] Parameter: ligation_region_size = 5\n", + "2024-06-21 22:01:38,504 [INFO] Parameter: arm_Tm_dif_max = 2\n", + "2024-06-21 22:01:38,505 [INFO] Parameter: arm_length_min = 10\n", + "2024-06-21 22:01:38,506 [INFO] Parameter: arm_Tm_min = 50\n", + "2024-06-21 22:01:38,507 [INFO] Parameter: arm_Tm_max = 60\n", + "2024-06-21 22:01:38,508 [INFO] Parameter: Tm_parameters_probe = {'check': True, 'strict': True, 'c_seq': None, 'shift': 0, 'nn_table': {'init': (0, 0), 'init_A/T': (2.3, 4.1), 'init_G/C': (0.1, -2.8), 'init_oneG/C': (0, 0), 'init_allA/T': (0, 0), 'init_5T/A': (0, 0), 'sym': (0, -1.4), 'AA/TT': (-7.9, -22.2), 'AT/TA': (-7.2, -20.4), 'TA/AT': (-7.2, -21.3), 'CA/GT': (-8.5, -22.7), 'GT/CA': (-8.4, -22.4), 'CT/GA': (-7.8, -21.0), 'GA/CT': (-8.2, -22.2), 'CG/GC': (-10.6, -27.2), 'GC/CG': (-9.8, -24.4), 'GG/CC': (-8.0, -19.9)}, 'tmm_table': {'AA/TA': (-3.1, -7.8), 'TA/AA': (-2.5, -6.3), 'CA/GA': (-4.3, -10.7), 'GA/CA': (-8.0, -22.5), 'AC/TC': (-0.1, 0.5), 'TC/AC': (-0.7, -1.3), 'CC/GC': (-2.1, -5.1), 'GC/CC': (-3.9, -10.6), 'AG/TG': (-1.1, -2.1), 'TG/AG': (-1.1, -2.7), 'CG/GG': (-3.8, -9.5), 'GG/CG': (-0.7, -19.2), 'AT/TT': (-2.4, -6.5), 'TT/AT': (-3.2, -8.9), 'CT/GT': (-6.1, -16.9), 'GT/CT': (-7.4, -21.2), 'AA/TC': (-1.6, -4.0), 'AC/TA': (-1.8, -3.8), 'CA/GC': (-2.6, -5.9), 'CC/GA': (-2.7, -6.0), 'GA/CC': (-5.0, -13.8), 'GC/CA': (-3.2, -7.1), 'TA/AC': (-2.3, -5.9), 'TC/AA': (-2.7, -7.0), 'AC/TT': (-0.9, -1.7), 'AT/TC': (-2.3, -6.3), 'CC/GT': (-3.2, -8.0), 'CT/GC': (-3.9, -10.6), 'GC/CT': (-4.9, -13.5), 'GT/CC': (-3.0, -7.8), 'TC/AT': (-2.5, -6.3), 'TT/AC': (-0.7, -1.2), 'AA/TG': (-1.9, -4.4), 'AG/TA': (-2.5, -5.9), 'CA/GG': (-3.9, -9.6), 'CG/GA': (-6.0, -15.5), 'GA/CG': (-4.3, -11.1), 'GG/CA': (-4.6, -11.4), 'TA/AG': (-2.0, -4.7), 'TG/AA': (-2.4, -5.8), 'AG/TT': (-3.2, -8.7), 'AT/TG': (-3.5, -9.4), 'CG/GT': (-3.8, -9.0), 'CT/GG': (-6.6, -18.7), 'GG/CT': (-5.7, -15.9), 'GT/CG': (-5.9, -16.1), 'TG/AT': (-3.9, -10.5), 'TT/AG': (-3.6, -9.8)}, 'imm_table': {'AG/TT': (1.0, 0.9), 'AT/TG': (-2.5, -8.3), 'CG/GT': (-4.1, -11.7), 'CT/GG': (-2.8, -8.0), 'GG/CT': (3.3, 10.4), 'GG/TT': (5.8, 16.3), 'GT/CG': (-4.4, -12.3), 'GT/TG': (4.1, 9.5), 'TG/AT': (-0.1, -1.7), 'TG/GT': (-1.4, -6.2), 'TT/AG': (-1.3, -5.3), 'AA/TG': (-0.6, -2.3), 'AG/TA': (-0.7, -2.3), 'CA/GG': (-0.7, -2.3), 'CG/GA': (-4.0, -13.2), 'GA/CG': (-0.6, -1.0), 'GG/CA': (0.5, 3.2), 'TA/AG': (0.7, 0.7), 'TG/AA': (3.0, 7.4), 'AC/TT': (0.7, 0.2), 'AT/TC': (-1.2, -6.2), 'CC/GT': (-0.8, -4.5), 'CT/GC': (-1.5, -6.1), 'GC/CT': (2.3, 5.4), 'GT/CC': (5.2, 13.5), 'TC/AT': (1.2, 0.7), 'TT/AC': (1.0, 0.7), 'AA/TC': (2.3, 4.6), 'AC/TA': (5.3, 14.6), 'CA/GC': (1.9, 3.7), 'CC/GA': (0.6, -0.6), 'GA/CC': (5.2, 14.2), 'GC/CA': (-0.7, -3.8), 'TA/AC': (3.4, 8.0), 'TC/AA': (7.6, 20.2), 'AA/TA': (1.2, 1.7), 'CA/GA': (-0.9, -4.2), 'GA/CA': (-2.9, -9.8), 'TA/AA': (4.7, 12.9), 'AC/TC': (0.0, -4.4), 'CC/GC': (-1.5, -7.2), 'GC/CC': (3.6, 8.9), 'TC/AC': (6.1, 16.4), 'AG/TG': (-3.1, -9.5), 'CG/GG': (-4.9, -15.3), 'GG/CG': (-6.0, -15.8), 'TG/AG': (1.6, 3.6), 'AT/TT': (-2.7, -10.8), 'CT/GT': (-5.0, -15.8), 'GT/CT': (-2.2, -8.4), 'TT/AT': (0.2, -1.5), 'AI/TC': (-8.9, -25.5), 'TI/AC': (-5.9, -17.4), 'AC/TI': (-8.8, -25.4), 'TC/AI': (-4.9, -13.9), 'CI/GC': (-5.4, -13.7), 'GI/CC': (-6.8, -19.1), 'CC/GI': (-8.3, -23.8), 'GC/CI': (-5.0, -12.6), 'AI/TA': (-8.3, -25.0), 'TI/AA': (-3.4, -11.2), 'AA/TI': (-0.7, -2.6), 'TA/AI': (-1.3, -4.6), 'CI/GA': (2.6, 8.9), 'GI/CA': (-7.8, -21.1), 'CA/GI': (-7.0, -20.0), 'GA/CI': (-7.6, -20.2), 'AI/TT': (0.49, -0.7), 'TI/AT': (-6.5, -22.0), 'AT/TI': (-5.6, -18.7), 'TT/AI': (-0.8, -4.3), 'CI/GT': (-1.0, -2.4), 'GI/CT': (-3.5, -10.6), 'CT/GI': (0.1, -1.0), 'GT/CI': (-4.3, -12.1), 'AI/TG': (-4.9, -15.8), 'TI/AG': (-1.9, -8.5), 'AG/TI': (0.1, -1.8), 'TG/AI': (1.0, 1.0), 'CI/GG': (7.1, 21.3), 'GI/CG': (-1.1, -3.2), 'CG/GI': (5.8, 16.9), 'GG/CI': (-7.6, -22.0), 'AI/TI': (-3.3, -11.9), 'TI/AI': (0.1, -2.3), 'CI/GI': (1.3, 3.0), 'GI/CI': (-0.5, -1.3)}, 'de_table': {'AA/.T': (0.2, 2.3), 'AC/.G': (-6.3, -17.1), 'AG/.C': (-3.7, -10.0), 'AT/.A': (-2.9, -7.6), 'CA/.T': (0.6, 3.3), 'CC/.G': (-4.4, -12.6), 'CG/.C': (-4.0, -11.9), 'CT/.A': (-4.1, -13.0), 'GA/.T': (-1.1, -1.6), 'GC/.G': (-5.1, -14.0), 'GG/.C': (-3.9, -10.9), 'GT/.A': (-4.2, -15.0), 'TA/.T': (-6.9, -20.0), 'TC/.G': (-4.0, -10.9), 'TG/.C': (-4.9, -13.8), 'TT/.A': (-0.2, -0.5), '.A/AT': (-0.7, -0.8), '.C/AG': (-2.1, -3.9), '.G/AC': (-5.9, -16.5), '.T/AA': (-0.5, -1.1), '.A/CT': (4.4, 14.9), '.C/CG': (-0.2, -0.1), '.G/CC': (-2.6, -7.4), '.T/CA': (4.7, 14.2), '.A/GT': (-1.6, -3.6), '.C/GG': (-3.9, -11.2), '.G/GC': (-3.2, -10.4), '.T/GA': (-4.1, -13.1), '.A/TT': (2.9, 10.4), '.C/TG': (-4.4, -13.1), '.G/TC': (-5.2, -15.0), '.T/TA': (-3.8, -12.6)}, 'dnac1': 50, 'dnac2': 0, 'selfcomp': False, 'saltcorr': 7, 'Na': 39, 'K': 75, 'Tris': 20, 'Mg': 10, 'dNTPs': 0}\n", + "2024-06-21 22:01:38,509 [INFO] Parameter: Tm_chem_correction_param_probe = {'DMSO': 0, 'fmd': 20, 'DMSOfactor': 0.75, 'fmdfactor': 0.65, 'fmdmethod': 1, 'GC': None}\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3dd15df04e304ca9af5b3d1df0c0eacb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c6cb3f3871b2419c83ed6fc48edfc84b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b77e785a3f7e477193c239dc971a7268", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-06-22 01:35:54,325 [INFO] Step - Specificity Filters: database contains 408735 oligos from 806 genes.\n" ] } ], "source": [ "####### Load existing database #######\n", - "# load annotation files for Reference Database\n", - "# source = \"custom\"\n", - "# custom_params = {\n", - "# \"file_annotation\": \"./output/annotation/GCF_000001405.40_GRCh38.p14_genomic.gtf\",\n", - "# \"file_sequence\": \"./output/annotation/GCF_000001405.40_GRCh38.p14_genomic.fna\",\n", - "# \"files_source\": \"NCBI\",\n", - "# \"species\": \"Homo_sapiens\",\n", - "# \"annotation_release\": \"110\",\n", - "# \"genome_assembly\": \"GRCh38.p14\", \n", - "# }\n", - "# probe_designer.load_annotations(source=source, source_params=custom_params)\n", + "# dir_database = os.path.join(dir_output, \"db_probes/2_db_probes_property_filter\")\n", + "# probe_database = OligoDatabase(min_oligos_per_region=min_probes_per_gene, write_regions_with_insufficient_oligos=True, lru_db_max_in_memory=db_max_in_memory, database_name=\"db_probes\", dir_output=dir_output, n_jobs=n_jobs)\n", + "# probe_database.load_database(dir_database=dir_database, region_ids=gene_ids, database_overwrite=True)\n", "\n", - "# # load existing database\n", - "# file_database = \"./output/oligo_database/probe_database_property_filter.txt\"\n", - "# min_probes_per_gene = 3\n", - "# probe_database = probe_designer.load_probe_database(file_database=file_database, min_probes_per_gene=min_probes_per_gene)\n", - "\n", - "####### Apply Specificity Filter #######\n", - "ligation_region_size=5\n", - "blast_word_size=10\n", - "blast_percent_identity=80\n", - "blast_coverage=50\n", - "\n", - "probe_database, file_database = probe_designer.filter_probes_by_specificity(probe_database, ligation_region_size=ligation_region_size, \n", - " blast_word_size=blast_word_size, blast_percent_identity=blast_percent_identity, blast_coverage=blast_coverage, n_jobs=2)" + "## Apply specificity filters\n", + "probe_database, file_database = pipeline.filter_by_specificity(\n", + " oligo_database=probe_database,\n", + " files_fasta_reference_database=files_fasta_reference_database,\n", + " specificity_blastn_search_parameters=specificity_blastn_search_parameters,\n", + " specificity_blastn_hit_parameters=specificity_blastn_hit_parameters,\n", + " cross_hybridization_blastn_search_parameters=cross_hybridization_blastn_search_parameters,\n", + " cross_hybridization_blastn_hit_parameters=cross_hybridization_blastn_hit_parameters,\n", + " ligation_region_size=ligation_region_size,\n", + " arm_Tm_dif_max=arm_Tm_dif_max,\n", + " arm_length_min=arm_length_min,\n", + " arm_Tm_min=arm_Tm_min,\n", + " arm_Tm_max=arm_Tm_max,\n", + " Tm_parameters_probe=Tm_parameters_probe,\n", + " Tm_chem_correction_param_probe=Tm_chem_correction_param_probe,\n", + ")" ] }, { @@ -474,83 +930,151 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "After applying different sets of filters to the probe database, we will create probesets for each gene, which are sets of probes that do not overlap and have a high efficiency score (calculated from melting temperature and GC content).\n", + "### Create Probe Sets of Non-Overlapping Probes\n", "\n", - "### Parameters for Oligo Efficiency Score\n", - "- Tm_min: minimum melting temperature of probes\n", - "- Tm_max: maximum melting temperature of probes\n", - "- Tm_opt: optimal melting temperature of probes\n", - "- Tm_weight: weight of the Tm of the probe in the efficiency score\n", - "- GC_content_min: minimum GC content of probes\n", - "- GC_content_max: maximum GC content of probes\n", - "- GC_content_opt: optimal GC content of probes\n", - "- GC_weight: weight of the GC content of the probe in the efficiency score\n", + "After applying different sets of filters to the probe database, we will create probesets for each gene, which are sets of probes that do not overlap and have a high efficiency score (calculated from melting temperature, GC content and isoform consensus).\n", "\n", - "### Parameters for Oligosets Generation\n", - "- probeset_size_opt: ideal number of oligos per probeset\n", - "- probeset_size_min: minimum number of oligos per probeset\n", - "- n_sets: maximum number of sets per gene" + "**Parameters for the Oligo Set Selection**\n", + "- `probe_isoform_weight`: weight of the isoform consensus of the probe in the efficiency score\n", + "- `probe_GC_content_opt`: max and min values are defiend above\n", + "- `probe_GC_weight`: weight of the GC content of the probe in the efficiency score\n", + "- `probe_Tm_opt`: max and min values are defiend above\n", + "- `probe_Tm_weight`: weight of the Tm of the probe in the efficiency score\n", + "- `probeset_size_min`: minimum size of probe sets (in case there exist no set of the optimal size) -> genes with less oligos will be filtered out and stored in regions_with_insufficient_oligos_for_db_probes\n", + "- `probeset_size_opt`: optimal size of probe sets\n", + "- `distance_between_probes`: how much overlap should be allowed between oligos, e.g. if oligos can overlpap x bases choose -x, if oligos can be next to one another choose 0, if oligos should be x bases apart choose x\n", + "- `n_sets`: maximum number of sets to generate\n", + "- `max_graph_size`: maximum number of oligos that are taken into consisderation in the last step (5000 -> ~5GB, 2500 -> ~1GB)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "## Define parameters\n", + "probe_isoform_weight = 2 \n", + "probe_GC_weight = 1 \n", + "probe_GC_content_opt = 50 \n", + "probe_Tm_weight = 1 \n", + "probe_Tm_opt = 70 \n", + "\n", + "probeset_size_min = 3 \n", + "probeset_size_opt = 5 \n", + "distance_between_probes = 0 \n", + "n_sets = 100 \n", + "max_graph_size = 5000 " + ] + }, + { + "cell_type": "code", + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2023-05-27 01:19:33,698 [INFO] Parameters Probesets:\n", - "2023-05-27 01:19:33,702 [INFO] probe_database = \n", - "2023-05-27 01:19:33,703 [INFO] probeset_size_opt = 5\n", - "2023-05-27 01:19:33,705 [INFO] probeset_size_min = 2\n", - "2023-05-27 01:19:33,706 [INFO] n_sets = 100\n", - "2023-05-27 01:19:33,706 [INFO] Tm_min = 52\n", - "2023-05-27 01:19:33,707 [INFO] Tm_max = 67\n", - "2023-05-27 01:19:33,708 [INFO] Tm_opt = 60\n", - "2023-05-27 01:19:33,709 [INFO] Tm_weight = 1\n", - "2023-05-27 01:19:33,710 [INFO] GC_content_min = 40\n", - "2023-05-27 01:19:33,710 [INFO] GC_content_max = 60\n", - "2023-05-27 01:19:33,711 [INFO] GC_content_opt = 50\n", - "2023-05-27 01:19:33,712 [INFO] GC_weight = 1\n", - "2023-05-27 01:19:33,713 [INFO] n_jobs = 2\n", - "2023-05-27 03:19:32,175 [INFO] Step - Generate Oligosets: the database contains 11525 probes from 756 genes, while 558885 probes and 40 genes have been deleted in this step.\n" + "2024-06-22 07:10:20,798 [INFO] Parameters Set Selection:\n", + "2024-06-22 07:10:20,800 [INFO] Function: create_probe_sets\n", + "2024-06-22 07:10:20,801 [INFO] Parameter: oligo_database = \n", + "2024-06-22 07:10:20,802 [INFO] Parameter: probe_isoform_weight = 2\n", + "2024-06-22 07:10:20,804 [INFO] Parameter: probe_Tm_weight = 1\n", + "2024-06-22 07:10:20,807 [INFO] Parameter: probe_Tm_min = 65\n", + "2024-06-22 07:10:20,808 [INFO] Parameter: probe_Tm_opt = 70\n", + "2024-06-22 07:10:20,809 [INFO] Parameter: probe_Tm_max = 75\n", + "2024-06-22 07:10:20,811 [INFO] Parameter: Tm_parameters_probe = {'check': True, 'strict': True, 'c_seq': None, 'shift': 0, 'nn_table': {'init': (0, 0), 'init_A/T': (2.3, 4.1), 'init_G/C': (0.1, -2.8), 'init_oneG/C': (0, 0), 'init_allA/T': (0, 0), 'init_5T/A': (0, 0), 'sym': (0, -1.4), 'AA/TT': (-7.9, -22.2), 'AT/TA': (-7.2, -20.4), 'TA/AT': (-7.2, -21.3), 'CA/GT': (-8.5, -22.7), 'GT/CA': (-8.4, -22.4), 'CT/GA': (-7.8, -21.0), 'GA/CT': (-8.2, -22.2), 'CG/GC': (-10.6, -27.2), 'GC/CG': (-9.8, -24.4), 'GG/CC': (-8.0, -19.9)}, 'tmm_table': {'AA/TA': (-3.1, -7.8), 'TA/AA': (-2.5, -6.3), 'CA/GA': (-4.3, -10.7), 'GA/CA': (-8.0, -22.5), 'AC/TC': (-0.1, 0.5), 'TC/AC': (-0.7, -1.3), 'CC/GC': (-2.1, -5.1), 'GC/CC': (-3.9, -10.6), 'AG/TG': (-1.1, -2.1), 'TG/AG': (-1.1, -2.7), 'CG/GG': (-3.8, -9.5), 'GG/CG': (-0.7, -19.2), 'AT/TT': (-2.4, -6.5), 'TT/AT': (-3.2, -8.9), 'CT/GT': (-6.1, -16.9), 'GT/CT': (-7.4, -21.2), 'AA/TC': (-1.6, -4.0), 'AC/TA': (-1.8, -3.8), 'CA/GC': (-2.6, -5.9), 'CC/GA': (-2.7, -6.0), 'GA/CC': (-5.0, -13.8), 'GC/CA': (-3.2, -7.1), 'TA/AC': (-2.3, -5.9), 'TC/AA': (-2.7, -7.0), 'AC/TT': (-0.9, -1.7), 'AT/TC': (-2.3, -6.3), 'CC/GT': (-3.2, -8.0), 'CT/GC': (-3.9, -10.6), 'GC/CT': (-4.9, -13.5), 'GT/CC': (-3.0, -7.8), 'TC/AT': (-2.5, -6.3), 'TT/AC': (-0.7, -1.2), 'AA/TG': (-1.9, -4.4), 'AG/TA': (-2.5, -5.9), 'CA/GG': (-3.9, -9.6), 'CG/GA': (-6.0, -15.5), 'GA/CG': (-4.3, -11.1), 'GG/CA': (-4.6, -11.4), 'TA/AG': (-2.0, -4.7), 'TG/AA': (-2.4, -5.8), 'AG/TT': (-3.2, -8.7), 'AT/TG': (-3.5, -9.4), 'CG/GT': (-3.8, -9.0), 'CT/GG': (-6.6, -18.7), 'GG/CT': (-5.7, -15.9), 'GT/CG': (-5.9, -16.1), 'TG/AT': (-3.9, -10.5), 'TT/AG': (-3.6, -9.8)}, 'imm_table': {'AG/TT': (1.0, 0.9), 'AT/TG': (-2.5, -8.3), 'CG/GT': (-4.1, -11.7), 'CT/GG': (-2.8, -8.0), 'GG/CT': (3.3, 10.4), 'GG/TT': (5.8, 16.3), 'GT/CG': (-4.4, -12.3), 'GT/TG': (4.1, 9.5), 'TG/AT': (-0.1, -1.7), 'TG/GT': (-1.4, -6.2), 'TT/AG': (-1.3, -5.3), 'AA/TG': (-0.6, -2.3), 'AG/TA': (-0.7, -2.3), 'CA/GG': (-0.7, -2.3), 'CG/GA': (-4.0, -13.2), 'GA/CG': (-0.6, -1.0), 'GG/CA': (0.5, 3.2), 'TA/AG': (0.7, 0.7), 'TG/AA': (3.0, 7.4), 'AC/TT': (0.7, 0.2), 'AT/TC': (-1.2, -6.2), 'CC/GT': (-0.8, -4.5), 'CT/GC': (-1.5, -6.1), 'GC/CT': (2.3, 5.4), 'GT/CC': (5.2, 13.5), 'TC/AT': (1.2, 0.7), 'TT/AC': (1.0, 0.7), 'AA/TC': (2.3, 4.6), 'AC/TA': (5.3, 14.6), 'CA/GC': (1.9, 3.7), 'CC/GA': (0.6, -0.6), 'GA/CC': (5.2, 14.2), 'GC/CA': (-0.7, -3.8), 'TA/AC': (3.4, 8.0), 'TC/AA': (7.6, 20.2), 'AA/TA': (1.2, 1.7), 'CA/GA': (-0.9, -4.2), 'GA/CA': (-2.9, -9.8), 'TA/AA': (4.7, 12.9), 'AC/TC': (0.0, -4.4), 'CC/GC': (-1.5, -7.2), 'GC/CC': (3.6, 8.9), 'TC/AC': (6.1, 16.4), 'AG/TG': (-3.1, -9.5), 'CG/GG': (-4.9, -15.3), 'GG/CG': (-6.0, -15.8), 'TG/AG': (1.6, 3.6), 'AT/TT': (-2.7, -10.8), 'CT/GT': (-5.0, -15.8), 'GT/CT': (-2.2, -8.4), 'TT/AT': (0.2, -1.5), 'AI/TC': (-8.9, -25.5), 'TI/AC': (-5.9, -17.4), 'AC/TI': (-8.8, -25.4), 'TC/AI': (-4.9, -13.9), 'CI/GC': (-5.4, -13.7), 'GI/CC': (-6.8, -19.1), 'CC/GI': (-8.3, -23.8), 'GC/CI': (-5.0, -12.6), 'AI/TA': (-8.3, -25.0), 'TI/AA': (-3.4, -11.2), 'AA/TI': (-0.7, -2.6), 'TA/AI': (-1.3, -4.6), 'CI/GA': (2.6, 8.9), 'GI/CA': (-7.8, -21.1), 'CA/GI': (-7.0, -20.0), 'GA/CI': (-7.6, -20.2), 'AI/TT': (0.49, -0.7), 'TI/AT': (-6.5, -22.0), 'AT/TI': (-5.6, -18.7), 'TT/AI': (-0.8, -4.3), 'CI/GT': (-1.0, -2.4), 'GI/CT': (-3.5, -10.6), 'CT/GI': (0.1, -1.0), 'GT/CI': (-4.3, -12.1), 'AI/TG': (-4.9, -15.8), 'TI/AG': (-1.9, -8.5), 'AG/TI': (0.1, -1.8), 'TG/AI': (1.0, 1.0), 'CI/GG': (7.1, 21.3), 'GI/CG': (-1.1, -3.2), 'CG/GI': (5.8, 16.9), 'GG/CI': (-7.6, -22.0), 'AI/TI': (-3.3, -11.9), 'TI/AI': (0.1, -2.3), 'CI/GI': (1.3, 3.0), 'GI/CI': (-0.5, -1.3)}, 'de_table': {'AA/.T': (0.2, 2.3), 'AC/.G': (-6.3, -17.1), 'AG/.C': (-3.7, -10.0), 'AT/.A': (-2.9, -7.6), 'CA/.T': (0.6, 3.3), 'CC/.G': (-4.4, -12.6), 'CG/.C': (-4.0, -11.9), 'CT/.A': (-4.1, -13.0), 'GA/.T': (-1.1, -1.6), 'GC/.G': (-5.1, -14.0), 'GG/.C': (-3.9, -10.9), 'GT/.A': (-4.2, -15.0), 'TA/.T': (-6.9, -20.0), 'TC/.G': (-4.0, -10.9), 'TG/.C': (-4.9, -13.8), 'TT/.A': (-0.2, -0.5), '.A/AT': (-0.7, -0.8), '.C/AG': (-2.1, -3.9), '.G/AC': (-5.9, -16.5), '.T/AA': (-0.5, -1.1), '.A/CT': (4.4, 14.9), '.C/CG': (-0.2, -0.1), '.G/CC': (-2.6, -7.4), '.T/CA': (4.7, 14.2), '.A/GT': (-1.6, -3.6), '.C/GG': (-3.9, -11.2), '.G/GC': (-3.2, -10.4), '.T/GA': (-4.1, -13.1), '.A/TT': (2.9, 10.4), '.C/TG': (-4.4, -13.1), '.G/TC': (-5.2, -15.0), '.T/TA': (-3.8, -12.6)}, 'dnac1': 50, 'dnac2': 0, 'selfcomp': False, 'saltcorr': 7, 'Na': 39, 'K': 75, 'Tris': 20, 'Mg': 10, 'dNTPs': 0}\n", + "2024-06-22 07:10:20,818 [INFO] Parameter: Tm_chem_correction_param_probe = {'DMSO': 0, 'fmd': 20, 'DMSOfactor': 0.75, 'fmdfactor': 0.65, 'fmdmethod': 1, 'GC': None}\n", + "2024-06-22 07:10:20,819 [INFO] Parameter: probe_GC_weight = 1\n", + "2024-06-22 07:10:20,821 [INFO] Parameter: probe_GC_content_min = 40\n", + "2024-06-22 07:10:20,822 [INFO] Parameter: probe_GC_content_opt = 50\n", + "2024-06-22 07:10:20,823 [INFO] Parameter: probe_GC_content_max = 60\n", + "2024-06-22 07:10:20,825 [INFO] Parameter: probeset_size_opt = 5\n", + "2024-06-22 07:10:20,826 [INFO] Parameter: probeset_size_min = 3\n", + "2024-06-22 07:10:20,826 [INFO] Parameter: max_graph_size = 5000\n", + "2024-06-22 07:10:20,828 [INFO] Parameter: n_sets = 100\n", + "2024-06-22 07:10:20,829 [INFO] Parameter: distance_between_probes = 0\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "71e53103727a41f2b4c8db238482f478", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-06-22 07:16:26,067 [INFO] Step - Set Selection: database contains 15896 oligos from 729 genes.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-06-22 07:16:26,097 [DEBUG] handle_msg[71e53103727a41f2b4c8db238482f478]({'header': {'date': datetime.datetime(2024, 6, 22, 5, 16, 26, 92000, tzinfo=tzutc()), 'msg_id': 'ee0cb3c5-8ede-47d7-ae82-ec91e847eaa1', 'msg_type': 'comm_msg', 'session': 'f5badd4b-0344-47de-95b0-5b394769c48b', 'username': '14d27083-0f27-4302-83da-06a8ed8f694f', 'version': '5.2'}, 'msg_id': 'ee0cb3c5-8ede-47d7-ae82-ec91e847eaa1', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '71e53103727a41f2b4c8db238482f478', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': ' \\x1b[35m100%\\x1b[0m \\x1b[36mFind Oligosets\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[32m735/735\\x1b[0m \\x1b[33m0:05:57\\x1b[0m < \\x1b[36m0:00:00\\x1b[0m\\n', 'text/html': '
  100% Find Oligosets ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 735/735 0:05:57 < 0:00:00\\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n" ] } ], "source": [ "####### Load existing database #######\n", - "# file_database = \"./output/oligo_database/oligo_database_specificity_filters.txt\"\n", - "# min_probes_per_gene = 3\n", - "# probe_database = probe_designer.load_probe_database(file_database=file_database, min_probes_per_gene=min_probes_per_gene)\n", - "\n", - "####### Apply Probe Set Selection #######\n", - "probeset_size_opt=5\n", - "probeset_size_min=2\n", - "n_sets=100\n", - "Tm_min=52\n", - "Tm_max=67\n", - "Tm_opt=60\n", - "Tm_weight=1\n", - "GC_content_min=40\n", - "GC_content_max=60\n", - "GC_content_opt=50\n", - "GC_weight=1\n", - "\n", - "probe_database, file_database, dir_oligosets = probe_designer.create_probe_sets(probe_database, \n", - " probeset_size_opt=probeset_size_opt, \n", - " probeset_size_min=probeset_size_min, \n", - " n_sets=n_sets, \n", - " Tm_min=Tm_min, \n", - " Tm_max=Tm_max, \n", - " Tm_opt=Tm_opt, \n", - " Tm_weight=Tm_weight, \n", - " GC_content_min=GC_content_min, \n", - " GC_content_max=GC_content_max, \n", - " GC_content_opt=GC_content_opt, \n", - " GC_weight=GC_weight, \n", - " n_jobs=2)" + "# dir_database = os.path.join(dir_output, \"db_probes/3_db_probes_specificity_filter\")\n", + "# probe_database = OligoDatabase(min_oligos_per_region=min_probes_per_gene, write_regions_with_insufficient_oligos=True, lru_db_max_in_memory=db_max_in_memory, database_name=\"db_probes\", dir_output=dir_output, n_jobs=n_jobs)\n", + "# probe_database.load_database(dir_database=dir_database, region_ids=gene_ids, database_overwrite=True)\n", + "\n", + "## Apply probe selection\n", + "probe_database, file_database, dir_probesets = pipeline.create_probe_sets(\n", + " oligo_database=probe_database,\n", + " probe_isoform_weight=probe_isoform_weight,\n", + " probe_Tm_weight=probe_Tm_weight,\n", + " probe_Tm_min=probe_Tm_min,\n", + " probe_Tm_opt=probe_Tm_opt,\n", + " probe_Tm_max=probe_Tm_max,\n", + " Tm_parameters_probe=Tm_parameters_probe,\n", + " Tm_chem_correction_param_probe=Tm_chem_correction_param_probe,\n", + " probe_GC_weight=probe_GC_weight,\n", + " probe_GC_content_min=probe_GC_content_min,\n", + " probe_GC_content_opt=probe_GC_content_opt,\n", + " probe_GC_content_max=probe_GC_content_max,\n", + " probeset_size_opt=probeset_size_opt,\n", + " probeset_size_min=probeset_size_min,\n", + " max_graph_size=max_graph_size,\n", + " n_sets=n_sets,\n", + " distance_between_probes=distance_between_probes,\n", + ")" ] }, { @@ -558,17 +1082,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the probe database, the gene names are the keys of the database. All genes that do not have sufficient probes were removed from the database. \n", + "## Select Gene Panel\n", + "\n", + "In the probe database, the gene names are the keys of the database. All genes that do not have sufficient probes were removed from the database. \n", "Once we hve all genes with sufficient probes, we can run the gene set selection step. \n", "Therefore, we include additional metadata information to our adata object, i.e. the genes that have sufficient probes and the genes that fulfill both constraint (*highly variable* and *sufficient probes*)." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ + "####### Load existing database #######\n", + "# dir_database = os.path.join(dir_output, \"db_probes/4_db_probes_probesets\")\n", + "# probe_database = OligoDatabase(min_oligos_per_region=min_probes_per_gene, write_regions_with_insufficient_oligos=True, lru_db_max_in_memory=db_max_in_memory, database_name=\"db_probes\", dir_output=dir_output, n_jobs=n_jobs)\n", + "# probe_database.load_database(dir_database=dir_database, region_ids=gene_ids, database_overwrite=True)\n", + "\n", "# get gene names of genes with sufficient number of probes to proceed with next step\n", "genes_with_sufficient_probes = probe_database.database.keys()\n", "\n", @@ -597,13 +1128,13 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "8670f070af094472be824ecf093d55a3", + "model_id": "14ca412bd28b44558d6a3140101a5ed8", "version_major": 2, "version_minor": 0 }, @@ -624,6 +1155,92 @@ "The genes selected for those cell types potentially don't generalize well. Find the genes for each of those cell types in self.genes_of_primary_trees after running self.select_probeset().\n" ] }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n" + ] + }, { "data": { "text/html": [ @@ -649,76 +1266,157 @@ } ], "source": [ - "##### Select genes for gene panel #####\n", + "## Select genes for gene panel\n", "selector = sp.se.ProbesetSelector(pbmc_data, n=20, genes_key=\"pass_constraints\", celltype_key=\"celltype\", verbosity=1, save_dir=None)\n", "selector.select_probeset()\n", "selected_genes = selector.probeset.index[selector.probeset.selection]" ] }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "## Remove all genes from the database that are not selected for the gene panel \n", + "probe_database.database = {key: value for key, value in probe_database.database.items() if key in selected_genes}\n", + "probe_database.oligosets = {key: value for key, value in probe_database.oligosets.items() if key in selected_genes}" + ] + }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Once we have all selected genes, we create the final \"read to order\" probe sequences. \n", - "Calling the fuction below will produce two files, *[padlock, merfish, seqfish]_probes* and *[padlock, merfish, seqfish]_probes_order*.\n", + "## Get Final Padlock Probe Sequences\n", + "\n", + "Once we have all selected genes, we create the final \"ready to order\" probe sequences. \n", + "Calling the fuction below will produce two files, *[padlock, merfish, seqfish]_probes.yaml* and *[padlock, merfish, seqfish]_probes_order.yaml*.\n", "The latter file contains the ready to order probe sequences for each gene.\n", "\n", - "### Parameters for Padlock Final Sequence Design\n", - "- detect_oligo_length_min: minimum length of detection oligo\n", - "- detect_oligo_length_max: maximum length of detection oligo\n", - "- detect_oligo_Tm_opt: optimal melting temperature of detection oligo\n", - "- Tm_parameters_detection_oligo: melting temperature parameters for detection oligo design\n", - "- Tm_chem_correction_param_detection_oligo: parameters for chemical correction of melting temperature for detection oligo design\n", + "**Parameters for Padlock Sequence Design**\n", + "- `U_distance`: preferred minimal distance between U(racils)\n", + "- `detect_oligo_Tm_opt`: optimal melting temperature of detection probe\n", + "- `top_n_sets`: maximum number of sets to report in *[padlock, merfish, seqfish]_probes.yaml* and *[padlock, merfish, seqfish]_probes_order.yaml*\n", "\n", - "*Note: The melting temperature is used in 2 different stages (probe and detection oligo design), where a few parameters are shared and the others differ. Parameters for melting temperature -> for more information on parameters, see: [here](https://biopython.org/docs/1.75/api/Bio.SeqUtils.MeltingTemp.html#Bio.SeqUtils.MeltingTemp.Tm_NN)*" + "*Note: The melting temperature is used in 2 different stages (probe and detection oligo design), where a few parameters are shared and the others differ. Parameters for melting temperature -> for more information on parameters, see: [here](https://biopython.org/docs/1.75/api/Bio.SeqUtils.MeltingTemp.html#Bio.SeqUtils.MeltingTemp.Tm_NN)*\n" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ - "##### Remove all genes from the database that are not selected for the gene panel ####\n", - "probe_database.database = {key: value for key, value in probe_database.database.items() if key in selected_genes}\n", - "probe_database.oligosets = {key: value for key, value in probe_database.oligosets.items() if key in selected_genes}" + "## Define parameters\n", + "U_distance = 5 \n", + "detect_oligo_Tm_opt = 56\n", + "top_n_sets = 3 \n", + "\n", + "Tm_parameters_detection_oligo = {\n", + " \"check\": True, # default\n", + " \"strict\": True, # default\n", + " \"c_seq\": None, # default\n", + " \"shift\": 0, # default\n", + " \"nn_table\": getattr(mt, \"DNA_NN3\"), # Allawi & SantaLucia (1997)\n", + " \"tmm_table\": getattr(mt, \"DNA_TMM1\"), #default\n", + " \"imm_table\": getattr(mt, \"DNA_IMM1\"), #default\n", + " \"de_table\": getattr(mt, \"DNA_DE1\"), #default\n", + " \"dnac1\": 50, # [nM]\n", + " \"dnac2\": 0, # [nM]\n", + " \"selfcomp\": False, # default\n", + " \"saltcorr\": 7, # Owczarzy et al. (2008)\n", + " \"Na\": 39, # [mM]\n", + " \"K\": 0, # [mM] default\n", + " \"Tris\": 0, # [mM] default\n", + " \"Mg\": 0, # [mM] default\n", + " \"dNTPs\": 0, # [mM] default\n", + "}\n", + "\n", + "Tm_chem_correction_param_detection_oligo = {\n", + " \"DMSO\": 0, # default\n", + " \"fmd\": 30,\n", + " \"DMSOfactor\": 0.75, # default\n", + " \"fmdfactor\": 0.65, # default\n", + " \"fmdmethod\": 1, # default\n", + " \"GC\": None, # default\n", + "}" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 25, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-05-27 03:23:40,007 [DEBUG] handle_msg[8670f070af094472be824ecf093d55a3]({'header': {'date': datetime.datetime(2023, 5, 27, 1, 23, 39, 934000, tzinfo=tzutc()), 'msg_id': '5a26b004-efe4-482a-a30b-7bed1068077b', 'msg_type': 'comm_msg', 'session': 'f80bbb6a-f083-4dbc-9c24-00aa83c84915', 'username': '16c5b64d-425a-4c50-b860-381f99d9c018', 'version': '5.2'}, 'msg_id': '5a26b004-efe4-482a-a30b-7bed1068077b', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '8670f070af094472be824ecf093d55a3', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': '\\x1b[1;30mSPAPROS PROBESET SELECTION: \\x1b[0m \\x1b[33m0:04:06\\x1b[0m\\n\\x1b[1;34mSelect pca genes..........................................\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[35m100%\\x1b[0m \\x1b[33m0:00:00\\x1b[0m\\n\\x1b[1;34mTrain baseline forest based on DE genes...................\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[35m 4/4\\x1b[0m \\x1b[33m0:03:12\\x1b[0m\\n \\x1b[1;2;36mSelect DE genes.........................................\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[35m 8/8\\x1b[0m \\x1b[33m0:00:00\\x1b[0m\\n \\x1b[1;2;36mTrain prior forest for DE_baseline forest...............\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[35m 3/3\\x1b[0m \\x1b[33m0:00:40\\x1b[0m\\n \\x1b[1;2;36mIteratively add DE genes to DE_baseline forest..........\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[35m 3/3\\x1b[0m \\x1b[33m0:01:41\\x1b[0m\\n \\x1b[1;2;36mTrain final baseline forest on all celltypes............\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[35m 3/3\\x1b[0m \\x1b[33m0:00:49\\x1b[0m\\n\\x1b[1;34mTrain final forests.......................................\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[35m 3/3\\x1b[0m \\x1b[33m0:00:53\\x1b[0m\\n \\x1b[1;2;36mTrain forest on pre/prior/pca selected genes............\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[35m 3/3\\x1b[0m \\x1b[33m0:00:53\\x1b[0m\\n \\x1b[1;2;36mInitial results are good enough. No genes are added.......................................\\x1b[0m \\n\\x1b[1;34mCompile probeset list.....................................\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[35m100%\\x1b[0m \\x1b[33m0:00:00\\x1b[0m\\n\\x1b[1;30mFINISHED\\x1b[0m \\n \\n', 'text/html': '
SPAPROS PROBESET SELECTION:                                                                      0:04:06\\nSelect pca genes.......................................... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━  100% 0:00:00\\nTrain baseline forest based on DE genes................... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   4/4 0:03:12\\n  Select DE genes......................................... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   8/8 0:00:00\\n  Train prior forest for DE_baseline forest............... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   3/3 0:00:40\\n  Iteratively add DE genes to DE_baseline forest.......... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   3/3 0:01:41\\n  Train final baseline forest on all celltypes............ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   3/3 0:00:49\\nTrain final forests....................................... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   3/3 0:00:53\\n  Train forest on pre/prior/pca selected genes............ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━   3/3 0:00:53\\n  Initial results are good enough. No genes are added.......................................  \\nCompile probeset list..................................... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━  100% 0:00:00\\nFINISHED  \\n          \\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2d76b817415d4832a9e5157a18286605", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ - "2023-05-27 03:23:40,069 [INFO] Parameters Final Sequence Design:\n", - "2023-05-27 03:23:40,070 [INFO] probe_database = \n", - "2023-05-27 03:23:40,072 [INFO] detect_oligo_length_min = 18\n", - "2023-05-27 03:23:40,073 [INFO] detect_oligo_length_max = 25\n", - "2023-05-27 03:23:40,074 [INFO] detect_oligo_Tm_opt = 32\n", - "2023-05-27 03:23:40,076 [INFO] Tm_parameters_detection_oligo = {'check': True, 'strict': True, 'c_seq': None, 'shift': 0, 'nn_table': 'DNA_NN3', 'tmm_table': 'DNA_TMM1', 'imm_table': 'DNA_IMM1', 'de_table': 'DNA_DE1', 'dnac1': 50, 'dnac2': 0, 'selfcomp': False, 'dNTPs': 0, 'saltcorr': 7, 'Na': 39, 'K': 0, 'Tris': 0, 'Mg': 0}\n", - "2023-05-27 03:23:40,077 [INFO] Tm_chem_correction_param_detection_oligo = {'DMSO': 0, 'DMSOfactor': 0.75, 'fmdfactor': 0.65, 'fmdmethod': 1, 'GC': None, 'fmd': 30}\n", - "2023-05-27 03:23:40,347 [INFO] Step - Design Final Padlock Sequences: padlock sequences are stored in './output/padlock_sequences/padlock_sequences' directory.\n" + "2024-06-22 07:24:22,481 [DEBUG] handle_msg[2d76b817415d4832a9e5157a18286605]({'header': {'date': datetime.datetime(2024, 6, 22, 5, 24, 22, 468000, tzinfo=tzutc()), 'msg_id': '579a2685-fd7f-4de7-95b6-e0219bda2ea0', 'msg_type': 'comm_msg', 'session': 'f5badd4b-0344-47de-95b0-5b394769c48b', 'username': '14d27083-0f27-4302-83da-06a8ed8f694f', 'version': '5.2'}, 'msg_id': '579a2685-fd7f-4de7-95b6-e0219bda2ea0', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '2d76b817415d4832a9e5157a18286605', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': ' \\x1b[35m100%\\x1b[0m \\x1b[36mDesign Final Padlock Sequence\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[32m20/20\\x1b[0m \\x1b[33m0:00:22\\x1b[0m < \\x1b[36m0:00:00\\x1b[0m\\n', 'text/html': '
  100% Design Final Padlock Sequence ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20/20 0:00:22 < 0:00:00\\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n" ] } ], "source": [ - "##### Design final sequences #####\n", - "detect_oligo_length_min = 18\n", - "detect_oligo_length_max = 25\n", - "detect_oligo_Tm_opt = 32\n", + "## Design final sequences \n", + "probe_database = pipeline.design_final_padlock_sequence(\n", + " oligo_database=probe_database,\n", + " min_thymines=min_thymines,\n", + " U_distance=U_distance,\n", + " detect_oligo_length_min=detect_oligo_length_min,\n", + " detect_oligo_length_max=detect_oligo_length_max,\n", + " detect_oligo_Tm_opt=detect_oligo_Tm_opt,\n", + " Tm_parameters_detection_oligo=Tm_parameters_detection_oligo,\n", + " Tm_chem_correction_param_detection_oligo=Tm_chem_correction_param_detection_oligo,\n", + " Tm_parameters_probe=Tm_parameters_probe,\n", + " Tm_chem_correction_param_probe=Tm_chem_correction_param_probe,\n", + ")\n", + "\n", + "## Compute all required probe attributes for output\n", + "probe_database = pipeline.compute_probe_attributes(\n", + " oligo_database=probe_database,\n", + " Tm_parameters_probe=Tm_parameters_probe,\n", + " Tm_chem_correction_param_probe=Tm_chem_correction_param_probe,\n", + ")\n", "\n", - "probe_designer.create_final_sequences(probe_database, detect_oligo_length_min, detect_oligo_length_max, detect_oligo_Tm_opt)" + "## Write output to files\n", + "pipeline.generate_output(oligo_database=probe_database, top_n_sets=top_n_sets)" ] } ], @@ -738,7 +1436,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.17" + "version": "3.10.14" }, "vscode": { "interpreter": { diff --git a/docs/_tutorials/spapros_tutorial_end_to_end_selection_short.ipynb b/docs/_tutorials/spapros_tutorial_end_to_end_selection_short.ipynb index 0cf9b08..20b9f8f 100644 --- a/docs/_tutorials/spapros_tutorial_end_to_end_selection_short.ipynb +++ b/docs/_tutorials/spapros_tutorial_end_to_end_selection_short.ipynb @@ -8,19 +8,33 @@ "hide-cell" ] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "scanpy==1.10.1 anndata==0.10.7 umap==0.5.6 numpy==1.26.4 scipy==1.13.1 pandas==1.5.3 scikit-learn==1.5.0 statsmodels==0.14.2 igraph==0.11.5 pynndescent==0.5.12\n" + ] + } + ], "source": [ "from IPython.display import HTML, display, Image\n", "\n", + "import yaml\n", "import warnings\n", - "warnings.filterwarnings('ignore')\n", + "warnings.simplefilter('ignore')\n", + "\n", + "import subprocess\n", + "import scanpy as sc\n", + "sc.settings.verbosity = 0\n", + "sc.logging.print_header()\n", "\n", "import pandas as pd\n", "#pd.set_option(\"max_columns\", None) # show all cols\n", "pd.set_option('max_colwidth', None) # show full width of showing cols\n", "pd.set_option(\"expand_frame_repr\", False) # print cols side by side as it's supposed to be\n", "pd.options.display.max_seq_items = 200000\n", - "pd.options.display.max_rows = 400000\n" + "pd.options.display.max_rows = 400000" ] }, { @@ -40,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -50,8 +64,13 @@ "" ] }, - "metadata": {}, - "output_type": "display_data" + "execution_count": 2, + "metadata": { + "image/png": { + "width": 900 + } + }, + "output_type": "execute_result" } ], "source": [ @@ -69,57 +88,50 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Besides `spapros` also install `oligo_designer_toolsuite` if not done already. First we need to install some dependencies:\n", + "Besides `spapros` also install `oligo_designer_toolsuite`. Therefore, first setup a conda environment (packages is tested for Python 3.9 - 3.10), e.g.:\n", + "\n", + "```bash\n", + "conda create -n odt python=3.10\n", + "conda activate odt\n", + "```\n", + "\n", + "Then, install the required dependencies, i.e. **Blast** (2.15 or higher), **BedTools** (2.30 or higher), **Bowtie** (1.3 or higher) and **Bowtie2** (2.5 or higher), that need to be installed independently. To install those tools via conda, please activate the Bioconda and conda-forge channels in your conda environment with and update conda and all packages in your environment:\n", "\n", "```bash\n", - "conda config --add channels bioconda\n", "conda config --add channels conda-forge\n", - "conda update conda\n", + "conda config --add channels bioconda \n", "conda update --all\n", "\n", - "conda install \"blast>=2.12\"\n", + "conda install \"blast>=2.15.0\"\n", "conda install \"bedtools>=2.30\"\n", "conda install \"bowtie>=1.3.1\"\n", "conda install \"bowtie2>=2.5\"\n", "```\n", "\n", - "\n", - "To run the code below we need to install the current dev version of the oligo designer:\n", + "All other required packages are automatically installed during the `pip` installation:\n", "\n", "```bash\n", "git clone https://github.com/HelmholtzAI-Consultants-Munich/oligo-designer-toolsuite.git\n", "cd oligo-designer-toolsuite\n", "git switch pipelines\n", "pip install -e .\n", - "```\n", - "\n", - "Otherwise, if that didn't work, try:\n", - "```bash\n", - "pip install oligo_designer_toolsuite\n", "```" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "scanpy==1.9.3 anndata==0.9.2 umap==0.5.3 numpy==1.24.4 scipy==1.11.2 pandas==1.5.3 scikit-learn==1.3.0 statsmodels==0.14.0 python-igraph==0.9.11 pynndescent==0.5.10\n", - "spapros==0.1.3\n" + "spapros==0.1.5\n" ] } ], "source": [ - "import yaml\n", - "import pandas as pd\n", - "import scanpy as sc\n", - "sc.settings.verbosity = 0\n", - "sc.logging.print_header()\n", - "\n", "import spapros as sp\n", "print(f\"spapros=={sp.__version__}\")" ] @@ -139,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -158,11 +170,11 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "# Save genes as a list in ./my_genes.txt\n", + "# Save genes as a list in ./my_genes.txt -> if the file is renamed it needs to be changd in the config file as well\n", "pd.DataFrame(highly_variable_genes).to_csv('./my_genes.txt', index=False, header=False)" ] }, @@ -172,20 +184,139 @@ "source": [ "## Probe Design\n", "\n", - "To conduct the probe design, run the `_probe_designer` cli command of the oligo-designer-toolsuite from the terminal:\n", + "Before we start with the probe design we have to create genomic region fasta files from which the probe database is created and which are used as background database for the specificity filter. Therefore, run the *genomic_region_generator* cli command of the oligo-designer-toolsuite from the terminal:\n", "\n", "```bash\n", - "scrinshot_probe_designer --file_genes ./my_genes.txt -o ./output --n_jobs 2 --min_probes_per_gene 3 --source ncbi --taxon vertebrate_mammalian --species Homo_sapiens --annotation_release 110\n", + "genomic_region_generator -c genomic_region_generator_ncbi.yaml\n", "```\n", "\n", - "Alternativ pipelines: `merfish_probe_designer`, `seqfish_probe_designer` (see our [resource table](https://www.google.com/url?q=https://docs.google.com/spreadsheets/d/1NCDLscdmzn32U7_IKy6OKYHqfECn76x0pAD3KFhuJgQ/edit%23gid%3D0&sa=D&source=docs&ust=1692975800552487&usg=AOvVaw20CRwiObnVkWVS54CcqXMq) for an overview of differences between the technologies)\n", + "A default configuration file for the genomic region generator with NCBI annotations can be downloaded from [here](https://github.com/HelmholtzAI-Consultants-Munich/oligo-designer-toolsuite/blob/pipelines/data/configs/genomic_region_generator_ncbi.yaml).\n", + "\n", + "*Note: if an error occurs for the unzipping of files, this might be due to a faulty download of files from the ftp server. In this case, try to download the files manually from the ftp server and use those files as input for the pipeline with custom input files. See `spapros_tutorial_end_to_end_selection.ipynb` tutorial for more information.*\n", + "\n", + "To conduct the probe design, run the *\\_probe_designer* cli command of the oligo-designer-toolsuite from the terminal:\n", "\n", - "More info can be obtained with \n", "```bash\n", - "scrinshot_probe_designer --help\n", + "scrinshot_probe_designer -c scrinshot_probe_designer.yaml\n", "```\n", "\n", - "Also note that when running the pipeline, a file `./output/config_scrinshot_ncbi.yaml` is created. It contains all parameters that define the pipeline run. Except of the paramters that define the species and annotation release it makes sense to keep the default parameters. In the [long version](spapros_tutorial_end_to_end_selection.ipynb) of this tutorial the parameters for each pipeline step are described." + "A default configuration file for the scrinshot probe designer can be downloaded from [here](https://github.com/HelmholtzAI-Consultants-Munich/oligo-designer-toolsuite/blob/pipelines/data/configs/scrinshot_probe_designer.yaml). \n", + "**Important:** add the filename of the gene list and the file names of the fasta files to the configuration file of the pipeline. From the logging file of the previous step you can retrieve the names of the generated fasta files. \n", + "\n", + "\n", + "Alternativ pipelines: *merfish_probe_designer*, *seqfish_probe_designer* (see our [resource table](https://www.google.com/url?q=https://docs.google.com/spreadsheets/d/1NCDLscdmzn32U7_IKy6OKYHqfECn76x0pAD3KFhuJgQ/edit%23gid%3D0&sa=D&source=docs&ust=1692975800552487&usg=AOvVaw20CRwiObnVkWVS54CcqXMq) for an overview of differences between the technologies)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", + "\n", + "Due to the on going maintenance burden of keeping command line application\n", + "wrappers up to date, we have decided to deprecate and eventually remove these\n", + "modules.\n", + "\n", + "We instead now recommend building your command line and invoking it directly\n", + "with the subprocess module.\n", + " warnings.warn(\n", + "2024-06-19 21:20:21,234 [INFO] Parameters Load Annotations:\n", + "2024-06-19 21:20:21,235 [INFO] source = ncbi\n", + "2024-06-19 21:20:21,235 [INFO] source_params = {'taxon': 'vertebrate_mammalian', 'species': 'Homo_sapiens', 'annotation_release': 110}\n", + "2024-06-19 21:28:20,814 [WARNING] /Users/lisa.barros/Desktop/oligo-designer-toolsuite/oligo_designer_toolsuite/utils/_sequence_parser.py:104: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " csv_df = pd.read_csv(csv_file, sep=\"\\t\", names=self.GFF_HEADER, header=None)\n", + "\n", + "2024-06-19 21:28:56,907 [INFO] The following annotation files are used for GTF annotation of regions: /Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/GCF_000001405.40_GRCh38.p14_genomic.gtf and for fasta sequence file: /Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/GCF_000001405.40_GRCh38.p14_genomic.fna .\n", + "2024-06-19 21:28:56,908 [INFO] The annotations are from NCBI source, for the species: Homo_sapiens, release number: 110 and genome assembly: GRCh38.p14\n", + "2024-06-19 21:32:27,987 [INFO] The genomic region 'exon' was stored in :/Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna.\n", + "2024-06-19 21:44:45,118 [INFO] The genomic region 'exon_exon_junction' was stored in :/Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna.\n" + ] + } + ], + "source": [ + "cmd = \"genomic_region_generator -c data/genomic_region_generator_ncbi.yaml\"\n", + "process = subprocess.Popen(cmd, shell=True).wait()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", + "\n", + "Due to the on going maintenance burden of keeping command line application\n", + "wrappers up to date, we have decided to deprecate and eventually remove these\n", + "modules.\n", + "\n", + "We instead now recommend building your command line and invoking it directly\n", + "with the subprocess module.\n", + " warnings.warn(\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", + "\n", + "Due to the on going maintenance burden of keeping command line application\n", + "wrappers up to date, we have decided to deprecate and eventually remove these\n", + "modules.\n", + "\n", + "We instead now recommend building your command line and invoking it directly\n", + "with the subprocess module.\n", + " warnings.warn(\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", + "\n", + "Due to the on going maintenance burden of keeping command line application\n", + "wrappers up to date, we have decided to deprecate and eventually remove these\n", + "modules.\n", + "\n", + "We instead now recommend building your command line and invoking it directly\n", + "with the subprocess module.\n", + " warnings.warn(\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", + "\n", + "Due to the on going maintenance burden of keeping command line application\n", + "wrappers up to date, we have decided to deprecate and eventually remove these\n", + "modules.\n", + "\n", + "We instead now recommend building your command line and invoking it directly\n", + "with the subprocess module.\n", + " warnings.warn(\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", + "\n", + "Due to the on going maintenance burden of keeping command line application\n", + "wrappers up to date, we have decided to deprecate and eventually remove these\n", + "modules.\n", + "\n", + "We instead now recommend building your command line and invoking it directly\n", + "with the subprocess module.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mDatabase Loading\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m887/887\u001b[0m \u001b[33m1:09:27\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:04\u001b[0m03:37\u001b[0m\n", + "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mProperty Filter\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m887/887\u001b[0m \u001b[33m1:00:48\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:04\u001b[0m01:38\u001b[0m\n", + "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mSpecificity Filter: Exact Matches\u001b[0m \u001b[90m━━━━━━━━━━━━━\u001b[0m \u001b[32m887/887\u001b[0m \u001b[33m0:04:27\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:01\u001b[0m00:21\u001b[0m\n", + "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mSpecificity Filter: Blastn Specificity\u001b[0m \u001b[90m━━━━━━━━\u001b[0m \u001b[32m887/887\u001b[0m \u001b[33m1:13:44\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:05\u001b[0m06:41\u001b[0m\n", + "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mSpecificity Filter: Blastn Crosshybridization\u001b[0m \u001b[90m━━━━━\u001b[0m \u001b[32m887/…\u001b[0m \u001b[33m0:02:…\u001b[0m < \u001b[36m0:00:…\u001b[0m < \u001b[36m0:00:…\u001b[0m:00:…\u001b[0m\n", + "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mFind Oligosets\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m810/810\u001b[0m \u001b[33m2:42:58\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:04\u001b[0m02:54\u001b[0m\n", + "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mDesign Final Padlock Sequence\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m741/741\u001b[0m \u001b[33m0:11:26\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:01\u001b[0m00:38\u001b[0m\n", + "\u001b[?25h" + ] + } + ], + "source": [ + "cmd = \"scrinshot_probe_designer -c data/scrinshot_probe_designer.yaml\"\n", + "process = subprocess.Popen(cmd, shell=True).wait()" ] }, { @@ -197,53 +328,229 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "##### Load probe design filter #####\n", - "genes_without_enough_probes = pd.read_csv('./output/regions_with_insufficient_oligos.txt', index_col=0, sep=\"\\t\").index.tolist()\n", + "genes_without_enough_probes = pd.read_csv('output_scrinshot_probe_designer/db_probes/regions_with_insufficient_oligos_for_db_probes.txt', index_col=0, sep=\"\\t\").index.tolist()\n", "adata.var[\"has_enough_probes\"] = [g not in genes_without_enough_probes for g in adata.var_names]\n", - "adata.var[\"pass_constraints\"] = adata.var[\"has_enough_probes\"] & adata.var[\"highly_variable\"]\n", - "\n", + "adata.var[\"pass_constraints\"] = adata.var[\"has_enough_probes\"] & adata.var[\"highly_variable\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "856cfa65e91446989bda6ed9bffa59e2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: The following celltypes' test set sizes for forest training are below min_test_n (=20):\n", + "\t Dendritic cells : 9\n", + "\t Megakaryocytes : 3\n", + "The genes selected for those cell types potentially don't generalize well. Find the genes for each of those cell types in self.genes_of_primary_trees after running self.select_probeset().\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n", + "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/threadpoolctl.py:1214: RuntimeWarning: \n", + "Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at\n", + "the same time. Both libraries are known to be incompatible and this\n", + "can cause random crashes or deadlocks on Linux when loaded in the\n", + "same Python program.\n", + "Using threadpoolctl may cause crashes or deadlocks. For more\n", + "information and possible workarounds, please see\n", + " https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md\n", + "\n", + " warnings.warn(msg, RuntimeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ "##### Select genes for gene panel #####\n", "selector = sp.se.ProbesetSelector(adata, n=20, genes_key=\"pass_constraints\", celltype_key=\"celltype\", verbosity=1, save_dir=None)\n", "selector.select_probeset()\n", - "selected_genes = selector.probeset.index[selector.probeset.selection]\n", - "\n", + "selected_genes = selector.probeset.index[selector.probeset.selection]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ "##### Get probes of genes #####\n", - "with open('./output/padlock_sequences/padlock_probes_order.yml', 'r') as file:\n", + "with open('output_scrinshot_probe_designer/padlock_probes_order.yml', 'r') as file:\n", " all_probes = yaml.safe_load(file)\n", " probes = {g: all_probes[g] for g in selected_genes}" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 11, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected genes: ['CD79A', 'CST3', 'CTSS', 'FCER1G', 'FCGR3A', 'FCN1', 'FGFBP2', 'GNLY', 'GPX1', 'GRN', 'GZMA', 'GZMH', 'GZMK', 'IL32', 'LYAR', 'MAL', 'MS4A1', 'PF4', 'PRF1', 'TYROBP']\n" + ] + }, { "data": { "text/plain": [ - "{'AAGAB_oligo1': {'padlock_probe_full_sequence': 'ATCATCACCTCAGGTAACCTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCAAACCTATCTTCTTTCACTCTATCGCAGACCAAG',\n", - " 'detection_probe_sequence': 'GCAGACCAAGATCAUCACCU[fluorophore]'},\n", - " 'AAGAB_oligo2': {'padlock_probe_full_sequence': 'GTGTGAGCCTTATTTCTTTCTGTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCAAACCTATCTTCTTTGGTAGTGTCTGGAGGG',\n", - " 'detection_probe_sequence': '[fluorophore]UCTGGAGGGGUGTGAGCC'},\n", - " 'AAGAB_oligo3': {'padlock_probe_full_sequence': 'GAGAGGGATTCAGTACTATCTGTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCAAACCTATCTTCTTTACCACCCCGATGATCA',\n", - " 'detection_probe_sequence': 'CCGATGAUCAGAGAGGGATUC[fluorophore]'},\n", - " 'AAGAB_oligo4': {'padlock_probe_full_sequence': 'CTAGAAGAGGCCAATATCCCTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCAAACCTATCTTCTTTGACCTTCATCCCATATGTCC',\n", - " 'detection_probe_sequence': '[fluorophore]CCAUATGTCCCUAGAAGAGGC'},\n", - " 'AAGAB_oligo5': {'padlock_probe_full_sequence': 'ATCACAGTGAGGGCTAATGTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCAAACCTATCTTCTTTCAGAGCCACAGCAGTAATG',\n", - " 'detection_probe_sequence': 'ACAGCAGTAATGAUCACAGUGAG[fluorophore]'}}" + "{'oligoset_1': {'CTSS::21801': {'sequence_padlock_probe': 'CAGCAGTTGCTCCCACAGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTACCAGCCGTTTCATTGTGATAGAAC',\n", + " 'sequence_detection_oligo': 'CCGTTTCATTGTGATAGAACCAGCAGTTGCUCCCACAGU[fluorophore]'},\n", + " 'CTSS::3064': {'sequence_padlock_probe': 'GCCACAGCTTCTTTCAGGACATCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTCCAACAGACACTGGGCCTTTATTG',\n", + " 'sequence_detection_oligo': 'CAGACACTGGGCCTTTATTGGCCACAGCUTCTTUCAGGAC[fluorophore]'},\n", + " 'CTSS::22579': {'sequence_padlock_probe': 'GCGTCTGAGTCGATGCCCTTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTATCCATGGCTTTGTAGGGATAGGAA',\n", + " 'sequence_detection_oligo': 'TGGCTTTGTAGGGATAGGAAGCGTCTGAGTCGAUGCCCU[fluorophore]'},\n", + " 'CTSS::979': {'sequence_padlock_probe': 'TTCCCATTGAATGCTCCAGGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTTGCCCAGATCGTATGAGTGCA',\n", + " 'sequence_detection_oligo': 'GCCCAGATCGTATGAGTGCATTCCCATUGAATGCUCCAGG[fluorophore]'},\n", + " 'CTSS::22236': {'sequence_padlock_probe': 'AGCACCACAAGAACCCATGTCTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTACAGCACTGAAAGCCCAGCA',\n", + " 'sequence_detection_oligo': 'ACAGCACUGAAAGCCCAGCAAGCACCACAAGAACCCATGU[fluorophore]'}},\n", + " 'oligoset_2': {'CTSS::21801': {'sequence_padlock_probe': 'CAGCAGTTGCTCCCACAGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTACCAGCCGTTTCATTGTGATAGAAC',\n", + " 'sequence_detection_oligo': 'CCGTTTCATTGTGATAGAACCAGCAGTTGCUCCCACAGU[fluorophore]'},\n", + " 'CTSS::3064': {'sequence_padlock_probe': 'GCCACAGCTTCTTTCAGGACATCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTCCAACAGACACTGGGCCTTTATTG',\n", + " 'sequence_detection_oligo': 'CAGACACTGGGCCTTTATTGGCCACAGCUTCTTUCAGGAC[fluorophore]'},\n", + " 'CTSS::2173': {'sequence_padlock_probe': 'GTCTGAGTCGATGCCCTTGTTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTATGGCTTTGTAGGGATAGGAAGC',\n", + " 'sequence_detection_oligo': 'GCTTTGTAGGGATAGGAAGCGTCTGAGTCGAUGCCCTTGU[fluorophore]'},\n", + " 'CTSS::979': {'sequence_padlock_probe': 'TTCCCATTGAATGCTCCAGGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTTGCCCAGATCGTATGAGTGCA',\n", + " 'sequence_detection_oligo': 'GCCCAGATCGTATGAGTGCATTCCCATUGAATGCUCCAGG[fluorophore]'},\n", + " 'CTSS::22236': {'sequence_padlock_probe': 'AGCACCACAAGAACCCATGTCTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTACAGCACTGAAAGCCCAGCA',\n", + " 'sequence_detection_oligo': 'ACAGCACUGAAAGCCCAGCAAGCACCACAAGAACCCATGU[fluorophore]'}},\n", + " 'oligoset_3': {'CTSS::21801': {'sequence_padlock_probe': 'CAGCAGTTGCTCCCACAGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTACCAGCCGTTTCATTGTGATAGAAC',\n", + " 'sequence_detection_oligo': 'CCGTTTCATTGTGATAGAACCAGCAGTTGCUCCCACAGU[fluorophore]'},\n", + " 'CTSS::3064': {'sequence_padlock_probe': 'GCCACAGCTTCTTTCAGGACATCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTCCAACAGACACTGGGCCTTTATTG',\n", + " 'sequence_detection_oligo': 'CAGACACTGGGCCTTTATTGGCCACAGCUTCTTUCAGGAC[fluorophore]'},\n", + " 'CTSS::22579': {'sequence_padlock_probe': 'GCGTCTGAGTCGATGCCCTTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTATCCATGGCTTTGTAGGGATAGGAA',\n", + " 'sequence_detection_oligo': 'TGGCTTTGTAGGGATAGGAAGCGTCTGAGTCGAUGCCCU[fluorophore]'},\n", + " 'CTSS::1139': {'sequence_padlock_probe': 'CCCATTGAATGCTCCAGGTTGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTGCCCAGATCGTATGAGTGCATT',\n", + " 'sequence_detection_oligo': 'CCAGATCGTATGAGTGCATTCCCATTGAATGCUCCAGGU[fluorophore]'},\n", + " 'CTSS::22236': {'sequence_padlock_probe': 'AGCACCACAAGAACCCATGTCTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTACAGCACTGAAAGCCCAGCA',\n", + " 'sequence_detection_oligo': 'ACAGCACUGAAAGCCCAGCAAGCACCACAAGAACCCATGU[fluorophore]'}}}" ] }, - "execution_count": 22, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "probes[\"AAGAB\"]" + "print(f\"Selected genes: {sorted(list(probes.keys()))}\")\n", + "probes[\"CTSS\"]" ] } ], @@ -263,7 +570,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.17" + "version": "3.10.14" }, "orig_nbformat": 4 }, From d58eacdfd21231f392feb6207a9c2f3d98071a5d Mon Sep 17 00:00:00 2001 From: Lisa Sousa Date: Thu, 11 Jul 2024 21:59:37 +0200 Subject: [PATCH 2/3] adapt to new oligo-designer toolsuite package --- .../data/scrinshot_probe_designer.yaml | 27 +- ...papros_tutorial_end_to_end_selection.ipynb | 270 +++++++++--------- 2 files changed, 156 insertions(+), 141 deletions(-) diff --git a/docs/_tutorials/data/scrinshot_probe_designer.yaml b/docs/_tutorials/data/scrinshot_probe_designer.yaml index 2254482..87325cd 100644 --- a/docs/_tutorials/data/scrinshot_probe_designer.yaml +++ b/docs/_tutorials/data/scrinshot_probe_designer.yaml @@ -3,11 +3,13 @@ ####################### ### General parameters -n_jobs: 4 # number of cores used to run the pipeline +### ----------------------------------------------- +n_jobs: 4 # number of cores used to run the pipeline and 2*n_jobs +1 of regions that should be stored in cache. If memory consumption of pipeline is too high reduce this number, if a lot of RAM is available increase this number to decrease runtime dir_output: output_scrinshot_probe_designer # name of the directory where the output files will be written write_intermediate_steps: true # if true, writes the oligo sequences after each step of the pipeline into a csv file ### Parameters for probe sequences generation +### ----------------------------------------------- file_regions: my_genes.txt # file with a list the genes used to generate the oligos sequences, leave empty if all the genes are used files_fasta_probe_database: # fasta file with sequences form which the oligos should be generated. Hint: use the genomic_region_generator pipeline to create fasta files of genomic regions of interest - output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna @@ -16,7 +18,8 @@ probe_length_min: 40 #min length of oligos probe_length_max: 45 #max length of oligos ### Parameters for the property filers, i.e. properties that the sequences should fulfill -# probe sequence +### ----------------------------------------------- +## target probe sequence probe_GC_content_min: 40 # minimum GC content of oligos probe_GC_content_max: 60 # maximum GC content of oligos probe_Tm_min: 65 # minimum melting temperature of oligos @@ -26,23 +29,25 @@ homopolymeric_base_n: # minimum number of nucleotides to consider it a homopolym T: 5 C: 5 G: 5 -# padlock arms +## padlock arms arm_Tm_dif_max: 2 # maximum melting temperature difference of both arms (difference shouldn't be higher than 5! But range is not super important, the lower the better) arm_length_min: 10 # minimum length of each arm arm_Tm_min: 50 # minimum melting temperature of each arm arm_Tm_max: 60 # maximum melting temperature of each arm -# detection oligos +## detection oligos min_thymines: 2 # minimal number of Thymines in detection oligo. detect_oligo_length_min: 15 # minimum length of detection probe detect_oligo_length_max: 40 # maximum length of detection probe ### Parameters for the specificity filters +### ----------------------------------------------- files_fasta_reference_database: # fasta file with sequences used as reference for the specificity filters. Hint: use the genomic_region_generator pipeline to create fasta files of genomic regions of interest - output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna - output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna ligation_region_size: 5 # size of the seed region around the ligation site for blast seed region filter; set to 0 if ligation region should not be considered for blast search -### Parameters for the Oligo set selection +### Parameters for set selection +### ----------------------------------------------- probe_isoform_weight: 2 # weight of the isoform consensus of the probe in the efficiency score probe_GC_content_opt: 50 # max and min values are defiend above probe_GC_weight: 1 # weight of the GC content of the probe in the efficiency score @@ -54,7 +59,8 @@ probeset_size_opt: 5 # optimal size of probe sets distance_between_probes: 0 # how much overlap should be allowed between oligos, e.g. if oligos can overlpap x bases choose -x, if oligos can be next to one another choose 0, if oligos should be x bases apart choose x n_sets: 100 # maximum number of sets to generate -### Parameters for creation of final probe sequence +### Parameters for final sequence design +### ----------------------------------------------- U_distance: 5 # preferred minimal distance between U(racils) detect_oligo_Tm_opt: 56 # optimal melting temperature of detection probe top_n_sets: 3 #maximum number of sets to report in padlock_probes.yaml and "padlock_probes_order.yaml" @@ -64,6 +70,7 @@ top_n_sets: 3 #maximum number of sets to report in padlock_probes.yaml and "padl ############################ ### Parameters for the specificity filters +### ----------------------------------------------- # Specificity filter with BlastN specificity_blastn_search_parameters: perc_identity: 80 @@ -89,12 +96,15 @@ cross_hybridization_blastn_hit_parameters: ### Parameters for the Oligo set selection +### ----------------------------------------------- max_graph_size: 5000 # maximum number of oligos that are taken into consisderation in the last step (5000 -> ~5GB, 2500 -> ~1GB) ### Parameters for Melting Temperature +### ----------------------------------------------- # The melting temperature is used in 2 different stages (property filters and padlock detection probe design), where a few parameters are shared and the others differ. # parameters for melting temperature -> for more information on parameters, see: https://biopython.org/docs/1.75/api/Bio.SeqUtils.MeltingTemp.html#Bio.SeqUtils.MeltingTemp.Tm_NN +## target probe Tm_parameters_probe: check: true #default strict: true #default @@ -122,6 +132,9 @@ Tm_chem_correction_param_probe: fmdmethod: 1 #default GC: null #default +Tm_salt_correction_param_probe: null # if salt correction desired, please add parameters below + +## detection oligo Tm_parameters_detection_oligo: check: true #default strict: true #default @@ -148,3 +161,5 @@ Tm_chem_correction_param_detection_oligo: fmdfactor: 0.65 #default fmdmethod: 1 #default GC: null #default + +Tm_salt_correction_param_detection_oligo: null # if salt correction desired, please add parameters below \ No newline at end of file diff --git a/docs/_tutorials/spapros_tutorial_end_to_end_selection.ipynb b/docs/_tutorials/spapros_tutorial_end_to_end_selection.ipynb index c8e70df..d06a764 100644 --- a/docs/_tutorials/spapros_tutorial_end_to_end_selection.ipynb +++ b/docs/_tutorials/spapros_tutorial_end_to_end_selection.ipynb @@ -279,9 +279,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-08 14:35:32,314 [INFO] Parameters Load Annotations:\n", + "2024-07-08 14:35:32,316 [INFO] source = ncbi\n", + "2024-07-08 14:35:32,317 [INFO] source_params = {'taxon': 'vertebrate_mammalian', 'species': 'Homo_sapiens', 'annotation_release': 110}\n", + "2024-07-08 14:44:42,316 [INFO] The following annotation files are used for GTF annotation of regions: /Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/GCF_000001405.40_GRCh38.p14_genomic.gtf and for fasta sequence file: /Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/GCF_000001405.40_GRCh38.p14_genomic.fna .\n", + "2024-07-08 14:44:42,381 [INFO] The annotations are from NCBI source, for the species: Homo_sapiens, release number: 110 and genome assembly: GRCh38.p14\n", + "2024-07-08 14:48:17,407 [INFO] The genomic region 'exon' was stored in :/Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna.\n", + "2024-07-08 15:00:11,519 [INFO] The genomic region 'exon_exon_junction' was stored in :/Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna.\n" + ] + } + ], "source": [ "# Generate the genomic regions\n", "region_generator = pipeline.load_annotations(\n", @@ -298,7 +312,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -325,7 +339,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -338,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -370,7 +384,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -382,62 +396,26 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2024-06-21 18:43:04,735 [INFO] Parameters Create Database:\n", - "2024-06-21 18:43:04,739 [INFO] Function: create_probe_database\n", - "2024-06-21 18:43:04,741 [INFO] Parameter: gene_ids = ['AAGAB', 'AATF', 'ABCC10', 'ABHD12', 'ABHD17B', 'ABHD5', 'ABRACL', 'ABT1', 'AC005082.12', 'AC074138.3', 'AC093323.3', 'ACAP1', 'ACBD3', 'ACD', 'ACOT13', 'ACP1', 'ACRBP', 'ACTL6A', 'ACTR6', 'ACVR2A', 'ADAL', 'ADAM10', 'ADAM28', 'ADD1', 'ADIPOR2', 'ADPRM', 'ADSL', 'AEBP1', 'AGPAT1', 'AHSA1', 'AIF1', 'AIM2', 'AKTIP', 'AL928768.3', 'ALKBH7', 'ANAPC13', 'ANKAR', 'ANKEF1', 'ANKRD27', 'ANKRD54', 'AP001462.6', 'AP003419.16', 'AP3M2', 'AP4B1-AS1', 'AP4S1', 'APOBEC3A', 'APOBEC3B', 'APOBEC3G', 'AQP3', 'ARHGAP11A', 'ARHGAP19', 'ARHGAP24', 'ARHGAP33', 'ARHGAP6', 'ARID4A', 'ARIH2OS', 'ARL2', 'ARL2BP', 'ARL4A', 'ARL6IP5', 'ARMC7', 'ARMCX5', 'ARRDC3', 'ARRDC4', 'ARSD', 'ARSG', 'ARVCF', 'ASB8', 'ASXL2', 'ATAD3C', 'ATF7IP2', 'ATG16L1', 'ATP10A', 'ATP5H', 'ATP5O', 'ATP5SL', 'ATP6V0E2', 'ATXN1L', 'ATXN3', 'AURKC', 'BABAM1', 'BACE2', 'BAZ2A', 'BBX', 'BCDIN3D', 'BET1', 'BEX4', 'BGLAP', 'BLNK', 'BLZF1', 'BMPR2', 'BNIP2', 'BOLA1', 'BOLA3', 'BRAT1', 'BRWD1', 'BTN3A1', 'BTN3A2', 'BUB3', 'C10orf32', 'C12orf45', 'C14orf1', 'C14orf166', 'C14orf80', 'C15orf57', 'C16orf13', 'C16orf52', 'C16orf54', 'C16orf58', 'C16orf74', 'C16orf80', 'C17orf59', 'C17orf62', 'C19orf33', 'C19orf52', 'C1QA', 'C1QB', 'C1QC', 'C1orf162', 'C1orf35', 'C21orf33', 'C2CD4D', 'C2orf76', 'C2orf88', 'C3orf18', 'C5orf15', 'C5orf42', 'C8orf44', 'C9orf142', 'C9orf16', 'C9orf37', 'CAMK1D', 'CAMK2G', 'CAMK2N1', 'CAPN12', 'CARHSP1', 'CARS', 'CASC4', 'CBX5', 'CCDC115', 'CCDC122', 'CCDC66', 'CCDC91', 'CCL3', 'CCL4', 'CCL5', 'CCND2', 'CCNG1', 'CCP110', 'CCT4', 'CCT7', 'CD160', 'CD19', 'CD2', 'CD247', 'CD274', 'CD2AP', 'CD320', 'CD72', 'CD79A', 'CD79B', 'CD82', 'CD9', 'CD96', 'CDC123', 'CDC16', 'CDC37', 'CDC40', 'CDK19', 'CDKN2A', 'CEACAM4', 'CEBPB', 'CECR5', 'CEP120', 'CEP68', 'CEP85L', 'CEPT1', 'CES4A', 'CGRRF1', 'CHD2', 'CHD7', 'CHERP', 'CHI3L2', 'CHPF2', 'CIAPIN1', 'CISD1', 'CISH', 'CITED4', 'CKS1B', 'CLDN5', 'CLEC2B', 'CLIC3', 'CLNS1A', 'CLPX', 'CLU', 'CLYBL', 'CMTM5', 'CNEP1R1', 'COMMD10', 'COQ7', 'CORO1B', 'COTL1', 'CPNE2', 'CPQ', 'CPSF3L', 'CR1', 'CRIP3', 'CRTC2', 'CST3', 'CST7', 'CTA-29F11.1', 'CTB-113I20.2', 'CTB-152G17.6', 'CTC-444N24.11', 'CTD-2015H6.3', 'CTD-2302E22.4', 'CTD-2368P22.1', 'CTD-2537I9.12', 'CTSS', 'CTSW', 'CWC15', 'CWC27', 'CXCL10', 'CXCL3', 'CYB5B', 'CYTH2', 'DAGLB', 'DCAF5', 'DDI2', 'DDT', 'DDX1', 'DDX17', 'DDX46', 'DDX56', 'DENND1C', 'DENND2D', 'DENND5B', 'DENND6A', 'DERL1', 'DEXI', 'DHX34', 'DHX9', 'DIDO1', 'DIMT1', 'DIS3', 'DISP1', 'DLST', 'DMTN', 'DNAJA3', 'DNAJB14', 'DNAJC10', 'DNAJC15', 'DNAJC2', 'DNAJC27', 'DNASE1L3', 'DNMT3A', 'DOK3', 'DPH6', 'DPY19L4', 'DRAXIN', 'DSCR3', 'DTX3', 'DUS3L', 'DUSP10', 'EAF2', 'EARS2', 'ECHDC1', 'EDC3', 'EID2', 'EIF1AY', 'EIF1B', 'EIF2B1', 'EIF3D', 'ELANE', 'ELOF1', 'ELOVL4', 'ELP6', 'EMB', 'EMG1', 'EML6', 'ENTPD3-AS1', 'EOGT', 'ERH', 'ERV3-1', 'EVA1B', 'EWSR1', 'EXOC6', 'F5', 'FADS1', 'FAM107B', 'FAM173A', 'FAM210B', 'FAM96A', 'FAM98A', 'FBXL14', 'FBXO21', 'FBXO33', 'FBXO4', 'FBXW4', 'FCER1A', 'FCER1G', 'FCGR2B', 'FCGR3A', 'FCN1', 'FCRLA', 'FEM1A', 'FERMT3', 'FGFBP2', 'FH', 'FHL1', 'FKBP3', 'FKBP5', 'FLOT1', 'FMO4', 'FN3KRP', 'FNBP4', 'FNTA', 'FOPNL', 'FRY-AS1', 'FUS', 'FXN', 'FYB', 'G0S2', 'GADD45B', 'GALT', 'GBGT1', 'GBP1', 'GDF11', 'GFER', 'GGA3', 'GGNBP2', 'GIMAP2', 'GIMAP4', 'GIMAP5', 'GIMAP7', 'GIT2', 'GMPPA', 'GNE', 'GNG11', 'GNG3', 'GNLY', 'GNPAT', 'GOLGB1', 'GP9', 'GPATCH4', 'GPKOW', 'GPR171', 'GPR183', 'GPR35', 'GPS1', 'GPX1', 'GRAP', 'GRN', 'GSTP1', 'GTPBP6', 'GUSB', 'GYS1', 'GZMA', 'GZMB', 'GZMH', 'GZMK', 'HAGH', 'HBA1', 'HBP1', 'HCFC2', 'HDAC1', 'HDAC5', 'HDAC9', 'HELQ', 'HEMK1', 'HERPUD2', 'HIST1H1B', 'HIST1H2AC', 'HIST1H2AH', 'HLA-DMA', 'HLA-DMB', 'HLA-DOB', 'HLA-DPA1', 'HLA-DPB1', 'HLA-DQA1', 'HLA-DQB1', 'HLA-DRB1', 'HMBOX1', 'HMGCL', 'HMGXB4', 'HNRNPH3', 'HOOK2', 'HOPX', 'HSPB11', 'HVCN1', 'ICAM2', 'ICOS', 'ICOSLG', 'ID2', 'IDUA', 'IFFO1', 'IFI27', 'IFIT1', 'IFIT2', 'IFITM3', 'IGFBP7', 'IGJ', 'IGLL5', 'IL1B', 'IL1RAP', 'IL23A', 'IL24', 'IL27RA', 'IL32', 'IL6', 'IL8', 'ILF3', 'ILF3-AS1', 'ING5', 'INSL3', 'INTS12', 'INTS2', 'IP6K1', 'IQCE', 'IRF8', 'IRF9', 'ISCA2', 'ISOC1', 'ITGA2B', 'ITGB7', 'ITM2A', 'ITSN2', 'JAKMIP1', 'JUND', 'KARS', 'KCNG1', 'KCNQ1OT1', 'KIAA0040', 'KIAA0125', 'KIAA0196', 'KIAA1430', 'KIF3A', 'KIF3C', 'KIF5B', 'KLHL24', 'KLRB1', 'KLRG1', 'KRBOX4', 'LAMP3', 'LARS', 'LAT2', 'LBR', 'LDLRAP1', 'LGALS1', 'LGALS2', 'LGALS3', 'LILRA4', 'LIN52', 'LINC00494', 'LINC00662', 'LINC00886', 'LINC00926', 'LINC00936', 'LINC01013', 'LIX1L', 'LONRF1', 'LPIN1', 'LRBA', 'LRRIQ3', 'LSM14A', 'LST1', 'LTB', 'LTV1', 'LUC7L', 'LUC7L3', 'LYAR', 'LYPD2', 'LYPLA1', 'LYRM4', 'LYSMD4', 'LZTS2', 'MADD', 'MAEA', 'MAGEH1', 'MAL', 'MALT1', 'MAP2K7', 'MARCKSL1', 'MCF2L', 'MCM3', 'MDS2', 'MED30', 'MED9', 'METTL21A', 'METTL3', 'METTL8', 'MFF', 'MFSD10', 'MIS18A', 'MKKS', 'MLLT11', 'MLLT6', 'MMADHC', 'MMP9', 'MNAT1', 'MOCS2', 'MORF4L2', 'MPHOSPH10', 'MRM1', 'MRPL1', 'MRPL19', 'MRPL42', 'MRPS12', 'MRPS33', 'MS4A1', 'MS4A6A', 'MTERFD2', 'MTIF2', 'MTRF1', 'MUM1', 'MYADM', 'MYCBP2', 'MYL9', 'MYO1E', 'MYOM2', 'MZB1', 'MZT1', 'NAA20', 'NAP1L4', 'NAPA-AS1', 'NARG2', 'NAT9', 'NBR1', 'NCOR2', 'NCR3', 'NDUFA10', 'NDUFA12', 'NECAB3', 'NEFH', 'NEK8', 'NELFB', 'NEMF', 'NFAT5', 'NFE2L2', 'NFIC', 'NFU1', 'NIT2', 'NKAP', 'NKG7', 'NKTR', 'NME3', 'NME6', 'NMNAT3', 'NNT-AS1', 'NOC4L', 'NOG', 'NOL11', 'NONO', 'NOP58', 'NPC2', 'NPHP3', 'NPRL2', 'NR2C1', 'NR3C1', 'NSA2', 'NT5C', 'NT5C3A', 'NUDCD1', 'NUDT16L1', 'NUP54', 'NXT2', 'OARD1', 'OAT', 'OBSCN', 'ODC1', 'ORAI1', 'ORC2', 'OSBPL1A', 'OSBPL7', 'OXLD1', 'P2RX5', 'P2RY10', 'PACS1', 'PACSIN2', 'PAICS', 'PARP1', 'PARS2', 'PASK', 'PAWR', 'PAXIP1-AS1', 'PBLD', 'PBRM1', 'PCNA', 'PCSK7', 'PDCD1', 'PDCD2L', 'PDE6B', 'PDIA3', 'PDIK1L', 'PDK2', 'PDXDC1', 'PDZD4', 'PEMT', 'PEX16', 'PEX26', 'PF4', 'PGM1', 'PGM2L1', 'PHACTR4', 'PHF12', 'PHF14', 'PHF3', 'PIGF', 'PIGU', 'PIGX', 'PIK3R1', 'PITHD1', 'PITPNA-AS1', 'PJA1', 'PKIG', 'PLA2G12A', 'PLCL1', 'PLD6', 'PLEKHA1', 'PLEKHA3', 'PLRG1', 'PMEPA1', 'PNOC', 'POLR2I', 'POLR2K', 'POLR3E', 'POMT1', 'PPA2', 'PPBP', 'PPIE', 'PPIG', 'PPIL2', 'PPIL4', 'PPP1R14A', 'PPP1R2', 'PPP2R1B', 'PPP6C', 'PPT2-EGFL8', 'PQBP1', 'PRAF2', 'PRDX1', 'PRELID2', 'PRF1', 'PRICKLE1', 'PRKACB', 'PRKCB', 'PRKD2', 'PRMT2', 'PRNP', 'PRPF31', 'PRPS2', 'PRR5', 'PSMD14', 'PTCRA', 'PTGDR', 'PTGDS', 'PTGES2', 'PTPN7', 'PURA', 'PWP1', 'PXMP4', 'PYCARD', 'R3HDM1', 'R3HDM2', 'RAB40C', 'RABEP2', 'RABL6', 'RAD51B', 'RALBP1', 'RALY', 'RASD1', 'RASGRP2', 'RBM25', 'RBM26-AS1', 'RBM39', 'RBM4', 'RBM48', 'RBM5', 'RBM7', 'RBPJ', 'RCE1', 'RCHY1', 'RCL1', 'RCN2', 'RDH14', 'RELB', 'REXO2', 'RFC1', 'RFC5', 'RFNG', 'RFPL2', 'RGS14', 'RIC3', 'RIOK1', 'RIOK2', 'RNF113A', 'RNF125', 'RNF139', 'RNF14', 'RNF168', 'RNF187', 'RNF213', 'RNF25', 'RNF26', 'RORA', 'RP1-28O10.1', 'RP11-1055B8.7', 'RP11-138A9.2', 'RP11-141B14.1', 'RP11-142C4.6', 'RP11-162G10.5', 'RP11-164H13.1', 'RP11-178G16.4', 'RP11-18H21.1', 'RP11-211G3.2', 'RP11-219B17.1', 'RP11-219B4.7', 'RP11-252A24.3', 'RP11-291B21.2', 'RP11-314N13.3', 'RP11-324I22.4', 'RP11-349A22.5', 'RP11-378J18.3', 'RP11-390B4.5', 'RP11-398C13.6', 'RP11-400F19.6', 'RP11-421L21.3', 'RP11-428G5.5', 'RP11-432I5.1', 'RP11-468E2.4', 'RP11-488C13.5', 'RP11-493L12.4', 'RP11-527L4.5', 'RP11-545I5.3', 'RP11-589C21.6', 'RP11-5C23.1', 'RP11-701P16.5', 'RP11-706O15.1', 'RP11-70P17.1', 'RP11-727F15.9', 'RP11-798G7.6', 'RP11-879F14.2', 'RP11-950C14.3', 'RP3-325F22.5', 'RP5-1073O3.7', 'RP5-827C21.4', 'RP5-887A10.1', 'RPH3A', 'RPL39L', 'RPL7L1', 'RPN2', 'RPS6KL1', 'RPUSD2', 'RRAGC', 'RRS1', 'RUNDC1', 'S100A11', 'S100A12', 'S100A8', 'S100B', 'SAFB2', 'SAMD1', 'SAMD3', 'SAMSN1', 'SARDH', 'SARS', 'SAT1', 'SCAI', 'SCAPER', 'SCGB3A1', 'SCPEP1', 'SDCCAG8', 'SDPR', 'SEC61A2', 'SELL', 'SEPT11', 'SERAC1', 'SETD1B', 'SF3B1', 'SF3B5', 'SH3GLB1', 'SH3KBP1', 'SHOC2', 'SHPK-1', 'SIAH2', 'SIRPG', 'SIRT1', 'SIVA1', 'SLA', 'SLBP', 'SLC22A4', 'SLC25A11', 'SLC25A12', 'SLC25A14', 'SLC27A1', 'SLC2A13', 'SLC35A2', 'SLC48A1', 'SLFN5', 'SMARCA4', 'SMARCC2', 'SMC2', 'SMCHD1', 'SMDT1', 'SMIM14', 'SMIM7', 'SNAP47', 'SNHG12', 'SNHG8', 'SNTA1', 'SNX29P2', 'SOX13', 'SPARC', 'SPATA7', 'SPG7', 'SPIB', 'SPIN1', 'SPOCD1', 'SPON2', 'SPSB2', 'SREBF1', 'SRM', 'SRP9', 'SRSF6', 'SSBP1', 'ST3GAL2', 'STAMBP', 'STAU2', 'STK17A', 'STK38', 'STMN1', 'STOML2', 'STUB1', 'STX16', 'STX18', 'SUCLG2', 'SUOX', 'SURF1', 'SURF6', 'SWAP70', 'SYCE1', 'SYP', 'SYVN1', 'TACR2', 'TADA2A', 'TAF10', 'TAF12', 'TAF1D', 'TAL1', 'TALDO1', 'TAPBP', 'TARSL2', 'TASP1', 'TBC1D15', 'TBCK', 'TBXA2R', 'TCEAL4', 'TCEAL8', 'TCL1A', 'TCL1B', 'TCP1', 'TDG', 'TERF2IP', 'TGFBRAP1', 'THAP2', 'THEM4', 'THOC7', 'THUMPD3', 'THYN1', 'TIGIT', 'TIMM10B', 'TMEM116', 'TMEM138', 'TMEM140', 'TMEM14B', 'TMEM165', 'TMEM177', 'TMEM194A', 'TMEM219', 'TMEM242', 'TMEM40', 'TMEM60', 'TMEM80', 'TMEM87A', 'TMEM87B', 'TMEM91', 'TMTC2', 'TMX2', 'TMX3', 'TNFRSF17', 'TNFRSF25', 'TNFRSF4', 'TNFRSF9', 'TNFSF10', 'TOP1MT', 'TOP2B', 'TRABD2A', 'TRAF3IP3', 'TRAPPC12-AS1', 'TRAPPC3', 'TREML1', 'TRIM23', 'TRIP12', 'TRIT1', 'TRMT61A', 'TRPM4', 'TSC22D1', 'TSPAN15', 'TSSC1', 'TTC1', 'TTC14', 'TTC3', 'TTC8', 'TTN-AS1', 'TUBB1', 'TUBG2', 'TYMP', 'TYROBP', 'U2SURP', 'UBA5', 'UBAC2', 'UBE2D2', 'UBE2D4', 'UBE2K', 'UBE2Q1', 'UBE3A', 'UBIAD1', 'UBLCP1', 'UBXN4', 'UCK1', 'UNC45A', 'UQCC1', 'URB2', 'URGCP', 'USP30', 'USP33', 'USP36', 'USP38', 'USP5', 'USP7', 'VAMP5', 'VDAC3', 'VIPR1', 'VPS13A', 'VPS13C', 'VPS25', 'VPS26B', 'VPS28', 'VTI1A', 'VTI1B', 'WARS2', 'WBP2NL', 'WDR55', 'WDR91', 'WDYHV1', 'WNK1', 'WTAP', 'XCL2', 'XPOT', 'XRRA1', 'XXbac-BPG299F13.17', 'YEATS2', 'YES1', 'YPEL2', 'YPEL3', 'YTHDF2', 'ZAP70', 'ZBED5-AS1', 'ZBP1', 'ZC3H15', 'ZCCHC11', 'ZCCHC9', 'ZFAND4', 'ZNF175', 'ZNF232', 'ZNF256', 'ZNF263', 'ZNF276', 'ZNF32', 'ZNF350', 'ZNF436', 'ZNF45', 'ZNF493', 'ZNF503', 'ZNF528', 'ZNF559', 'ZNF561', 'ZNF587B', 'ZNF594', 'ZNF653', 'ZNF682', 'ZNF688', 'ZNF718', 'ZNF747', 'ZNF799', 'ZNF836', 'ZNF92', 'ZRANB3', 'ZSWIM6', 'ZUFSP']\n", - "2024-06-21 18:43:04,743 [INFO] Parameter: probe_length_min = 40\n", - "2024-06-21 18:43:04,744 [INFO] Parameter: probe_length_max = 45\n", - "2024-06-21 18:43:04,745 [INFO] Parameter: files_fasta_oligo_database = ['output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna', 'output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna']\n", - "2024-06-21 18:43:04,746 [INFO] Parameter: min_probes_per_gene = 3\n", - "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", - "\n", - "Due to the on going maintenance burden of keeping command line application\n", - "wrappers up to date, we have decided to deprecate and eventually remove these\n", - "modules.\n", - "\n", - "We instead now recommend building your command line and invoking it directly\n", - "with the subprocess module.\n", - " warnings.warn(\n", - "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", - "\n", - "Due to the on going maintenance burden of keeping command line application\n", - "wrappers up to date, we have decided to deprecate and eventually remove these\n", - "modules.\n", - "\n", - "We instead now recommend building your command line and invoking it directly\n", - "with the subprocess module.\n", - " warnings.warn(\n", - "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", - "\n", - "Due to the on going maintenance burden of keeping command line application\n", - "wrappers up to date, we have decided to deprecate and eventually remove these\n", - "modules.\n", - "\n", - "We instead now recommend building your command line and invoking it directly\n", - "with the subprocess module.\n", - " warnings.warn(\n", - "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", - "\n", - "Due to the on going maintenance burden of keeping command line application\n", - "wrappers up to date, we have decided to deprecate and eventually remove these\n", - "modules.\n", - "\n", - "We instead now recommend building your command line and invoking it directly\n", - "with the subprocess module.\n", - " warnings.warn(\n" + "2024-07-08 15:00:47,199 [INFO] Parameters Create Database:\n", + "2024-07-08 15:00:47,204 [INFO] Function: create_probe_database\n", + "2024-07-08 15:00:47,205 [INFO] Parameter: gene_ids = ['AAGAB', 'AATF', 'ABCC10', 'ABHD12', 'ABHD17B', 'ABHD5', 'ABRACL', 'ABT1', 'AC005082.12', 'AC074138.3', 'AC093323.3', 'ACAP1', 'ACBD3', 'ACD', 'ACOT13', 'ACP1', 'ACRBP', 'ACTL6A', 'ACTR6', 'ACVR2A', 'ADAL', 'ADAM10', 'ADAM28', 'ADD1', 'ADIPOR2', 'ADPRM', 'ADSL', 'AEBP1', 'AGPAT1', 'AHSA1', 'AIF1', 'AIM2', 'AKTIP', 'AL928768.3', 'ALKBH7', 'ANAPC13', 'ANKAR', 'ANKEF1', 'ANKRD27', 'ANKRD54', 'AP001462.6', 'AP003419.16', 'AP3M2', 'AP4B1-AS1', 'AP4S1', 'APOBEC3A', 'APOBEC3B', 'APOBEC3G', 'AQP3', 'ARHGAP11A', 'ARHGAP19', 'ARHGAP24', 'ARHGAP33', 'ARHGAP6', 'ARID4A', 'ARIH2OS', 'ARL2', 'ARL2BP', 'ARL4A', 'ARL6IP5', 'ARMC7', 'ARMCX5', 'ARRDC3', 'ARRDC4', 'ARSD', 'ARSG', 'ARVCF', 'ASB8', 'ASXL2', 'ATAD3C', 'ATF7IP2', 'ATG16L1', 'ATP10A', 'ATP5H', 'ATP5O', 'ATP5SL', 'ATP6V0E2', 'ATXN1L', 'ATXN3', 'AURKC', 'BABAM1', 'BACE2', 'BAZ2A', 'BBX', 'BCDIN3D', 'BET1', 'BEX4', 'BGLAP', 'BLNK', 'BLZF1', 'BMPR2', 'BNIP2', 'BOLA1', 'BOLA3', 'BRAT1', 'BRWD1', 'BTN3A1', 'BTN3A2', 'BUB3', 'C10orf32', 'C12orf45', 'C14orf1', 'C14orf166', 'C14orf80', 'C15orf57', 'C16orf13', 'C16orf52', 'C16orf54', 'C16orf58', 'C16orf74', 'C16orf80', 'C17orf59', 'C17orf62', 'C19orf33', 'C19orf52', 'C1QA', 'C1QB', 'C1QC', 'C1orf162', 'C1orf35', 'C21orf33', 'C2CD4D', 'C2orf76', 'C2orf88', 'C3orf18', 'C5orf15', 'C5orf42', 'C8orf44', 'C9orf142', 'C9orf16', 'C9orf37', 'CAMK1D', 'CAMK2G', 'CAMK2N1', 'CAPN12', 'CARHSP1', 'CARS', 'CASC4', 'CBX5', 'CCDC115', 'CCDC122', 'CCDC66', 'CCDC91', 'CCL3', 'CCL4', 'CCL5', 'CCND2', 'CCNG1', 'CCP110', 'CCT4', 'CCT7', 'CD160', 'CD19', 'CD2', 'CD247', 'CD274', 'CD2AP', 'CD320', 'CD72', 'CD79A', 'CD79B', 'CD82', 'CD9', 'CD96', 'CDC123', 'CDC16', 'CDC37', 'CDC40', 'CDK19', 'CDKN2A', 'CEACAM4', 'CEBPB', 'CECR5', 'CEP120', 'CEP68', 'CEP85L', 'CEPT1', 'CES4A', 'CGRRF1', 'CHD2', 'CHD7', 'CHERP', 'CHI3L2', 'CHPF2', 'CIAPIN1', 'CISD1', 'CISH', 'CITED4', 'CKS1B', 'CLDN5', 'CLEC2B', 'CLIC3', 'CLNS1A', 'CLPX', 'CLU', 'CLYBL', 'CMTM5', 'CNEP1R1', 'COMMD10', 'COQ7', 'CORO1B', 'COTL1', 'CPNE2', 'CPQ', 'CPSF3L', 'CR1', 'CRIP3', 'CRTC2', 'CST3', 'CST7', 'CTA-29F11.1', 'CTB-113I20.2', 'CTB-152G17.6', 'CTC-444N24.11', 'CTD-2015H6.3', 'CTD-2302E22.4', 'CTD-2368P22.1', 'CTD-2537I9.12', 'CTSS', 'CTSW', 'CWC15', 'CWC27', 'CXCL10', 'CXCL3', 'CYB5B', 'CYTH2', 'DAGLB', 'DCAF5', 'DDI2', 'DDT', 'DDX1', 'DDX17', 'DDX46', 'DDX56', 'DENND1C', 'DENND2D', 'DENND5B', 'DENND6A', 'DERL1', 'DEXI', 'DHX34', 'DHX9', 'DIDO1', 'DIMT1', 'DIS3', 'DISP1', 'DLST', 'DMTN', 'DNAJA3', 'DNAJB14', 'DNAJC10', 'DNAJC15', 'DNAJC2', 'DNAJC27', 'DNASE1L3', 'DNMT3A', 'DOK3', 'DPH6', 'DPY19L4', 'DRAXIN', 'DSCR3', 'DTX3', 'DUS3L', 'DUSP10', 'EAF2', 'EARS2', 'ECHDC1', 'EDC3', 'EID2', 'EIF1AY', 'EIF1B', 'EIF2B1', 'EIF3D', 'ELANE', 'ELOF1', 'ELOVL4', 'ELP6', 'EMB', 'EMG1', 'EML6', 'ENTPD3-AS1', 'EOGT', 'ERH', 'ERV3-1', 'EVA1B', 'EWSR1', 'EXOC6', 'F5', 'FADS1', 'FAM107B', 'FAM173A', 'FAM210B', 'FAM96A', 'FAM98A', 'FBXL14', 'FBXO21', 'FBXO33', 'FBXO4', 'FBXW4', 'FCER1A', 'FCER1G', 'FCGR2B', 'FCGR3A', 'FCN1', 'FCRLA', 'FEM1A', 'FERMT3', 'FGFBP2', 'FH', 'FHL1', 'FKBP3', 'FKBP5', 'FLOT1', 'FMO4', 'FN3KRP', 'FNBP4', 'FNTA', 'FOPNL', 'FRY-AS1', 'FUS', 'FXN', 'FYB', 'G0S2', 'GADD45B', 'GALT', 'GBGT1', 'GBP1', 'GDF11', 'GFER', 'GGA3', 'GGNBP2', 'GIMAP2', 'GIMAP4', 'GIMAP5', 'GIMAP7', 'GIT2', 'GMPPA', 'GNE', 'GNG11', 'GNG3', 'GNLY', 'GNPAT', 'GOLGB1', 'GP9', 'GPATCH4', 'GPKOW', 'GPR171', 'GPR183', 'GPR35', 'GPS1', 'GPX1', 'GRAP', 'GRN', 'GSTP1', 'GTPBP6', 'GUSB', 'GYS1', 'GZMA', 'GZMB', 'GZMH', 'GZMK', 'HAGH', 'HBA1', 'HBP1', 'HCFC2', 'HDAC1', 'HDAC5', 'HDAC9', 'HELQ', 'HEMK1', 'HERPUD2', 'HIST1H1B', 'HIST1H2AC', 'HIST1H2AH', 'HLA-DMA', 'HLA-DMB', 'HLA-DOB', 'HLA-DPA1', 'HLA-DPB1', 'HLA-DQA1', 'HLA-DQB1', 'HLA-DRB1', 'HMBOX1', 'HMGCL', 'HMGXB4', 'HNRNPH3', 'HOOK2', 'HOPX', 'HSPB11', 'HVCN1', 'ICAM2', 'ICOS', 'ICOSLG', 'ID2', 'IDUA', 'IFFO1', 'IFI27', 'IFIT1', 'IFIT2', 'IFITM3', 'IGFBP7', 'IGJ', 'IGLL5', 'IL1B', 'IL1RAP', 'IL23A', 'IL24', 'IL27RA', 'IL32', 'IL6', 'IL8', 'ILF3', 'ILF3-AS1', 'ING5', 'INSL3', 'INTS12', 'INTS2', 'IP6K1', 'IQCE', 'IRF8', 'IRF9', 'ISCA2', 'ISOC1', 'ITGA2B', 'ITGB7', 'ITM2A', 'ITSN2', 'JAKMIP1', 'JUND', 'KARS', 'KCNG1', 'KCNQ1OT1', 'KIAA0040', 'KIAA0125', 'KIAA0196', 'KIAA1430', 'KIF3A', 'KIF3C', 'KIF5B', 'KLHL24', 'KLRB1', 'KLRG1', 'KRBOX4', 'LAMP3', 'LARS', 'LAT2', 'LBR', 'LDLRAP1', 'LGALS1', 'LGALS2', 'LGALS3', 'LILRA4', 'LIN52', 'LINC00494', 'LINC00662', 'LINC00886', 'LINC00926', 'LINC00936', 'LINC01013', 'LIX1L', 'LONRF1', 'LPIN1', 'LRBA', 'LRRIQ3', 'LSM14A', 'LST1', 'LTB', 'LTV1', 'LUC7L', 'LUC7L3', 'LYAR', 'LYPD2', 'LYPLA1', 'LYRM4', 'LYSMD4', 'LZTS2', 'MADD', 'MAEA', 'MAGEH1', 'MAL', 'MALT1', 'MAP2K7', 'MARCKSL1', 'MCF2L', 'MCM3', 'MDS2', 'MED30', 'MED9', 'METTL21A', 'METTL3', 'METTL8', 'MFF', 'MFSD10', 'MIS18A', 'MKKS', 'MLLT11', 'MLLT6', 'MMADHC', 'MMP9', 'MNAT1', 'MOCS2', 'MORF4L2', 'MPHOSPH10', 'MRM1', 'MRPL1', 'MRPL19', 'MRPL42', 'MRPS12', 'MRPS33', 'MS4A1', 'MS4A6A', 'MTERFD2', 'MTIF2', 'MTRF1', 'MUM1', 'MYADM', 'MYCBP2', 'MYL9', 'MYO1E', 'MYOM2', 'MZB1', 'MZT1', 'NAA20', 'NAP1L4', 'NAPA-AS1', 'NARG2', 'NAT9', 'NBR1', 'NCOR2', 'NCR3', 'NDUFA10', 'NDUFA12', 'NECAB3', 'NEFH', 'NEK8', 'NELFB', 'NEMF', 'NFAT5', 'NFE2L2', 'NFIC', 'NFU1', 'NIT2', 'NKAP', 'NKG7', 'NKTR', 'NME3', 'NME6', 'NMNAT3', 'NNT-AS1', 'NOC4L', 'NOG', 'NOL11', 'NONO', 'NOP58', 'NPC2', 'NPHP3', 'NPRL2', 'NR2C1', 'NR3C1', 'NSA2', 'NT5C', 'NT5C3A', 'NUDCD1', 'NUDT16L1', 'NUP54', 'NXT2', 'OARD1', 'OAT', 'OBSCN', 'ODC1', 'ORAI1', 'ORC2', 'OSBPL1A', 'OSBPL7', 'OXLD1', 'P2RX5', 'P2RY10', 'PACS1', 'PACSIN2', 'PAICS', 'PARP1', 'PARS2', 'PASK', 'PAWR', 'PAXIP1-AS1', 'PBLD', 'PBRM1', 'PCNA', 'PCSK7', 'PDCD1', 'PDCD2L', 'PDE6B', 'PDIA3', 'PDIK1L', 'PDK2', 'PDXDC1', 'PDZD4', 'PEMT', 'PEX16', 'PEX26', 'PF4', 'PGM1', 'PGM2L1', 'PHACTR4', 'PHF12', 'PHF14', 'PHF3', 'PIGF', 'PIGU', 'PIGX', 'PIK3R1', 'PITHD1', 'PITPNA-AS1', 'PJA1', 'PKIG', 'PLA2G12A', 'PLCL1', 'PLD6', 'PLEKHA1', 'PLEKHA3', 'PLRG1', 'PMEPA1', 'PNOC', 'POLR2I', 'POLR2K', 'POLR3E', 'POMT1', 'PPA2', 'PPBP', 'PPIE', 'PPIG', 'PPIL2', 'PPIL4', 'PPP1R14A', 'PPP1R2', 'PPP2R1B', 'PPP6C', 'PPT2-EGFL8', 'PQBP1', 'PRAF2', 'PRDX1', 'PRELID2', 'PRF1', 'PRICKLE1', 'PRKACB', 'PRKCB', 'PRKD2', 'PRMT2', 'PRNP', 'PRPF31', 'PRPS2', 'PRR5', 'PSMD14', 'PTCRA', 'PTGDR', 'PTGDS', 'PTGES2', 'PTPN7', 'PURA', 'PWP1', 'PXMP4', 'PYCARD', 'R3HDM1', 'R3HDM2', 'RAB40C', 'RABEP2', 'RABL6', 'RAD51B', 'RALBP1', 'RALY', 'RASD1', 'RASGRP2', 'RBM25', 'RBM26-AS1', 'RBM39', 'RBM4', 'RBM48', 'RBM5', 'RBM7', 'RBPJ', 'RCE1', 'RCHY1', 'RCL1', 'RCN2', 'RDH14', 'RELB', 'REXO2', 'RFC1', 'RFC5', 'RFNG', 'RFPL2', 'RGS14', 'RIC3', 'RIOK1', 'RIOK2', 'RNF113A', 'RNF125', 'RNF139', 'RNF14', 'RNF168', 'RNF187', 'RNF213', 'RNF25', 'RNF26', 'RORA', 'RP1-28O10.1', 'RP11-1055B8.7', 'RP11-138A9.2', 'RP11-141B14.1', 'RP11-142C4.6', 'RP11-162G10.5', 'RP11-164H13.1', 'RP11-178G16.4', 'RP11-18H21.1', 'RP11-211G3.2', 'RP11-219B17.1', 'RP11-219B4.7', 'RP11-252A24.3', 'RP11-291B21.2', 'RP11-314N13.3', 'RP11-324I22.4', 'RP11-349A22.5', 'RP11-378J18.3', 'RP11-390B4.5', 'RP11-398C13.6', 'RP11-400F19.6', 'RP11-421L21.3', 'RP11-428G5.5', 'RP11-432I5.1', 'RP11-468E2.4', 'RP11-488C13.5', 'RP11-493L12.4', 'RP11-527L4.5', 'RP11-545I5.3', 'RP11-589C21.6', 'RP11-5C23.1', 'RP11-701P16.5', 'RP11-706O15.1', 'RP11-70P17.1', 'RP11-727F15.9', 'RP11-798G7.6', 'RP11-879F14.2', 'RP11-950C14.3', 'RP3-325F22.5', 'RP5-1073O3.7', 'RP5-827C21.4', 'RP5-887A10.1', 'RPH3A', 'RPL39L', 'RPL7L1', 'RPN2', 'RPS6KL1', 'RPUSD2', 'RRAGC', 'RRS1', 'RUNDC1', 'S100A11', 'S100A12', 'S100A8', 'S100B', 'SAFB2', 'SAMD1', 'SAMD3', 'SAMSN1', 'SARDH', 'SARS', 'SAT1', 'SCAI', 'SCAPER', 'SCGB3A1', 'SCPEP1', 'SDCCAG8', 'SDPR', 'SEC61A2', 'SELL', 'SEPT11', 'SERAC1', 'SETD1B', 'SF3B1', 'SF3B5', 'SH3GLB1', 'SH3KBP1', 'SHOC2', 'SHPK-1', 'SIAH2', 'SIRPG', 'SIRT1', 'SIVA1', 'SLA', 'SLBP', 'SLC22A4', 'SLC25A11', 'SLC25A12', 'SLC25A14', 'SLC27A1', 'SLC2A13', 'SLC35A2', 'SLC48A1', 'SLFN5', 'SMARCA4', 'SMARCC2', 'SMC2', 'SMCHD1', 'SMDT1', 'SMIM14', 'SMIM7', 'SNAP47', 'SNHG12', 'SNHG8', 'SNTA1', 'SNX29P2', 'SOX13', 'SPARC', 'SPATA7', 'SPG7', 'SPIB', 'SPIN1', 'SPOCD1', 'SPON2', 'SPSB2', 'SREBF1', 'SRM', 'SRP9', 'SRSF6', 'SSBP1', 'ST3GAL2', 'STAMBP', 'STAU2', 'STK17A', 'STK38', 'STMN1', 'STOML2', 'STUB1', 'STX16', 'STX18', 'SUCLG2', 'SUOX', 'SURF1', 'SURF6', 'SWAP70', 'SYCE1', 'SYP', 'SYVN1', 'TACR2', 'TADA2A', 'TAF10', 'TAF12', 'TAF1D', 'TAL1', 'TALDO1', 'TAPBP', 'TARSL2', 'TASP1', 'TBC1D15', 'TBCK', 'TBXA2R', 'TCEAL4', 'TCEAL8', 'TCL1A', 'TCL1B', 'TCP1', 'TDG', 'TERF2IP', 'TGFBRAP1', 'THAP2', 'THEM4', 'THOC7', 'THUMPD3', 'THYN1', 'TIGIT', 'TIMM10B', 'TMEM116', 'TMEM138', 'TMEM140', 'TMEM14B', 'TMEM165', 'TMEM177', 'TMEM194A', 'TMEM219', 'TMEM242', 'TMEM40', 'TMEM60', 'TMEM80', 'TMEM87A', 'TMEM87B', 'TMEM91', 'TMTC2', 'TMX2', 'TMX3', 'TNFRSF17', 'TNFRSF25', 'TNFRSF4', 'TNFRSF9', 'TNFSF10', 'TOP1MT', 'TOP2B', 'TRABD2A', 'TRAF3IP3', 'TRAPPC12-AS1', 'TRAPPC3', 'TREML1', 'TRIM23', 'TRIP12', 'TRIT1', 'TRMT61A', 'TRPM4', 'TSC22D1', 'TSPAN15', 'TSSC1', 'TTC1', 'TTC14', 'TTC3', 'TTC8', 'TTN-AS1', 'TUBB1', 'TUBG2', 'TYMP', 'TYROBP', 'U2SURP', 'UBA5', 'UBAC2', 'UBE2D2', 'UBE2D4', 'UBE2K', 'UBE2Q1', 'UBE3A', 'UBIAD1', 'UBLCP1', 'UBXN4', 'UCK1', 'UNC45A', 'UQCC1', 'URB2', 'URGCP', 'USP30', 'USP33', 'USP36', 'USP38', 'USP5', 'USP7', 'VAMP5', 'VDAC3', 'VIPR1', 'VPS13A', 'VPS13C', 'VPS25', 'VPS26B', 'VPS28', 'VTI1A', 'VTI1B', 'WARS2', 'WBP2NL', 'WDR55', 'WDR91', 'WDYHV1', 'WNK1', 'WTAP', 'XCL2', 'XPOT', 'XRRA1', 'XXbac-BPG299F13.17', 'YEATS2', 'YES1', 'YPEL2', 'YPEL3', 'YTHDF2', 'ZAP70', 'ZBED5-AS1', 'ZBP1', 'ZC3H15', 'ZCCHC11', 'ZCCHC9', 'ZFAND4', 'ZNF175', 'ZNF232', 'ZNF256', 'ZNF263', 'ZNF276', 'ZNF32', 'ZNF350', 'ZNF436', 'ZNF45', 'ZNF493', 'ZNF503', 'ZNF528', 'ZNF559', 'ZNF561', 'ZNF587B', 'ZNF594', 'ZNF653', 'ZNF682', 'ZNF688', 'ZNF718', 'ZNF747', 'ZNF799', 'ZNF836', 'ZNF92', 'ZRANB3', 'ZSWIM6', 'ZUFSP']\n", + "2024-07-08 15:00:47,207 [INFO] Parameter: probe_length_min = 40\n", + "2024-07-08 15:00:47,208 [INFO] Parameter: probe_length_max = 45\n", + "2024-07-08 15:00:47,209 [INFO] Parameter: files_fasta_oligo_database = ['output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna', 'output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna']\n", + "2024-07-08 15:00:47,210 [INFO] Parameter: min_probes_per_gene = 3\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "be10edafcab84490b40411a24194a401", + "model_id": "456d1e2d0cc8415e90c2a28419ea6040", "version_major": 2, "version_minor": 0 }, @@ -475,14 +453,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-06-21 20:53:12,552 [INFO] Step - Create Database: database contains 26663811 oligos from 887 genes.\n" + "2024-07-08 17:17:26,931 [INFO] Step - Create Database: database contains 26663811 oligos from 887 regions.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2024-06-21 20:53:12,618 [DEBUG] handle_msg[be10edafcab84490b40411a24194a401]({'header': {'date': datetime.datetime(2024, 6, 21, 18, 53, 12, 601000, tzinfo=tzutc()), 'msg_id': 'd60c5994-b6ae-4aeb-b38d-782e4fadc32e', 'msg_type': 'comm_msg', 'session': 'f5badd4b-0344-47de-95b0-5b394769c48b', 'username': '14d27083-0f27-4302-83da-06a8ed8f694f', 'version': '5.2'}, 'msg_id': 'd60c5994-b6ae-4aeb-b38d-782e4fadc32e', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': 'be10edafcab84490b40411a24194a401', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': ' \\x1b[35m100%\\x1b[0m \\x1b[36mDatabase Loading\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[32m887/887\\x1b[0m \\x1b[33m1:08:39\\x1b[0m < \\x1b[36m0:00:00\\x1b[0m\\n', 'text/html': '
  100% Database Loading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 887/887 1:08:39 < 0:00:00\\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n" + "2024-07-08 17:17:27,020 [DEBUG] handle_msg[456d1e2d0cc8415e90c2a28419ea6040]({'header': {'date': datetime.datetime(2024, 7, 8, 15, 17, 27, 15000, tzinfo=tzutc()), 'msg_id': 'bffef3a3-1674-47de-8b8f-9180581b259b', 'msg_type': 'comm_msg', 'session': 'f15cb79e-4700-4ef8-aed8-65224aba1be6', 'username': '5bb12fba-c395-4a0a-afbc-31e5784dd153', 'version': '5.2'}, 'msg_id': 'bffef3a3-1674-47de-8b8f-9180581b259b', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '456d1e2d0cc8415e90c2a28419ea6040', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': ' \\x1b[35m100%\\x1b[0m \\x1b[36mDatabase Loading\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[32m887/887\\x1b[0m \\x1b[33m1:12:17\\x1b[0m < \\x1b[36m0:00:00\\x1b[0m\\n', 'text/html': '
  100% Database Loading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 887/887 1:12:17 < 0:00:00\\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n" ] } ], @@ -529,7 +507,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -582,41 +560,44 @@ " \"fmdfactor\": 0.65, # default\n", " \"fmdmethod\": 1, # default\n", " \"GC\": None, # default\n", - "}" + "}\n", + "\n", + "Tm_salt_correction_param_probe = None" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2024-06-21 20:53:35,992 [INFO] Parameters Property Filters:\n", - "2024-06-21 20:53:35,994 [INFO] Function: filter_by_property\n", - "2024-06-21 20:53:35,996 [INFO] Parameter: oligo_database = \n", - "2024-06-21 20:53:35,998 [INFO] Parameter: probe_GC_content_min = 40\n", - "2024-06-21 20:53:35,999 [INFO] Parameter: probe_GC_content_max = 60\n", - "2024-06-21 20:53:36,001 [INFO] Parameter: probe_Tm_min = 65\n", - "2024-06-21 20:53:36,002 [INFO] Parameter: probe_Tm_max = 75\n", - "2024-06-21 20:53:36,004 [INFO] Parameter: detect_oligo_length_min = 15\n", - "2024-06-21 20:53:36,006 [INFO] Parameter: detect_oligo_length_max = 40\n", - "2024-06-21 20:53:36,007 [INFO] Parameter: min_thymines = 2\n", - "2024-06-21 20:53:36,008 [INFO] Parameter: arm_Tm_dif_max = 2\n", - "2024-06-21 20:53:36,011 [INFO] Parameter: arm_length_min = 10\n", - "2024-06-21 20:53:36,012 [INFO] Parameter: arm_Tm_min = 50\n", - "2024-06-21 20:53:36,014 [INFO] Parameter: arm_Tm_max = 60\n", - "2024-06-21 20:53:36,016 [INFO] Parameter: homopolymeric_base_n = {'A': 5, 'T': 5, 'C': 5, 'G': 5}\n", - "2024-06-21 20:53:36,017 [INFO] Parameter: Tm_parameters_probe = {'check': True, 'strict': True, 'c_seq': None, 'shift': 0, 'nn_table': {'init': (0, 0), 'init_A/T': (2.3, 4.1), 'init_G/C': (0.1, -2.8), 'init_oneG/C': (0, 0), 'init_allA/T': (0, 0), 'init_5T/A': (0, 0), 'sym': (0, -1.4), 'AA/TT': (-7.9, -22.2), 'AT/TA': (-7.2, -20.4), 'TA/AT': (-7.2, -21.3), 'CA/GT': (-8.5, -22.7), 'GT/CA': (-8.4, -22.4), 'CT/GA': (-7.8, -21.0), 'GA/CT': (-8.2, -22.2), 'CG/GC': (-10.6, -27.2), 'GC/CG': (-9.8, -24.4), 'GG/CC': (-8.0, -19.9)}, 'tmm_table': {'AA/TA': (-3.1, -7.8), 'TA/AA': (-2.5, -6.3), 'CA/GA': (-4.3, -10.7), 'GA/CA': (-8.0, -22.5), 'AC/TC': (-0.1, 0.5), 'TC/AC': (-0.7, -1.3), 'CC/GC': (-2.1, -5.1), 'GC/CC': (-3.9, -10.6), 'AG/TG': (-1.1, -2.1), 'TG/AG': (-1.1, -2.7), 'CG/GG': (-3.8, -9.5), 'GG/CG': (-0.7, -19.2), 'AT/TT': (-2.4, -6.5), 'TT/AT': (-3.2, -8.9), 'CT/GT': (-6.1, -16.9), 'GT/CT': (-7.4, -21.2), 'AA/TC': (-1.6, -4.0), 'AC/TA': (-1.8, -3.8), 'CA/GC': (-2.6, -5.9), 'CC/GA': (-2.7, -6.0), 'GA/CC': (-5.0, -13.8), 'GC/CA': (-3.2, -7.1), 'TA/AC': (-2.3, -5.9), 'TC/AA': (-2.7, -7.0), 'AC/TT': (-0.9, -1.7), 'AT/TC': (-2.3, -6.3), 'CC/GT': (-3.2, -8.0), 'CT/GC': (-3.9, -10.6), 'GC/CT': (-4.9, -13.5), 'GT/CC': (-3.0, -7.8), 'TC/AT': (-2.5, -6.3), 'TT/AC': (-0.7, -1.2), 'AA/TG': (-1.9, -4.4), 'AG/TA': (-2.5, -5.9), 'CA/GG': (-3.9, -9.6), 'CG/GA': (-6.0, -15.5), 'GA/CG': (-4.3, -11.1), 'GG/CA': (-4.6, -11.4), 'TA/AG': (-2.0, -4.7), 'TG/AA': (-2.4, -5.8), 'AG/TT': (-3.2, -8.7), 'AT/TG': (-3.5, -9.4), 'CG/GT': (-3.8, -9.0), 'CT/GG': (-6.6, -18.7), 'GG/CT': (-5.7, -15.9), 'GT/CG': (-5.9, -16.1), 'TG/AT': (-3.9, -10.5), 'TT/AG': (-3.6, -9.8)}, 'imm_table': {'AG/TT': (1.0, 0.9), 'AT/TG': (-2.5, -8.3), 'CG/GT': (-4.1, -11.7), 'CT/GG': (-2.8, -8.0), 'GG/CT': (3.3, 10.4), 'GG/TT': (5.8, 16.3), 'GT/CG': (-4.4, -12.3), 'GT/TG': (4.1, 9.5), 'TG/AT': (-0.1, -1.7), 'TG/GT': (-1.4, -6.2), 'TT/AG': (-1.3, -5.3), 'AA/TG': (-0.6, -2.3), 'AG/TA': (-0.7, -2.3), 'CA/GG': (-0.7, -2.3), 'CG/GA': (-4.0, -13.2), 'GA/CG': (-0.6, -1.0), 'GG/CA': (0.5, 3.2), 'TA/AG': (0.7, 0.7), 'TG/AA': (3.0, 7.4), 'AC/TT': (0.7, 0.2), 'AT/TC': (-1.2, -6.2), 'CC/GT': (-0.8, -4.5), 'CT/GC': (-1.5, -6.1), 'GC/CT': (2.3, 5.4), 'GT/CC': (5.2, 13.5), 'TC/AT': (1.2, 0.7), 'TT/AC': (1.0, 0.7), 'AA/TC': (2.3, 4.6), 'AC/TA': (5.3, 14.6), 'CA/GC': (1.9, 3.7), 'CC/GA': (0.6, -0.6), 'GA/CC': (5.2, 14.2), 'GC/CA': (-0.7, -3.8), 'TA/AC': (3.4, 8.0), 'TC/AA': (7.6, 20.2), 'AA/TA': (1.2, 1.7), 'CA/GA': (-0.9, -4.2), 'GA/CA': (-2.9, -9.8), 'TA/AA': (4.7, 12.9), 'AC/TC': (0.0, -4.4), 'CC/GC': (-1.5, -7.2), 'GC/CC': (3.6, 8.9), 'TC/AC': (6.1, 16.4), 'AG/TG': (-3.1, -9.5), 'CG/GG': (-4.9, -15.3), 'GG/CG': (-6.0, -15.8), 'TG/AG': (1.6, 3.6), 'AT/TT': (-2.7, -10.8), 'CT/GT': (-5.0, -15.8), 'GT/CT': (-2.2, -8.4), 'TT/AT': (0.2, -1.5), 'AI/TC': (-8.9, -25.5), 'TI/AC': (-5.9, -17.4), 'AC/TI': (-8.8, -25.4), 'TC/AI': (-4.9, -13.9), 'CI/GC': (-5.4, -13.7), 'GI/CC': (-6.8, -19.1), 'CC/GI': (-8.3, -23.8), 'GC/CI': (-5.0, -12.6), 'AI/TA': (-8.3, -25.0), 'TI/AA': (-3.4, -11.2), 'AA/TI': (-0.7, -2.6), 'TA/AI': (-1.3, -4.6), 'CI/GA': (2.6, 8.9), 'GI/CA': (-7.8, -21.1), 'CA/GI': (-7.0, -20.0), 'GA/CI': (-7.6, -20.2), 'AI/TT': (0.49, -0.7), 'TI/AT': (-6.5, -22.0), 'AT/TI': (-5.6, -18.7), 'TT/AI': (-0.8, -4.3), 'CI/GT': (-1.0, -2.4), 'GI/CT': (-3.5, -10.6), 'CT/GI': (0.1, -1.0), 'GT/CI': (-4.3, -12.1), 'AI/TG': (-4.9, -15.8), 'TI/AG': (-1.9, -8.5), 'AG/TI': (0.1, -1.8), 'TG/AI': (1.0, 1.0), 'CI/GG': (7.1, 21.3), 'GI/CG': (-1.1, -3.2), 'CG/GI': (5.8, 16.9), 'GG/CI': (-7.6, -22.0), 'AI/TI': (-3.3, -11.9), 'TI/AI': (0.1, -2.3), 'CI/GI': (1.3, 3.0), 'GI/CI': (-0.5, -1.3)}, 'de_table': {'AA/.T': (0.2, 2.3), 'AC/.G': (-6.3, -17.1), 'AG/.C': (-3.7, -10.0), 'AT/.A': (-2.9, -7.6), 'CA/.T': (0.6, 3.3), 'CC/.G': (-4.4, -12.6), 'CG/.C': (-4.0, -11.9), 'CT/.A': (-4.1, -13.0), 'GA/.T': (-1.1, -1.6), 'GC/.G': (-5.1, -14.0), 'GG/.C': (-3.9, -10.9), 'GT/.A': (-4.2, -15.0), 'TA/.T': (-6.9, -20.0), 'TC/.G': (-4.0, -10.9), 'TG/.C': (-4.9, -13.8), 'TT/.A': (-0.2, -0.5), '.A/AT': (-0.7, -0.8), '.C/AG': (-2.1, -3.9), '.G/AC': (-5.9, -16.5), '.T/AA': (-0.5, -1.1), '.A/CT': (4.4, 14.9), '.C/CG': (-0.2, -0.1), '.G/CC': (-2.6, -7.4), '.T/CA': (4.7, 14.2), '.A/GT': (-1.6, -3.6), '.C/GG': (-3.9, -11.2), '.G/GC': (-3.2, -10.4), '.T/GA': (-4.1, -13.1), '.A/TT': (2.9, 10.4), '.C/TG': (-4.4, -13.1), '.G/TC': (-5.2, -15.0), '.T/TA': (-3.8, -12.6)}, 'dnac1': 50, 'dnac2': 0, 'selfcomp': False, 'saltcorr': 7, 'Na': 39, 'K': 75, 'Tris': 20, 'Mg': 10, 'dNTPs': 0}\n", - "2024-06-21 20:53:36,019 [INFO] Parameter: Tm_chem_correction_param_probe = {'DMSO': 0, 'fmd': 20, 'DMSOfactor': 0.75, 'fmdfactor': 0.65, 'fmdmethod': 1, 'GC': None}\n" + "2024-07-08 17:41:57,263 [INFO] Parameters Property Filters:\n", + "2024-07-08 17:41:57,266 [INFO] Function: filter_by_property\n", + "2024-07-08 17:41:57,268 [INFO] Parameter: oligo_database = \n", + "2024-07-08 17:41:57,270 [INFO] Parameter: probe_GC_content_min = 40\n", + "2024-07-08 17:41:57,272 [INFO] Parameter: probe_GC_content_max = 60\n", + "2024-07-08 17:41:57,273 [INFO] Parameter: probe_Tm_min = 65\n", + "2024-07-08 17:41:57,274 [INFO] Parameter: probe_Tm_max = 75\n", + "2024-07-08 17:41:57,275 [INFO] Parameter: detect_oligo_length_min = 15\n", + "2024-07-08 17:41:57,276 [INFO] Parameter: detect_oligo_length_max = 40\n", + "2024-07-08 17:41:57,278 [INFO] Parameter: min_thymines = 2\n", + "2024-07-08 17:41:57,279 [INFO] Parameter: arm_Tm_dif_max = 2\n", + "2024-07-08 17:41:57,279 [INFO] Parameter: arm_length_min = 10\n", + "2024-07-08 17:41:57,280 [INFO] Parameter: arm_Tm_min = 50\n", + "2024-07-08 17:41:57,282 [INFO] Parameter: arm_Tm_max = 60\n", + "2024-07-08 17:41:57,284 [INFO] Parameter: homopolymeric_base_n = {'A': 5, 'T': 5, 'C': 5, 'G': 5}\n", + "2024-07-08 17:41:57,286 [INFO] Parameter: Tm_parameters_probe = {'check': True, 'strict': True, 'c_seq': None, 'shift': 0, 'nn_table': {'init': (0, 0), 'init_A/T': (2.3, 4.1), 'init_G/C': (0.1, -2.8), 'init_oneG/C': (0, 0), 'init_allA/T': (0, 0), 'init_5T/A': (0, 0), 'sym': (0, -1.4), 'AA/TT': (-7.9, -22.2), 'AT/TA': (-7.2, -20.4), 'TA/AT': (-7.2, -21.3), 'CA/GT': (-8.5, -22.7), 'GT/CA': (-8.4, -22.4), 'CT/GA': (-7.8, -21.0), 'GA/CT': (-8.2, -22.2), 'CG/GC': (-10.6, -27.2), 'GC/CG': (-9.8, -24.4), 'GG/CC': (-8.0, -19.9)}, 'tmm_table': {'AA/TA': (-3.1, -7.8), 'TA/AA': (-2.5, -6.3), 'CA/GA': (-4.3, -10.7), 'GA/CA': (-8.0, -22.5), 'AC/TC': (-0.1, 0.5), 'TC/AC': (-0.7, -1.3), 'CC/GC': (-2.1, -5.1), 'GC/CC': (-3.9, -10.6), 'AG/TG': (-1.1, -2.1), 'TG/AG': (-1.1, -2.7), 'CG/GG': (-3.8, -9.5), 'GG/CG': (-0.7, -19.2), 'AT/TT': (-2.4, -6.5), 'TT/AT': (-3.2, -8.9), 'CT/GT': (-6.1, -16.9), 'GT/CT': (-7.4, -21.2), 'AA/TC': (-1.6, -4.0), 'AC/TA': (-1.8, -3.8), 'CA/GC': (-2.6, -5.9), 'CC/GA': (-2.7, -6.0), 'GA/CC': (-5.0, -13.8), 'GC/CA': (-3.2, -7.1), 'TA/AC': (-2.3, -5.9), 'TC/AA': (-2.7, -7.0), 'AC/TT': (-0.9, -1.7), 'AT/TC': (-2.3, -6.3), 'CC/GT': (-3.2, -8.0), 'CT/GC': (-3.9, -10.6), 'GC/CT': (-4.9, -13.5), 'GT/CC': (-3.0, -7.8), 'TC/AT': (-2.5, -6.3), 'TT/AC': (-0.7, -1.2), 'AA/TG': (-1.9, -4.4), 'AG/TA': (-2.5, -5.9), 'CA/GG': (-3.9, -9.6), 'CG/GA': (-6.0, -15.5), 'GA/CG': (-4.3, -11.1), 'GG/CA': (-4.6, -11.4), 'TA/AG': (-2.0, -4.7), 'TG/AA': (-2.4, -5.8), 'AG/TT': (-3.2, -8.7), 'AT/TG': (-3.5, -9.4), 'CG/GT': (-3.8, -9.0), 'CT/GG': (-6.6, -18.7), 'GG/CT': (-5.7, -15.9), 'GT/CG': (-5.9, -16.1), 'TG/AT': (-3.9, -10.5), 'TT/AG': (-3.6, -9.8)}, 'imm_table': {'AG/TT': (1.0, 0.9), 'AT/TG': (-2.5, -8.3), 'CG/GT': (-4.1, -11.7), 'CT/GG': (-2.8, -8.0), 'GG/CT': (3.3, 10.4), 'GG/TT': (5.8, 16.3), 'GT/CG': (-4.4, -12.3), 'GT/TG': (4.1, 9.5), 'TG/AT': (-0.1, -1.7), 'TG/GT': (-1.4, -6.2), 'TT/AG': (-1.3, -5.3), 'AA/TG': (-0.6, -2.3), 'AG/TA': (-0.7, -2.3), 'CA/GG': (-0.7, -2.3), 'CG/GA': (-4.0, -13.2), 'GA/CG': (-0.6, -1.0), 'GG/CA': (0.5, 3.2), 'TA/AG': (0.7, 0.7), 'TG/AA': (3.0, 7.4), 'AC/TT': (0.7, 0.2), 'AT/TC': (-1.2, -6.2), 'CC/GT': (-0.8, -4.5), 'CT/GC': (-1.5, -6.1), 'GC/CT': (2.3, 5.4), 'GT/CC': (5.2, 13.5), 'TC/AT': (1.2, 0.7), 'TT/AC': (1.0, 0.7), 'AA/TC': (2.3, 4.6), 'AC/TA': (5.3, 14.6), 'CA/GC': (1.9, 3.7), 'CC/GA': (0.6, -0.6), 'GA/CC': (5.2, 14.2), 'GC/CA': (-0.7, -3.8), 'TA/AC': (3.4, 8.0), 'TC/AA': (7.6, 20.2), 'AA/TA': (1.2, 1.7), 'CA/GA': (-0.9, -4.2), 'GA/CA': (-2.9, -9.8), 'TA/AA': (4.7, 12.9), 'AC/TC': (0.0, -4.4), 'CC/GC': (-1.5, -7.2), 'GC/CC': (3.6, 8.9), 'TC/AC': (6.1, 16.4), 'AG/TG': (-3.1, -9.5), 'CG/GG': (-4.9, -15.3), 'GG/CG': (-6.0, -15.8), 'TG/AG': (1.6, 3.6), 'AT/TT': (-2.7, -10.8), 'CT/GT': (-5.0, -15.8), 'GT/CT': (-2.2, -8.4), 'TT/AT': (0.2, -1.5), 'AI/TC': (-8.9, -25.5), 'TI/AC': (-5.9, -17.4), 'AC/TI': (-8.8, -25.4), 'TC/AI': (-4.9, -13.9), 'CI/GC': (-5.4, -13.7), 'GI/CC': (-6.8, -19.1), 'CC/GI': (-8.3, -23.8), 'GC/CI': (-5.0, -12.6), 'AI/TA': (-8.3, -25.0), 'TI/AA': (-3.4, -11.2), 'AA/TI': (-0.7, -2.6), 'TA/AI': (-1.3, -4.6), 'CI/GA': (2.6, 8.9), 'GI/CA': (-7.8, -21.1), 'CA/GI': (-7.0, -20.0), 'GA/CI': (-7.6, -20.2), 'AI/TT': (0.49, -0.7), 'TI/AT': (-6.5, -22.0), 'AT/TI': (-5.6, -18.7), 'TT/AI': (-0.8, -4.3), 'CI/GT': (-1.0, -2.4), 'GI/CT': (-3.5, -10.6), 'CT/GI': (0.1, -1.0), 'GT/CI': (-4.3, -12.1), 'AI/TG': (-4.9, -15.8), 'TI/AG': (-1.9, -8.5), 'AG/TI': (0.1, -1.8), 'TG/AI': (1.0, 1.0), 'CI/GG': (7.1, 21.3), 'GI/CG': (-1.1, -3.2), 'CG/GI': (5.8, 16.9), 'GG/CI': (-7.6, -22.0), 'AI/TI': (-3.3, -11.9), 'TI/AI': (0.1, -2.3), 'CI/GI': (1.3, 3.0), 'GI/CI': (-0.5, -1.3)}, 'de_table': {'AA/.T': (0.2, 2.3), 'AC/.G': (-6.3, -17.1), 'AG/.C': (-3.7, -10.0), 'AT/.A': (-2.9, -7.6), 'CA/.T': (0.6, 3.3), 'CC/.G': (-4.4, -12.6), 'CG/.C': (-4.0, -11.9), 'CT/.A': (-4.1, -13.0), 'GA/.T': (-1.1, -1.6), 'GC/.G': (-5.1, -14.0), 'GG/.C': (-3.9, -10.9), 'GT/.A': (-4.2, -15.0), 'TA/.T': (-6.9, -20.0), 'TC/.G': (-4.0, -10.9), 'TG/.C': (-4.9, -13.8), 'TT/.A': (-0.2, -0.5), '.A/AT': (-0.7, -0.8), '.C/AG': (-2.1, -3.9), '.G/AC': (-5.9, -16.5), '.T/AA': (-0.5, -1.1), '.A/CT': (4.4, 14.9), '.C/CG': (-0.2, -0.1), '.G/CC': (-2.6, -7.4), '.T/CA': (4.7, 14.2), '.A/GT': (-1.6, -3.6), '.C/GG': (-3.9, -11.2), '.G/GC': (-3.2, -10.4), '.T/GA': (-4.1, -13.1), '.A/TT': (2.9, 10.4), '.C/TG': (-4.4, -13.1), '.G/TC': (-5.2, -15.0), '.T/TA': (-3.8, -12.6)}, 'dnac1': 50, 'dnac2': 0, 'selfcomp': False, 'saltcorr': 7, 'Na': 39, 'K': 75, 'Tris': 20, 'Mg': 10, 'dNTPs': 0}\n", + "2024-07-08 17:41:57,288 [INFO] Parameter: Tm_chem_correction_param_probe = {'DMSO': 0, 'fmd': 20, 'DMSOfactor': 0.75, 'fmdfactor': 0.65, 'fmdmethod': 1, 'GC': None}\n", + "2024-07-08 17:41:57,288 [INFO] Parameter: Tm_salt_correction_param_probe = None\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1b6e1bbfde05498394497b2d64da341f", + "model_id": "af295b15415d4d37bd0c05502a7e2966", "version_major": 2, "version_minor": 0 }, @@ -654,21 +635,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-06-21 22:01:23,977 [INFO] Step - Property Filters: database contains 3468382 oligos from 887 genes.\n" + "2024-07-08 18:52:48,260 [INFO] Step - Property Filters: database contains 3468382 oligos from 887 regions.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2024-06-21 22:01:24,010 [DEBUG] handle_msg[1b6e1bbfde05498394497b2d64da341f]({'header': {'date': datetime.datetime(2024, 6, 21, 20, 1, 24, 3000, tzinfo=tzutc()), 'msg_id': 'a8137585-25fd-4c5c-aa27-eec9590e2742', 'msg_type': 'comm_msg', 'session': 'f5badd4b-0344-47de-95b0-5b394769c48b', 'username': '14d27083-0f27-4302-83da-06a8ed8f694f', 'version': '5.2'}, 'msg_id': 'a8137585-25fd-4c5c-aa27-eec9590e2742', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '1b6e1bbfde05498394497b2d64da341f', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': ' \\x1b[35m100%\\x1b[0m \\x1b[36mProperty Filter\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[32m887/887\\x1b[0m \\x1b[33m0:56:32\\x1b[0m < \\x1b[36m0:00:00\\x1b[0m\\n', 'text/html': '
  100% Property Filter ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 887/887 0:56:32 < 0:00:00\\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n" + "2024-07-08 18:52:48,357 [DEBUG] handle_msg[af295b15415d4d37bd0c05502a7e2966]({'header': {'date': datetime.datetime(2024, 7, 8, 16, 52, 48, 323000, tzinfo=tzutc()), 'msg_id': '94375a4a-c46b-4ea8-aa8f-b1e5092c97ae', 'msg_type': 'comm_msg', 'session': 'f15cb79e-4700-4ef8-aed8-65224aba1be6', 'username': '5bb12fba-c395-4a0a-afbc-31e5784dd153', 'version': '5.2'}, 'msg_id': '94375a4a-c46b-4ea8-aa8f-b1e5092c97ae', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': 'af295b15415d4d37bd0c05502a7e2966', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': ' \\x1b[35m100%\\x1b[0m \\x1b[36mProperty Filter\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[32m887/887\\x1b[0m \\x1b[33m0:59:00\\x1b[0m < \\x1b[36m0:00:00\\x1b[0m\\n', 'text/html': '
  100% Property Filter ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 887/887 0:59:00 < 0:00:00\\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n" ] } ], "source": [ "####### Load existing database #######\n", "# dir_database = os.path.join(dir_output, \"db_probes/1_db_probes_initial\")\n", - "# probe_database = OligoDatabase(min_oligos_per_region=min_probes_per_gene, write_regions_with_insufficient_oligos=True, lru_db_max_in_memory=db_max_in_memory, database_name=\"db_probes\", dir_output=dir_output, n_jobs=n_jobs)\n", + "# probe_database = OligoDatabase(min_oligos_per_region=min_probes_per_gene, write_regions_with_insufficient_oligos=True, lru_db_max_in_memory=n_jobs*2+2, database_name=\"db_probes\", dir_output=dir_output, n_jobs=n_jobs)\n", "# probe_database.load_database(dir_database=dir_database, region_ids=gene_ids, database_overwrite=True)\n", "\n", "## Apply property filters\n", @@ -688,6 +669,7 @@ " homopolymeric_base_n=homopolymeric_base_n,\n", " Tm_parameters_probe=Tm_parameters_probe,\n", " Tm_chem_correction_param_probe=Tm_chem_correction_param_probe,\n", + " Tm_salt_correction_param_probe=Tm_salt_correction_param_probe,\n", ")" ] }, @@ -719,7 +701,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -758,34 +740,35 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2024-06-21 22:01:38,485 [INFO] Parameters Specificity Filters:\n", - "2024-06-21 22:01:38,488 [INFO] Function: filter_by_specificity\n", - "2024-06-21 22:01:38,489 [INFO] Parameter: oligo_database = \n", - "2024-06-21 22:01:38,493 [INFO] Parameter: files_fasta_reference_database = ['output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna', 'output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna']\n", - "2024-06-21 22:01:38,496 [INFO] Parameter: specificity_blastn_search_parameters = {'perc_identity': 80, 'strand': 'minus', 'word_size': 10, 'dust': 'no', 'soft_masking': 'false', 'max_target_seqs': 10, 'max_hsps': 1000}\n", - "2024-06-21 22:01:38,497 [INFO] Parameter: specificity_blastn_hit_parameters = {'coverage': 50}\n", - "2024-06-21 22:01:38,499 [INFO] Parameter: cross_hybridization_blastn_search_parameters = {'perc_identity': 80, 'strand': 'minus', 'word_size': 10, 'dust': 'no', 'soft_masking': 'false', 'max_target_seqs': 10}\n", - "2024-06-21 22:01:38,500 [INFO] Parameter: cross_hybridization_blastn_hit_parameters = {'coverage': 80}\n", - "2024-06-21 22:01:38,501 [INFO] Parameter: ligation_region_size = 5\n", - "2024-06-21 22:01:38,504 [INFO] Parameter: arm_Tm_dif_max = 2\n", - "2024-06-21 22:01:38,505 [INFO] Parameter: arm_length_min = 10\n", - "2024-06-21 22:01:38,506 [INFO] Parameter: arm_Tm_min = 50\n", - "2024-06-21 22:01:38,507 [INFO] Parameter: arm_Tm_max = 60\n", - "2024-06-21 22:01:38,508 [INFO] Parameter: Tm_parameters_probe = {'check': True, 'strict': True, 'c_seq': None, 'shift': 0, 'nn_table': {'init': (0, 0), 'init_A/T': (2.3, 4.1), 'init_G/C': (0.1, -2.8), 'init_oneG/C': (0, 0), 'init_allA/T': (0, 0), 'init_5T/A': (0, 0), 'sym': (0, -1.4), 'AA/TT': (-7.9, -22.2), 'AT/TA': (-7.2, -20.4), 'TA/AT': (-7.2, -21.3), 'CA/GT': (-8.5, -22.7), 'GT/CA': (-8.4, -22.4), 'CT/GA': (-7.8, -21.0), 'GA/CT': (-8.2, -22.2), 'CG/GC': (-10.6, -27.2), 'GC/CG': (-9.8, -24.4), 'GG/CC': (-8.0, -19.9)}, 'tmm_table': {'AA/TA': (-3.1, -7.8), 'TA/AA': (-2.5, -6.3), 'CA/GA': (-4.3, -10.7), 'GA/CA': (-8.0, -22.5), 'AC/TC': (-0.1, 0.5), 'TC/AC': (-0.7, -1.3), 'CC/GC': (-2.1, -5.1), 'GC/CC': (-3.9, -10.6), 'AG/TG': (-1.1, -2.1), 'TG/AG': (-1.1, -2.7), 'CG/GG': (-3.8, -9.5), 'GG/CG': (-0.7, -19.2), 'AT/TT': (-2.4, -6.5), 'TT/AT': (-3.2, -8.9), 'CT/GT': (-6.1, -16.9), 'GT/CT': (-7.4, -21.2), 'AA/TC': (-1.6, -4.0), 'AC/TA': (-1.8, -3.8), 'CA/GC': (-2.6, -5.9), 'CC/GA': (-2.7, -6.0), 'GA/CC': (-5.0, -13.8), 'GC/CA': (-3.2, -7.1), 'TA/AC': (-2.3, -5.9), 'TC/AA': (-2.7, -7.0), 'AC/TT': (-0.9, -1.7), 'AT/TC': (-2.3, -6.3), 'CC/GT': (-3.2, -8.0), 'CT/GC': (-3.9, -10.6), 'GC/CT': (-4.9, -13.5), 'GT/CC': (-3.0, -7.8), 'TC/AT': (-2.5, -6.3), 'TT/AC': (-0.7, -1.2), 'AA/TG': (-1.9, -4.4), 'AG/TA': (-2.5, -5.9), 'CA/GG': (-3.9, -9.6), 'CG/GA': (-6.0, -15.5), 'GA/CG': (-4.3, -11.1), 'GG/CA': (-4.6, -11.4), 'TA/AG': (-2.0, -4.7), 'TG/AA': (-2.4, -5.8), 'AG/TT': (-3.2, -8.7), 'AT/TG': (-3.5, -9.4), 'CG/GT': (-3.8, -9.0), 'CT/GG': (-6.6, -18.7), 'GG/CT': (-5.7, -15.9), 'GT/CG': (-5.9, -16.1), 'TG/AT': (-3.9, -10.5), 'TT/AG': (-3.6, -9.8)}, 'imm_table': {'AG/TT': (1.0, 0.9), 'AT/TG': (-2.5, -8.3), 'CG/GT': (-4.1, -11.7), 'CT/GG': (-2.8, -8.0), 'GG/CT': (3.3, 10.4), 'GG/TT': (5.8, 16.3), 'GT/CG': (-4.4, -12.3), 'GT/TG': (4.1, 9.5), 'TG/AT': (-0.1, -1.7), 'TG/GT': (-1.4, -6.2), 'TT/AG': (-1.3, -5.3), 'AA/TG': (-0.6, -2.3), 'AG/TA': (-0.7, -2.3), 'CA/GG': (-0.7, -2.3), 'CG/GA': (-4.0, -13.2), 'GA/CG': (-0.6, -1.0), 'GG/CA': (0.5, 3.2), 'TA/AG': (0.7, 0.7), 'TG/AA': (3.0, 7.4), 'AC/TT': (0.7, 0.2), 'AT/TC': (-1.2, -6.2), 'CC/GT': (-0.8, -4.5), 'CT/GC': (-1.5, -6.1), 'GC/CT': (2.3, 5.4), 'GT/CC': (5.2, 13.5), 'TC/AT': (1.2, 0.7), 'TT/AC': (1.0, 0.7), 'AA/TC': (2.3, 4.6), 'AC/TA': (5.3, 14.6), 'CA/GC': (1.9, 3.7), 'CC/GA': (0.6, -0.6), 'GA/CC': (5.2, 14.2), 'GC/CA': (-0.7, -3.8), 'TA/AC': (3.4, 8.0), 'TC/AA': (7.6, 20.2), 'AA/TA': (1.2, 1.7), 'CA/GA': (-0.9, -4.2), 'GA/CA': (-2.9, -9.8), 'TA/AA': (4.7, 12.9), 'AC/TC': (0.0, -4.4), 'CC/GC': (-1.5, -7.2), 'GC/CC': (3.6, 8.9), 'TC/AC': (6.1, 16.4), 'AG/TG': (-3.1, -9.5), 'CG/GG': (-4.9, -15.3), 'GG/CG': (-6.0, -15.8), 'TG/AG': (1.6, 3.6), 'AT/TT': (-2.7, -10.8), 'CT/GT': (-5.0, -15.8), 'GT/CT': (-2.2, -8.4), 'TT/AT': (0.2, -1.5), 'AI/TC': (-8.9, -25.5), 'TI/AC': (-5.9, -17.4), 'AC/TI': (-8.8, -25.4), 'TC/AI': (-4.9, -13.9), 'CI/GC': (-5.4, -13.7), 'GI/CC': (-6.8, -19.1), 'CC/GI': (-8.3, -23.8), 'GC/CI': (-5.0, -12.6), 'AI/TA': (-8.3, -25.0), 'TI/AA': (-3.4, -11.2), 'AA/TI': (-0.7, -2.6), 'TA/AI': (-1.3, -4.6), 'CI/GA': (2.6, 8.9), 'GI/CA': (-7.8, -21.1), 'CA/GI': (-7.0, -20.0), 'GA/CI': (-7.6, -20.2), 'AI/TT': (0.49, -0.7), 'TI/AT': (-6.5, -22.0), 'AT/TI': (-5.6, -18.7), 'TT/AI': (-0.8, -4.3), 'CI/GT': (-1.0, -2.4), 'GI/CT': (-3.5, -10.6), 'CT/GI': (0.1, -1.0), 'GT/CI': (-4.3, -12.1), 'AI/TG': (-4.9, -15.8), 'TI/AG': (-1.9, -8.5), 'AG/TI': (0.1, -1.8), 'TG/AI': (1.0, 1.0), 'CI/GG': (7.1, 21.3), 'GI/CG': (-1.1, -3.2), 'CG/GI': (5.8, 16.9), 'GG/CI': (-7.6, -22.0), 'AI/TI': (-3.3, -11.9), 'TI/AI': (0.1, -2.3), 'CI/GI': (1.3, 3.0), 'GI/CI': (-0.5, -1.3)}, 'de_table': {'AA/.T': (0.2, 2.3), 'AC/.G': (-6.3, -17.1), 'AG/.C': (-3.7, -10.0), 'AT/.A': (-2.9, -7.6), 'CA/.T': (0.6, 3.3), 'CC/.G': (-4.4, -12.6), 'CG/.C': (-4.0, -11.9), 'CT/.A': (-4.1, -13.0), 'GA/.T': (-1.1, -1.6), 'GC/.G': (-5.1, -14.0), 'GG/.C': (-3.9, -10.9), 'GT/.A': (-4.2, -15.0), 'TA/.T': (-6.9, -20.0), 'TC/.G': (-4.0, -10.9), 'TG/.C': (-4.9, -13.8), 'TT/.A': (-0.2, -0.5), '.A/AT': (-0.7, -0.8), '.C/AG': (-2.1, -3.9), '.G/AC': (-5.9, -16.5), '.T/AA': (-0.5, -1.1), '.A/CT': (4.4, 14.9), '.C/CG': (-0.2, -0.1), '.G/CC': (-2.6, -7.4), '.T/CA': (4.7, 14.2), '.A/GT': (-1.6, -3.6), '.C/GG': (-3.9, -11.2), '.G/GC': (-3.2, -10.4), '.T/GA': (-4.1, -13.1), '.A/TT': (2.9, 10.4), '.C/TG': (-4.4, -13.1), '.G/TC': (-5.2, -15.0), '.T/TA': (-3.8, -12.6)}, 'dnac1': 50, 'dnac2': 0, 'selfcomp': False, 'saltcorr': 7, 'Na': 39, 'K': 75, 'Tris': 20, 'Mg': 10, 'dNTPs': 0}\n", - "2024-06-21 22:01:38,509 [INFO] Parameter: Tm_chem_correction_param_probe = {'DMSO': 0, 'fmd': 20, 'DMSOfactor': 0.75, 'fmdfactor': 0.65, 'fmdmethod': 1, 'GC': None}\n" + "2024-07-08 19:46:51,968 [INFO] Parameters Specificity Filters:\n", + "2024-07-08 19:46:51,969 [INFO] Function: filter_by_specificity\n", + "2024-07-08 19:46:51,972 [INFO] Parameter: oligo_database = \n", + "2024-07-08 19:46:51,973 [INFO] Parameter: files_fasta_reference_database = ['output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna', 'output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna']\n", + "2024-07-08 19:46:51,975 [INFO] Parameter: specificity_blastn_search_parameters = {'perc_identity': 80, 'strand': 'minus', 'word_size': 10, 'dust': 'no', 'soft_masking': 'false', 'max_target_seqs': 10, 'max_hsps': 1000}\n", + "2024-07-08 19:46:51,977 [INFO] Parameter: specificity_blastn_hit_parameters = {'coverage': 50}\n", + "2024-07-08 19:46:51,978 [INFO] Parameter: cross_hybridization_blastn_search_parameters = {'perc_identity': 80, 'strand': 'minus', 'word_size': 10, 'dust': 'no', 'soft_masking': 'false', 'max_target_seqs': 10}\n", + "2024-07-08 19:46:51,979 [INFO] Parameter: cross_hybridization_blastn_hit_parameters = {'coverage': 80}\n", + "2024-07-08 19:46:51,980 [INFO] Parameter: ligation_region_size = 5\n", + "2024-07-08 19:46:51,984 [INFO] Parameter: arm_Tm_dif_max = 2\n", + "2024-07-08 19:46:51,986 [INFO] Parameter: arm_length_min = 10\n", + "2024-07-08 19:46:51,988 [INFO] Parameter: arm_Tm_min = 50\n", + "2024-07-08 19:46:51,990 [INFO] Parameter: arm_Tm_max = 60\n", + "2024-07-08 19:46:51,992 [INFO] Parameter: Tm_parameters_probe = {'check': True, 'strict': True, 'c_seq': None, 'shift': 0, 'nn_table': {'init': (0, 0), 'init_A/T': (2.3, 4.1), 'init_G/C': (0.1, -2.8), 'init_oneG/C': (0, 0), 'init_allA/T': (0, 0), 'init_5T/A': (0, 0), 'sym': (0, -1.4), 'AA/TT': (-7.9, -22.2), 'AT/TA': (-7.2, -20.4), 'TA/AT': (-7.2, -21.3), 'CA/GT': (-8.5, -22.7), 'GT/CA': (-8.4, -22.4), 'CT/GA': (-7.8, -21.0), 'GA/CT': (-8.2, -22.2), 'CG/GC': (-10.6, -27.2), 'GC/CG': (-9.8, -24.4), 'GG/CC': (-8.0, -19.9)}, 'tmm_table': {'AA/TA': (-3.1, -7.8), 'TA/AA': (-2.5, -6.3), 'CA/GA': (-4.3, -10.7), 'GA/CA': (-8.0, -22.5), 'AC/TC': (-0.1, 0.5), 'TC/AC': (-0.7, -1.3), 'CC/GC': (-2.1, -5.1), 'GC/CC': (-3.9, -10.6), 'AG/TG': (-1.1, -2.1), 'TG/AG': (-1.1, -2.7), 'CG/GG': (-3.8, -9.5), 'GG/CG': (-0.7, -19.2), 'AT/TT': (-2.4, -6.5), 'TT/AT': (-3.2, -8.9), 'CT/GT': (-6.1, -16.9), 'GT/CT': (-7.4, -21.2), 'AA/TC': (-1.6, -4.0), 'AC/TA': (-1.8, -3.8), 'CA/GC': (-2.6, -5.9), 'CC/GA': (-2.7, -6.0), 'GA/CC': (-5.0, -13.8), 'GC/CA': (-3.2, -7.1), 'TA/AC': (-2.3, -5.9), 'TC/AA': (-2.7, -7.0), 'AC/TT': (-0.9, -1.7), 'AT/TC': (-2.3, -6.3), 'CC/GT': (-3.2, -8.0), 'CT/GC': (-3.9, -10.6), 'GC/CT': (-4.9, -13.5), 'GT/CC': (-3.0, -7.8), 'TC/AT': (-2.5, -6.3), 'TT/AC': (-0.7, -1.2), 'AA/TG': (-1.9, -4.4), 'AG/TA': (-2.5, -5.9), 'CA/GG': (-3.9, -9.6), 'CG/GA': (-6.0, -15.5), 'GA/CG': (-4.3, -11.1), 'GG/CA': (-4.6, -11.4), 'TA/AG': (-2.0, -4.7), 'TG/AA': (-2.4, -5.8), 'AG/TT': (-3.2, -8.7), 'AT/TG': (-3.5, -9.4), 'CG/GT': (-3.8, -9.0), 'CT/GG': (-6.6, -18.7), 'GG/CT': (-5.7, -15.9), 'GT/CG': (-5.9, -16.1), 'TG/AT': (-3.9, -10.5), 'TT/AG': (-3.6, -9.8)}, 'imm_table': {'AG/TT': (1.0, 0.9), 'AT/TG': (-2.5, -8.3), 'CG/GT': (-4.1, -11.7), 'CT/GG': (-2.8, -8.0), 'GG/CT': (3.3, 10.4), 'GG/TT': (5.8, 16.3), 'GT/CG': (-4.4, -12.3), 'GT/TG': (4.1, 9.5), 'TG/AT': (-0.1, -1.7), 'TG/GT': (-1.4, -6.2), 'TT/AG': (-1.3, -5.3), 'AA/TG': (-0.6, -2.3), 'AG/TA': (-0.7, -2.3), 'CA/GG': (-0.7, -2.3), 'CG/GA': (-4.0, -13.2), 'GA/CG': (-0.6, -1.0), 'GG/CA': (0.5, 3.2), 'TA/AG': (0.7, 0.7), 'TG/AA': (3.0, 7.4), 'AC/TT': (0.7, 0.2), 'AT/TC': (-1.2, -6.2), 'CC/GT': (-0.8, -4.5), 'CT/GC': (-1.5, -6.1), 'GC/CT': (2.3, 5.4), 'GT/CC': (5.2, 13.5), 'TC/AT': (1.2, 0.7), 'TT/AC': (1.0, 0.7), 'AA/TC': (2.3, 4.6), 'AC/TA': (5.3, 14.6), 'CA/GC': (1.9, 3.7), 'CC/GA': (0.6, -0.6), 'GA/CC': (5.2, 14.2), 'GC/CA': (-0.7, -3.8), 'TA/AC': (3.4, 8.0), 'TC/AA': (7.6, 20.2), 'AA/TA': (1.2, 1.7), 'CA/GA': (-0.9, -4.2), 'GA/CA': (-2.9, -9.8), 'TA/AA': (4.7, 12.9), 'AC/TC': (0.0, -4.4), 'CC/GC': (-1.5, -7.2), 'GC/CC': (3.6, 8.9), 'TC/AC': (6.1, 16.4), 'AG/TG': (-3.1, -9.5), 'CG/GG': (-4.9, -15.3), 'GG/CG': (-6.0, -15.8), 'TG/AG': (1.6, 3.6), 'AT/TT': (-2.7, -10.8), 'CT/GT': (-5.0, -15.8), 'GT/CT': (-2.2, -8.4), 'TT/AT': (0.2, -1.5), 'AI/TC': (-8.9, -25.5), 'TI/AC': (-5.9, -17.4), 'AC/TI': (-8.8, -25.4), 'TC/AI': (-4.9, -13.9), 'CI/GC': (-5.4, -13.7), 'GI/CC': (-6.8, -19.1), 'CC/GI': (-8.3, -23.8), 'GC/CI': (-5.0, -12.6), 'AI/TA': (-8.3, -25.0), 'TI/AA': (-3.4, -11.2), 'AA/TI': (-0.7, -2.6), 'TA/AI': (-1.3, -4.6), 'CI/GA': (2.6, 8.9), 'GI/CA': (-7.8, -21.1), 'CA/GI': (-7.0, -20.0), 'GA/CI': (-7.6, -20.2), 'AI/TT': (0.49, -0.7), 'TI/AT': (-6.5, -22.0), 'AT/TI': (-5.6, -18.7), 'TT/AI': (-0.8, -4.3), 'CI/GT': (-1.0, -2.4), 'GI/CT': (-3.5, -10.6), 'CT/GI': (0.1, -1.0), 'GT/CI': (-4.3, -12.1), 'AI/TG': (-4.9, -15.8), 'TI/AG': (-1.9, -8.5), 'AG/TI': (0.1, -1.8), 'TG/AI': (1.0, 1.0), 'CI/GG': (7.1, 21.3), 'GI/CG': (-1.1, -3.2), 'CG/GI': (5.8, 16.9), 'GG/CI': (-7.6, -22.0), 'AI/TI': (-3.3, -11.9), 'TI/AI': (0.1, -2.3), 'CI/GI': (1.3, 3.0), 'GI/CI': (-0.5, -1.3)}, 'de_table': {'AA/.T': (0.2, 2.3), 'AC/.G': (-6.3, -17.1), 'AG/.C': (-3.7, -10.0), 'AT/.A': (-2.9, -7.6), 'CA/.T': (0.6, 3.3), 'CC/.G': (-4.4, -12.6), 'CG/.C': (-4.0, -11.9), 'CT/.A': (-4.1, -13.0), 'GA/.T': (-1.1, -1.6), 'GC/.G': (-5.1, -14.0), 'GG/.C': (-3.9, -10.9), 'GT/.A': (-4.2, -15.0), 'TA/.T': (-6.9, -20.0), 'TC/.G': (-4.0, -10.9), 'TG/.C': (-4.9, -13.8), 'TT/.A': (-0.2, -0.5), '.A/AT': (-0.7, -0.8), '.C/AG': (-2.1, -3.9), '.G/AC': (-5.9, -16.5), '.T/AA': (-0.5, -1.1), '.A/CT': (4.4, 14.9), '.C/CG': (-0.2, -0.1), '.G/CC': (-2.6, -7.4), '.T/CA': (4.7, 14.2), '.A/GT': (-1.6, -3.6), '.C/GG': (-3.9, -11.2), '.G/GC': (-3.2, -10.4), '.T/GA': (-4.1, -13.1), '.A/TT': (2.9, 10.4), '.C/TG': (-4.4, -13.1), '.G/TC': (-5.2, -15.0), '.T/TA': (-3.8, -12.6)}, 'dnac1': 50, 'dnac2': 0, 'selfcomp': False, 'saltcorr': 7, 'Na': 39, 'K': 75, 'Tris': 20, 'Mg': 10, 'dNTPs': 0}\n", + "2024-07-08 19:46:51,996 [INFO] Parameter: Tm_chem_correction_param_probe = {'DMSO': 0, 'fmd': 20, 'DMSOfactor': 0.75, 'fmdfactor': 0.65, 'fmdmethod': 1, 'GC': None}\n", + "2024-07-08 19:46:51,997 [INFO] Parameter: Tm_salt_correction_param_probe = None\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3dd15df04e304ca9af5b3d1df0c0eacb", + "model_id": "ecdf5993189c4082b846e0bd4d424bc1", "version_major": 2, "version_minor": 0 }, @@ -822,7 +805,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c6cb3f3871b2419c83ed6fc48edfc84b", + "model_id": "f78d8fb19a4845f5bb559e02ee379443", "version_major": 2, "version_minor": 0 }, @@ -859,7 +842,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b77e785a3f7e477193c239dc971a7268", + "model_id": "086807ee363b4bbea301fac1c93c5444", "version_major": 2, "version_minor": 0 }, @@ -897,14 +880,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-06-22 01:35:54,325 [INFO] Step - Specificity Filters: database contains 408735 oligos from 806 genes.\n" + "2024-07-08 21:42:02,456 [INFO] Step - Specificity Filters: database contains 408735 oligos from 806 regions.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-08 21:42:02,542 [DEBUG] handle_msg[ecdf5993189c4082b846e0bd4d424bc1]({'header': {'date': datetime.datetime(2024, 7, 8, 19, 42, 2, 520000, tzinfo=tzutc()), 'msg_id': '3d0f9311-1134-4051-a004-bce437d0daed', 'msg_type': 'comm_msg', 'session': 'f15cb79e-4700-4ef8-aed8-65224aba1be6', 'username': '5bb12fba-c395-4a0a-afbc-31e5784dd153', 'version': '5.2'}, 'msg_id': '3d0f9311-1134-4051-a004-bce437d0daed', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': 'ecdf5993189c4082b846e0bd4d424bc1', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': ' \\x1b[35m100%\\x1b[0m \\x1b[36mSpecificity Filter: Exact Match\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[32m887/887\\x1b[0m \\x1b[33m0:04:30\\x1b[0m < \\x1b[36m0:00:00\\x1b[0m\\n', 'text/html': '
  100% Specificity Filter: Exact Match ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 887/887 0:04:30 < 0:00:00\\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n", + "2024-07-08 21:42:02,547 [DEBUG] handle_msg[f78d8fb19a4845f5bb559e02ee379443]({'header': {'date': datetime.datetime(2024, 7, 8, 19, 42, 2, 521000, tzinfo=tzutc()), 'msg_id': '00a4381b-1e14-42e5-a94a-246c14c0bee4', 'msg_type': 'comm_msg', 'session': 'f15cb79e-4700-4ef8-aed8-65224aba1be6', 'username': '5bb12fba-c395-4a0a-afbc-31e5784dd153', 'version': '5.2'}, 'msg_id': '00a4381b-1e14-42e5-a94a-246c14c0bee4', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': 'f78d8fb19a4845f5bb559e02ee379443', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': ' \\x1b[35m100%\\x1b[0m \\x1b[36mSpecificity Filter: Blastn Specificity\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[32m887/887\\x1b[0m \\x1b[33m1:08:54\\x1b[0m < \\x1b[36m0:00:00\\x1b[0m\\n', 'text/html': '
  100% Specificity Filter: Blastn Specificity ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 887/887 1:08:54 < 0:00:00\\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n", + "2024-07-08 21:42:02,552 [DEBUG] handle_msg[086807ee363b4bbea301fac1c93c5444]({'header': {'date': datetime.datetime(2024, 7, 8, 19, 42, 2, 521000, tzinfo=tzutc()), 'msg_id': '4d438cb6-e1cd-4b07-9718-cb0949ab2096', 'msg_type': 'comm_msg', 'session': 'f15cb79e-4700-4ef8-aed8-65224aba1be6', 'username': '5bb12fba-c395-4a0a-afbc-31e5784dd153', 'version': '5.2'}, 'msg_id': '4d438cb6-e1cd-4b07-9718-cb0949ab2096', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '086807ee363b4bbea301fac1c93c5444', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': ' \\x1b[35m100%\\x1b[0m \\x1b[36mSpecificity Filter: Blastn Crosshybridization\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[32m806/806\\x1b[0m \\x1b[33m0:02:35\\x1b[0m < \\x1b[36m0:00:00\\x1b[0m\\n', 'text/html': '
  100% Specificity Filter: Blastn Crosshybridization ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 806/806 0:02:35 < 0:00:00\\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n" ] } ], "source": [ "####### Load existing database #######\n", "# dir_database = os.path.join(dir_output, \"db_probes/2_db_probes_property_filter\")\n", - "# probe_database = OligoDatabase(min_oligos_per_region=min_probes_per_gene, write_regions_with_insufficient_oligos=True, lru_db_max_in_memory=db_max_in_memory, database_name=\"db_probes\", dir_output=dir_output, n_jobs=n_jobs)\n", + "# probe_database = OligoDatabase(min_oligos_per_region=min_probes_per_gene, write_regions_with_insufficient_oligos=True, lru_db_max_in_memory=n_jobs*2+2, database_name=\"db_probes\", dir_output=dir_output, n_jobs=n_jobs)\n", "# probe_database.load_database(dir_database=dir_database, region_ids=gene_ids, database_overwrite=True)\n", "\n", "## Apply specificity filters\n", @@ -922,6 +914,7 @@ " arm_Tm_max=arm_Tm_max,\n", " Tm_parameters_probe=Tm_parameters_probe,\n", " Tm_chem_correction_param_probe=Tm_chem_correction_param_probe,\n", + " Tm_salt_correction_param_probe=Tm_salt_correction_param_probe,\n", ")" ] }, @@ -949,7 +942,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -969,38 +962,39 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2024-06-22 07:10:20,798 [INFO] Parameters Set Selection:\n", - "2024-06-22 07:10:20,800 [INFO] Function: create_probe_sets\n", - "2024-06-22 07:10:20,801 [INFO] Parameter: oligo_database = \n", - "2024-06-22 07:10:20,802 [INFO] Parameter: probe_isoform_weight = 2\n", - "2024-06-22 07:10:20,804 [INFO] Parameter: probe_Tm_weight = 1\n", - "2024-06-22 07:10:20,807 [INFO] Parameter: probe_Tm_min = 65\n", - "2024-06-22 07:10:20,808 [INFO] Parameter: probe_Tm_opt = 70\n", - "2024-06-22 07:10:20,809 [INFO] Parameter: probe_Tm_max = 75\n", - "2024-06-22 07:10:20,811 [INFO] Parameter: Tm_parameters_probe = {'check': True, 'strict': True, 'c_seq': None, 'shift': 0, 'nn_table': {'init': (0, 0), 'init_A/T': (2.3, 4.1), 'init_G/C': (0.1, -2.8), 'init_oneG/C': (0, 0), 'init_allA/T': (0, 0), 'init_5T/A': (0, 0), 'sym': (0, -1.4), 'AA/TT': (-7.9, -22.2), 'AT/TA': (-7.2, -20.4), 'TA/AT': (-7.2, -21.3), 'CA/GT': (-8.5, -22.7), 'GT/CA': (-8.4, -22.4), 'CT/GA': (-7.8, -21.0), 'GA/CT': (-8.2, -22.2), 'CG/GC': (-10.6, -27.2), 'GC/CG': (-9.8, -24.4), 'GG/CC': (-8.0, -19.9)}, 'tmm_table': {'AA/TA': (-3.1, -7.8), 'TA/AA': (-2.5, -6.3), 'CA/GA': (-4.3, -10.7), 'GA/CA': (-8.0, -22.5), 'AC/TC': (-0.1, 0.5), 'TC/AC': (-0.7, -1.3), 'CC/GC': (-2.1, -5.1), 'GC/CC': (-3.9, -10.6), 'AG/TG': (-1.1, -2.1), 'TG/AG': (-1.1, -2.7), 'CG/GG': (-3.8, -9.5), 'GG/CG': (-0.7, -19.2), 'AT/TT': (-2.4, -6.5), 'TT/AT': (-3.2, -8.9), 'CT/GT': (-6.1, -16.9), 'GT/CT': (-7.4, -21.2), 'AA/TC': (-1.6, -4.0), 'AC/TA': (-1.8, -3.8), 'CA/GC': (-2.6, -5.9), 'CC/GA': (-2.7, -6.0), 'GA/CC': (-5.0, -13.8), 'GC/CA': (-3.2, -7.1), 'TA/AC': (-2.3, -5.9), 'TC/AA': (-2.7, -7.0), 'AC/TT': (-0.9, -1.7), 'AT/TC': (-2.3, -6.3), 'CC/GT': (-3.2, -8.0), 'CT/GC': (-3.9, -10.6), 'GC/CT': (-4.9, -13.5), 'GT/CC': (-3.0, -7.8), 'TC/AT': (-2.5, -6.3), 'TT/AC': (-0.7, -1.2), 'AA/TG': (-1.9, -4.4), 'AG/TA': (-2.5, -5.9), 'CA/GG': (-3.9, -9.6), 'CG/GA': (-6.0, -15.5), 'GA/CG': (-4.3, -11.1), 'GG/CA': (-4.6, -11.4), 'TA/AG': (-2.0, -4.7), 'TG/AA': (-2.4, -5.8), 'AG/TT': (-3.2, -8.7), 'AT/TG': (-3.5, -9.4), 'CG/GT': (-3.8, -9.0), 'CT/GG': (-6.6, -18.7), 'GG/CT': (-5.7, -15.9), 'GT/CG': (-5.9, -16.1), 'TG/AT': (-3.9, -10.5), 'TT/AG': (-3.6, -9.8)}, 'imm_table': {'AG/TT': (1.0, 0.9), 'AT/TG': (-2.5, -8.3), 'CG/GT': (-4.1, -11.7), 'CT/GG': (-2.8, -8.0), 'GG/CT': (3.3, 10.4), 'GG/TT': (5.8, 16.3), 'GT/CG': (-4.4, -12.3), 'GT/TG': (4.1, 9.5), 'TG/AT': (-0.1, -1.7), 'TG/GT': (-1.4, -6.2), 'TT/AG': (-1.3, -5.3), 'AA/TG': (-0.6, -2.3), 'AG/TA': (-0.7, -2.3), 'CA/GG': (-0.7, -2.3), 'CG/GA': (-4.0, -13.2), 'GA/CG': (-0.6, -1.0), 'GG/CA': (0.5, 3.2), 'TA/AG': (0.7, 0.7), 'TG/AA': (3.0, 7.4), 'AC/TT': (0.7, 0.2), 'AT/TC': (-1.2, -6.2), 'CC/GT': (-0.8, -4.5), 'CT/GC': (-1.5, -6.1), 'GC/CT': (2.3, 5.4), 'GT/CC': (5.2, 13.5), 'TC/AT': (1.2, 0.7), 'TT/AC': (1.0, 0.7), 'AA/TC': (2.3, 4.6), 'AC/TA': (5.3, 14.6), 'CA/GC': (1.9, 3.7), 'CC/GA': (0.6, -0.6), 'GA/CC': (5.2, 14.2), 'GC/CA': (-0.7, -3.8), 'TA/AC': (3.4, 8.0), 'TC/AA': (7.6, 20.2), 'AA/TA': (1.2, 1.7), 'CA/GA': (-0.9, -4.2), 'GA/CA': (-2.9, -9.8), 'TA/AA': (4.7, 12.9), 'AC/TC': (0.0, -4.4), 'CC/GC': (-1.5, -7.2), 'GC/CC': (3.6, 8.9), 'TC/AC': (6.1, 16.4), 'AG/TG': (-3.1, -9.5), 'CG/GG': (-4.9, -15.3), 'GG/CG': (-6.0, -15.8), 'TG/AG': (1.6, 3.6), 'AT/TT': (-2.7, -10.8), 'CT/GT': (-5.0, -15.8), 'GT/CT': (-2.2, -8.4), 'TT/AT': (0.2, -1.5), 'AI/TC': (-8.9, -25.5), 'TI/AC': (-5.9, -17.4), 'AC/TI': (-8.8, -25.4), 'TC/AI': (-4.9, -13.9), 'CI/GC': (-5.4, -13.7), 'GI/CC': (-6.8, -19.1), 'CC/GI': (-8.3, -23.8), 'GC/CI': (-5.0, -12.6), 'AI/TA': (-8.3, -25.0), 'TI/AA': (-3.4, -11.2), 'AA/TI': (-0.7, -2.6), 'TA/AI': (-1.3, -4.6), 'CI/GA': (2.6, 8.9), 'GI/CA': (-7.8, -21.1), 'CA/GI': (-7.0, -20.0), 'GA/CI': (-7.6, -20.2), 'AI/TT': (0.49, -0.7), 'TI/AT': (-6.5, -22.0), 'AT/TI': (-5.6, -18.7), 'TT/AI': (-0.8, -4.3), 'CI/GT': (-1.0, -2.4), 'GI/CT': (-3.5, -10.6), 'CT/GI': (0.1, -1.0), 'GT/CI': (-4.3, -12.1), 'AI/TG': (-4.9, -15.8), 'TI/AG': (-1.9, -8.5), 'AG/TI': (0.1, -1.8), 'TG/AI': (1.0, 1.0), 'CI/GG': (7.1, 21.3), 'GI/CG': (-1.1, -3.2), 'CG/GI': (5.8, 16.9), 'GG/CI': (-7.6, -22.0), 'AI/TI': (-3.3, -11.9), 'TI/AI': (0.1, -2.3), 'CI/GI': (1.3, 3.0), 'GI/CI': (-0.5, -1.3)}, 'de_table': {'AA/.T': (0.2, 2.3), 'AC/.G': (-6.3, -17.1), 'AG/.C': (-3.7, -10.0), 'AT/.A': (-2.9, -7.6), 'CA/.T': (0.6, 3.3), 'CC/.G': (-4.4, -12.6), 'CG/.C': (-4.0, -11.9), 'CT/.A': (-4.1, -13.0), 'GA/.T': (-1.1, -1.6), 'GC/.G': (-5.1, -14.0), 'GG/.C': (-3.9, -10.9), 'GT/.A': (-4.2, -15.0), 'TA/.T': (-6.9, -20.0), 'TC/.G': (-4.0, -10.9), 'TG/.C': (-4.9, -13.8), 'TT/.A': (-0.2, -0.5), '.A/AT': (-0.7, -0.8), '.C/AG': (-2.1, -3.9), '.G/AC': (-5.9, -16.5), '.T/AA': (-0.5, -1.1), '.A/CT': (4.4, 14.9), '.C/CG': (-0.2, -0.1), '.G/CC': (-2.6, -7.4), '.T/CA': (4.7, 14.2), '.A/GT': (-1.6, -3.6), '.C/GG': (-3.9, -11.2), '.G/GC': (-3.2, -10.4), '.T/GA': (-4.1, -13.1), '.A/TT': (2.9, 10.4), '.C/TG': (-4.4, -13.1), '.G/TC': (-5.2, -15.0), '.T/TA': (-3.8, -12.6)}, 'dnac1': 50, 'dnac2': 0, 'selfcomp': False, 'saltcorr': 7, 'Na': 39, 'K': 75, 'Tris': 20, 'Mg': 10, 'dNTPs': 0}\n", - "2024-06-22 07:10:20,818 [INFO] Parameter: Tm_chem_correction_param_probe = {'DMSO': 0, 'fmd': 20, 'DMSOfactor': 0.75, 'fmdfactor': 0.65, 'fmdmethod': 1, 'GC': None}\n", - "2024-06-22 07:10:20,819 [INFO] Parameter: probe_GC_weight = 1\n", - "2024-06-22 07:10:20,821 [INFO] Parameter: probe_GC_content_min = 40\n", - "2024-06-22 07:10:20,822 [INFO] Parameter: probe_GC_content_opt = 50\n", - "2024-06-22 07:10:20,823 [INFO] Parameter: probe_GC_content_max = 60\n", - "2024-06-22 07:10:20,825 [INFO] Parameter: probeset_size_opt = 5\n", - "2024-06-22 07:10:20,826 [INFO] Parameter: probeset_size_min = 3\n", - "2024-06-22 07:10:20,826 [INFO] Parameter: max_graph_size = 5000\n", - "2024-06-22 07:10:20,828 [INFO] Parameter: n_sets = 100\n", - "2024-06-22 07:10:20,829 [INFO] Parameter: distance_between_probes = 0\n" + "2024-07-09 00:36:23,067 [INFO] Parameters Set Selection:\n", + "2024-07-09 00:36:23,070 [INFO] Function: create_probe_sets\n", + "2024-07-09 00:36:23,072 [INFO] Parameter: oligo_database = \n", + "2024-07-09 00:36:23,075 [INFO] Parameter: probe_isoform_weight = 2\n", + "2024-07-09 00:36:23,077 [INFO] Parameter: probe_Tm_weight = 1\n", + "2024-07-09 00:36:23,078 [INFO] Parameter: probe_Tm_min = 65\n", + "2024-07-09 00:36:23,081 [INFO] Parameter: probe_Tm_opt = 70\n", + "2024-07-09 00:36:23,084 [INFO] Parameter: probe_Tm_max = 75\n", + "2024-07-09 00:36:23,086 [INFO] Parameter: Tm_parameters_probe = {'check': True, 'strict': True, 'c_seq': None, 'shift': 0, 'nn_table': {'init': (0, 0), 'init_A/T': (2.3, 4.1), 'init_G/C': (0.1, -2.8), 'init_oneG/C': (0, 0), 'init_allA/T': (0, 0), 'init_5T/A': (0, 0), 'sym': (0, -1.4), 'AA/TT': (-7.9, -22.2), 'AT/TA': (-7.2, -20.4), 'TA/AT': (-7.2, -21.3), 'CA/GT': (-8.5, -22.7), 'GT/CA': (-8.4, -22.4), 'CT/GA': (-7.8, -21.0), 'GA/CT': (-8.2, -22.2), 'CG/GC': (-10.6, -27.2), 'GC/CG': (-9.8, -24.4), 'GG/CC': (-8.0, -19.9)}, 'tmm_table': {'AA/TA': (-3.1, -7.8), 'TA/AA': (-2.5, -6.3), 'CA/GA': (-4.3, -10.7), 'GA/CA': (-8.0, -22.5), 'AC/TC': (-0.1, 0.5), 'TC/AC': (-0.7, -1.3), 'CC/GC': (-2.1, -5.1), 'GC/CC': (-3.9, -10.6), 'AG/TG': (-1.1, -2.1), 'TG/AG': (-1.1, -2.7), 'CG/GG': (-3.8, -9.5), 'GG/CG': (-0.7, -19.2), 'AT/TT': (-2.4, -6.5), 'TT/AT': (-3.2, -8.9), 'CT/GT': (-6.1, -16.9), 'GT/CT': (-7.4, -21.2), 'AA/TC': (-1.6, -4.0), 'AC/TA': (-1.8, -3.8), 'CA/GC': (-2.6, -5.9), 'CC/GA': (-2.7, -6.0), 'GA/CC': (-5.0, -13.8), 'GC/CA': (-3.2, -7.1), 'TA/AC': (-2.3, -5.9), 'TC/AA': (-2.7, -7.0), 'AC/TT': (-0.9, -1.7), 'AT/TC': (-2.3, -6.3), 'CC/GT': (-3.2, -8.0), 'CT/GC': (-3.9, -10.6), 'GC/CT': (-4.9, -13.5), 'GT/CC': (-3.0, -7.8), 'TC/AT': (-2.5, -6.3), 'TT/AC': (-0.7, -1.2), 'AA/TG': (-1.9, -4.4), 'AG/TA': (-2.5, -5.9), 'CA/GG': (-3.9, -9.6), 'CG/GA': (-6.0, -15.5), 'GA/CG': (-4.3, -11.1), 'GG/CA': (-4.6, -11.4), 'TA/AG': (-2.0, -4.7), 'TG/AA': (-2.4, -5.8), 'AG/TT': (-3.2, -8.7), 'AT/TG': (-3.5, -9.4), 'CG/GT': (-3.8, -9.0), 'CT/GG': (-6.6, -18.7), 'GG/CT': (-5.7, -15.9), 'GT/CG': (-5.9, -16.1), 'TG/AT': (-3.9, -10.5), 'TT/AG': (-3.6, -9.8)}, 'imm_table': {'AG/TT': (1.0, 0.9), 'AT/TG': (-2.5, -8.3), 'CG/GT': (-4.1, -11.7), 'CT/GG': (-2.8, -8.0), 'GG/CT': (3.3, 10.4), 'GG/TT': (5.8, 16.3), 'GT/CG': (-4.4, -12.3), 'GT/TG': (4.1, 9.5), 'TG/AT': (-0.1, -1.7), 'TG/GT': (-1.4, -6.2), 'TT/AG': (-1.3, -5.3), 'AA/TG': (-0.6, -2.3), 'AG/TA': (-0.7, -2.3), 'CA/GG': (-0.7, -2.3), 'CG/GA': (-4.0, -13.2), 'GA/CG': (-0.6, -1.0), 'GG/CA': (0.5, 3.2), 'TA/AG': (0.7, 0.7), 'TG/AA': (3.0, 7.4), 'AC/TT': (0.7, 0.2), 'AT/TC': (-1.2, -6.2), 'CC/GT': (-0.8, -4.5), 'CT/GC': (-1.5, -6.1), 'GC/CT': (2.3, 5.4), 'GT/CC': (5.2, 13.5), 'TC/AT': (1.2, 0.7), 'TT/AC': (1.0, 0.7), 'AA/TC': (2.3, 4.6), 'AC/TA': (5.3, 14.6), 'CA/GC': (1.9, 3.7), 'CC/GA': (0.6, -0.6), 'GA/CC': (5.2, 14.2), 'GC/CA': (-0.7, -3.8), 'TA/AC': (3.4, 8.0), 'TC/AA': (7.6, 20.2), 'AA/TA': (1.2, 1.7), 'CA/GA': (-0.9, -4.2), 'GA/CA': (-2.9, -9.8), 'TA/AA': (4.7, 12.9), 'AC/TC': (0.0, -4.4), 'CC/GC': (-1.5, -7.2), 'GC/CC': (3.6, 8.9), 'TC/AC': (6.1, 16.4), 'AG/TG': (-3.1, -9.5), 'CG/GG': (-4.9, -15.3), 'GG/CG': (-6.0, -15.8), 'TG/AG': (1.6, 3.6), 'AT/TT': (-2.7, -10.8), 'CT/GT': (-5.0, -15.8), 'GT/CT': (-2.2, -8.4), 'TT/AT': (0.2, -1.5), 'AI/TC': (-8.9, -25.5), 'TI/AC': (-5.9, -17.4), 'AC/TI': (-8.8, -25.4), 'TC/AI': (-4.9, -13.9), 'CI/GC': (-5.4, -13.7), 'GI/CC': (-6.8, -19.1), 'CC/GI': (-8.3, -23.8), 'GC/CI': (-5.0, -12.6), 'AI/TA': (-8.3, -25.0), 'TI/AA': (-3.4, -11.2), 'AA/TI': (-0.7, -2.6), 'TA/AI': (-1.3, -4.6), 'CI/GA': (2.6, 8.9), 'GI/CA': (-7.8, -21.1), 'CA/GI': (-7.0, -20.0), 'GA/CI': (-7.6, -20.2), 'AI/TT': (0.49, -0.7), 'TI/AT': (-6.5, -22.0), 'AT/TI': (-5.6, -18.7), 'TT/AI': (-0.8, -4.3), 'CI/GT': (-1.0, -2.4), 'GI/CT': (-3.5, -10.6), 'CT/GI': (0.1, -1.0), 'GT/CI': (-4.3, -12.1), 'AI/TG': (-4.9, -15.8), 'TI/AG': (-1.9, -8.5), 'AG/TI': (0.1, -1.8), 'TG/AI': (1.0, 1.0), 'CI/GG': (7.1, 21.3), 'GI/CG': (-1.1, -3.2), 'CG/GI': (5.8, 16.9), 'GG/CI': (-7.6, -22.0), 'AI/TI': (-3.3, -11.9), 'TI/AI': (0.1, -2.3), 'CI/GI': (1.3, 3.0), 'GI/CI': (-0.5, -1.3)}, 'de_table': {'AA/.T': (0.2, 2.3), 'AC/.G': (-6.3, -17.1), 'AG/.C': (-3.7, -10.0), 'AT/.A': (-2.9, -7.6), 'CA/.T': (0.6, 3.3), 'CC/.G': (-4.4, -12.6), 'CG/.C': (-4.0, -11.9), 'CT/.A': (-4.1, -13.0), 'GA/.T': (-1.1, -1.6), 'GC/.G': (-5.1, -14.0), 'GG/.C': (-3.9, -10.9), 'GT/.A': (-4.2, -15.0), 'TA/.T': (-6.9, -20.0), 'TC/.G': (-4.0, -10.9), 'TG/.C': (-4.9, -13.8), 'TT/.A': (-0.2, -0.5), '.A/AT': (-0.7, -0.8), '.C/AG': (-2.1, -3.9), '.G/AC': (-5.9, -16.5), '.T/AA': (-0.5, -1.1), '.A/CT': (4.4, 14.9), '.C/CG': (-0.2, -0.1), '.G/CC': (-2.6, -7.4), '.T/CA': (4.7, 14.2), '.A/GT': (-1.6, -3.6), '.C/GG': (-3.9, -11.2), '.G/GC': (-3.2, -10.4), '.T/GA': (-4.1, -13.1), '.A/TT': (2.9, 10.4), '.C/TG': (-4.4, -13.1), '.G/TC': (-5.2, -15.0), '.T/TA': (-3.8, -12.6)}, 'dnac1': 50, 'dnac2': 0, 'selfcomp': False, 'saltcorr': 7, 'Na': 39, 'K': 75, 'Tris': 20, 'Mg': 10, 'dNTPs': 0}\n", + "2024-07-09 00:36:23,089 [INFO] Parameter: Tm_chem_correction_param_probe = {'DMSO': 0, 'fmd': 20, 'DMSOfactor': 0.75, 'fmdfactor': 0.65, 'fmdmethod': 1, 'GC': None}\n", + "2024-07-09 00:36:23,090 [INFO] Parameter: Tm_salt_correction_param_probe = None\n", + "2024-07-09 00:36:23,091 [INFO] Parameter: probe_GC_weight = 1\n", + "2024-07-09 00:36:23,094 [INFO] Parameter: probe_GC_content_min = 40\n", + "2024-07-09 00:36:23,096 [INFO] Parameter: probe_GC_content_opt = 50\n", + "2024-07-09 00:36:23,097 [INFO] Parameter: probe_GC_content_max = 60\n", + "2024-07-09 00:36:23,098 [INFO] Parameter: probeset_size_opt = 5\n", + "2024-07-09 00:36:23,099 [INFO] Parameter: probeset_size_min = 3\n", + "2024-07-09 00:36:23,100 [INFO] Parameter: max_graph_size = 5000\n", + "2024-07-09 00:36:23,101 [INFO] Parameter: n_sets = 100\n", + "2024-07-09 00:36:23,103 [INFO] Parameter: distance_between_probes = 0\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "71e53103727a41f2b4c8db238482f478", + "model_id": "b011dd7910384d038ec20102cf492c08", "version_major": 2, "version_minor": 0 }, @@ -1038,21 +1032,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-06-22 07:16:26,067 [INFO] Step - Set Selection: database contains 15896 oligos from 729 genes.\n" + "2024-07-09 03:04:22,408 [INFO] Step - Set Selection: database contains 16129 oligos from 735 regions.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2024-06-22 07:16:26,097 [DEBUG] handle_msg[71e53103727a41f2b4c8db238482f478]({'header': {'date': datetime.datetime(2024, 6, 22, 5, 16, 26, 92000, tzinfo=tzutc()), 'msg_id': 'ee0cb3c5-8ede-47d7-ae82-ec91e847eaa1', 'msg_type': 'comm_msg', 'session': 'f5badd4b-0344-47de-95b0-5b394769c48b', 'username': '14d27083-0f27-4302-83da-06a8ed8f694f', 'version': '5.2'}, 'msg_id': 'ee0cb3c5-8ede-47d7-ae82-ec91e847eaa1', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '71e53103727a41f2b4c8db238482f478', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': ' \\x1b[35m100%\\x1b[0m \\x1b[36mFind Oligosets\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[32m735/735\\x1b[0m \\x1b[33m0:05:57\\x1b[0m < \\x1b[36m0:00:00\\x1b[0m\\n', 'text/html': '
  100% Find Oligosets ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 735/735 0:05:57 < 0:00:00\\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n" + "2024-07-09 03:04:22,449 [DEBUG] handle_msg[b011dd7910384d038ec20102cf492c08]({'header': {'date': datetime.datetime(2024, 7, 9, 1, 4, 22, 445000, tzinfo=tzutc()), 'msg_id': '75f170fe-5fbb-46a3-9620-3f145fbff019', 'msg_type': 'comm_msg', 'session': 'f15cb79e-4700-4ef8-aed8-65224aba1be6', 'username': '5bb12fba-c395-4a0a-afbc-31e5784dd153', 'version': '5.2'}, 'msg_id': '75f170fe-5fbb-46a3-9620-3f145fbff019', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': 'b011dd7910384d038ec20102cf492c08', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': ' \\x1b[35m100%\\x1b[0m \\x1b[36mFind Oligosets\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[32m806/806\\x1b[0m \\x1b[33m2:27:50\\x1b[0m < \\x1b[36m0:00:00\\x1b[0m\\n', 'text/html': '
  100% Find Oligosets ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 806/806 2:27:50 < 0:00:00\\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n" ] } ], "source": [ "####### Load existing database #######\n", "# dir_database = os.path.join(dir_output, \"db_probes/3_db_probes_specificity_filter\")\n", - "# probe_database = OligoDatabase(min_oligos_per_region=min_probes_per_gene, write_regions_with_insufficient_oligos=True, lru_db_max_in_memory=db_max_in_memory, database_name=\"db_probes\", dir_output=dir_output, n_jobs=n_jobs)\n", + "# probe_database = OligoDatabase(min_oligos_per_region=min_probes_per_gene, write_regions_with_insufficient_oligos=True, lru_db_max_in_memory=n_jobs*2+2, database_name=\"db_probes\", dir_output=dir_output, n_jobs=n_jobs)\n", "# probe_database.load_database(dir_database=dir_database, region_ids=gene_ids, database_overwrite=True)\n", "\n", "## Apply probe selection\n", @@ -1065,6 +1059,7 @@ " probe_Tm_max=probe_Tm_max,\n", " Tm_parameters_probe=Tm_parameters_probe,\n", " Tm_chem_correction_param_probe=Tm_chem_correction_param_probe,\n", + " Tm_salt_correction_param_probe=Tm_salt_correction_param_probe,\n", " probe_GC_weight=probe_GC_weight,\n", " probe_GC_content_min=probe_GC_content_min,\n", " probe_GC_content_opt=probe_GC_content_opt,\n", @@ -1091,13 +1086,13 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "####### Load existing database #######\n", "# dir_database = os.path.join(dir_output, \"db_probes/4_db_probes_probesets\")\n", - "# probe_database = OligoDatabase(min_oligos_per_region=min_probes_per_gene, write_regions_with_insufficient_oligos=True, lru_db_max_in_memory=db_max_in_memory, database_name=\"db_probes\", dir_output=dir_output, n_jobs=n_jobs)\n", + "# probe_database = OligoDatabase(min_oligos_per_region=min_probes_per_gene, write_regions_with_insufficient_oligos=True, lru_db_max_in_memory=n_jobs*2+2, database_name=\"db_probes\", dir_output=dir_output, n_jobs=n_jobs)\n", "# probe_database.load_database(dir_database=dir_database, region_ids=gene_ids, database_overwrite=True)\n", "\n", "# get gene names of genes with sufficient number of probes to proceed with next step\n", @@ -1128,13 +1123,13 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "14ca412bd28b44558d6a3140101a5ed8", + "model_id": "74aab958e5b54cb698e62612fa9e03f8", "version_major": 2, "version_minor": 0 }, @@ -1274,7 +1269,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -1304,7 +1299,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -1340,18 +1335,20 @@ " \"fmdfactor\": 0.65, # default\n", " \"fmdmethod\": 1, # default\n", " \"GC\": None, # default\n", - "}" + "}\n", + "\n", + "Tm_salt_correction_param_detection_oligo = None" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2d76b817415d4832a9e5157a18286605", + "model_id": "1a931c4c89ec48e6be4917b9a4ac2f8f", "version_major": 2, "version_minor": 0 }, @@ -1389,13 +1386,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-06-22 07:24:22,481 [DEBUG] handle_msg[2d76b817415d4832a9e5157a18286605]({'header': {'date': datetime.datetime(2024, 6, 22, 5, 24, 22, 468000, tzinfo=tzutc()), 'msg_id': '579a2685-fd7f-4de7-95b6-e0219bda2ea0', 'msg_type': 'comm_msg', 'session': 'f5badd4b-0344-47de-95b0-5b394769c48b', 'username': '14d27083-0f27-4302-83da-06a8ed8f694f', 'version': '5.2'}, 'msg_id': '579a2685-fd7f-4de7-95b6-e0219bda2ea0', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '2d76b817415d4832a9e5157a18286605', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': ' \\x1b[35m100%\\x1b[0m \\x1b[36mDesign Final Padlock Sequence\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[32m20/20\\x1b[0m \\x1b[33m0:00:22\\x1b[0m < \\x1b[36m0:00:00\\x1b[0m\\n', 'text/html': '
  100% Design Final Padlock Sequence ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20/20 0:00:22 < 0:00:00\\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n" + "2024-07-09 10:12:10,657 [DEBUG] handle_msg[1a931c4c89ec48e6be4917b9a4ac2f8f]({'header': {'date': datetime.datetime(2024, 7, 9, 8, 12, 10, 653000, tzinfo=tzutc()), 'msg_id': '394388fb-c687-478d-b79b-49e5fecd638c', 'msg_type': 'comm_msg', 'session': 'f15cb79e-4700-4ef8-aed8-65224aba1be6', 'username': '5bb12fba-c395-4a0a-afbc-31e5784dd153', 'version': '5.2'}, 'msg_id': '394388fb-c687-478d-b79b-49e5fecd638c', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '1a931c4c89ec48e6be4917b9a4ac2f8f', 'data': {'method': 'update', 'state': {'outputs': [{'output_type': 'display_data', 'data': {'text/plain': ' \\x1b[35m100%\\x1b[0m \\x1b[36mDesign Final Padlock Sequence\\x1b[0m \\x1b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\\x1b[0m \\x1b[32m20/20\\x1b[0m \\x1b[33m0:00:22\\x1b[0m < \\x1b[36m0:00:00\\x1b[0m\\n', 'text/html': '
  100% Design Final Padlock Sequence ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20/20 0:00:22 < 0:00:00\\n
\\n'}, 'metadata': {}}]}, 'buffer_paths': []}}, 'buffers': []})\n" ] } ], "source": [ "## Design final sequences \n", - "probe_database = pipeline.design_final_padlock_sequence(\n", + "probe_database = pipeline.design_final_probe_sequence(\n", " oligo_database=probe_database,\n", " min_thymines=min_thymines,\n", " U_distance=U_distance,\n", @@ -1404,8 +1401,10 @@ " detect_oligo_Tm_opt=detect_oligo_Tm_opt,\n", " Tm_parameters_detection_oligo=Tm_parameters_detection_oligo,\n", " Tm_chem_correction_param_detection_oligo=Tm_chem_correction_param_detection_oligo,\n", + " Tm_salt_correction_param_detection_oligo=Tm_salt_correction_param_detection_oligo,\n", " Tm_parameters_probe=Tm_parameters_probe,\n", " Tm_chem_correction_param_probe=Tm_chem_correction_param_probe,\n", + " Tm_salt_correction_param_probe=Tm_salt_correction_param_probe,\n", ")\n", "\n", "## Compute all required probe attributes for output\n", @@ -1413,6 +1412,7 @@ " oligo_database=probe_database,\n", " Tm_parameters_probe=Tm_parameters_probe,\n", " Tm_chem_correction_param_probe=Tm_chem_correction_param_probe,\n", + " Tm_salt_correction_param_probe=Tm_salt_correction_param_probe,\n", ")\n", "\n", "## Write output to files\n", From d9eb37799b6a261a120fba28fd66f695afdea83c Mon Sep 17 00:00:00 2001 From: Lisa Sousa Date: Wed, 17 Jul 2024 09:03:32 +0200 Subject: [PATCH 3/3] update notebook to pipeline changes --- ..._tutorial_end_to_end_selection_short.ipynb | 134 +++++------------- 1 file changed, 37 insertions(+), 97 deletions(-) diff --git a/docs/_tutorials/spapros_tutorial_end_to_end_selection_short.ipynb b/docs/_tutorials/spapros_tutorial_end_to_end_selection_short.ipynb index 20b9f8f..4a8537d 100644 --- a/docs/_tutorials/spapros_tutorial_end_to_end_selection_short.ipynb +++ b/docs/_tutorials/spapros_tutorial_end_to_end_selection_short.ipynb @@ -216,25 +216,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", - "\n", - "Due to the on going maintenance burden of keeping command line application\n", - "wrappers up to date, we have decided to deprecate and eventually remove these\n", - "modules.\n", - "\n", - "We instead now recommend building your command line and invoking it directly\n", - "with the subprocess module.\n", - " warnings.warn(\n", - "2024-06-19 21:20:21,234 [INFO] Parameters Load Annotations:\n", - "2024-06-19 21:20:21,235 [INFO] source = ncbi\n", - "2024-06-19 21:20:21,235 [INFO] source_params = {'taxon': 'vertebrate_mammalian', 'species': 'Homo_sapiens', 'annotation_release': 110}\n", - "2024-06-19 21:28:20,814 [WARNING] /Users/lisa.barros/Desktop/oligo-designer-toolsuite/oligo_designer_toolsuite/utils/_sequence_parser.py:104: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "2024-07-16 17:15:53,511 [INFO] Parameters Load Annotations:\n", + "2024-07-16 17:15:53,511 [INFO] source = ncbi\n", + "2024-07-16 17:15:53,511 [INFO] source_params = {'taxon': 'vertebrate_mammalian', 'species': 'Homo_sapiens', 'annotation_release': 110}\n", + "2024-07-16 17:23:33,702 [WARNING] /Users/lisa.barros/Desktop/oligo-designer-toolsuite/oligo_designer_toolsuite/utils/_sequence_parser.py:104: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.\n", " csv_df = pd.read_csv(csv_file, sep=\"\\t\", names=self.GFF_HEADER, header=None)\n", "\n", - "2024-06-19 21:28:56,907 [INFO] The following annotation files are used for GTF annotation of regions: /Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/GCF_000001405.40_GRCh38.p14_genomic.gtf and for fasta sequence file: /Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/GCF_000001405.40_GRCh38.p14_genomic.fna .\n", - "2024-06-19 21:28:56,908 [INFO] The annotations are from NCBI source, for the species: Homo_sapiens, release number: 110 and genome assembly: GRCh38.p14\n", - "2024-06-19 21:32:27,987 [INFO] The genomic region 'exon' was stored in :/Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna.\n", - "2024-06-19 21:44:45,118 [INFO] The genomic region 'exon_exon_junction' was stored in :/Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna.\n" + "2024-07-16 17:24:16,761 [INFO] The following annotation files are used for GTF annotation of regions: /Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/GCF_000001405.40_GRCh38.p14_genomic.gtf and for fasta sequence file: /Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/GCF_000001405.40_GRCh38.p14_genomic.fna .\n", + "2024-07-16 17:24:16,763 [INFO] The annotations are from NCBI source, for the species: Homo_sapiens, release number: 110 and genome assembly: GRCh38.p14\n", + "2024-07-16 17:28:18,217 [INFO] The genomic region 'exon' was stored in :/Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/exon_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna.\n", + "2024-07-16 17:40:49,655 [INFO] The genomic region 'exon_exon_junction' was stored in :/Users/lisa.barros/Desktop/spapros/docs/_tutorials/output_genomic_region_generator_ncbi/annotation/exon_exon_junction_annotation_source-NCBI_species-Homo_sapiens_annotation_release-110_genome_assemly-GRCh38.p14.fna.\n" ] } ], @@ -248,68 +239,17 @@ "execution_count": 7, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", - "\n", - "Due to the on going maintenance burden of keeping command line application\n", - "wrappers up to date, we have decided to deprecate and eventually remove these\n", - "modules.\n", - "\n", - "We instead now recommend building your command line and invoking it directly\n", - "with the subprocess module.\n", - " warnings.warn(\n", - "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", - "\n", - "Due to the on going maintenance burden of keeping command line application\n", - "wrappers up to date, we have decided to deprecate and eventually remove these\n", - "modules.\n", - "\n", - "We instead now recommend building your command line and invoking it directly\n", - "with the subprocess module.\n", - " warnings.warn(\n", - "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", - "\n", - "Due to the on going maintenance burden of keeping command line application\n", - "wrappers up to date, we have decided to deprecate and eventually remove these\n", - "modules.\n", - "\n", - "We instead now recommend building your command line and invoking it directly\n", - "with the subprocess module.\n", - " warnings.warn(\n", - "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", - "\n", - "Due to the on going maintenance burden of keeping command line application\n", - "wrappers up to date, we have decided to deprecate and eventually remove these\n", - "modules.\n", - "\n", - "We instead now recommend building your command line and invoking it directly\n", - "with the subprocess module.\n", - " warnings.warn(\n", - "/Users/lisa.barros/anaconda3/envs/odt_test/lib/python3.10/site-packages/Bio/Application/__init__.py:40: BiopythonDeprecationWarning: The Bio.Application modules and modules relying on it have been deprecated.\n", - "\n", - "Due to the on going maintenance burden of keeping command line application\n", - "wrappers up to date, we have decided to deprecate and eventually remove these\n", - "modules.\n", - "\n", - "We instead now recommend building your command line and invoking it directly\n", - "with the subprocess module.\n", - " warnings.warn(\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mDatabase Loading\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m887/887\u001b[0m \u001b[33m1:09:27\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:04\u001b[0m03:37\u001b[0m\n", - "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mProperty Filter\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m887/887\u001b[0m \u001b[33m1:00:48\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:04\u001b[0m01:38\u001b[0m\n", - "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mSpecificity Filter: Exact Matches\u001b[0m \u001b[90m━━━━━━━━━━━━━\u001b[0m \u001b[32m887/887\u001b[0m \u001b[33m0:04:27\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:01\u001b[0m00:21\u001b[0m\n", - "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mSpecificity Filter: Blastn Specificity\u001b[0m \u001b[90m━━━━━━━━\u001b[0m \u001b[32m887/887\u001b[0m \u001b[33m1:13:44\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:05\u001b[0m06:41\u001b[0m\n", - "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mSpecificity Filter: Blastn Crosshybridization\u001b[0m \u001b[90m━━━━━\u001b[0m \u001b[32m887/…\u001b[0m \u001b[33m0:02:…\u001b[0m < \u001b[36m0:00:…\u001b[0m < \u001b[36m0:00:…\u001b[0m:00:…\u001b[0m\n", - "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mFind Oligosets\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m810/810\u001b[0m \u001b[33m2:42:58\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:04\u001b[0m02:54\u001b[0m\n", - "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mDesign Final Padlock Sequence\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m741/741\u001b[0m \u001b[33m0:11:26\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:01\u001b[0m00:38\u001b[0m\n", + "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mDatabase Loading\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m887/887\u001b[0m \u001b[33m1:09:20\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:05\u001b[0m04:28\u001b[0m\n", + "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mProperty Filter\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m887/887\u001b[0m \u001b[33m1:00:18\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:04\u001b[0m04:50\u001b[0m\n", + "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mSpecificity Filter: Exact Match\u001b[0m \u001b[90m━━━━━━━━━━━━━━━\u001b[0m \u001b[32m887/887\u001b[0m \u001b[33m0:04:31\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:01\u001b[0m00:26\u001b[0m\n", + "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mSpecificity Filter: Blastn Specificity\u001b[0m \u001b[90m━━━━━━━━\u001b[0m \u001b[32m887/887\u001b[0m \u001b[33m1:09:06\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:11\u001b[0m15:04\u001b[0m\n", + "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mSpecificity Filter: Blastn Crosshybridization\u001b[0m \u001b[90m━━━━━\u001b[0m \u001b[32m810/…\u001b[0m \u001b[33m0:02:…\u001b[0m < \u001b[36m0:00:…\u001b[0m < \u001b[36m0:00:…\u001b[0m:00:…\u001b[0m\n", + "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mFind Oligosets\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m810/810\u001b[0m \u001b[33m2:44:14\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:04\u001b[0m01:51\u001b[0m\n", + "\u001b[2K \u001b[35m100%\u001b[0m \u001b[36mDesign Final Padlock Sequence\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m741/741\u001b[0m \u001b[33m0:11:25\u001b[0m < \u001b[36m0:00:00\u001b[0m< \u001b[36m0:00:01\u001b[0m00:49\u001b[0m\n", "\u001b[?25h" ] } @@ -335,7 +275,7 @@ "##### Load probe design filter #####\n", "genes_without_enough_probes = pd.read_csv('output_scrinshot_probe_designer/db_probes/regions_with_insufficient_oligos_for_db_probes.txt', index_col=0, sep=\"\\t\").index.tolist()\n", "adata.var[\"has_enough_probes\"] = [g not in genes_without_enough_probes for g in adata.var_names]\n", - "adata.var[\"pass_constraints\"] = adata.var[\"has_enough_probes\"] & adata.var[\"highly_variable\"]\n" + "adata.var[\"pass_constraints\"] = adata.var[\"has_enough_probes\"] & adata.var[\"highly_variable\"]" ] }, { @@ -346,7 +286,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "856cfa65e91446989bda6ed9bffa59e2", + "model_id": "5b1ed0e8aaaa4bd2a046ec85e95e1562", "version_major": 2, "version_minor": 0 }, @@ -511,35 +451,35 @@ { "data": { "text/plain": [ - "{'oligoset_1': {'CTSS::21801': {'sequence_padlock_probe': 'CAGCAGTTGCTCCCACAGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTACCAGCCGTTTCATTGTGATAGAAC',\n", + "{'oligoset_1': {'CTSS::21801': {'sequence_padlock_probe': 'CAGCAGTTGCTCCCACAGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTTCTCCTATCTTCTTTACCAGCCGTTTCATTGTGATAGAAC',\n", " 'sequence_detection_oligo': 'CCGTTTCATTGTGATAGAACCAGCAGTTGCUCCCACAGU[fluorophore]'},\n", - " 'CTSS::3064': {'sequence_padlock_probe': 'GCCACAGCTTCTTTCAGGACATCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTCCAACAGACACTGGGCCTTTATTG',\n", + " 'CTSS::3064': {'sequence_padlock_probe': 'GCCACAGCTTCTTTCAGGACATCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTTCTCCTATCTTCTTTCCAACAGACACTGGGCCTTTATTG',\n", " 'sequence_detection_oligo': 'CAGACACTGGGCCTTTATTGGCCACAGCUTCTTUCAGGAC[fluorophore]'},\n", - " 'CTSS::22579': {'sequence_padlock_probe': 'GCGTCTGAGTCGATGCCCTTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTATCCATGGCTTTGTAGGGATAGGAA',\n", + " 'CTSS::22579': {'sequence_padlock_probe': 'GCGTCTGAGTCGATGCCCTTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTTCTCCTATCTTCTTTATCCATGGCTTTGTAGGGATAGGAA',\n", " 'sequence_detection_oligo': 'TGGCTTTGTAGGGATAGGAAGCGTCTGAGTCGAUGCCCU[fluorophore]'},\n", - " 'CTSS::979': {'sequence_padlock_probe': 'TTCCCATTGAATGCTCCAGGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTTGCCCAGATCGTATGAGTGCA',\n", + " 'CTSS::979': {'sequence_padlock_probe': 'TTCCCATTGAATGCTCCAGGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTTCTCCTATCTTCTTTTGCCCAGATCGTATGAGTGCA',\n", " 'sequence_detection_oligo': 'GCCCAGATCGTATGAGTGCATTCCCATUGAATGCUCCAGG[fluorophore]'},\n", - " 'CTSS::22236': {'sequence_padlock_probe': 'AGCACCACAAGAACCCATGTCTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTACAGCACTGAAAGCCCAGCA',\n", + " 'CTSS::22236': {'sequence_padlock_probe': 'AGCACCACAAGAACCCATGTCTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTTCTCCTATCTTCTTTACAGCACTGAAAGCCCAGCA',\n", " 'sequence_detection_oligo': 'ACAGCACUGAAAGCCCAGCAAGCACCACAAGAACCCATGU[fluorophore]'}},\n", - " 'oligoset_2': {'CTSS::21801': {'sequence_padlock_probe': 'CAGCAGTTGCTCCCACAGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTACCAGCCGTTTCATTGTGATAGAAC',\n", + " 'oligoset_2': {'CTSS::21801': {'sequence_padlock_probe': 'CAGCAGTTGCTCCCACAGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTTCTCCTATCTTCTTTACCAGCCGTTTCATTGTGATAGAAC',\n", " 'sequence_detection_oligo': 'CCGTTTCATTGTGATAGAACCAGCAGTTGCUCCCACAGU[fluorophore]'},\n", - " 'CTSS::3064': {'sequence_padlock_probe': 'GCCACAGCTTCTTTCAGGACATCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTCCAACAGACACTGGGCCTTTATTG',\n", - " 'sequence_detection_oligo': 'CAGACACTGGGCCTTTATTGGCCACAGCUTCTTUCAGGAC[fluorophore]'},\n", - " 'CTSS::2173': {'sequence_padlock_probe': 'GTCTGAGTCGATGCCCTTGTTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTATGGCTTTGTAGGGATAGGAAGC',\n", - " 'sequence_detection_oligo': 'GCTTTGTAGGGATAGGAAGCGTCTGAGTCGAUGCCCTTGU[fluorophore]'},\n", - " 'CTSS::979': {'sequence_padlock_probe': 'TTCCCATTGAATGCTCCAGGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTTGCCCAGATCGTATGAGTGCA',\n", + " 'CTSS::22579': {'sequence_padlock_probe': 'GCGTCTGAGTCGATGCCCTTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTTCTCCTATCTTCTTTATCCATGGCTTTGTAGGGATAGGAA',\n", + " 'sequence_detection_oligo': 'TGGCTTTGTAGGGATAGGAAGCGTCTGAGTCGAUGCCCU[fluorophore]'},\n", + " 'CTSS::979': {'sequence_padlock_probe': 'TTCCCATTGAATGCTCCAGGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTTCTCCTATCTTCTTTTGCCCAGATCGTATGAGTGCA',\n", " 'sequence_detection_oligo': 'GCCCAGATCGTATGAGTGCATTCCCATUGAATGCUCCAGG[fluorophore]'},\n", - " 'CTSS::22236': {'sequence_padlock_probe': 'AGCACCACAAGAACCCATGTCTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTACAGCACTGAAAGCCCAGCA',\n", + " 'CTSS::2942': {'sequence_padlock_probe': 'GGCCACAGCTTCTTTCAGGACTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTTCTCCTATCTTCTTTCCAACAGACACTGGGCCTTTATT',\n", + " 'sequence_detection_oligo': 'ACAGACACTGGGCCTTTATTGGCCACAGCUTCTTUCAGGA[fluorophore]'},\n", + " 'CTSS::22236': {'sequence_padlock_probe': 'AGCACCACAAGAACCCATGTCTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTTCTCCTATCTTCTTTACAGCACTGAAAGCCCAGCA',\n", " 'sequence_detection_oligo': 'ACAGCACUGAAAGCCCAGCAAGCACCACAAGAACCCATGU[fluorophore]'}},\n", - " 'oligoset_3': {'CTSS::21801': {'sequence_padlock_probe': 'CAGCAGTTGCTCCCACAGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTACCAGCCGTTTCATTGTGATAGAAC',\n", + " 'oligoset_3': {'CTSS::21801': {'sequence_padlock_probe': 'CAGCAGTTGCTCCCACAGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTTCTCCTATCTTCTTTACCAGCCGTTTCATTGTGATAGAAC',\n", " 'sequence_detection_oligo': 'CCGTTTCATTGTGATAGAACCAGCAGTTGCUCCCACAGU[fluorophore]'},\n", - " 'CTSS::3064': {'sequence_padlock_probe': 'GCCACAGCTTCTTTCAGGACATCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTCCAACAGACACTGGGCCTTTATTG',\n", - " 'sequence_detection_oligo': 'CAGACACTGGGCCTTTATTGGCCACAGCUTCTTUCAGGAC[fluorophore]'},\n", - " 'CTSS::22579': {'sequence_padlock_probe': 'GCGTCTGAGTCGATGCCCTTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTATCCATGGCTTTGTAGGGATAGGAA',\n", - " 'sequence_detection_oligo': 'TGGCTTTGTAGGGATAGGAAGCGTCTGAGTCGAUGCCCU[fluorophore]'},\n", - " 'CTSS::1139': {'sequence_padlock_probe': 'CCCATTGAATGCTCCAGGTTGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTGCCCAGATCGTATGAGTGCATT',\n", - " 'sequence_detection_oligo': 'CCAGATCGTATGAGTGCATTCCCATTGAATGCUCCAGGU[fluorophore]'},\n", - " 'CTSS::22236': {'sequence_padlock_probe': 'AGCACCACAAGAACCCATGTCTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTGAATCTATCTTCTTTACAGCACTGAAAGCCCAGCA',\n", + " 'CTSS::2173': {'sequence_padlock_probe': 'GTCTGAGTCGATGCCCTTGTTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTTCTCCTATCTTCTTTATGGCTTTGTAGGGATAGGAAGC',\n", + " 'sequence_detection_oligo': 'GCTTTGTAGGGATAGGAAGCGTCTGAGTCGAUGCCCTTGU[fluorophore]'},\n", + " 'CTSS::979': {'sequence_padlock_probe': 'TTCCCATTGAATGCTCCAGGTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTTCTCCTATCTTCTTTTGCCCAGATCGTATGAGTGCA',\n", + " 'sequence_detection_oligo': 'GCCCAGATCGTATGAGTGCATTCCCATUGAATGCUCCAGG[fluorophore]'},\n", + " 'CTSS::2942': {'sequence_padlock_probe': 'GGCCACAGCTTCTTTCAGGACTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTTCTCCTATCTTCTTTCCAACAGACACTGGGCCTTTATT',\n", + " 'sequence_detection_oligo': 'ACAGACACTGGGCCTTTATTGGCCACAGCUTCTTUCAGGA[fluorophore]'},\n", + " 'CTSS::22236': {'sequence_padlock_probe': 'AGCACCACAAGAACCCATGTCTTCCTCTATGATTACTGACTGCGTCTATTTAGTGGAGCCTTCTCCTATCTTCTTTACAGCACTGAAAGCCCAGCA',\n", " 'sequence_detection_oligo': 'ACAGCACUGAAAGCCCAGCAAGCACCACAAGAACCCATGU[fluorophore]'}}}" ] },