Merge branch 'develop' into 'master'

Beta release v1.1.0b1 See merge request tron/addannot!231
TRON-Bioinformatics · Feb 8, 2023 · 6f9772a · 6f9772a
2 parents d5452a6 + be15532
commit 6f9772a
Show file tree

Hide file tree

Showing 26 changed files with 104 additions and 124 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -67,4 +67,5 @@ publish_package:
     - python3 setup.py sdist bdist_wheel
     - TWINE_PASSWORD=${CI_JOB_TOKEN} TWINE_USERNAME=gitlab-ci-token python -m twine upload --repository-url https://gitlab.rlp.net/api/v4/projects/${CI_PROJECT_ID}/packages/pypi dist/*
   only:
+    # deploys in private gitlab package repository only the develop branch, the master branch is published in PyPI
     - develop
diff --git a/docs/source/02_installation.md b/docs/source/02_installation.md
@@ -219,7 +219,7 @@ The test data can be downloaded here:
 * [test_patients.tsv](_static/test_patients.tsv)
 
 ````commandline
-neofox --candidate-file /path/to/test_data.txt --patient-data /path/to/test_patients.txt --output-folder  /path/to/outputfolder --with-table --with-json --output-prefix test
+neofox --input-file /path/to/test_data.txt --patient-data /path/to/test_patients.txt --output-folder  /path/to/outputfolder --output-prefix test
 ````
 
 The resulting output files can be compared to the following test output files:

diff --git a/docs/source/03_03_usage.md b/docs/source/03_03_usage.md
@@ -15,7 +15,7 @@ neofox --input-file neoantigens_candidates.tsv \
     [--output-prefix out_prefix]  \
     [--organism human|mouse]  \
     [--rank-mhci-threshold 2.0] \
-    [--rank-mhcii-threshold 4.0] \
+    [--rank-mhcii-threshold 5.0] \
     [--num-cpus] \
     [--config] \
     [--patient-id] \
@@ -41,7 +41,7 @@ where:
 - if all expression values related to a patient are NA or `rnaExpression` is not given in the input file but the tumor type has been provided in the patient file, imputated expression will be used for the relevant features
 
 **EXAMPLE**  
-This is an example to call NeoFox with a candidate-file and obtaining the annotated neoantigen candidates in [tabular](03_02_output_data.md#tabular-format) format:  
+This is an example to call NeoFox with a candidate file and obtaining the annotated neoantigen candidates in [tabular](03_02_output_data.md#tabular-format) format:
 
 ````commandline
 neofox --input-file neoantigens_candidates.tsv \
@@ -287,7 +287,7 @@ patients_json = ModelConverter.objects2json(model_objects=patients)
 ```
 
 - instead of creating neoantigen or patient models, tabular or json files containing this information can be passed:  
-  The neoantigen candidates can be provided in [candidate-file format](03_01_input_data.md#tabular-file-format)
+  The neoantigen candidates can be provided in [candidate file format](03_01_input_data.md#tabular-file-format)
 
 ```python
 model_file = "/path/to/neoantigen_candidates.tab"

diff --git a/docs/source/05_models.md b/docs/source/05_models.md
@@ -195,7 +195,6 @@ The metadata required for analysis for a given patient + its patient identifier
 | Field | Type | Label | Description |
 | ----- | ---- | ----- | ----------- |
 | identifier | [string](#string) |  | Patient identifier |
-| isRnaAvailable | [bool](#bool) |  | Is RNA expression available? |
 | tumorType | [string](#string) |  | Tumor entity in TCGA study abbrevation style as described here: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations |
 | mhc1 | [Mhc1](#neoantigen.Mhc1) | repeated | MHC I classic molecules |
 | mhc2 | [Mhc2](#neoantigen.Mhc2) | repeated | MHC II classic molecules |

diff --git a/neofox/__init__.py b/neofox/__init__.py
@@ -18,8 +18,7 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.#
 
 
-VERSION = "1.0.2"
-
+VERSION = "1.1.0b1"
 
 REFERENCE_FOLDER_ENV = "NEOFOX_REFERENCE_FOLDER"
 NEOFOX_BLASTP_ENV = "NEOFOX_BLASTP"

diff --git a/neofox/annotator/abstract_annotator.py b/neofox/annotator/abstract_annotator.py
@@ -53,7 +53,7 @@ def __init__(
         self.priority_score_calculator = PriorityScore()
         self.iedb_immunogenicity = IEDBimmunogenicity()
         self.amplitude = Amplitude()
-        self.hex = Hex(runner=self.runner, configuration=configuration, references=references)
+        self.hex = Hex(references=references)
 
     def get_additional_annotations_neoepitope_mhci(
             self, epitope: PredictedEpitope, neoantigen: Neoantigen = None) -> PredictedEpitope:

diff --git a/neofox/command_line.py b/neofox/command_line.py
@@ -213,16 +213,6 @@ def _read_data(input_file, patients_data, mhc_database: MhcDatabase) -> Tuple[Li
     else:
         raise ValueError('Not supported input file extension: {}'.format(input_file))
 
-    patients_dict : Dict[str, Patient]
-    patients_dict = {p.identifier: p for p in patients}
-
-    for n in neoantigens:
-        patient = patients_dict.get(n.patient_identifier)
-        if not patient.is_rna_available:
-            # removes RNA vaf if indicated in patient that this information is no good
-            # iCam legacy
-            n.rna_variant_allele_frequency = None
-
     return neoantigens, patients
 
 
@@ -386,16 +376,6 @@ def _read_data_epitopes(
     else:
         raise ValueError('Not supported input file extension: {}'.format(input_file))
 
-    patients_dict : Dict[str, Patient]
-    patients_dict = {p.identifier: p for p in patients}
-
-    for n in neoepitopes:
-        patient = patients_dict.get(n.patient_identifier)
-        if patient is not None and not patient.is_rna_available:
-            # removes RNA vaf if indicated in patient that this information is no good
-            # iCam legacy
-            n.rna_variant_allele_frequency = None
-
     return neoepitopes, patients
 
 

diff --git a/neofox/model/conversion.py b/neofox/model/conversion.py
@@ -111,7 +111,6 @@ def parse_patients_file(patients_file: str, mhc_database: MhcDatabase) -> List[P
             patient_dict = row.to_dict()
             patient = PatientFactory.build_patient(
                 identifier=patient_dict.get("identifier"),
-                is_rna_available=patient_dict.get("isRnaAvailable", False),
                 tumor_type=patient_dict.get("tumorType"),
                 mhc_alleles=patient_dict.get("mhcIAlleles", []),
                 mhc2_alleles=patient_dict.get("mhcIIAlleles", []),

diff --git a/neofox/model/factories.py b/neofox/model/factories.py
@@ -173,11 +173,10 @@ def build_neoepitope(mutated_peptide=None, wild_type_peptide=None, patient_ident
 
 class PatientFactory(object):
     @staticmethod
-    def build_patient(identifier, is_rna_available=False, tumor_type=None, mhc_alleles: List[str] = [],
+    def build_patient(identifier, tumor_type=None, mhc_alleles: List[str] = [],
                       mhc2_alleles: List[str] = [], mhc_database: MhcDatabase =None):
         patient = Patient(
             identifier=identifier,
-            is_rna_available=is_rna_available,
             tumor_type=tumor_type,
             mhc1=MhcFactory.build_mhc1_alleles(mhc_alleles, mhc_database),
             mhc2=MhcFactory.build_mhc2_alleles(mhc2_alleles, mhc_database)

diff --git a/neofox/model/models.md b/neofox/model/models.md
@@ -195,7 +195,6 @@ The metadata required for analysis for a given patient + its patient identifier
 | Field | Type | Label | Description |
 | ----- | ---- | ----- | ----------- |
 | identifier | [string](#string) |  | Patient identifier |
-| isRnaAvailable | [bool](#bool) |  | Is RNA expression available? |
 | tumorType | [string](#string) |  | Tumor entity in TCGA study abbrevation style as described here: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations |
 | mhc1 | [Mhc1](#neoantigen.Mhc1) | repeated | MHC I classic molecules |
 | mhc2 | [Mhc2](#neoantigen.Mhc2) | repeated | MHC II classic molecules |

diff --git a/neofox/model/neoantigen.proto b/neofox/model/neoantigen.proto
@@ -78,21 +78,17 @@ message Patient {
 	*/
 	string identifier = 1;
 	/**
-	Is RNA expression available?
-	*/
-	bool isRnaAvailable = 2;
-	/**
 	Tumor entity in TCGA study abbrevation style as described here: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations
 	*/
-	string tumorType = 3;
+	string tumorType = 2;
 	/**
 	MHC I classic molecules
 	*/
-	repeated Mhc1 mhc1 = 4;
+	repeated Mhc1 mhc1 = 3;
 	/**
 	MHC II classic molecules
 	*/
-	repeated Mhc2 mhc2 = 5;
+	repeated Mhc2 mhc2 = 4;
 }
 
 /**

diff --git a/neofox/model/neoantigen.py b/neofox/model/neoantigen.py
diff --git a/neofox/neofox.py b/neofox/neofox.py
@@ -110,10 +110,6 @@ def __init__(
         for neoantigen in self.neoantigens:
             expression_per_patient[neoantigen.patient_identifier].append(neoantigen.rna_expression)
 
-        for patient in self.patients:
-            self.patients[patient].is_rna_available = all(e is not None for e in
-                                                          expression_per_patient[self.patients[patient].identifier])
-
         # only performs the expression imputation for humans
         if self.reference_folder.organism == ORGANISM_HOMO_SAPIENS:
             # impute expresssion from TCGA, ONLY if isRNAavailable = False for given patient,
@@ -137,7 +133,7 @@ def _conditional_expression_imputation(self) -> List[Neoantigen]:
             gene_expression = expression_annotator.get_gene_expression_annotation(
                 gene_name=neoantigen.gene, tcga_cohort=patient.tumor_type
             )
-            if not patient.is_rna_available and patient.tumor_type is not None and patient.tumor_type != "":
+            if expression_value is None and patient.tumor_type is not None and patient.tumor_type != "":
                 expression_value = gene_expression
             neoantigen_transformed.rna_expression = expression_value
             neoantigen.imputed_gene_expression = gene_expression

diff --git a/neofox/neofox_epitope.py b/neofox/neofox_epitope.py
@@ -211,8 +211,6 @@ def _conditional_expression_imputation(self) -> List[PredictedEpitope]:
                 neoepitope_transformed = neoepitope
                 gene_expression = expression_annotator.get_gene_expression_annotation(
                     gene_name=neoepitope.gene, tcga_cohort=patient.tumor_type)
-                if not patient.is_rna_available and patient.tumor_type is not None and patient.tumor_type != "":
-                    neoepitope_transformed.rna_expression = gene_expression
                 neoepitope.imputed_gene_expression = gene_expression
                 neoepitopes_transformed.append(neoepitope_transformed)
             else:

diff --git a/neofox/published_features/hex/BLOSUM62.rda b/neofox/published_features/hex/BLOSUM62.rda
diff --git a/neofox/published_features/hex/hex.py b/neofox/published_features/hex/hex.py
@@ -18,33 +18,22 @@
 # You should have received a copy of the GNU General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.#
 from typing import List
-import os
 from neofox.model.neoantigen import Annotation, PredictedEpitope
 from neofox.model.factories import AnnotationFactory
+from neofox.published_features.hex.pyhex import PyHex
 from neofox.references.references import ReferenceFolder
 
 
 class Hex(object):
 
-    def __init__(self, references: ReferenceFolder, runner, configuration):
-        """
-        :type runner: neofox.helpers.runner.Runner
-        :type configuration: neofox.references.DependenciesConfiguration
-        """
-        self.runner = runner
-        self.configuration = configuration
+    def __init__(self, references: ReferenceFolder):
         self.iedb_fasta = references.get_iedb_fasta()
+        self.pyhex = PyHex(self.iedb_fasta)
 
     def apply_hex(self, mut_peptide):
         """this function calls hex tool. this tool analyses the neoepitope candidate sequence for molecular mimicry to viral epitopes
         """
-        my_path = os.path.abspath(os.path.dirname(__file__))
-        tool_path = os.path.join(my_path, "hex.R")
-        cmd = [self.configuration.rscript, tool_path, mut_peptide, self.iedb_fasta, my_path]
-        output, _ = self.runner.run_command(cmd)
-        if output == "":
-            output = None
-        return output
+        return self.pyhex.run(mut_peptide)
 
     def get_annotation(
             self, mutated_peptide_mhci: PredictedEpitope, mutated_peptide_mhcii: PredictedEpitope) -> List[Annotation]:

diff --git a/neofox/published_features/hex/pyhex.py b/neofox/published_features/hex/pyhex.py
@@ -0,0 +1,51 @@
+from math import ceil, floor
+
+from Bio import SeqIO
+from Bio.Align import substitution_matrices
+from Bio.Alphabet.IUPAC import ExtendedIUPACProtein
+
+
+class PyHex:
+
+    def __init__(self, iedb_fasta, magic_number=4):
+        self.iedb_sequences = self._read_fasta(iedb_fasta)
+        self.magic_number = magic_number
+        self.blosum = substitution_matrices.load("BLOSUM62")
+
+    @staticmethod
+    def _read_fasta(fasta_file):
+        sequences = []
+        # read fasta
+        with open(fasta_file, "r") as handle:
+            for record in SeqIO.parse(handle, "fasta"):
+                # include only records that do not contain non-standard amino acids
+                if not any([aa not in ExtendedIUPACProtein.letters for aa in record.seq]):
+                    sequences.append(record)
+        return sequences
+
+    def _align(self, sequence, mutated_sequence):
+        weights = self._get_sequence_weights(mutated_sequence)
+        score = sum([self.blosum[q, t] * w for q, t, w in zip(sequence, mutated_sequence, weights)])
+        return score
+
+    def _get_sequence_weights(self, mutated_sequence):
+        length_mutated_sequence = len(mutated_sequence)
+        mid_score = ceil(length_mutated_sequence / 2) * self.magic_number
+        weights = list(range(1, mid_score, self.magic_number))
+        weights.extend(reversed(weights[0:floor(length_mutated_sequence / 2)]))
+
+        top_floor = floor(length_mutated_sequence / 3)
+        weights[0:top_floor] = list(range(1, top_floor + 1))
+        tail = length_mutated_sequence - top_floor
+        weights[tail:length_mutated_sequence] = list(reversed(range(1, top_floor + 1)))
+
+        return weights
+
+    def run(self, mutated_sequence):
+        # excludes sequences that have different length than the mutated sequence
+        sequences = [s for s in self.iedb_sequences if len(s.seq) == len(mutated_sequence)]
+        # align each of the sequences
+        alignment_scores = [self._align(s.seq, mutated_sequence) for s in sequences]
+        # gets the best score of all the alignments
+        best_score = max(alignment_scores)
+        return best_score
diff --git a/neofox/references/install_r_dependencies.R b/neofox/references/install_r_dependencies.R
@@ -1,9 +1,3 @@
-install.packages("lattice", repo="http://cran.rstudio.com/")
-install.packages("ggplot2", repo="http://cran.rstudio.com/")
 install.packages("caret", repo="http://cran.rstudio.com/")
 install.packages("Peptides", repo="http://cran.rstudio.com/")
 install.packages("doParallel", repo="http://cran.rstudio.com/")
-install.packages("gbm", repo="http://cran.rstudio.com/")
-if (!requireNamespace("BiocManager", quietly = TRUE))
-    install.packages("BiocManager")
-BiocManager::install("Biostrings")
diff --git a/neofox/tests/integration_tests/test_hex.py b/neofox/tests/integration_tests/test_hex.py
@@ -22,23 +22,32 @@
 from neofox.helpers.runner import Runner
 
 import neofox.tests.integration_tests.integration_test_tools as integration_test_tools
-
+from neofox.published_features.hex.pyhex import PyHex
 
 
 class TestHex(TestCase):
     def setUp(self):
         self.references, self.configuration = integration_test_tools.load_references()
         self.runner = Runner()
 
-
     def test_hex(self):
-        res = Hex(
-            runner=self.runner, configuration=self.configuration, references=self.references
-        ).apply_hex(
-            mut_peptide="FGLAIDVDD"
-        )
-        logger.info(res)
-        self.assertEqual(float(res), 148)
+        res = Hex(references=self.references).apply_hex(mut_peptide="FGLAIDVDD")
+        self.assertEqual(int(res), 148)
+
+    def test_pyhex(self):
+        pyhex = PyHex(iedb_fasta=self.references.get_iedb_fasta())
+        res = pyhex.run("FGLAIDVDD")
+        self.assertEqual(res, 148)
+
+    def test_comparison(self):
+        for i in range(10):
+            for k in range(9, 30):
+                peptide = integration_test_tools.get_random_kmer(k=k)
+                logger.info(peptide)
+                res = Hex(references=self.references).apply_hex(mut_peptide=peptide)
+                pyhex = PyHex(iedb_fasta=self.references.get_iedb_fasta())
+                res_pyhex = pyhex.run(peptide)
+                self.assertEqual(float(res), res_pyhex, "Peptide: {}".format(peptide))
 
 
 
diff --git a/neofox/tests/integration_tests/test_neofox.py b/neofox/tests/integration_tests/test_neofox.py
@@ -345,8 +345,6 @@ def test_neofox_without_mhc1(self):
 
     def test_gene_expression_imputation(self):
         neoantigens, patients = self._get_test_data()
-        for p in patients:
-            p.is_rna_available = False
         neofox = NeoFox(
             neoantigens=neoantigens,
             patients=patients,

diff --git a/neofox/tests/synthetic_data/factories.py b/neofox/tests/synthetic_data/factories.py
@@ -78,7 +78,6 @@ def patient(self) -> Patient:
             try:
                 patient = Patient(
                     identifier=self.generator.unique.uuid4(),
-                    is_rna_available=True,
                     tumor_type=self.random_elements(self.available_tumor_types, length=1)[0],
                     # by setting unique=True we enforce that all patients are heterozygous
                     mhc1=MhcFactory.build_mhc1_alleles(