Skip to content

Commit

Permalink
Merge branch 'develop' into 'master'
Browse files Browse the repository at this point in the history
Beta release v1.1.0b1

See merge request tron/addannot!231
  • Loading branch information
Pablo Riesgo Ferreiro committed Feb 8, 2023
2 parents d5452a6 + be15532 commit 6f9772a
Show file tree
Hide file tree
Showing 26 changed files with 104 additions and 124 deletions.
1 change: 1 addition & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,5 @@ publish_package:
- python3 setup.py sdist bdist_wheel
- TWINE_PASSWORD=${CI_JOB_TOKEN} TWINE_USERNAME=gitlab-ci-token python -m twine upload --repository-url https://gitlab.rlp.net/api/v4/projects/${CI_PROJECT_ID}/packages/pypi dist/*
only:
# deploys in private gitlab package repository only the develop branch, the master branch is published in PyPI
- develop
2 changes: 1 addition & 1 deletion docs/source/02_installation.md
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ The test data can be downloaded here:
* [test_patients.tsv](_static/test_patients.tsv)

````commandline
neofox --candidate-file /path/to/test_data.txt --patient-data /path/to/test_patients.txt --output-folder /path/to/outputfolder --with-table --with-json --output-prefix test
neofox --input-file /path/to/test_data.txt --patient-data /path/to/test_patients.txt --output-folder /path/to/outputfolder --output-prefix test
````

The resulting output files can be compared to the following test output files:
Expand Down
6 changes: 3 additions & 3 deletions docs/source/03_03_usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ neofox --input-file neoantigens_candidates.tsv \
[--output-prefix out_prefix] \
[--organism human|mouse] \
[--rank-mhci-threshold 2.0] \
[--rank-mhcii-threshold 4.0] \
[--rank-mhcii-threshold 5.0] \
[--num-cpus] \
[--config] \
[--patient-id] \
Expand All @@ -41,7 +41,7 @@ where:
- if all expression values related to a patient are NA or `rnaExpression` is not given in the input file but the tumor type has been provided in the patient file, imputated expression will be used for the relevant features

**EXAMPLE**
This is an example to call NeoFox with a candidate-file and obtaining the annotated neoantigen candidates in [tabular](03_02_output_data.md#tabular-format) format:
This is an example to call NeoFox with a candidate file and obtaining the annotated neoantigen candidates in [tabular](03_02_output_data.md#tabular-format) format:

````commandline
neofox --input-file neoantigens_candidates.tsv \
Expand Down Expand Up @@ -287,7 +287,7 @@ patients_json = ModelConverter.objects2json(model_objects=patients)
```

- instead of creating neoantigen or patient models, tabular or json files containing this information can be passed:
The neoantigen candidates can be provided in [candidate-file format](03_01_input_data.md#tabular-file-format)
The neoantigen candidates can be provided in [candidate file format](03_01_input_data.md#tabular-file-format)

```python
model_file = "/path/to/neoantigen_candidates.tab"
Expand Down
1 change: 0 additions & 1 deletion docs/source/05_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,6 @@ The metadata required for analysis for a given patient + its patient identifier
| Field | Type | Label | Description |
| ----- | ---- | ----- | ----------- |
| identifier | [string](#string) | | Patient identifier |
| isRnaAvailable | [bool](#bool) | | Is RNA expression available? |
| tumorType | [string](#string) | | Tumor entity in TCGA study abbrevation style as described here: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations |
| mhc1 | [Mhc1](#neoantigen.Mhc1) | repeated | MHC I classic molecules |
| mhc2 | [Mhc2](#neoantigen.Mhc2) | repeated | MHC II classic molecules |
Expand Down
3 changes: 1 addition & 2 deletions neofox/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.#


VERSION = "1.0.2"

VERSION = "1.1.0b1"

REFERENCE_FOLDER_ENV = "NEOFOX_REFERENCE_FOLDER"
NEOFOX_BLASTP_ENV = "NEOFOX_BLASTP"
Expand Down
2 changes: 1 addition & 1 deletion neofox/annotator/abstract_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(
self.priority_score_calculator = PriorityScore()
self.iedb_immunogenicity = IEDBimmunogenicity()
self.amplitude = Amplitude()
self.hex = Hex(runner=self.runner, configuration=configuration, references=references)
self.hex = Hex(references=references)

def get_additional_annotations_neoepitope_mhci(
self, epitope: PredictedEpitope, neoantigen: Neoantigen = None) -> PredictedEpitope:
Expand Down
20 changes: 0 additions & 20 deletions neofox/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,16 +213,6 @@ def _read_data(input_file, patients_data, mhc_database: MhcDatabase) -> Tuple[Li
else:
raise ValueError('Not supported input file extension: {}'.format(input_file))

patients_dict : Dict[str, Patient]
patients_dict = {p.identifier: p for p in patients}

for n in neoantigens:
patient = patients_dict.get(n.patient_identifier)
if not patient.is_rna_available:
# removes RNA vaf if indicated in patient that this information is no good
# iCam legacy
n.rna_variant_allele_frequency = None

return neoantigens, patients


Expand Down Expand Up @@ -386,16 +376,6 @@ def _read_data_epitopes(
else:
raise ValueError('Not supported input file extension: {}'.format(input_file))

patients_dict : Dict[str, Patient]
patients_dict = {p.identifier: p for p in patients}

for n in neoepitopes:
patient = patients_dict.get(n.patient_identifier)
if patient is not None and not patient.is_rna_available:
# removes RNA vaf if indicated in patient that this information is no good
# iCam legacy
n.rna_variant_allele_frequency = None

return neoepitopes, patients


Expand Down
1 change: 0 additions & 1 deletion neofox/model/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ def parse_patients_file(patients_file: str, mhc_database: MhcDatabase) -> List[P
patient_dict = row.to_dict()
patient = PatientFactory.build_patient(
identifier=patient_dict.get("identifier"),
is_rna_available=patient_dict.get("isRnaAvailable", False),
tumor_type=patient_dict.get("tumorType"),
mhc_alleles=patient_dict.get("mhcIAlleles", []),
mhc2_alleles=patient_dict.get("mhcIIAlleles", []),
Expand Down
3 changes: 1 addition & 2 deletions neofox/model/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,11 +173,10 @@ def build_neoepitope(mutated_peptide=None, wild_type_peptide=None, patient_ident

class PatientFactory(object):
@staticmethod
def build_patient(identifier, is_rna_available=False, tumor_type=None, mhc_alleles: List[str] = [],
def build_patient(identifier, tumor_type=None, mhc_alleles: List[str] = [],
mhc2_alleles: List[str] = [], mhc_database: MhcDatabase =None):
patient = Patient(
identifier=identifier,
is_rna_available=is_rna_available,
tumor_type=tumor_type,
mhc1=MhcFactory.build_mhc1_alleles(mhc_alleles, mhc_database),
mhc2=MhcFactory.build_mhc2_alleles(mhc2_alleles, mhc_database)
Expand Down
1 change: 0 additions & 1 deletion neofox/model/models.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,6 @@ The metadata required for analysis for a given patient + its patient identifier
| Field | Type | Label | Description |
| ----- | ---- | ----- | ----------- |
| identifier | [string](#string) | | Patient identifier |
| isRnaAvailable | [bool](#bool) | | Is RNA expression available? |
| tumorType | [string](#string) | | Tumor entity in TCGA study abbrevation style as described here: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations |
| mhc1 | [Mhc1](#neoantigen.Mhc1) | repeated | MHC I classic molecules |
| mhc2 | [Mhc2](#neoantigen.Mhc2) | repeated | MHC II classic molecules |
Expand Down
10 changes: 3 additions & 7 deletions neofox/model/neoantigen.proto
Original file line number Diff line number Diff line change
Expand Up @@ -78,21 +78,17 @@ message Patient {
*/
string identifier = 1;
/**
Is RNA expression available?
*/
bool isRnaAvailable = 2;
/**
Tumor entity in TCGA study abbrevation style as described here: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations
*/
string tumorType = 3;
string tumorType = 2;
/**
MHC I classic molecules
*/
repeated Mhc1 mhc1 = 4;
repeated Mhc1 mhc1 = 3;
/**
MHC II classic molecules
*/
repeated Mhc2 mhc2 = 5;
repeated Mhc2 mhc2 = 4;
}

/**
Expand Down
8 changes: 3 additions & 5 deletions neofox/model/neoantigen.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 1 addition & 5 deletions neofox/neofox.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,6 @@ def __init__(
for neoantigen in self.neoantigens:
expression_per_patient[neoantigen.patient_identifier].append(neoantigen.rna_expression)

for patient in self.patients:
self.patients[patient].is_rna_available = all(e is not None for e in
expression_per_patient[self.patients[patient].identifier])

# only performs the expression imputation for humans
if self.reference_folder.organism == ORGANISM_HOMO_SAPIENS:
# impute expresssion from TCGA, ONLY if isRNAavailable = False for given patient,
Expand All @@ -137,7 +133,7 @@ def _conditional_expression_imputation(self) -> List[Neoantigen]:
gene_expression = expression_annotator.get_gene_expression_annotation(
gene_name=neoantigen.gene, tcga_cohort=patient.tumor_type
)
if not patient.is_rna_available and patient.tumor_type is not None and patient.tumor_type != "":
if expression_value is None and patient.tumor_type is not None and patient.tumor_type != "":
expression_value = gene_expression
neoantigen_transformed.rna_expression = expression_value
neoantigen.imputed_gene_expression = gene_expression
Expand Down
2 changes: 0 additions & 2 deletions neofox/neofox_epitope.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,6 @@ def _conditional_expression_imputation(self) -> List[PredictedEpitope]:
neoepitope_transformed = neoepitope
gene_expression = expression_annotator.get_gene_expression_annotation(
gene_name=neoepitope.gene, tcga_cohort=patient.tumor_type)
if not patient.is_rna_available and patient.tumor_type is not None and patient.tumor_type != "":
neoepitope_transformed.rna_expression = gene_expression
neoepitope.imputed_gene_expression = gene_expression
neoepitopes_transformed.append(neoepitope_transformed)
else:
Expand Down
Binary file removed neofox/published_features/hex/BLOSUM62.rda
Binary file not shown.
19 changes: 4 additions & 15 deletions neofox/published_features/hex/hex.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,33 +18,22 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.#
from typing import List
import os
from neofox.model.neoantigen import Annotation, PredictedEpitope
from neofox.model.factories import AnnotationFactory
from neofox.published_features.hex.pyhex import PyHex
from neofox.references.references import ReferenceFolder


class Hex(object):

def __init__(self, references: ReferenceFolder, runner, configuration):
"""
:type runner: neofox.helpers.runner.Runner
:type configuration: neofox.references.DependenciesConfiguration
"""
self.runner = runner
self.configuration = configuration
def __init__(self, references: ReferenceFolder):
self.iedb_fasta = references.get_iedb_fasta()
self.pyhex = PyHex(self.iedb_fasta)

def apply_hex(self, mut_peptide):
"""this function calls hex tool. this tool analyses the neoepitope candidate sequence for molecular mimicry to viral epitopes
"""
my_path = os.path.abspath(os.path.dirname(__file__))
tool_path = os.path.join(my_path, "hex.R")
cmd = [self.configuration.rscript, tool_path, mut_peptide, self.iedb_fasta, my_path]
output, _ = self.runner.run_command(cmd)
if output == "":
output = None
return output
return self.pyhex.run(mut_peptide)

def get_annotation(
self, mutated_peptide_mhci: PredictedEpitope, mutated_peptide_mhcii: PredictedEpitope) -> List[Annotation]:
Expand Down
51 changes: 51 additions & 0 deletions neofox/published_features/hex/pyhex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from math import ceil, floor

from Bio import SeqIO
from Bio.Align import substitution_matrices
from Bio.Alphabet.IUPAC import ExtendedIUPACProtein


class PyHex:

def __init__(self, iedb_fasta, magic_number=4):
self.iedb_sequences = self._read_fasta(iedb_fasta)
self.magic_number = magic_number
self.blosum = substitution_matrices.load("BLOSUM62")

@staticmethod
def _read_fasta(fasta_file):
sequences = []
# read fasta
with open(fasta_file, "r") as handle:
for record in SeqIO.parse(handle, "fasta"):
# include only records that do not contain non-standard amino acids
if not any([aa not in ExtendedIUPACProtein.letters for aa in record.seq]):
sequences.append(record)
return sequences

def _align(self, sequence, mutated_sequence):
weights = self._get_sequence_weights(mutated_sequence)
score = sum([self.blosum[q, t] * w for q, t, w in zip(sequence, mutated_sequence, weights)])
return score

def _get_sequence_weights(self, mutated_sequence):
length_mutated_sequence = len(mutated_sequence)
mid_score = ceil(length_mutated_sequence / 2) * self.magic_number
weights = list(range(1, mid_score, self.magic_number))
weights.extend(reversed(weights[0:floor(length_mutated_sequence / 2)]))

top_floor = floor(length_mutated_sequence / 3)
weights[0:top_floor] = list(range(1, top_floor + 1))
tail = length_mutated_sequence - top_floor
weights[tail:length_mutated_sequence] = list(reversed(range(1, top_floor + 1)))

return weights

def run(self, mutated_sequence):
# excludes sequences that have different length than the mutated sequence
sequences = [s for s in self.iedb_sequences if len(s.seq) == len(mutated_sequence)]
# align each of the sequences
alignment_scores = [self._align(s.seq, mutated_sequence) for s in sequences]
# gets the best score of all the alignments
best_score = max(alignment_scores)
return best_score
6 changes: 0 additions & 6 deletions neofox/references/install_r_dependencies.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,3 @@
install.packages("lattice", repo="http://cran.rstudio.com/")
install.packages("ggplot2", repo="http://cran.rstudio.com/")
install.packages("caret", repo="http://cran.rstudio.com/")
install.packages("Peptides", repo="http://cran.rstudio.com/")
install.packages("doParallel", repo="http://cran.rstudio.com/")
install.packages("gbm", repo="http://cran.rstudio.com/")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("Biostrings")
27 changes: 18 additions & 9 deletions neofox/tests/integration_tests/test_hex.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,23 +22,32 @@
from neofox.helpers.runner import Runner

import neofox.tests.integration_tests.integration_test_tools as integration_test_tools

from neofox.published_features.hex.pyhex import PyHex


class TestHex(TestCase):
def setUp(self):
self.references, self.configuration = integration_test_tools.load_references()
self.runner = Runner()


def test_hex(self):
res = Hex(
runner=self.runner, configuration=self.configuration, references=self.references
).apply_hex(
mut_peptide="FGLAIDVDD"
)
logger.info(res)
self.assertEqual(float(res), 148)
res = Hex(references=self.references).apply_hex(mut_peptide="FGLAIDVDD")
self.assertEqual(int(res), 148)

def test_pyhex(self):
pyhex = PyHex(iedb_fasta=self.references.get_iedb_fasta())
res = pyhex.run("FGLAIDVDD")
self.assertEqual(res, 148)

def test_comparison(self):
for i in range(10):
for k in range(9, 30):
peptide = integration_test_tools.get_random_kmer(k=k)
logger.info(peptide)
res = Hex(references=self.references).apply_hex(mut_peptide=peptide)
pyhex = PyHex(iedb_fasta=self.references.get_iedb_fasta())
res_pyhex = pyhex.run(peptide)
self.assertEqual(float(res), res_pyhex, "Peptide: {}".format(peptide))



2 changes: 0 additions & 2 deletions neofox/tests/integration_tests/test_neofox.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,8 +345,6 @@ def test_neofox_without_mhc1(self):

def test_gene_expression_imputation(self):
neoantigens, patients = self._get_test_data()
for p in patients:
p.is_rna_available = False
neofox = NeoFox(
neoantigens=neoantigens,
patients=patients,
Expand Down
1 change: 0 additions & 1 deletion neofox/tests/synthetic_data/factories.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ def patient(self) -> Patient:
try:
patient = Patient(
identifier=self.generator.unique.uuid4(),
is_rna_available=True,
tumor_type=self.random_elements(self.available_tumor_types, length=1)[0],
# by setting unique=True we enforce that all patients are heterozygous
mhc1=MhcFactory.build_mhc1_alleles(
Expand Down
Loading

0 comments on commit 6f9772a

Please sign in to comment.