Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
Merging master and development branch for version 1.1.2
  • Loading branch information
ambj committed Apr 3, 2017
2 parents 8fa0b54 + 2bc5da0 commit ad0edc9
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 38 deletions.
86 changes: 65 additions & 21 deletions MuPeXI.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def main(args):

# Read in data
print_ifnot_webserver('\nReading in data', input_.webserver)
expression = build_expression(input_.expression_file, input_.webserver)
expression = build_expression(input_.expression_file, input_.webserver, input_.expression_type)
proteome_reference, sequence_count = build_proteome_reference(paths.proteome_ref_file, input_.webserver)
genome_reference = build_genome_reference(paths.genome_ref_file, input_.webserver)
cancer_genes = build_cancer_genes(paths.cosmic_file, input_.webserver)
Expand All @@ -58,7 +58,7 @@ def main(args):
vcf_sorted_file = create_vep_compatible_vcf(vcf_file, input_.webserver, tmp_dir, input_.hg19)
allele_fractions = extract_allele_frequency(vcf_sorted_file, input_.webserver, variant_caller)
vep_file = run_vep(vcf_sorted_file, input_.webserver, tmp_dir, paths.vep_path, paths.vep_dir, input_.keep_temp, input_.prefix, input_.outdir)
vep_info, vep_counters, transcript_info = build_vep_info(vep_file, input_.webserver)
vep_info, vep_counters, transcript_info, protein_positions = build_vep_info(vep_file, input_.webserver)

end_time_vep = datetime.now()

Expand Down Expand Up @@ -94,10 +94,10 @@ def main(args):
# run netMHCpan
unique_mutant_peptide_count, peptide_file = write_peptide_file(peptide_info, tmp_dir, input_.webserver, input_.keep_temp, input_.prefix, input_.outdir)
netMHCpan_runtime, unique_alleles, netMHC_file = run_netMHCpan(input_.HLA_alleles, paths.netMHC, peptide_file, tmp_dir, input_.webserver, input_.keep_temp, input_.prefix, input_.outdir)
net_mhc = build_netMCH(netMHC_file, input_.webserver)
net_mhc = build_netMHC(netMHC_file, input_.webserver)

# write files
output_file = write_output_file(peptide_info, expression, net_mhc, unique_alleles, cancer_genes, tmp_dir, input_.webserver, input_.print_mismatch, allele_fractions, input_.expression_type, transcript_info, reference_peptides)
output_file = write_output_file(peptide_info, expression, net_mhc, unique_alleles, cancer_genes, tmp_dir, input_.webserver, input_.print_mismatch, allele_fractions, input_.expression_type, transcript_info, reference_peptides, proteome_reference, protein_positions)
log_file = write_log_file(sys.argv, peptide_length, sequence_count, reference_peptide_counters, vep_counters, peptide_counters, start_time_mupex, start_time_mupei, start_time, end_time_mupex, input_.HLA_alleles, netMHCpan_runtime, unique_mutant_peptide_count, unique_alleles, tmp_dir, input_.webserver)

# clean up
Expand Down Expand Up @@ -280,7 +280,7 @@ def build_genome_reference(genome_ref_file, webserver):



def build_expression(expression_file, webserver):
def build_expression(expression_file, webserver, expression_type):
if not expression_file == None :
print_ifnot_webserver('\tCreating expression file dictionary', webserver)
expression = defaultdict(dict) # empty dictionary
Expand All @@ -289,21 +289,32 @@ def build_expression(expression_file, webserver):
for line in f.readlines():
line = line.split()
if not line[0] == 'target_id':
check_expression_file_type(expression_type, line)
# save line information
if '.' in line[0] : # example: ENST00000415118.3
trans_id = line[0].split('.')[0]
ensembl_id = line[0].split('.')[0]
else: # example: ENST00000415118
trans_id = line[0]
ensembl_id = line[0]
mean = line[1]
# fill dictionary
expression[trans_id] = mean
expression[ensembl_id] = mean
else :
expression = None

return expression



def check_expression_file_type(expression_type, line):
if expression_type == 'gene' :
if 'ENST' in line[0]:
usage(); sys.exit('ERROR:\tEnsembl transcript id detected: {}\n\tWhile expression type option "gene" was used\n'.format(line[0]))
if expression_type == 'transcript' :
if 'ENSG' in line[0]:
usage(); sys.exit('ERROR:\tEnsembl gene id detected: {}\n\tWhile expression type option "transcript" was used\n'.format(line[0]))



def build_cancer_genes(cosmic_file, webserver):
if not cosmic_file == None:
print_ifnot_webserver('\tCreating cancer genes list', webserver)
Expand Down Expand Up @@ -482,6 +493,7 @@ def build_vep_info(vep_file, webserver):
# Creating named tuple
Mutation_Info = namedtuple('mutation_info', ['gene_id', 'trans_id', 'mutation_consequence','chr', 'pos', 'cdna_pos', 'prot_pos', 'prot_pos_to', 'aa_normal', 'aa_mut', 'codon_normal', 'codon_mut', 'alt_allele', 'symbol'])
transcript_info = defaultdict(dict)
protein_positions = defaultdict(lambda: defaultdict(dict))

non_used_mutation_count, misssense_variant_count, inframe_insertion_count, inframe_deletion_count, frameshift_variant_count = 0, 0, 0 ,0, 0

Expand Down Expand Up @@ -514,8 +526,11 @@ def build_vep_info(vep_file, webserver):
prot_pos, prot_pos_to = line[9].strip(), None
mutation_id = '{}_{}_{}'.format(chr_, genome_pos, alt_allele)

# set the default value of the key to be a list and append the transcript id
transcript_info.setdefault(mutation_id,[]).append(transID)
# Generate dict of dicts (dependent on both mutation ID and gene ID)
# then set the default value of the key to be a list and append the transcript id
transcript_info[mutation_id].setdefault(geneID,[]).append(transID)
# ad protein position
protein_positions[mutation_id][geneID][transID] = prot_pos
# append information from the line to the list of named tuples - fill tuple
vep_info.append(Mutation_Info(geneID, transID, mutation_consequence, chr_, genome_pos, cdna_pos, int(prot_pos), prot_pos_to, aa_normal, aa_mutation, codon_normal, codon_mut, alt_allele, symbol))

Expand Down Expand Up @@ -543,7 +558,7 @@ def build_vep_info(vep_file, webserver):
VEPCounters = namedtuple('vep_counters', ['non_used_mutation_count', 'misssense_variant_count', 'gene_count', 'transcript_Count', 'inframe_insertion_count', 'inframe_deletion_count', 'frameshift_variant_count'])
vep_counters = VEPCounters(non_used_mutation_count, misssense_variant_count, gene_count, transcript_Count, inframe_insertion_count, inframe_deletion_count, frameshift_variant_count)

return vep_info, vep_counters, transcript_info
return vep_info, vep_counters, transcript_info, protein_positions



Expand Down Expand Up @@ -1015,7 +1030,7 @@ def run_netMHCpan(HLA_alleles, netMHCpan3_0, peptide_file, tmp_dir, webserver, k



def build_netMCH(netMHC_file, webserver):
def build_netMHC(netMHC_file, webserver):
print_ifnot_webserver ('\tCreating NetMHCpan file dictionary', webserver)
net_mhc = defaultdict(dict) # empty dictionary
NetMHCInfo = namedtuple('NetMHCInfo', ['affinity', 'rank'])
Expand All @@ -1040,7 +1055,7 @@ def build_netMCH(netMHC_file, webserver):



def write_output_file(peptide_info, expression, net_mhc, unique_alleles, cancer_genes, tmp_dir, webserver, print_mismatch, allele_fractions, expression_file_type, transcript_info, reference_peptides):
def write_output_file(peptide_info, expression, net_mhc, unique_alleles, cancer_genes, tmp_dir, webserver, print_mismatch, allele_fractions, expression_file_type, transcript_info, reference_peptides, proteome_reference, protein_positions):
print_ifnot_webserver('\tWriting output file', webserver)
printed_ids = set()
row = 0
Expand Down Expand Up @@ -1094,8 +1109,12 @@ def write_output_file(peptide_info, expression, net_mhc, unique_alleles, cancer_
mismatches = pep_match_info.mismatch if not pep_match_info == None else 1
# Print normal peptide or normal peptide only showing mis matched (...X...XX...)
print_normal_peptide = mismatch_snv_normal_peptide_conversion(normal_peptide, peptide_position, peptide_sequence_info.consequence, pep_match_info) if not print_mismatch == None else normal_peptide
# Extract transcript IDs including the peptide
transcript_ids = extract_transcript_ids(mutation_info.gene_id, transcript_info[mutation_id][mutation_info.gene_id], proteome_reference, normal_peptide, peptide_sequence_info.consequence)
# Extract protein position
protein_positions_extracted = extract_protein_position(transcript_ids, mutation_id, mutation_info.gene_id, protein_positions)
# Extract expression value if file is given
expression_sum = extract_expression_value(expression_file_type, expression, mutation_info.gene_id, webserver, transcript_info[mutation_id], printed_ids)
expression_sum = extract_expression_value(expression_file_type, expression, mutation_info.gene_id, webserver, transcript_info[mutation_id][mutation_info.gene_id], printed_ids)
# Extract cancer genes if file is given
if not cancer_genes == None:
cancer_gene = 'Yes' if mutation_info.symbol in cancer_genes else 'No'
Expand All @@ -1107,6 +1126,7 @@ def write_output_file(peptide_info, expression, net_mhc, unique_alleles, cancer_
# calculate priority score
priority_score, scores = score_creation(normal_netmhc_info.rank, mutant_netmhc_info.rank, expression_sum, mutation_info.symbol, mismatches, allele_frequency, reference_peptides, mutant_petide)


# Add row to data frame
df.loc[row] = [
hla,
Expand All @@ -1117,14 +1137,14 @@ def write_output_file(peptide_info, expression, net_mhc, unique_alleles, cancer_
mutant_netmhc_info.affinity,
mutant_netmhc_info.rank,
mutation_info.gene_id,
','.join(transcript_info[mutation_id]),
','.join(transcript_ids),
'{}/{}'.format(mutation_info.aa_normal, mutation_info.aa_mut),
'-' if allele_fractions == None else allele_frequency,
mismatches,
peptide_position,
mutation_info.chr,
mutation_info.pos,
mutation_info.prot_pos,
','.join(protein_positions_extracted),
peptide_sequence_info.consequence,
'-' if mutation_info.symbol == None else mutation_info.symbol,
cancer_gene,
Expand All @@ -1142,7 +1162,6 @@ def write_output_file(peptide_info, expression, net_mhc, unique_alleles, cancer_
df_sorted = df.sort(columns = ('priority_Score'), ascending=False)
df_sorted.loc[:,'priority_Score'] = df_sorted.priority_Score.multiply(100).round().astype(int)
df_sorted.loc[:,'Mismatches'] = df_sorted.Mismatches.astype(int)
df_sorted.loc[:,'Protein_position'] = df_sorted.Protein_position.astype(int)

# Print data frame to intermediate file
df_file = NamedTemporaryFile(delete = False, dir = tmp_dir)
Expand Down Expand Up @@ -1217,6 +1236,31 @@ def extract_expression_value(expression_file_type, expression, gene_id, webserve



def extract_transcript_ids(gene_id, trans_ids, proteome_reference, normal_peptide, consequence) :
peptide_trans_ids = []

# Find transcript id's including the entire peptide.
if not consequence == 'M' :
peptide_trans_ids = trans_ids
else:
for transID in trans_ids:
aa_sequence = proteome_reference[gene_id][transID]
if normal_peptide in aa_sequence :
peptide_trans_ids.append(transID)

return peptide_trans_ids



def extract_protein_position(transcript_ids, mutation_id, gene_id, protein_positions) :
protein_positions_extracted = []
for trans_id in transcript_ids :
protein_positions_extracted.append(protein_positions[mutation_id][gene_id][trans_id])

return protein_positions_extracted



def score_creation(rank_normal, rank_mutant, expression_sum, gene_symbol, mismatches, allele_frequency, reference_peptides, mutant_petide):
expression = 0 if expression_sum == None else float(expression_sum)
normal_match = 0 if mutant_petide in reference_peptides else 1
Expand Down Expand Up @@ -1390,7 +1434,7 @@ def webserver_print_output(webserver, www_tmp_dir, output, logfile, fasta_file_n
def usage():
usage = """
MuPeXI - Mutant Peptide Extractor and Informer
version 2016-08-11
version 2017-03-28
The current version of this program is available from
https://github.com/ambj/MuPeXI
Expand All @@ -1413,6 +1457,9 @@ def usage():
range (9-11) or comma separated (9,10,11).
-e, --expression-file Expression file, tab separated
((ENST*/ENSG*) \t mean)
-E, --expression-type Are the expression values in the expression transcript
files determined on transcript or gene level.
(transcript/gene)
Optional arguments affecting output files:
-o, --output-file Output file name. <VEP-file>.mupexi
Expand All @@ -1435,9 +1482,6 @@ def usage():
-g, --hg19 Perform liftover HG19 to GRCh38.
Requires local picard installation with paths
stated in the config file
-E, --expression-type Setting if the expression values in the expression transcript
files are determined on transcript or gene level.
(transcript/gene)
-h, --help Print this help information
REMEMBER to state references in the config.ini file
Expand Down
21 changes: 11 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,16 @@ MuPeXI.py extracts peptides of user-defined lengths around missense variant muta

## Dependencies

MuPeXI currently runs only on x86_64 machines running Linux or Darwin.

To run MuPeXI the following software and packages must be installed:

#### Required software:
* [Python 2.7](https://www.python.org/download/releases/2.7/)
* [netMHCpan 3.0](http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?netMHCpan)
* [Varaint Effect Predictor (VEP)](http://www.ensembl.org/info/docs/tools/vep/index.html)
* [NetMHCpan 3.0](http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?netMHCpan)
* [Variant Effect Predictor (VEP)](http://www.ensembl.org/info/docs/tools/vep/index.html)

#### Python modules:
#### Required Python modules:
* [Biopython](http://biopython.org/wiki/Download)
* [numpy](http://www.numpy.org/)
* [pandas](http://pandas.pydata.org/)
Expand All @@ -48,7 +49,7 @@ These packages are included if downloading python through [Anaconda](https://www


1. Install all software listed above.
2. Install Biopython, pandas and nympy with the following command:
2. Install Biopython, pandas and numpy with the following command:

pip install biopython
pip install numpy
Expand All @@ -59,16 +60,16 @@ These packages are included if downloading python through [Anaconda](https://www

git clone https://github.com/ambj/MuPeXI.git

4. If not already on your system, reference files from GRCh38 should be obtained. These include cDNA, peptide and cosmic references, look under references in the user manual for a detailed description.
4. If not already on your system, reference files from GRCh38 should be obtained. These include cDNA, peptide and COSMIC references; see the References section in the user manual for a detailed description.

5. Fill the config.ini file
* State the full path to netMHCpan 3.0 and VEP.
* State the full path to the reference files:
5. Fill in the config.ini file
* Provide the full path to NetMHCpan 3.0 and VEP.
* Provide the full path to the reference files:
- cDNA
- peptide
- cosmic
- COSMIC

Additional peptide references and liftover paths can be stated in the config.ini file, see the user manual for detailed information. Instructions on how to fill the config.ini file is found within the file. `config.ini` is automatically detected in the same directory as `MuPeXI.py` script but can also be placed elsewhere and referred to by the `-c` option.
Additional peptide references and liftover paths can be provided in the config.ini file; see the user manual for detailed information. Instructions on how to fill in the config.ini file are found within the file. `config.ini` is automatically detected in the same directory as `MuPeXI.py` script but can also be placed elsewhere and referred to by the `-c` option.



Expand Down
15 changes: 8 additions & 7 deletions config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,30 @@
## You should ONLY change the path to your references, not the groups nor the names.

[netMHC]
# Please specify the netMHCpan binary path
# Please specify the NetMHCpan binary path
MHC = /your/path/to/netMHCpan-3.0/netMHCpan

[EnsemblVEP]
# Please specify binary path to the Ensembl Variant effect predictor (VEP), and the directory containing the cache database.
# Please specify path to the Ensembl Variant effect predictor (VEP) binary, and the
# directory containing the cache database.
# The VEPdir is the directory containing the cached references. (human/GRCh38)
VEP = /your/path/to/ensembl-tools-release-85/scripts/variant_effect_predictor/variant_effect_predictor.pl
VEPdir = /your/path/to/ensembl-tools-release-85/scripts/variant_effect_predictor

[References]
# Please specify the binary path to the references used
# NOTE: The individual peptide reference files (pep8 etc.) is generated by MuPeXI in the first
# run, and can be obtained and saved by keeping temporary files (-t option)
# Please specify the path to the reference files
# NOTE: The individual peptide reference files (pep8 etc.) are generated by MuPeXI in the
# first run, and can be obtained and saved by keeping temporary files (-t option)
cDNA = /your/path/to/references/human_GRCh38/cDNA/Homo_sapiens.GRCh38.85.cdna.all.fa
pep = /your/path/to/references/human_GRCh38/pep/Homo_sapiens.GRCh38.85.pep.all.fa
cosmic = /your/path/to/references/cosmic/Census_allWed_Feb_17_09-33-40_2016.tsv
pep8 = /your/path/to/references/reference_peptide_9.txt
pep8 = /your/path/to/references/reference_peptide_8.txt
pep9 = /your/path/to/references/reference_peptide_9.txt
pep10 = /your/path/to/references/reference_peptide_10.txt
pep11 = /your/path/to/references/reference_peptide_11.txt

[PeptideMatch]
# Please specify the path to the pepmatch binary file matching you operating system
# Please specify the path to the pepmatch binary file matching your operating system
PM = /your/path/to/MuPeXI/bin/pepmatch_db

[LiftOver]
Expand Down

0 comments on commit ad0edc9

Please sign in to comment.