-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAA_subs_quant_TMT.py
181 lines (152 loc) · 8.7 KB
/
AA_subs_quant_TMT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/bin/usr python
import os
import pandas as pd
from Bio import SeqIO
import numpy as np
from itertools import groupby
import re
from operator import itemgetter
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from glob import glob
from collections import Counter
import matplotlib as mpl
from matplotlib.lines import Line2D
import random
from copy import deepcopy
""" This script is for use with TMT-labeled datasets. It follows the AAS_detection and AAS_validation scripts. It takes in highly confidently identified peptides with AAS, and quantifies them. Output metrics include normalized abundances, and ratios of peptide with AAS to its base peptide, at both precursor and reporter-ion levels, stored in a dict that structured by unique MTP-BP pairs, as opposed to TMT sets.
This script also generates a dictionary of quant data organized by AAS type
MTP = mistranslated peptide (SAAP or substituted amino acid peptide)
BP = base peptide (RNA-tempalted peptide)
"""
# setting directories and reading in data
code_dir = '/Users/shiri/Google Drive/My Drive/Mistranslation Project/AA_subst_pipeline/'
proj_dir = '/Users/shiri/Google Drive/My Drive/Mistranslation Project/Papers/Cao_PDAC/'
aa_subs_dir = proj_dir+'AA_subs_pipeline/'
sample_map = pd.read_excel(aa_subs_dir+'sample_map.xlsx')
samples = ['S'+str(i) for i in list(set(sample_map['TMT plex']))]
MQ_TMT_dict = {'126':1,'127N':2, '127C':3, '128N':4, '128C':5, '129N':6, '129C':7, '130N':8, '130C':9, '131':10, '131C':11}
mtp_dict = pickle.load(open(aa_subs_dir+'Ion_validated_MTP_dict.p', 'rb'))
val_evidence_dict = pickle.load(open(aa_subs_dir+'Validation_search_evidence_dict.p', 'rb'))
print("initializing")
# get unique pairs of MTP-BP seqs across all TMT sets
unq_pairs = []
unq_pair_dicts = []
for s, s_dict in mtp_dict.items():
print(s)
for i,v in s_dict['aa subs'].items():
sub = v
mtp = s_dict['mistranslated sequence'][i]
bp = s_dict['DP Base Sequence'][i]
ev_idx = s_dict['idx_val_evidence'][i]
pair = [mtp, bp]
if (pair not in unq_pairs):
unq_pairs.append(pair)
unq_pair_dicts.append({'MTP':mtp, 'BP':bp, 'AAS':sub, 'TMT_list':[s], 'ev_idx_list':ev_idx})
unq_pair_dicts[len(unq_pair_dicts)-1]['TMT_evidence'] = {s:[ev_idx]}
else:
curr_dict_idx = [i for i,x in enumerate(unq_pair_dicts) if (x['MTP']==mtp) and (x['BP']==bp)][0]
unq_pair_dicts[curr_dict_idx]['TMT_list'].append(s)
[unq_pair_dicts[curr_dict_idx]['ev_idx_list'].append(x) for x in ev_idx]
if s in unq_pair_dicts[curr_dict_idx]['TMT_evidence'].keys():
[unq_pair_dicts[curr_dict_idx]['TMT_evidence'][s].append(x) for x in ev_idx]
else:
unq_pair_dicts[curr_dict_idx]['TMT_evidence'][s] = ev_idx
print(len(unq_pairs))
# initialize dictionary to store quant info
MTP_quant_dict = {}
for i,pair in enumerate(unq_pair_dicts):
curr_dict = {'MTP_seq':pair['MTP'], 'BP_seq':pair['BP'], 'sub_index':[i for i,x in enumerate(pair['MTP']) if pair['MTP'][i]!=pair['BP'][i]], 'aa_sub':pair['AAS'],'tmt_sets':list(set(pair['TMT_list'])), 'tmt_evidence':pair['TMT_evidence'],
'MTP_PrecInt':{x:np.nan for x in samples}, 'BP_PrecInt':{x:np.nan for x in samples},'Prec_ratio':{x:np.nan for x in samples},'Patient_dict':{}}
curr_dict['Patient_dict'] = {x:{'MTP_ReportInt':np.nan,'BP_ReportInt':np.nan, 'Reporter_ratio':np.nan} for x in list(set(sample_map['sample_name']))}
MTP_quant_dict[i] = curr_dict
#pickle.dump(MTP_quant_dict, open(aa_subs_dir+'MTP_quant_dict.p', 'wb'))
""" Functions for quantification"""
# normalize reporter ion intensities
def reporter_ion_normalize(sample_raw):
"""
Input: list of raw intensities from a single TMT channel
Output: median-normalized list of intensities for TMT channel
"""
sample_median = np.median(sample_raw)
sample_norm = [x/sample_median for x in sample_raw]
return(sample_norm)
# Quantify precursor intensities and ratio of MTP/BP precursor intensities
# precursor intensities encompass all samples in tmt set
def precursor_intensity_quant(k, tmt_set):
"""
Input: k=index of MTP in MTP_quant_dict, tmt_set
Output: precursor intensity of mtp, its bp, their ratio
Precursor intensities are log-transformed values of those reported in evidence.txt result file from validation MQ search.
Precursor intensities represent the sum of all precursors mapped to the peptide sequence (across multiple fractions, charge states)
"""
bp = MTP_quant_dict[k]['BP_seq']
mtp = MTP_quant_dict[k]['MTP_seq']
ev_df = val_evidence_dict[tmt_set]
bp_ev_df = ev_df.loc[ev_df['Sequence']==bp,:]
bp_prec_int = bp_ev_df['Intensity'].sum()
mtp_ev_df = ev_df.loc[ev_df['Sequence']==mtp,:]
mtp_prec_int = mtp_ev_df['Intensity'].sum()
prec_ratio = np.log10(mtp_prec_int/bp_prec_int)
return([mtp_prec_int, bp_prec_int, prec_ratio])
# distribute precursor intensity by ratios of reporter ions in tmt set. these values used for sample-level quantification of peptide
def distribute_prec_int(k, tmt_set):
"""
Input: k=index of MTP in MTP_quant_dict, tmt set
Outpt: list of reporter intensity values for TMT channels that reflect precursor intensity distributed by fraction of each reporter ion to total reporter signal
"""
ev_df = val_evidence_dict[tmt_set]
bp = MTP_quant_dict[k]['BP_seq']
bp_ev = ev_df.loc[ev_df['Sequence']==bp,:]
bp_prec_int = MTP_quant_dict[k]['BP_PrecInt'][tmt_set]
mtp = MTP_quant_dict[k]['MTP_seq']
mtp_ev = ev_df.loc[ev_df['Sequence']==mtp,:]
mtp_prec_int = MTP_quant_dict[k]['MTP_PrecInt'][tmt_set]
# here we do not use normalized values, because we want the differences in sample loading to be accounted for in distribution of precursor intensity
bp_reporter_int = bp_ev.loc[:,[x for x in bp_ev.columns if ('Reporter intensity corrected' in x) and ('_Normalized' not in x)]]
bp_reporter_int_sums = bp_reporter_int.sum(axis=0)
bp_total_reporter_int = bp_reporter_int_sums.sum()
bp_distributed = bp_prec_int * bp_reporter_int_sums / bp_total_reporter_int
mtp_reporter_int = mtp_ev.loc[:,[x for x in mtp_ev.columns if ('Reporter intensity corrected' in x) and ('_Normalized' not in x)]]
mtp_reporter_int_sums = mtp_reporter_int.sum(axis=0)
mtp_total_reporter_int = mtp_reporter_int_sums.sum()
mtp_distributed = mtp_prec_int * mtp_reporter_int_sums / mtp_total_reporter_int
return(bp_distributed, mtp_distributed)
"""Quantification of precursor and reporter ions"""
print('Quantifying peptides with AAS')
for s in samples:
print(s)
tmt_set=s
ev_df = val_evidence_dict[s]
for col in [x for x in ev_df if 'Reporter intensity corrected' in x]:
val_evidence_dict[s][col+'_Normalized'] = reporter_ion_normalize(ev_df[col].values)
for k,v in MTP_quant_dict.items():
if s in v['tmt_sets']:
mtp = v['MTP_seq']
bp = v['BP_seq']
bp_ev = ev_df.loc[ev_df['Sequence']==bp,:]
mtp_ev = ev_df.loc[ev_df['Sequence']==mtp,:]
mtp_prec_int, bp_prec_int, prec_ratio = precursor_intensity_quant(k, tmt_set)
v['BP_PrecInt'][tmt_set] = bp_prec_int
v['MTP_PrecInt'][tmt_set] = mtp_prec_int
v['Prec_ratio'][tmt_set] = prec_ratio
bp_distributed, mtp_distributed = distribute_prec_int(k, tmt_set)
# Now we will assign the distributed reporter ion values to the respective patients
s_patients = sample_map.loc[sample_map['sample_ID']==s, 'sample_name'].values
p_dict = v['Patient_dict']
for pat in s_patients:
mq_reporter = sample_map.loc[sample_map['sample_name']==pat, 'MQ'].values[0]
patient_bp_reporter = bp_distributed['Reporter intensity corrected '+str(mq_reporter)]
patient_mtp_reporter = mtp_distributed['Reporter intensity corrected ' +str(mq_reporter)]
patient_ratio = np.log10(patient_mtp_reporter/patient_bp_reporter)
patient_bp_reporter_norm = bp_ev['Reporter intensity corrected '+str(mq_reporter)+'_Normalized'].sum()
patient_mtp_reporter_norm = mtp_ev['Reporter intensity corrected '+str(mq_reporter)+'_Normalized'].sum()
p_dict[pat]['BP_ReportInt'] = patient_bp_reporter
p_dict[pat]['MTP_ReportInt'] = patient_mtp_reporter
p_dict[pat]['BP_ReportInt_Norm'] = patient_bp_reporter_norm
p_dict[pat]['MTP_ReportInt_Norm'] = patient_mtp_reporter_norm
p_dict[pat]['Reporter_ratio'] = patient_ratio
print('saving results to file')
pickle.dump(MTP_quant_dict, open(aa_subs_dir+'MTP_quant_dict.p', 'wb'))