-
Notifications
You must be signed in to change notification settings - Fork 0
/
Pipeline_part_2.py
1442 lines (1160 loc) · 64.3 KB
/
Pipeline_part_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# For Python 2 comment these 2 lines out
#from __future__ import division
#from __future__ import print_function
#
# run code:
# python Pipeline_part_2.py
#
# processes steps from correcting assignment (9) up to relative_codon_fold_difference master table (11?)
#
import os
import sys
import time
import gzip
import shutil
import pysam
import subprocess as sp
import numpy.random as rn
import numpy as np
import pandas as pd
import matplotlib as mpl
mpl.use('Agg') # load backend - server safe
import matplotlib.pyplot as plt
import seaborn as sns
def cleanFile(File, Condition):
# gZip a file and delete the un-gZipped version!
if Condition == "gzip":
with open(File, 'rb') as FIn, gzip.open(File + ".gz", 'wb') as FOut:
shutil.copyfileobj(FIn, FOut)
sp.Popen(["rm", File])
if Condition == "bgzip":
# bgzip No of cores is hard coded
BgZip = ["bgzip", "-@", Params['cpu'], File]
BgZipIt = sp.Popen(BgZip, stdout=sp.PIPE, stderr=sp.PIPE, universal_newlines=True)
BgZipIt.wait()
if Condition == "rm":
sp.Popen(["rm", File])
def makeDirectory(Path):
# Check if a folder named exists at Path. If not, create it!
Split = Path.split("/")
if len(Split) == 1:
List = sp.Popen(["ls"], stdout=sp.PIPE, stderr=sp.PIPE, universal_newlines=True)
else:
List = sp.Popen(["ls", "/".join(Split[:-1])], stdout=sp.PIPE, stderr=sp.PIPE, universal_newlines=True)
if Path in List.communicate()[0].split("\n"):
pass
else:
Make = sp.Popen(["mkdir", Path])
Make.wait()
def parseParams(Path):
# Open the parameter file and read in parameters and files to operate on
File = open(Path)
SRAList = []
Names = []
ParamDict = {}
for iL in File:
if iL[0] == "#":
Split = iL[2:-1].split(":")
if len(Split) > 1:
ParamDict[Split[0].strip()] = Split[1].strip()
else:
if len(iL) != 1:
Split = iL[:-1].split("\t")
SRAList.append(Split[0])
Names.append(Split[1])
return ParamDict, SRAList, Names
def yeastChr():
# Ordered yeast Chr list short names from ensembl
return ['I','II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII','XIV','XV','XVI','Mito']
def corrAssignment(SRAList, Names, Params):
'''
It reads in interval data _iv adds missing positions, corrects and writes to *.h5
corrects for offset and writes to new *_assigned_rpm.h5 file
in this part I keep lines with 0 in table.
It is not reasonable for human data or bigger genomes
'''
makeDirectory("9-Assigncorr")
makeDirectory("9-Assigncorr/Reports")
# save_csv= False #save output to tab delim csv. *.h5 is saved anyway ##?
rlmin = int(Params["ReadLenMiN"])
rlmax = int(Params["ReadLenMaX"])
rlrange = str(rlmin) + "-" + str(rlmax) # read length range 4 filename
# todo: 3' mapping - not tested - Mapping goes to filename
Mapping = Params["Mapping"] # Mapping 5 or 3 prime end
df = pd.read_csv(Params["OffsetFile"], index_col=0, sep="\t")
for iN in Names:
fn_body = iN + "_" + Mapping + "-End_"
infile_idx_h5 = "6-AssignRaw/" + fn_body + rlrange + "_idx_iv" + ".h5"
infile_h5 = infile_idx_h5
storage = pd.HDFStore(infile_h5, "r")
readlen_and_offsets = {i: int(df.loc[i, iN]) for i in df[iN].dropna().index}
rl_l = list(readlen_and_offsets.keys())
rl_l.sort() # readlength with periodicity from the table
fn_body = fn_body + str(min(rl_l)) + "-" + str(max(rl_l))
outfile_hdf = "9-Assigncorr/" + fn_body + "_idx_iv" + "_assign_rpm.h5"
#outfile_hdf = "9-Assigncorr/" + fn_body + "_idx_assign_rpm.h5"
outp_h5 = pd.HDFStore(outfile_hdf, complevel=5, complib="zlib", mode="w")
LogFileName = "9-Assigncorr/Reports/" + fn_body + "_assign_corr_log.txt"
LOG_FILE = open(LogFileName, "wt")
# 2. for Forw & Rev strand AND for each chr separatedly
keys_list = [i for i in storage.keys() if "_rpm" in i] # get all keys
keys_for = [i for i in keys_list if "For_" in i]
keys_rev = [i for i in keys_list if "Rev_" in i]
# Process Log
report = "\nInput 1: {}\nRead length included: {}".format(Params["OffsetFile"], rl_l)
print(report); LOG_FILE.write(report + "\n")
report = "\nInput 2: {}\nrlmin: {}\nrlmax: {}\nName: {}\nMapping: {}".format(infile_idx_h5,
rlmin, rlmax, iN, Mapping)
print(report, "\n"); LOG_FILE.write(report + "\n")
# 3. get chr length
# todo: filename for Genome.fa as parameter
# todo: 2G problem. Consider use some another FastA format reader. This one opens file and reads it's content
# todo: Python 3.5 in OSX can't open files bigger thant 2G - problems with human genomes
# todo: test it in py 3.6
genome = read_FASTA_dictionary("0-References/Genome.fa")
chr_length = {key: len(genome[key]) for key in genome.keys()}
# columns to include
columns = ["Chr", "Strand"] + [str(i) for i in rl_l] + ["sum"] # colum name #'s are str
read_length_to_use = [str(i) for i in rl_l]
# 5.Forward str each keys_for
for key in keys_for:
Chr = key.split("/")[-1]
# 5.1 read uncorrected data from h5 to df
df1 = storage[key]
# 5.2 reindex
new_index = list(range(chr_length[Chr]))
df1 = df1[columns].reindex(new_index)
# 5.3 apply offset correction
# 5' OK!
# todo: 3' mapping corrrection
for rlen in [str(i) for i in rl_l]:
df1[rlen] = df1[rlen].shift(readlen_and_offsets[int(rlen)])
# 5.4
df1["Chr"] = Chr
df1["Srand"] = "+"
df1 = df1[read_length_to_use].fillna(0)
df1.loc[:, 'sum'] = df1.loc[:, read_length_to_use].sum(axis=1)
# drop lines 'sum' == 0 --> data interval
df1 = df1[df1['sum'] != 0]
# 5.5 write output to h5
outp_h5[key] = df1
# Process Log
report = "Forward done!" ; print(report, "\n"); LOG_FILE.write("\n" + report + "\n")
# 6. Reverse str each keys_rev
for key in keys_rev:
Chr = key.split("/")[-1]
# 6.1 read uncorrected data from h5 to df
df1 = storage[key]
# 6.2 reindex
new_index = list(range(chr_length[Chr]))
df1 = df1[columns].reindex(new_index)
# 6.3 apply offset correction
# 5' OK!
# todo: 3' mapping corrrection
for rlen in [str(i) for i in rl_l]:
df1[rlen] = df1[rlen].shift(-readlen_and_offsets[int(rlen)])
# 6.4
df1["Chr"] = Chr
df1["Srand"] = "+"
df1 = df1[read_length_to_use].fillna(0)
df1.loc[:, 'sum'] = df1.loc[:, read_length_to_use].sum(axis=1)
# drop lines 'sum' == 0 --> data interval
df1 = df1[df1['sum'] != 0]
# 6.5 write output to h5
outp_h5[key] = df1
# Process Log
report = "Reverse done!"; print(report, "\n"); LOG_FILE.write(report + "\n")
offsets= [readlen_and_offsets[i] for i in rl_l]
report = "\nOutput: {}\nRead length included: {}\nOffsets applied: {}\nName: {}\nMapping: {}".format(outfile_hdf, \
rl_l, offsets, iN, Mapping)
outp_h5.close()
storage.close()
print(report, "\n")
LOG_FILE.write(report + "\n")
LOG_FILE.close()
def metagTables2(SRAList, Names, Params):
"""From corrected assingments
data in h5 file with missing lines, i.e. lines without counts are missing
"""
# todo: it differs from a function metagTabels() in- and output files and Span
makeDirectory("10-corrMetagTbl")
makeDirectory("10-corrMetagTbl/Reports")
# todo: include UTR length req for metagenomic plots priority high
Span = int(Params["MetagSpancorr"]) # nr of nt before and after 5' position of start/stop codons
#time.sleep(0.1)
offsettbl = Params["OffsetFile"]
Mapping = Params["Mapping"] # 5 & 3
dataNorm = Params["Normalised"] # "raw" or "rpm"
LogFileName = "10-corrMetagTbl/Reports/" + "MetagTabl_" + Mapping + "-End_" + "corr_iv_log.txt"
LOG_FILE = open(LogFileName, "wt")
#time.sleep(0.1)
for iN in Names:
cf1 = cr1 = cf2 = cr2 = 0 # counters
report = "\nName: {}".format(iN)
LOG_FILE.write(report + "\n"); print(report)
# todo: input file name construction relies on offset table. Is it good or bad? What could bemore robust way?
df = pd.read_csv(offsettbl, index_col=0, sep="\t")
rl_l = [i for i in df[iN].dropna().index] # readlength with periodicity from the table
fn_body = iN + "_" + Mapping + "-End_" # filename body
fn_body+= str(min(rl_l)) + "-" + str(max(rl_l)) #
# file names
outf_start = "10-corrMetagTbl/" + fn_body + "_" + dataNorm + "_Start" + "_iv_Meta_Sum.txt"
outf_stop = "10-corrMetagTbl/" + fn_body + "_" + dataNorm + "_Stop" + "_iv_Meta_Sum.txt"
infile_h5 = "9-Assigncorr/" + fn_body + "_idx_iv_assign_rpm.h5"
# corrected assignment
hd5 = pd.HDFStore(infile_h5, "r")
# read from table and add 'sum'
columns = [str(i) for i in df[iN].dropna().index] + ['sum']
# Empty DataFrames
meta_start_dff = pd.DataFrame(index=range(0, 2 * Span + 1), columns=columns).fillna(0)
meta_start_dfr = pd.DataFrame(index=range(0, 2 * Span + 1), columns=columns).fillna(0)
meta_stop_dff = pd.DataFrame(index=range(0, 2 * Span + 1), columns=columns).fillna(0)
meta_stop_dfr = pd.DataFrame(index=range(0, 2 * Span + 1), columns=columns).fillna(0)
report = "Collecting data around genes Start & Stop - Span: {}".format(Span); print(report)
# Annotation
tabixfile = pysam.TabixFile("0-References/genome.gtf.gz", parser=pysam.asGTF())
# MetagThreshold is in raw counts
# Adjust Threshold if to RPM if Normalization is "rpm"
Threshold = int(Params["MetagThreshold"]) # has to be here # FILTER 3
if dataNorm == "rpm": # assumes BAM file is not deleted
BamName = "5-Aligned/" + iN + ".bam" # sorted and indexed BAM
rep = "Estimating normalisation factor from bam file {}".format(BamName); print(rep)
#todo: feed raw_metag_threshold_to_rpm() with normalisation_factors - reading BAM takes time
Threshold = raw_metag_threshold_to_rpm(BamName, Threshold, Params) # Adjusting FILTER 3 to rpm
else:
pass
report = "Metagene threshold {:.1f} {}".format(Threshold, dataNorm)
print(report); LOG_FILE.write(report + "\n")
for Chr in yeastChr():
key_f = "For_rpm" + "/" + Chr
key_r = "Rev_rpm" + "/" + Chr
# It is much faster to read data chr wise in df than access slices from hd5
# hd5[key].loc[gtf.start - Span:gtf.start + Span, :] # - is slow
df_f = hd5[key_f] # Forward
df_r = hd5[key_r] # Reverse
for gtf in tabixfile.fetch(reference=Chr):
if (gtf.feature == 'stop_codon') & (gtf.strand == '+'):
df = pd.DataFrame() # create empty df
# enough coverage?
if df_f.loc[gtf.start - Span:gtf.start + Span, "sum"].sum() < Threshold: #FILTER 3
continue
else:
df = df_f.loc[gtf.start - Span:gtf.start + Span, :]
index = range(gtf.start - Span, gtf.start + Span + 1)
# def df_framing() fills missing positions in idex with 0 - makes it interval safe
df = df_framing(df, index=index, columns=columns, strand=gtf.strand) # expanded & index resetted df
meta_stop_dff = meta_stop_dff + df # sum dataframes
cf2 += 1
# Stop codon in Rev strand
elif (gtf.feature == 'stop_codon') & (gtf.strand == '-'):
df = pd.DataFrame() # create empty df
if df_r.loc[gtf.end - Span - 1:gtf.end + Span - 1, "sum"].sum() < Threshold: #-1 rev str corr #FILTER 3
continue
else:
df = df_r.loc[gtf.end - Span - 1:gtf.end + Span - 1]
index = range(gtf.end - Span - 1, gtf.end + Span) # -1 correction
df = df_framing(df, index=index, columns=columns, strand=gtf.strand) # expanded & index resetted df
meta_stop_dfr = meta_stop_dfr + df # sum dataframes
cr2 += 1
# Sart codon in Forw strand
elif (gtf.feature == 'start_codon') & (gtf.strand == '+'):
df = pd.DataFrame() # create empty df
if df_f.loc[gtf.start - Span:gtf.start + Span, "sum"].sum() < Threshold: #FILTER 3
continue
else:
df = df_f.loc[gtf.start - Span:gtf.start + Span, :]
index = range(gtf.start - Span, gtf.start + Span + 1)
df = df_framing(df, index=index, columns=columns, strand=gtf.strand) # expanded & index resetted df
meta_start_dff = meta_start_dff + df # sum dataframes
cf1 += 1
# Start codon in Rev strand
elif (gtf.feature == 'start_codon') & (gtf.strand == '-'):
df = pd.DataFrame() # create empty df
if df_r.loc[gtf.end - Span - 1:gtf.end + Span - 1, "sum"].sum() < Threshold: #FILTER 3
continue
else:
df = df_r.loc[gtf.end - Span - 1:gtf.end + Span - 1] # -1 correction for rev strand
index = range(gtf.end - Span - 1, gtf.end + Span) # -1 correction for rev strand
df = df_framing(df, index=index, columns=columns, strand=gtf.strand) # expanded & index resetted df
meta_start_dfr = meta_start_dfr + df # sum dataframes
cr1 += 1
else:
pass
print("Summing up ...")
LOG_FILE.write("Summing up ...\n")
# summing up
meta_start_sum = meta_start_dff + meta_start_dfr
meta_stop_sum = meta_stop_dff + meta_stop_dfr
# saving to file
report = "Around START: {:,} included \t{}".format(cf1 + cr1, outf_start);
LOG_FILE.write(report + "\n");
print(report)
# print("Sum of saved table: {}".format(int(meta_start_sum["sum"].sum())))
meta_start_sum.to_csv(outf_start, sep='\t', header=True, index=True)
meta_start_sum['rel_Pos'] = list(range(-Span, Span + 1))
meta_start_sum.to_csv(outf_start, sep='\t', header=True, index=True)
report = "Around STOP: {:,} included \t{}".format(cf2 + cr2, outf_stop)
LOG_FILE.write(report + "\n");
print(report)
meta_stop_sum['rel_Pos'] = list(range(-Span, Span + 1))
meta_stop_sum.to_csv(outf_stop, sep='\t', header=True, index=True)
hd5.close()
LOG_FILE.close()
def metagPlotsCorrpdf(SRAList, Names, Params):
#
#
# ---------- Output graphics quality setings -------------
#
# modify according your needs and system setup
# OSX users safest is to uncomment all
#
#
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'svg')
# Using laTeX to set Helvetica as default font
# from matplotlib import rc
# rc('font', **{'family': 'sans-serif', 'sans-serif': ['Helvetica']})
# rc('text', usetex=True)
# -------------------------------------------------------
#
# using pandas, matplotlib, seaborn, numpy
makeDirectory("10-corrMetagTbl/MetagPlot")
sns.set_style("white") # seaborn_aesthetic
sns.set_context("paper") # seaborn_aesthetic
Span = int(Params["MetagSpan"])
Mapping = Params["Mapping"]
dataNorm = Params["Normalised"] # Mapping 5 or 3 prime end
# colors for plot
colors = {'25': 'fuchsia', '26': 'blueviolet', '27': 'darkblue', '28': 'b', '29': 'r',
'30': 'salmon', '31': 'orange', '32': 'olive', '33': 'g', '34': 'tan',
'35': 'y', 'sum': 'brown'}
for iN in Names:
ofdf = pd.read_csv(Params['OffsetFile'], index_col=0, sep="\t")
rl_l = [i for i in ofdf[iN].dropna().index] # readlength with periodicity from the table
rlrange = str(min(rl_l)) + "-" + str(max(rl_l))
readLen_l = [str(i) for i in rl_l] + ['sum'] # numbers to str
for iX in ["Start", "Stop"]:
infile = "10-corrMetagTbl/" + iN + "_" + Mapping + "-End" + "_" + rlrange + \
"_" + dataNorm + "_" + iX + "_iv_Meta_Sum.txt"
outfig = "10-corrMetagTbl/MetagPlot/" + iN + "-" + Mapping + "-End" + "-" + rlrange + \
"-" + dataNorm + "-" + iX + "-iv.pdf"
outfig_title = "{} {} {}' mapping".format(iN.replace('_', '-'), iX, Mapping )
legend_location = 'upper right' if iX == 'Stop' else 'upper left'
if os.path.isfile(infile): # infile exits
w = 8 # figure width
h = 1.2 * len(readLen_l) # figure height
fig, axes = plt.subplots(nrows=len(readLen_l), figsize=(w, h))
fig.suptitle(outfig_title, y=0.9, fontsize=12)
df = pd.read_csv(infile, index_col=0, sep='\t')
df.set_index("rel_Pos", inplace=True)
# Adjust plot for mapping and Start/Stop
if iX == "Start":
df = dfTrimmiX5(df, Span, iX, inside_gene=39, outside_gene=18)
elif iX == "Stop":
df = dfTrimmiX5(df, Span, iX, inside_gene=39, outside_gene=18)
else:
pass
for i, readLen in enumerate(readLen_l):
a = 0.6
readLen = str(readLen)
colors = colorsCheck(colors, readLen)
x = df.index
y = list(df.loc[:, readLen])
axes[i].bar(x, y, color=colors[readLen], alpha=a)
axes[i].legend([readLen], loc=legend_location)
# colors for guide lines; adjust for beg and end for 5pr
b, e = (df.index.min(), df.index.max())
# todo: getting axvline colors can be function
if Mapping == '5':
for k in list(range(b, e+1, 3)):
color = 'gray'
if k == -12:
color = 'g'; a = 0.5
elif k == 0:
color = 'r'; a = 0.4
elif k < 0:
color = 'gray'; a = 0.2
else:
color = 'gray'; a = 0.2
# add line after each 3 nt
axes[i].axvline(x=k, linewidth=1, alpha=a, color=color)
elif Mapping == '3':
for k in list(range(b, e+1, 3)):
color = 'gray'
if k == 12:
color = 'g'; a = 0.5
elif k == 0:
color = 'r'; a = 0.4
elif k < 0:
color = 'gray'; a = 0.2
else:
color = 'gray'; a = 0.2
# add line after each 3 nt
axes[i].axvline(x=k, linewidth=1, alpha=a, color=color)
else:
# any other type of mapping
pass
axes[i].set_ylabel(Params["Normalised"])
sns.despine() # seaborn_aesthetic
fig.savefig(outfig, format='pdf', dpi=300, bbox_inches='tight')
print("{}".format(outfig))
else:
print("Missing InFile -> {}".format(infile))
print("\n")
def codonTablesA(SRAList, Names, Params):
makeDirectory("11-codonTables")
makeDirectory("11-codonTables/Reports")
# Input genome and annotation
genome_file = "0-References/ScerR64-1-1.85.fa"
annotation = "0-References/genome.gtf.gz"
logfile_name = "11-codonTables/Reports/codonTables_A.log"
LOGFILE = open(logfile_name, "w")
# genome sequence
genome = {info: seq for info, seq in read_FASTA(genome_file, SplitHeader=False)}
# Get number of exons for each protein_coding gene
tabixfile = pysam.TabixFile(annotation, parser=pysam.asGTF())
missing_keys = [] # check
Mapping = Params["Mapping"] # Mapping 5 or 3 prime end
# Generates dic {gene_id: number_of_exons}
gene_ids_exon_No = {}
# todo: include multi-exon genes
for ref in yeastChr():
for gtf in tabixfile.fetch(reference=ref, start=0, end=None):
if gtf.feature == 'CDS':
if gtf.gene_id not in gene_ids_exon_No: # is on in d
gene_ids_exon_No[gtf.gene_id] = int(gtf.exon_number) # pysam gives number as str
# new exon No is bigger
if (gtf.gene_id in gene_ids_exon_No) & (gene_ids_exon_No[gtf.gene_id] < int(gtf.exon_number)):
gene_ids_exon_No[gtf.gene_id] = int(gtf.exon_number)
# list of sample names
ListA = [x.strip(' ') for x in Params["GroupA"].split(sep=";")]
for iN in ListA:
# read again to avoid half-way iterations
tabixfile = pysam.TabixFile(annotation, parser=pysam.asGTF())
fn_body = iN + "_" + Mapping + "-End_"
# read length from offsets file
rl_l = readlen_list_from_offset(Params["OffsetFile"], iN)
fn_body = fn_body + str(min(rl_l)) + "-" + str(max(rl_l))
# read data in
infile_h5 = "9-Assigncorr/" + fn_body + "_idx_iv_assign_rpm.h5"
storage = pd.HDFStore(infile_h5, "r")
# output file
outfile_name = "11-codonTables/" + fn_body + "_codon_table_A.txt"
# check file
if os.path.isfile(outfile_name):
message = "File exists {}\n Skipping !".format(outfile_name)
print(message)
LOGFILE.write(message + "\n")
continue
else:
outfile = open(outfile_name, 'w')
# ... for library depth
# enables to specify it - runs faster - default is 0
norm_factor = 0 # if 0 estimates from BAM file.
if norm_factor == 0:
BamFile = "5-Aligned/" + iN + ".bam"
print("\n\tProcessing ... {} \n".format(BamFile))
norm_factor = normalisation_factor_from_bam(BamFile, Params)
else: # take a given value
pass
# Thresholds
GeneRawMeanThr = float(Params["GeneRawMeanThr"]) # FILTER 1
CodonRawMeanThr = float(Params["CodonRawMeanThr"])# FILTER 2
GeneRpmMeanThr = GeneRawMeanThr / norm_factor # FILTER 1
CodonRpmMeanThr = CodonRawMeanThr / norm_factor # FILTER 2
#
report = "\n{}\nNormalisation factor: {}\nGeneRpmMeanThr: {}\nCodonRpmMeanThr: {}".format(iN,norm_factor,
GeneRpmMeanThr,CodonRpmMeanThr)
LOGFILE.write(report+"\n"); print(report + "\n")
###############################
# Generate codon based tables #
###############################
# Change header if changing output lines for gene or codon
header_gene = 'Chr\tExon\tNo_of_exons\tStrand\tGene_id_treated\tgene_1_leftmost\tgene_1_rigthmost\tu_rpm_1'
header_codon = 'Position_leftmost_1\tcodon_raw_1\tcodon_rpm_1\tcodon_relative_rpm_1\tcodon_P_1\tnorm_factor_1'
header_codon += '\tamplification_factor_1'
header = header_gene + "\t" + header_codon + "\n"
outfile.write(header)
for ref in yeastChr():
report = "Chr {:5s} ... ".format(ref); print(report); LOGFILE.write(report+"\n")
key_f = "For_rpm/" + ref
key_r = "Rev_rpm/" + ref
data_f = storage[key_f]
data_r = storage[key_r]
for gtf in tabixfile.fetch(reference=ref, start=0, end=None):
if gtf.feature == 'CDS':
#todo: introduce multiexon genes
if gene_ids_exon_No[gtf.gene_id] > 1: # Skip 2 and more exon genes
continue
left_most, right_most = gtf.start, gtf.end # STOP excluded
#
#left_most, right_most = left_most, right_most - 1 # + strand correction for df.loc[i:j] []-slicing
# Retrieve dataframe for gene Check correctness on "-" strand
gene_df = data_f.loc[left_most:right_most-1, ] if gtf.strand == "+" else data_r.loc[left_most:right_most-1, ]
if gene_df['sum'].mean() < GeneRpmMeanThr: # FILTER 1
continue
# --------------- #
# Forward strand
if gtf.strand == '+':
# get normalization factors for regions
collection = gene_region_normalisation_factor(gene_df, left_most, right_most,
column='sum', method='mean')
norm_factors_coll = tuple([x if x > 0 else np.nan for x in collection]) # repl 0 with nan
u_rpm = gene_df["sum"].mean()
# todo: consider be consistent taking only CDS and not including STOP codon? left_most+3
line_for_gene = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(gtf.contig, gtf.exon_number,
str(gene_ids_exon_No[gtf.gene_id]),
gtf.strand, gtf.gene_id,
left_most, right_most + 3, u_rpm)
# For each coding codon on gene, i. e. excluding STOP codon
for iPl in range(left_most, right_most, 3):
# skip odons with low coverage
if gene_df.loc[iPl:iPl + 2,'sum'].mean() < CodonRpmMeanThr: # FILTER 2
continue
gene_norm_factor = select_gene_region_norm_factor(norm_factors_coll, iPl, left_most,
right_most, strand=gtf.strand)
# how much rpms were multiplied to get codon_relative_rpm
amplification_factor = 1 / (norm_factor * gene_norm_factor)
codon_rpm = gene_df.loc[iPl:iPl + 2, 'sum'].sum()
codon_raw = int(codon_rpm * norm_factor) # pseudo raw because input is rpm's original raw is gone
codon_relative_rpm = codon_rpm / gene_norm_factor
codon_P = genome[ref][iPl:iPl + 3]
line_for_codon = "{}\t{}\t{}\t{}\t{}\t{}\t{:.3f}".format(int(iPl), codon_raw,
codon_rpm, codon_relative_rpm,
codon_P, gene_norm_factor,
amplification_factor)
line_to_add = line_for_gene + "\t" + line_for_codon + "\n"
outfile.write(line_to_add)
# --------------
# Reverse strand
if gtf.strand == '-':
# get normalization factors for regions
collection = gene_region_normalisation_factor(gene_df, left_most, right_most,
column='sum', method='mean')
norm_factors_coll = tuple([x if x > 0 else np.nan for x in collection]) # repl 0 with nan
u_rpm = gene_df["sum"].mean()
# todo: consider be consistent taking only CDS and not including STOP codon? left_most-3
line_for_gene = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(gtf.contig, gtf.exon_number,
str(gene_ids_exon_No[gtf.gene_id]),
gtf.strand, gtf.gene_id,
left_most - 3, right_most, u_rpm)
# For each coding codon on gene, i. e. excluding STOP codon
for iPl in range(left_most, right_most, 3):
# skip odons with low coverage
if gene_df.loc[iPl:iPl + 2,
'sum'].mean() < CodonRpmMeanThr: # FILTER 2
continue
gene_norm_factor = select_gene_region_norm_factor(norm_factors_coll, iPl, left_most,
right_most, strand=gtf.strand)
amplification_factor = 1 / (norm_factor * gene_norm_factor)
codon_rpm = gene_df.loc[iPl:iPl + 2, 'sum'].sum()
codon_raw = int(codon_rpm * norm_factor)
codon_relative_rpm = codon_rpm / gene_norm_factor
codon_P = genome[ref][iPl:iPl + 3]
codon_P = revcompl(codon_P)
line_for_codon = "{}\t{}\t{}\t{}\t{}\t{}\t{:.3f}".format(int(iPl), codon_raw,
codon_rpm, codon_relative_rpm,
codon_P,
gene_norm_factor,
amplification_factor)
line_to_add = line_for_gene + "\t" + line_for_codon + "\n"
outfile.write(line_to_add)
report = "\n otufile: {}\n\t{} done!\n".format(outfile_name, iN)
print(report)
LOGFILE.write(report)
outfile.close()
LOGFILE.close()
def codonTablesB(SRAList, Names, Params):
makeDirectory("11-codonTables")
makeDirectory("11-codonTables/Reports")
# Input genome and annotation
genome_file = "0-References/ScerR64-1-1.85.fa"
annotation = "0-References/genome.gtf.gz"
logfile_name = "11-codonTables/Reports/codonTables_B.log"
LOGFILE = open(logfile_name, "w")
# genome sequence
genome = {info: seq for info, seq in read_FASTA(genome_file, SplitHeader=False)}
# Get number of exons for each protein_coding gene
tabixfile = pysam.TabixFile(annotation, parser=pysam.asGTF())
missing_keys = [] # check
Mapping = Params["Mapping"] # Mapping 5 or 3 prime end
# todo: include multi-exon genes
# Generates dic {gene_id: number_of_exons}
gene_ids_exon_No = {}
for ref in yeastChr():
for gtf in tabixfile.fetch(reference=ref, start=0, end=None):
if gtf.feature == 'CDS':
if gtf.gene_id not in gene_ids_exon_No: # is on in d
gene_ids_exon_No[gtf.gene_id] = int(gtf.exon_number) # pysam gives number as str
# new exon No is bigger
if (gtf.gene_id in gene_ids_exon_No) & (gene_ids_exon_No[gtf.gene_id] < int(gtf.exon_number)):
gene_ids_exon_No[gtf.gene_id] = int(gtf.exon_number)
# list of sample names
ListB = [x.strip(' ') for x in Params["GroupB"].split(sep=";")]
for iN in ListB:
# read again to avoid half-way iterations
tabixfile = pysam.TabixFile(annotation, parser=pysam.asGTF())
# nucleotides before P-site
nt = int(Params["CodonsBeforePSite"]) * 3
# read length from offsets file
rl_l = readlen_list_from_offset(Params["OffsetFile"], iN)
# file name body
fn_body = iN + "_" + Mapping + "-End_"
fn_body = fn_body + str(min(rl_l)) + "-" + str(max(rl_l))
outfile_name = "11-codonTables/" + fn_body + "_codon_table_B.txt"
if os.path.isfile(outfile_name):
message = "File exists {}\n Skipping !".format(outfile_name)
print(message)
LOGFILE.write(message + "\n")
continue
else:
outfile = open(outfile_name, 'w')
# read data in
infile_h5 = "9-Assigncorr/" + fn_body + "_idx_iv_assign_rpm.h5"
storage = pd.HDFStore(infile_h5, "r")
# codon counters - not used jet
cP = 0 # plus strand
cM = 0 # minus strand
# ... for library depth
# enables to specify it - runs faster - default is 0
norm_factor = 0 # if 0 estimates from BAM file.
if norm_factor == 0:
BamFile = "5-Aligned/" + iN + ".bam"
print("\n\tProcessing ... {} \n".format(BamFile))
norm_factor = normalisation_factor_from_bam(BamFile, Params)
else: # take a given value
pass
# Thresholds
GeneRawMeanThr = float(Params["GeneRawMeanThr"]) # FILTER 1
CodonRawMeanThr = float(Params["CodonRawMeanThr"]) # FILTER 2
GeneRpmMeanThr = GeneRawMeanThr / norm_factor # FILTER 1
CodonRpmMeanThr = CodonRawMeanThr / norm_factor # FILTER 2
report = "\n{}\nNormalisation factor: {}\nGeneRpmMeanThr: {}\nCodonRpmMeanThr: {}".format(iN, norm_factor,
GeneRpmMeanThr,
CodonRpmMeanThr)
LOGFILE.write(report + "\n"); print(report + "\n")
# Change header if changing output lines for gene or codon
header_gene = 'Chr\tExon\tNo_of_exons\tStrand\tGene_id_WT\ttranscript_name\tgene_2_leftmost\tgene_2_rigthmost\tu_rpm_2'
header_codon = 'Position_leftmost_2\tcodon_raw_2\tcodon_rpm_2\tcodon_relative_rpm_2\tcodon_E\tcodon_P\tcodon_A\t'
header_codon = header_codon + 'sequence\treverse_complement\tpeptide\tnorm_factor_2\tamplification_factor_2'
header = header_gene + "\t" + header_codon + "\n"
outfile.write(header)
for ref in yeastChr():
report = "Chr {:5s} ... ".format(ref); print(report); LOGFILE.write(report + "\n")
key_f = "For_rpm/" + ref
key_r = "Rev_rpm/" + ref
data_f = storage[key_f]
data_r = storage[key_r]
for gtf in tabixfile.fetch(reference=ref, start=0, end=None):
if gtf.feature == 'CDS':
if gene_ids_exon_No[gtf.gene_id] > 1: # Skip genes containing 2 or more exons
continue
# --------------- #
# Forward strand
if gtf.strand == '+':
# correct for '+' strand df.loc[i:j] i,j both included
left_most, right_most = gtf.start, gtf.end-1
gene_df = data_f.loc[left_most:right_most, ] # Retrieve dataframe for a gene including stop
# continue if gene have few RPF's (rpm)
if gene_df['sum'].mean() < GeneRpmMeanThr: # FILTER 1
continue
# get normalization factors for regions
collection = gene_region_normalisation_factor(gene_df, left_most, right_most,
column='sum', method='mean')
# replace 0 with NaNs to avoid div by 0
norm_factors_coll = tuple([x if x > 0 else np.nan for x in collection])
# --------------- #
# Line_for_gene #
#### Chr; Exon; No_of_exons; Strand; Gene_id_treated; gleft_most; rigth_most
# Changing here update codonBased_df columns !!!
u_rpm = gene_df["sum"].mean()
line_for_gene = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(gtf.contig, gtf.exon_number,
str(gene_ids_exon_No[gtf.gene_id]),
gtf.strand, gtf.gene_id,
gtf.transcript_name,
left_most, right_most+3, u_rpm)
# -------------------------------------------------------- #
# For each coding codon on gene, i. e. excluding STOP codon
for iPl in range(left_most, right_most, 3):
# skipp codons with low coverage
if gene_df.loc[iPl:iPl + 2,'sum'].mean() < CodonRpmMeanThr: # FILTER 2
continue
cP += 1 # counting codons
gene_norm_factor = select_gene_region_norm_factor(norm_factors_coll, iPl,
left_most, right_most)
amplification_factor = 1 / (norm_factor * gene_norm_factor)
# calculate
codon_rpm = gene_df.loc[iPl:iPl + 2, 'sum'].sum()
codon_raw = int(codon_rpm * norm_factor)
codon_relative_rpm = codon_rpm/gene_norm_factor
# Special case when E-site is empty
codon_E = np.nan if iPl == left_most else genome[ref][iPl - 3:iPl]
codon_P = genome[ref][iPl:iPl + 3] # P site
codon_A = genome[ref][iPl + 3:iPl + 6] # A site
pos_30_codons_back = iPl - 30
if left_most > pos_30_codons_back:
pos_30_codons_back = left_most
sequence = genome[ref][pos_30_codons_back:iPl + 3 + nt] # nt nucleotides before P site
revcomp = np.nan # + strand reverse complement is empty
translation = translate_DNA(sequence)
# ---------------- #
# Line_for_codon #
line_for_codon = "{}\t{}\t{}\t{}\t".format(int(iPl), codon_raw, codon_rpm,
codon_relative_rpm)
line_for_codon += "{}\t{}\t{}\t{}\t{}\t{}\t{}".format(codon_E, codon_P, codon_A, sequence,
revcomp, translation, gene_norm_factor)
line_for_codon += "\t{:.3f}".format(amplification_factor)
line_to_add = line_for_gene + "\t" + line_for_codon + "\n"
outfile.write(line_to_add)
if gtf.strand == '-':
# left_most, right_most = left_most, right_most+1
# no correction for '-' strand
left_most, right_most = gtf.start, gtf.end
gene_df = data_r.loc[left_most:right_most, ]
# get normalization factors for regions (beg, body, end)
# select_gene_region_norm_factor() is strand aware and flips beg and end to fit "-" strand
collection = gene_region_normalisation_factor(gene_df, left_most, right_most,
column='sum', method='mean')
norm_factors_coll = tuple([x if x > 0 else np.nan for x in collection]) # repl 0 with nan
u_rpm = gene_df["sum"].mean()
# todo: consider be consistent taking only CDS and not including STOP codon? left_most-3
line_for_gene = "{}\t{}\t{}\t{}\t{}\t{}".format(gtf.contig, gtf.exon_number,
str(gene_ids_exon_No[gtf.gene_id]),
gtf.strand, gtf.gene_id, gtf.transcript_name)
line_for_gene += "\t{}\t{}\t{}".format(left_most - 3, right_most, u_rpm)
# For each coding codon on gene, i. e. excluding STOP codon
for iPl in range(left_most, right_most, 3):
# skip odons with low coverage
if gene_df.loc[iPl:iPl + 2, 'sum'].mean() < CodonRpmMeanThr: # FILTER 2
continue
cM += 1 # counting codons
gene_norm_factor = select_gene_region_norm_factor(norm_factors_coll, iPl, left_most,
right_most, strand=gtf.strand)
amplification_factor = 1 / (norm_factor * gene_norm_factor)
codon_rpm = gene_df.loc[iPl:iPl + 2, 'sum'].sum()
codon_raw = int(codon_rpm * norm_factor)
codon_relative_rpm = codon_rpm / gene_norm_factor
line_for_codon = "{}\t{}\t{}\t{}\t".format(int(iPl), codon_raw, codon_rpm, codon_relative_rpm)
# Get sequences
codon_E = np.nan if iPl + 3 == right_most else revcompl(genome[ref][iPl + 3:iPl + 6]) # E site
codon_P = revcompl(genome[ref][iPl:iPl + 3]) # P site
codon_A = revcompl(genome[ref][iPl - 3:iPl]) # A site
pos_30_codons_forward = iPl + 30
pos_30_codons_forward = right_most if pos_30_codons_forward > right_most else iPl + 30
#
sequence = genome[ref][iPl - nt:pos_30_codons_forward + 3] # 11 codons including P-site & nt before P-Site
seqrevcomp = revcompl(sequence)
translation = translate_DNA(seqrevcomp)
# ---------------- #
# Line_for_codon #
line_for_codon += "{}\t{}\t{}\t{}\t{}\t{}\t{}".format(codon_E, codon_P, codon_A, sequence,
seqrevcomp, translation, gene_norm_factor)
line_for_codon += "\t{:.3f}".format(amplification_factor)
line_to_add = line_for_gene + "\t" + line_for_codon + "\n"
outfile.write(line_to_add)
report = "\n otufile: {}\n\t{} done!\n".format(outfile_name, iN)
print(report)
LOGFILE.write(report)
outfile.close()
LOGFILE.close()
def codonTablesAB(SRAList, Names, Params):
# running functions for tables A and B
codonTablesA(SRAList, Names, Params)
codonTablesB(SRAList, Names, Params)
def masterTable(SRAList, Names, Params):
#
makeDirectory("12-MasterTables")
makeDirectory("12-MasterTables/Reports")
Mapping = Params["Mapping"]
nt = int(Params["CodonsBeforePSite"]) * 3
logfile_name = "12-MasterTables/Reports/MasterTable.log"
if os.path.exists(logfile_name):
append_write = 'a' # append if already exists
else:
append_write = 'w' # make a new file if not
LOGFILE = open(logfile_name, append_write)
# list of sample names
ListA = [x.strip(' ') for x in Params["GroupA"].split(sep=";")]
ListB = [x.strip(' ') for x in Params["GroupB"].split(sep=";")]
for i, iNa in enumerate(ListA):
iNb = ListB[i]
# inut table names
# A
fn_bodyA = iNa + "_" + Mapping + "-End_"
rl_l = readlen_list_from_offset(Params["OffsetFile"], iNa) # read length from offsets file
fn_bodyA += str(min(rl_l)) + "-" + str(max(rl_l))
# B
fn_bodyB = iNb + "_" + Mapping + "-End_"
rl_l = readlen_list_from_offset(Params["OffsetFile"], iNb)
fn_bodyB += str(min(rl_l)) + "-" + str(max(rl_l))
# input table names
infile_B = "11-codonTables/" + fn_bodyB + "_codon_table_B" + ".txt"
infile_A = "11-codonTables/" + fn_bodyA + "_codon_table_A" + ".txt"
report = "Replica No {}\n TReated: {}\nunTreated: {}".format(i + 1, iNa, iNb)
print(report); LOGFILE.write(report+ "\n")
# output file
outfile_csv = "12-MasterTables/Master_table_" + iNa + "_" + iNb + ".txt"
outfile_cleaned_csv= "12-MasterTables/Master_table_" + iNa + "_" + iNb + "_cleaned.txt"
#outfile = open(outfile_csv, 'w')
#outfile_cleaned = open(outfile_cleaned_csv, 'w')
# List of columns to be included into Master Table
columns_needed = ['Chr', 'Exon', 'No_of_exons', 'Strand', 'Gene_id_treated', 'codon_relative_fd',
'gene_1_leftmost', 'gene_1_rigthmost', 'Position_leftmost_1', 'codon_raw_1',