-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathclip_peaks.py
executable file
·1571 lines (1293 loc) · 57 KB
/
clip_peaks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
from optparse import OptionParser
from scipy.stats import poisson, nbinom
from numpy import array
from bisect import bisect_left, bisect_right
import copy, math, os, pdb, random, subprocess, sys
import pysam
import bam_fragments, fdr, gff, stats
################################################################################
# clip_peaks.py
#
# Call peaks in CLIP-Seq data.
#
# Conventions:
# 1. All indexes are GFF-based. I.e. the first bp in a sequence is 1 and the
# last is len(sequence). For annotations, the start marks the first bp of
# the annotation and the end marks the last. The length is end-start+1.
#
# Known issues:
# 1. Scan-statistic p-values can sometimes begin under the threshold for
# individual windows but emerge over it after adjacent significant windows
# are combined. This can happen if reads are dense, but clustered in a
# region smaller than the window size. The dumb Poisson model is too
# sensitive here.
################################################################################
# globals that are a pain to pass around
out_dir = None
verbose = None
print_filtered_peaks = None
################################################################################
# main
################################################################################
def main():
usage = 'usage: %prog [options] <clip_bam> <ref_gtf>'
parser = OptionParser(usage)
# IO options
parser.add_option('-a', dest='abundance_bam', help='BAM file to inform transcript abundance estimates [Default: <clip_bam>]')
parser.add_option('-c', dest='control_bam', help='BAM file to inform control comparisons [Default: None]')
parser.add_option('-o', dest='out_dir', default='peaks', help='Output directory [Default: %default]')
# peak calling options
parser.add_option('-w', dest='window_size', type='int', default=50, help='Window size for scan statistic [Default: %default]')
parser.add_option('-p', dest='p_val', type='float', default=.01, help='P-value required of window scan statistic tests [Default: %default]')
parser.add_option('-m', '--max_multimap_fraction', dest='max_multimap_fraction', type='float', default=0.3, help='Maximum proportion of the read count that can be contributed by multimapping reads [Default: %default]')
parser.add_option('-f', dest='print_filtered_peaks', action='store_true', default=False, help='Print peaks filtered at each step [Default: %default]')
parser.add_option('-i', '--ignore', dest='ignore_bed', help='Ignore peaks overlapping troublesome regions in the given BED file')
parser.add_option('-u', '--unstranded', dest='unstranded', action='store_true', default=False, help='Sequencing is unstranded [Default: %default]')
# cufflinks options
parser.add_option('--cuff', dest='cuff_out_dir', help='Cufflinks output directory to estimate the model parameters from.')
parser.add_option('--compatible-hits-norm', dest='compatible_hits_norm', action='store_true', default=True, help='Count only fragments compatible with the reference transcriptome [Default: %default]')
parser.add_option('--total-hits-norm', dest='total_hits_norm', action='store_true', default=False, help='Count all mapped fragments [Default: %default]')
parser.add_option('-t', dest='threads', type='int', default=1, help='Number of threads to use [Default: %default]')
# debug options
parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Verbose output [Default: %default]')
parser.add_option('-g', '--gene', dest='gene_only', help='Call peaks on the specified gene only')
parser.add_option('--print_windows', dest='print_windows', default=False, action='store_true', help='Print statistics for all windows [Default: %default]')
(options,args) = parser.parse_args()
if len(args) != 2:
parser.error(usage)
else:
clip_bam = args[0]
ref_gtf = args[1]
if options.compatible_hits_norm == options.total_hits_norm:
parser.error('Must choose one of compatible-hits-norm or total-hits-norm')
# set globals
global out_dir
out_dir = options.out_dir
global verbose
verbose = options.verbose
global print_filtered_peaks
print_filtered_peaks = options.print_filtered_peaks
if not os.path.isdir(out_dir):
os.mkdir(out_dir)
############################################
# parameterize
############################################
if verbose:
print >> sys.stderr, 'Estimating gene abundances...'
if options.abundance_bam == None:
options.abundance_bam = clip_bam
if not options.cuff_out_dir:
# run Cufflinks on new gtf file and abundance BAM
if options.compatible_hits_norm:
hits_norm = '--compatible-hits-norm'
else:
hits_norm = '--total-hits-norm'
# compute read length
read_length, read_sd = estimate_read_stats(options.abundance_bam)
options.cuff_out_dir = out_dir
subprocess.call('cufflinks -u -m %d -s %d -o %s -p %d %s -G %s %s' % (read_length, read_sd, options.cuff_out_dir, options.threads, hits_norm, ref_gtf, options.abundance_bam), shell=True)
# store transcripts
transcripts = read_genes('%s/transcripts.gtf'%options.cuff_out_dir, key_id='transcript_id')
# merge overlapping genes
g2t_merge, antisense_clusters = merged_g2t('%s/transcripts.gtf'%options.cuff_out_dir, options.unstranded)
if options.unstranded:
# alter strands
ambiguate_strands(transcripts, g2t_merge, antisense_clusters)
# set junctions
set_transcript_junctions(transcripts)
# set transcript FPKMs
set_transcript_fpkms(transcripts, options.cuff_out_dir)
if verbose:
print >> sys.stderr, 'Computing global statistics...'
if options.compatible_hits_norm:
# count transcriptome CLIP reads (overestimates small RNA single ended reads by counting antisense)
subprocess.call('intersectBed -abam %s -b %s > %s/clip.bam' % (clip_bam, '%s/transcripts.gtf'%options.cuff_out_dir, out_dir), shell=True)
clip_reads = bam_fragments.count('%s/clip.bam' % out_dir)
os.remove('%s/clip.bam' % out_dir)
else:
# count CLIP reads
clip_reads = bam_fragments.count(clip_bam)
if verbose:
print >> sys.stderr, '\t%d CLIP reads' % clip_reads
# compute # of tests we will perform
txome_size = transcriptome_size(transcripts, g2t_merge, options.window_size)
if verbose:
print >> sys.stderr, '\t%d transcriptome windows' % txome_size
############################################
# process genes
############################################
# index
subprocess.call('samtools index %s' % clip_bam, shell=True)
# open clip-seq bam
clip_in = pysam.Samfile(clip_bam, 'rb')
# open window output
windows_out = None
if options.print_windows:
windows_out = open('%s/window_stats.txt' % out_dir, 'w')
# possibly limit genes to examine
if options.gene_only:
gene_ids = []
for gids in g2t_merge.keys():
if options.gene_only in gids.split(','):
gene_ids.append(gids)
if len(gene_ids) == 0:
print >> sys.stderr, 'gene_id %s not found' % options.gene_only
exit(1)
else:
gene_ids = g2t_merge.keys()
# initalize peak list
putative_peaks = []
multimap_peaks = []
# for each gene
for gene_id in gene_ids:
if verbose:
print >> sys.stderr, 'Processing %s...' % gene_id
# make a more focused transcript hash for this gene
gene_transcripts = {}
for tid in g2t_merge[gene_id]:
gene_transcripts[tid] = transcripts[tid]
# obtain basic gene attributes
(gchrom, gstrand, gstart, gend) = gene_attrs(gene_transcripts)
if verbose:
print >> sys.stderr, '\tFetching alignments...'
# choose a single event position and weight the reads
read_pos_weights = position_reads(clip_in, gchrom, gstart, gend, gstrand, mapq_zero=True)
if verbose:
print >> sys.stderr, '\tCounting and computing in windows...'
# count reads and compute p-values in windows
window_stats = count_windows(clip_in, options.window_size, read_pos_weights, gene_transcripts, gstart, gend, clip_reads, txome_size, windows_out)
if verbose:
print >> sys.stderr, '\tRefining peaks...'
# post-process windows to peaks
peaks = windows2peaks(read_pos_weights, gene_transcripts, gstart, window_stats, options.window_size, options.p_val, clip_reads, txome_size)
# save peaks
for pstart, pend, pfrags, pmmfrac, ppval in peaks:
putative_peaks.append(Peak(gchrom, pstart, pend, gstrand, gene_id, pfrags, pmmfrac, ppval))
clip_in.close()
############################################
# filter peaks using ignore BED
############################################
if options.ignore_bed:
putative_peaks = filter_peaks_ignore(putative_peaks, options.ignore_bed)
############################################
# filter peaks using the control
############################################
if options.control_bam:
# index
subprocess.call('samtools index %s' % options.control_bam, shell=True)
if options.compatible_hits_norm:
# count transcriptome control reads
subprocess.call('intersectBed -abam %s -b %s/transcripts.gtf > %s/control.bam' % (options.control_bam, options.cuff_out_dir, out_dir), shell=True)
control_reads = bam_fragments.count('%s/control.bam' % out_dir)
os.remove('%s/control.bam' % out_dir)
else:
# count countrol reads
control_reads = bam_fragments.count(options.control_bam)
if verbose:
print >> sys.stderr, '\t%d Control reads' % control_reads
# compute normalization factor for the control
normalization_factor = clip_reads / control_reads
# estimate overdispersion
if verbose:
print >> sys.stderr, 'Estimating overdispersion...'
overdispersion = estimate_overdispersion(clip_bam, options.control_bam, g2t_merge, transcripts, options.window_size, normalization_factor)
if verbose:
print >> sys.stderr, 'Overdisperion estimated to be %f' % overdispersion
# filter peaks
if verbose:
print >> sys.stderr, 'Filtering peaks using control BAM...'
final_peaks = filter_peaks_control(putative_peaks, options.p_val, overdispersion, options.control_bam, normalization_factor)
else:
final_peaks = putative_peaks
############################################
# output peaks
############################################
peaks_out = open('%s/peaks.gff' % out_dir, 'w')
if verbose or print_filtered_peaks:
mm_peaks_out = open('%s/filtered_peaks_multimap.gff' % out_dir, 'w')
peak_id = 1
for peak in final_peaks:
# filter out multimap-dominated peaks
if peak.mm_frac <= options.max_multimap_fraction:
peak.id = peak_id
print >> peaks_out, peak.gff_str()
peak_id += 1
elif verbose or print_filtered_peaks:
print >> mm_peaks_out, peak.gff_str()
peaks_out.close()
if verbose or print_filtered_peaks:
mm_peaks_out.close()
################################################################################
# ambiguate_strands
#
# Remove transcript strands if the gene belongs to a cluster with antisense
# overlapping genes
#
# Input
# transcripts: Hash mapping transcript_id keys to Gene class instances.
# g2t: Hash mapping gene_id's to transcript_id's
# antisense_clusters: Set of gene_id's describing clusters with antisense overlap.
################################################################################
def ambiguate_strands(transcripts, g2t, antisense_clusters):
for gene_id in g2t:
if gene_id in antisense_clusters:
for tid in g2t[gene_id]:
transcripts[tid].strand = '*'
################################################################################
# cigar_endpoint
#
# Input
# aligned_read: pysam AlignedRead object.
#
# Output
# genome_pos: Endpoint of the alignment, considering the insertions and
# deletions in its CIGAR string (which includes splicing).
################################################################################
def cigar_endpoint(aligned_read):
genome_pos = aligned_read.pos+1 # correction for 0-based
for (operation,length) in aligned_read.cigar:
# match
if operation in [0,7,8]:
genome_pos += length
# insertion
elif operation in [1,3]:
genome_pos += length
# deletion
elif operation == 2:
pass
else:
print >> sys.stderr, 'Unknown CIGAR operation - %d, %s' % (operation, aligned_read.qname)
return genome_pos-1 # correction for pointing to the last nt
################################################################################
# cigar_midpoint
#
# Input
# aligned_read: pysam AlignedRead object.
#
# Output
# midpoint: Midpoint of the alignment, considering the insertions and
# deletions in its CIGAR string (which includes splicing).
################################################################################
def cigar_midpoint(aligned_read):
read_half = aligned_read.qlen / 2.0
read_walked = 0
genome_pos = aligned_read.pos+1
for (operation,length) in aligned_read.cigar:
# match
if operation in [0,7,8]:
if read_walked + length >= read_half:
midpoint = genome_pos + (read_half - read_walked)
break
else:
genome_pos += length
read_walked += length
# insertion
elif operation in [1,3]:
genome_pos += length
# deletion
elif operation == 2:
read_walked += length
else:
print >> sys.stderr, 'Unknown CIGAR operation - %d, %s' % (operation, aligned_read.qname)
return midpoint
################################################################################
# convolute_lambda
#
# Determine the convoluted poisson lambda for the given window using the
# transcript's FPKM estimates.
#
# Recall that junctions contains the 1st bp of the next exon/intron.
#
# TODO: Most of the time is spent in this function. It could be sped up by
# keeping better track of the contribution of each bp so that each next
# window can be updated by dropping the most distant bp and adding the
# next bp's contributions. It could also be sped up by ignoring relevant
# isoforms, e.g. by dropping those we've past the 3' end for.
#
# Input
# window_start: Window start coordinate.
# window_end: Window end coordinate.
# gene_transcripts: Hash mapping transcript_id to isoform Gene objects,
# containing only keys for a specific gene.
# junctions_i: Hash mapping transcript_id to the index of the first
# junction ahead of the window start in that transcript's
# junctions array.
# total_reads: Total number of reads aligned to the transcriptome.
#
# Output
# lambda: Poisson lambda for this window.
################################################################################
def convolute_lambda(window_start, window_end, gene_transcripts, junctions_i, total_reads):
# initialize FPKM
fpkm_conv = 0
# for each isoform
for tid in gene_transcripts:
# shortcuts
tx = gene_transcripts[tid]
ji = junctions_i[tid]
# determine transcript coefficient
tcoef = 0.0
# after junctions
if ji >= len(tx.junctions):
tcoef = 0.0
# next junction out of window
elif window_end < tx.junctions[ji]:
if ji % 2 == 1: # in an exon
tcoef = 1.0
# junctions
else:
# window start to first junction
if ji % 2 == 1: # exon
tcoef = tx.junctions[ji] - window_start
# advance
ji += 1
# between junctions
while ji < len(tx.junctions) and tx.junctions[ji] <= window_end:
if ji % 2 == 1: # exon
tcoef += tx.junctions[ji] - tx.junctions[ji-1]
ji += 1
# back up
ji -= 1
# last junction to window end
if ji % 2 == 0: # exon
tcoef += window_end - tx.junctions[ji] + 1
# normalize
tcoef /= float(window_end-window_start+1)
# add to fpkm
fpkm_conv += tcoef * tx.fpkm
# bump to min fpkm
fpkm_conv = max(fpkm_conv, 0.1)
# convert from fpkm to lambda
return fpkm_conv / 1000.0*(total_reads/1000000.0)
################################################################################
# count_windows
#
# Count the number of reads and compute the scan statistic p-value in each
# window through the gene.
#
# Input
# clip_in: Open pysam BAM file for clip-seq alignments.
# window_size: Scan statistic window size.
# read_pos_weights: Sorted list of read alignment positions and weights.
# gene_transcripts: Hash mapping transcript_id to isoform Gene objects,
# containing only keys for a specific gene.
# gene_start: Start of the gene's span.
# gene_end: End of the gene's span.
# total_reads: Total number of reads aligned to the transcriptome.
# txome_size: Total number of bp in the transcriptome.
# windows_out: Open file if we should print window stats, or None.
#
# Output
# window_stats: List of tuples (alignment count, p value) for all windows.
################################################################################
def count_windows(clip_in, window_size, read_pos_weights, gene_transcripts, gene_start, gene_end, total_reads, txome_size, windows_out):
# set lambda using whole region (some day, compare this to the cufflinks estimate)
# poisson_lambda = float(len(read_pos_weights)) / (gene_end - gene_start)
# get gene info
tid0 = gene_transcripts.keys()[0]
chrom = gene_transcripts[tid0].chrom
gene_id = gene_transcripts[tid0].kv['gene_id']
reads_window_start = 0 # index of the first read position that fits in the window (except I'm allowing 0)
reads_window_end = 0 # index of the first read position past the window
# combine all gene junctions
gene_junctions_set = set()
for tid in gene_transcripts:
gene_junctions_set |= set(gene_transcripts[tid].junctions)
gene_junctions = sorted(list(gene_junctions_set))
junctions_window_start = 0 # index of the first junction that fits in the window (except I'm allowing 0)
junctions_window_end = 0 # index of the first junction past the window
# initialize index of the first junction ahead of the window start for each transcript
junctions_i = {}
for tid in gene_transcripts:
junctions_i[tid] = 0
# to avoid redundant computation
precomputed_pvals = {}
rpw_len = len(read_pos_weights)
gj_len = len(gene_junctions)
window_stats = []
window_lambda = None
# start at either gene_start or the 3rd read (since we need >2 to Poisson test)
if rpw_len < 3:
first_window_start = gene_end # skip iteration
else:
first_window_start = max(gene_start, int(read_pos_weights[2][0])-window_size+1)
window_stats += [(0,1)]*(first_window_start-gene_start)
for window_start in range(first_window_start, gene_end-window_size+1):
window_end = window_start + window_size - 1
# update read_window_start
while reads_window_start < rpw_len and read_pos_weights[reads_window_start][0] < window_start:
reads_window_start += 1
if reads_window_start >= rpw_len:
break
# update read_window_end
while reads_window_end < rpw_len and read_pos_weights[reads_window_end][0] <= window_end:
reads_window_end += 1
# count reads
#window_count = reads_window_end - reads_window_start
window_count_float = sum([read_pos_weights[i][1] for i in range(reads_window_start,reads_window_end)])
# round count
window_count = int(window_count_float + 0.5)
# update junctions_window_start
while junctions_window_start < gj_len and gene_junctions[junctions_window_start] < window_start:
junctions_window_start += 1
# update junctions_window_end
while junctions_window_end < gj_len and gene_junctions[junctions_window_end] <= window_end:
junctions_window_end += 1
# update junction indexes and convolute lambda only if there are junctions in the window
if window_lambda == None or junctions_window_start < junctions_window_end:
# update junctions indexes (<= comparison because junctions holds the 1st bp of next exon/intron)
for tid in gene_transcripts:
tjunctions = gene_transcripts[tid].junctions
while junctions_i[tid] < len(tjunctions) and tjunctions[junctions_i[tid]] <= window_start:
junctions_i[tid] += 1
# set lambda
window_lambda = convolute_lambda(window_start, window_end, gene_transcripts, junctions_i, total_reads)
# compute p-value
if window_count > 2:
if (window_count,window_lambda) in precomputed_pvals:
p_val = precomputed_pvals[(window_count,window_lambda)]
else:
p_val = scan_stat_approx3(window_count, window_size, txome_size, window_lambda)
precomputed_pvals[(window_count,window_lambda)] = p_val
window_stats.append((window_count,p_val))
else:
window_stats.append((window_count,1))
# for debugging
if windows_out:
cols = (chrom, window_start, gene_id, window_count, window_stats[-1][1], window_lambda)
print >> windows_out, '%-5s %9d %18s %5d %8.1e %8.2e' % cols
return window_stats
################################################################################
# estimate_overdispersion
#
# Inputs:
# clip_bam: CLIP sequencing BAM
# control_bam: Control sequencing BAM
# g2t: Hash mapping gene_id's to transcript_id's
# transcripts: Hash mapping transcript_id keys to Gene class instances.
# window_size: Scan statistic window size.
# norm_factor: Ratio of total transcriptome CLIP to control reads
#
# Outputs:
# overdisperion: Estimated overdispersion parameter.
################################################################################
def estimate_overdispersion(clip_bam, control_bam, g2t, transcripts, window_size, norm_factor):
clip_in = pysam.Samfile(clip_bam)
control_in = pysam.Samfile(control_bam)
window_means = []
window_variances = []
for gene_id in g2t:
# make a more focused transcript hash for this gene
gene_transcripts = {}
for tid in g2t[gene_id]:
gene_transcripts[tid] = transcripts[tid]
# obtain basic gene attributes
(gchrom, gstrand, gstart, gend) = gene_attrs(gene_transcripts)
# fetch reads
clip_read_pos_weights = position_reads(clip_in, gchrom, gstart, gend, gstrand)
control_read_pos_weights = position_reads(control_in, gchrom, gstart, gend, gstrand)
clip_rpw_len = len(clip_read_pos_weights)
control_rpw_len = len(control_read_pos_weights)
# initialize
clip_reads_start_i = 0
clip_reads_end_i = 0
control_reads_start_i = 0
control_reads_end_i = 0
window_start = gstart
while window_start + window_size < gend:
# update reads_start_i
while clip_reads_start_i < clip_rpw_len and clip_read_pos_weights[clip_reads_start_i][0] < window_start:
clip_reads_start_i += 1
while control_reads_start_i < control_rpw_len and control_read_pos_weights[control_reads_start_i][0] < window_start:
control_reads_start_i += 1
# update reads_end_i
while clip_reads_end_i < clip_rpw_len and clip_read_pos_weights[clip_reads_end_i][0] <= window_start+window_size:
clip_reads_end_i += 1
while control_reads_end_i < control_rpw_len and control_read_pos_weights[control_reads_end_i][0] <= window_start+window_size:
control_reads_end_i += 1
# count clip fragments
if clip_reads_start_i >= clip_rpw_len:
clip_frags = 0.0
else:
clip_frags = sum([clip_read_pos_weights[i][1] for i in range(clip_reads_start_i,clip_reads_end_i)])
# count control fragments
if control_reads_start_i >= control_rpw_len:
control_frags = 0.0
else:
control_frags = sum([control_read_pos_weights[i][1] for i in range(control_reads_start_i,control_reads_end_i)])
# normalize control fragments
control_frags *= norm_factor
# save mean and variance
window_means.append(0.5*clip_frags + 0.5*control_frags)
window_variances.append((clip_frags - window_means[-1])**2 + (control_frags - window_means[-1])**2)
# update indexes
window_start += window_size
clip_in.close()
control_in.close()
# regress overdispersion
u = array(window_means)
var = array(window_variances)
if verbose:
mv_out = open('%s/overdispersion.txt' % out_dir, 'w')
for i in range(len(u)):
print >> mv_out, '%f\t%f' % (u[i],var[i])
mv_out.close()
return max(0, sum(u*var - u**2) / sum(u**3))
################################################################################
# estimate_read_stats
#
# Compute mean read length by sampling the first N reads.
#
# Input
# bam_file: BAM.
#
# Output:
# read_length: Mean read length.
# read_sd: Read length standard deviation.
################################################################################
def estimate_read_stats(bam_file):
samples = 2000000
s = 0
read_lengths = []
for aligned_read in pysam.Samfile(bam_file, 'rb'):
if aligned_read.mapq > 0:
read_lengths.append(aligned_read.rlen)
s += 1
if s >= samples:
break
mean_f, sd_f = stats.mean_sd(read_lengths)
return int(mean_f+0.5), int(sd_f+0.5)
################################################################################
# filter_peaks_control
#
# Input
# putative_peaks: List of Peak objects w/o attribute control_p.
# p_val: P-value to use for filtering.
# overdispersion: Negative binomial overdispersion parameter.
# control_bam: BAM file to inform control filtering.
# norm_factor: Ratio of total transcriptome CLIP to control reads
#
# Output
# filtered_peaks: List of filtered Peak objects w/ attribute control_p set.
################################################################################
def filter_peaks_control(putative_peaks, p_val, overdispersion, control_bam, norm_factor):
# number of bp to expand each peak by to check the control
fuzz = 5
# open control BAM for fetching
control_in = pysam.Samfile(control_bam, 'rb')
# initialize p-value list for later FDR correction
control_p_values = []
if verbose or print_filtered_peaks:
# open file to print filtered peaks
control_filter_out = open('%s/filtered_peaks_control.gff' % out_dir, 'w')
# for each peak
for peak in putative_peaks:
peak_length = peak.end - peak.start + 1
# fetch reads
read_pos_weights = position_reads(control_in, peak.chrom, peak.start-fuzz, peak.end+fuzz, peak.strand)
# sum weights
read_positions = [pos for (pos,w,mm) in read_pos_weights]
reads_start_i = bisect_left(read_positions, peak.start-fuzz)
reads_end_i = bisect_right(read_positions, peak.end+fuzz)
control_frags = sum([read_pos_weights[i][1] for i in range(reads_start_i,reads_end_i)])
# if there are fragments
if control_frags > 0:
# refactor for fuzz
control_frags *= float(peak_length) / (peak_length + 2*fuzz)
# normalize for read counts
peak.control_frags = max(0.1, control_frags * norm_factor)
# if there are no fragments
else:
# assume a small value that will pass
peak.control_frags = 0.1
if overdispersion == 0:
# perform poisson test
control_p_values.append( poisson.sf(peak.frags-1, peak.control_frags) )
else:
# perform negative binomial test
nb_p = 1.0 / (1.0 + peak.control_frags*overdispersion)
nb_n = 1.0 / overdispersion
control_p_values.append( nbinom.sf(peak.frags-1, nb_n, nb_p) )
# correct for multiple hypotheses
control_q_values = fdr.ben_hoch(control_p_values)
# attach q-values to peaks and filter
filtered_peaks = []
for i in range(len(putative_peaks)):
peak = putative_peaks[i]
peak.control_p = control_q_values[i]
if control_q_values[i] <= p_val:
filtered_peaks.append(peak)
elif verbose or print_filtered_peaks:
print >> control_filter_out, peak.gff_str()
if verbose or print_filtered_peaks:
control_filter_out.close()
return filtered_peaks
################################################################################
# filter_peaks_ignore
#
# Input
# putative_peaks: List of Peak objects.
# ignore_bed: BED file specifying troublesome regions to ignore.
#
# Output
# filtered_peaks: List of filtered Peak objects.
################################################################################
def filter_peaks_ignore(putative_peaks, ignore_bed):
# temporarily print to file
peaks_out = open('%s/putative.gff' % out_dir, 'w')
for peak in putative_peaks:
print >> peaks_out, peak.gff_str()
peaks_out.close()
# add fuzz to ignore_bed
fuzz = 3
ignorez_out = open('%s/ignore_fuzz.bed' % out_dir, 'w')
for line in open(ignore_bed):
a = line.split('\t')
a[1] = str(max(1,int(a[1])-fuzz))
a[2] = str(int(a[2])+fuzz)
print >> ignorez_out, '\t'.join(a),
ignorez_out.close()
# intersect with ignore regions
subprocess.call('intersectBed -wo -a %s/putative.gff -b %s/ignore_fuzz.bed > %s/filtered_peaks_ignore.gff' % (out_dir,out_dir,out_dir), shell=True)
# hash ignored peaks
ignored_peaks = set()
for line in open('%s/filtered_peaks_ignore.gff' % out_dir):
a = line.split('\t')
peak_tuple = (a[0], int(a[3]), int(a[4]), a[6])
ignored_peaks.add(peak_tuple)
# filter putative_peaks
filtered_peaks = []
for peak in putative_peaks:
peak_tuple = (peak.chrom, peak.start, peak.end, peak.strand)
if peak_tuple not in ignored_peaks:
filtered_peaks.append(peak)
# clean
os.remove('%s/ignore_fuzz.bed' % out_dir)
os.remove('%s/putative.gff' % out_dir)
return filtered_peaks
################################################################################
# gene_attrs
#
# Input
# gene_transcripts: Hash mapping transcript_id to isoform Gene objects,
# containing only keys for a specific gene.
#
# Output
# gene_chrom: Self explanatory...
# gene_strand:
# gene_start:
# gene_end:
################################################################################
def gene_attrs(gene_transcripts):
gene_start = None
gene_end = None
for tx in gene_transcripts.values():
gene_chrom = tx.chrom
gene_strand = tx.strand
if gene_start == None:
gene_start = tx.exons[0].start
else:
gene_start = min(gene_start, tx.exons[0].start)
if gene_end == None:
gene_end = tx.exons[-1].end
else:
gene_end = max(gene_end, tx.exons[-1].end)
return gene_chrom, gene_strand, gene_start, gene_end
################################################################################
# get_gene_regions
#
# Return a
#
# Input:
# transcripts: Hash mapping transcript_id keys to Gene class instances.
#
# Output:
# gene_regions: Hash mapping gene_id keys to lists consisting of (chromosome,
# start, end, strand) tuples with coordinates in GTF format.
################################################################################
def get_gene_regions(transcripts):
gene_regions = {}
for tid in transcripts:
tx = transcripts[tid]
gid = tx.kv['gene_id']
if not gid in gene_regions:
gene_regions[gid] = [tx.chrom, tx.exons[0].start, tx.exons[-1].end, tx.strand]
else:
gene_regions[gid][1] = min(gene_regions[gid][1], tx.exons[0].start)
gene_regions[gid][2] = max(gene_regions[gid][2], tx.exons[-1].end)
return gene_regions
################################################################################
# merged_g2t
#
# Merge overlapping genes and return the resulting gene_id to transcript_id
# mapping.
#
# Input
# ref_gtf: GTF file
# unstranded: Sequencing is unstranded, so handle antisense overlapping
#
# Output
# g2t: Hash mapping gene_id's to transcript_id's
# antisense_clusters: Set of gene_id's describing clusters with antisense overlap
################################################################################
def merged_g2t(ref_gtf, unstranded):
# make gene span file
span_gtf(ref_gtf, level='gene_id')
un_str = '-s'
if unstranded:
un_str = ''
# intersect spans
p = subprocess.Popen('intersectBed -wo %s -a %s/span.gtf -b %s/span.gtf' % (un_str,out_dir,out_dir), shell=True, stdout=subprocess.PIPE)
# map gene_id's to sets of overlapping genes
id_map = {}
antisense_genes = set()
for line in p.stdout:
a = line.split('\t')
gid1 = gff.gtf_kv(a[8])['gene_id']
gid2 = gff.gtf_kv(a[17])['gene_id']
gene_cluster = set([gid1,gid2]) | id_map.get(gid1,set()) | id_map.get(gid2,set())
for gid in gene_cluster:
id_map[gid] = gene_cluster
strand1 = a[6]
strand2 = a[15]
if strand1 != strand2:
antisense_genes.add(gid1)
antisense_genes.add(gid2)
# clean
p.communicate()
os.remove('%s/span.gtf' % out_dir)
# set cluster gene_id's and map genes to transcripts
g2t = {}
for line in open(ref_gtf):
a = line.split('\t')
kv = gff.gtf_kv(a[8])
if kv['gene_id'] in id_map:
gene_id = ','.join(sorted(list(id_map[kv['gene_id']])))
else:
gene_id = kv['gene_id']
g2t.setdefault(gene_id,set()).add(kv['transcript_id'])
# determine antisense clusters
antisense_clusters = set()
for gene_ids in g2t:
for gene_id in gene_ids.split(','):
if gene_id in antisense_genes:
antisense_clusters.add(gene_ids)
return g2t, antisense_clusters
################################################################################
# merge_peaks_count
#
# Merge the given trimmed windows with counts using bedtools and then recount
# the reads in the new intervals.
#
# Input
# trimmed_windows: List of (start,end) tuples for significant windows, trimmed
# to be tight around read midpoints.
# read_pos_weights: Sorted list of read alignment positions and weights.
#
# Output
# peaks: List of (start,end,count,mm_count) tuples for
# significant trimmed and merged windows.
################################################################################
def merge_peaks_count(trimmed_windows, read_pos_weights):
peaks = []
mm_peaks = []
if trimmed_windows:
# get read positions for easy bisecting
read_positions = [pos for (pos,w,mm) in read_pos_weights]
# initalize first peak
pstart, pend = trimmed_windows[0]
for t in range(1,len(trimmed_windows)):
tw_start, tw_end = trimmed_windows[t]
if pend < tw_start:
# close last peak
reads_start_i = bisect_left(read_positions, pstart)
reads_end_i = bisect_right(read_positions, pend)
read_count = sum([read_pos_weights[i][1] for i in range(reads_start_i,reads_end_i)])
mm_count = sum([1 for i in range(reads_start_i,reads_end_i) if read_pos_weights[i][2]])
mm_fraction = mm_count / float(reads_end_i-reads_start_i)
peaks.append((pstart, pend, read_count, mm_fraction))
# initialize next peak
pstart = tw_start
pend = tw_end
else:
# extend current peak
pend = tw_end