-
Notifications
You must be signed in to change notification settings - Fork 0
/
expected_counter_v2.py
85 lines (62 loc) · 2.44 KB
/
expected_counter_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
Usage: python expected_counter_v2.py -d directory
Input: -d the directory and prefix of output files
Output: A list of gene with the number of reads. {gene_name \t number_of_read}
Function: Assuming read with the same prefix (gene name) comes from the same gene.
The script iterates through the bam file and count expected number of read for each gene.
Date: 2013-12-29
Author: Chelsea Ju
Note: this script is adopted from fragments_counter
"""
import sys, re, pysam, os, random, argparse
"""
iterate through the bam file, which is sorted by the read name
the file may contain multiple alignments and pair end mates, so the read is only counted when seen the first time
"""
def expected_read_counter(sorted_input):
expected_hash = {}
previous_read = ""
sorted_bam_fh = pysam.Samfile(sorted_input)
for read in sorted_bam_fh:
name = read.qname
# the first time seeing this read
# (note: reads contain multiple alignment and pair-end mates, but each read is counted once)
if(name != previous_read):
prefix_match = re.match(r"(.*?):.*", name)
if(prefix_match):
prefix = prefix_match.group(1)
# if the read name exists
if(expected_hash.has_key(prefix)):
expected_hash[prefix] += 1
else:
expected_hash[prefix] = 1
# update previous_read
previous_read = name
sorted_bam_fh.close()
return expected_hash
"""
write the expected counts to file
"""
def export_array(hash, outfile):
out_fh = open(outfile, "w")
for k, v in hash.items():
out_fh.write("%s\t%s\n" %(k,str(v)))
out_fh.close()
print ""
print "Writing Expected Count to File : %s" %(outfile)
def main(parser):
options = parser.parse_args()
dir = options.dir
## check dir
if(dir[-1] != "/"):
dir += "/"
## start counting the readd
sorted_bam = dir + "accepted_hits_sortedByName.bam"
counts_hash = expected_read_counter(sorted_bam)
## output data
outfile = dir + "genes_expected_read_count.txt"
export_array(counts_hash, outfile)
if __name__ == "__main__":
parser = argparse.ArgumentParser(prog='expected_counter.py')
parser.add_argument("-d", "--directory", dest="dir", type=str, help="directory of input files", required = True)
main(parser)