-
Notifications
You must be signed in to change notification settings - Fork 0
/
multiread_distributor.py
149 lines (117 loc) · 4.26 KB
/
multiread_distributor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
""" multiread_distributor.py
Usage: python multiread_distributor.py -d directory -p is_paired
Input: -d the directory and prefix of output files ex: select_pseudogene_128_v1/10X_101L_1A/tophat_out
-p are reads paired? [True|False]
Output:
Function: 1. retrieve the fragments for each locus in lasso coefficient
2. retain all the uniquely mapped fragments
Date: 2014-04-04
Author: Chelsea Ju
"""
import sys, re, pysam, os, random, argparse
DISTRIBUTION={}
EXPECTATION={}
"""
Function :
1. given a region, query the reads in this region
2. keep all the uniquely mapped reads
3. for multireads, record the best alignment for the given positions
3. return the value of max((expected_count - actual_count), 0)
"""
def query_reads(chromosome, start, end, count, gene, bamFH):
actual_count = 0
for read in bamFH.fetch(chromosome, int(start), int(end)):
name = read.qname
if(read.positions[0]+1 >= int(start) and read.positions[0] +1 <= int(end)):
# scoring scheme:
# proper_pair = +50 for paired end
# mapq = + mapq
score = 0
if(DISTRIBUTION.has_key(name)):
# properly paired = mate is mapped, same chromosome
if(read.is_paired and not read.mate_is_unmapped and read.rnext == read.tid):
score += 50
score += read.mapq
if(DISTRIBUTION[name].has_key(gene)):
(assign_chr, assign_start, assign_end, assign_score) = DISTRIBUTION[name][gene]
if(score > assign_score):
DISTRIBUTION[name][gene] = (chromosome, int(read.positions[0]+1), read.pnext+1, score)
else:
DISTRIBUTION[name][gene] = (chromosome, int(read.positions[0]+1), read.pnext+1, score)
else:
actual_count += 1
# rank_multiread = sorted(rank_multiread, key=lambda k: -k[1])
# print chromosome, start, end, count, actual_count
# multiread_index = 0
# while(actual_count < count and multiread_index < len(rank_multiread)):
# (select_name, select_score) = rank_multiread[multiread_index]
# ASSIGNMENT[select_name] = (chromosome, start, end)
# multiread_index += 1
# actual_count += 1
return max(count - actual_count, 0)
"""
Function : iterat throught the gene list with expected count
query the fragments within the region
"""
def resolver(bamfile, expected, pairend):
fh = open(expected, 'rb')
bamFH = pysam.Samfile( bamfile, "rb" )
for line in fh:
(name, count, chromosome, start, end) = line.rstrip().split("\t")
count = float(count)
if(pairend):
count = count * 2
missing_count = query_reads(chromosome, start, end, int(round(count)), name, bamFH)
if(missing_count > 0):
EXPECTATION[name] = missing_count
bamFH.close()
fh.close()
"""
Function : read in a list of multireads name
"""
def import_multiread(file):
fh = open(file, 'rb')
for line in fh:
line = line.strip()
DISTRIBUTION[line] = {}
fh.close()
"""
Function : export multireads assignment
"""
def export_multiread(file):
fh = open(file, 'w')
for key in DISTRIBUTION.keys():
if(len(DISTRIBUTION[key]) == 1):
gene_name = DISTRIBUTION[key].keys()[0]
(chromosome, start, pair_start, score) = DISTRIBUTION[key][gene_name]
if(EXPECTATION.has_key(gene_name)):
if(pair_start == 0):
EXPECTATION[gene_name] -=1
else:
EXPECTATION[gene_name] -=2
fh.write("%s\t%s\t%s\t%s\n" %(key, chromosome, start, pair_start))
else:
print key, DISTRIBUTION[key]
fh.close()
def main(parser):
options = parser.parse_args()
dir = options.dir
pairend = options.pair
## check dir
if(dir[-1] != "/"):
dir += "/"
bamfile = dir + "accepted_hits_sorted.bam"
multiread_file = dir + "multireads.txt"
output = dir + "multireads_assignment.txt"
expected_file = dir + "locus_expectation.txt"
assignment_file = dir + "multireads_assignment.txt"
import_multiread(multiread_file)
resolver(bamfile, expected_file, pairend)
export_multiread(assignment_file)
print ""
print "Writing Multiread Assignment to File : %s" %(assignment_file)
if __name__ == "__main__":
parser = argparse.ArgumentParser(prog='multiread_disributor.py')
parser.add_argument("-d", "--directory", dest="dir", type=str, help="directory of input and output files", required = True)
parser.add_argument("-p", "--paired", dest="pair", type=bool, help="are reads paired? [True|False]", required = True)
main(parser)