-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcufflink_evaluator.py
140 lines (107 loc) · 3.48 KB
/
cufflink_evaluator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
""" cufflink_evaluator.py
Usage: python cufflink_evaluator.py -d cufflink_directory -a expected_count_A -b expected_count_B
Input: -d input/output directory (select_pseudogene_128_v2/10X_100L_1A_30X_100L_1A)
-a expected number of fragment in sample A
-b expected number of fragment in sample B
Output: line of cufflink output combining count and fpkm {XLOC_id \t Gene_id \t Overlap \t Count_A \t Count_B \t FPKM_A \t FPKM_B \t Expected_A \t Expected_B}
Function: 1. combine the information from cufflink result with expected data
Date: 2014-03-30
Author: Chelsea Ju
"""
XLOC = {}
EXPECTATION = {}
import sys, re, os, subprocess, random, argparse
"""
Function : build gene gene id
"""
def build_name(file):
fh = open(file, 'rb')
for line in fh:
line = line.rstrip()
data = line.split("\t")
x = data[0]
alternative_ids = data[3]
XLOC[x] = alternative_ids.split(",")
fh.close()
"""
Function : read in expected count for each gene
"""
def build_expectation(file, order):
fh = open(file, 'rb')
for line in fh:
line = line.rstrip()
(name, count) = line.split("\t")
if(EXPECTATION.has_key(name)):
EXPECTATION[name][order] = count
else:
count_data = ["0","0"]
count_data[order] = count
EXPECTATION[name] = count_data
fh.close()
def combine_expectation_observation(result):
evaluation_data = []
seen_ids = {}
# mark all expected ids
for k in EXPECTATION.keys():
seen_ids[k] = False
fh = open(result, 'rb')
line = fh.readline()
for line in fh:
line = line.rstrip()
data = line.split("\t")
xid = data[0]
xgene = data[1]
if(EXPECTATION.has_key(xgene)):
data = data + [xgene] + EXPECTATION[xgene]
seen_ids[xgene] = True
else:
found = False
if(XLOC.has_key(xid)):
for a_gene in XLOC[xid]:
if(EXPECTATION.has_key(a_gene)):
data = data + [a_gene] + EXPECTATION[a_gene]
seen_ids[a_gene] = True
found = True
break
if(not found):
data = data + ["-", "0","0"]
evaluation_data.append(data)
fh.close()
for k in sorted(seen_ids, key=seen_ids.get):
if(seen_ids[k] == False):
data = ["-",k,"-","-","-","-","-",k] + EXPECTATION[k]
evaluation_data.append(data)
else:
break
return evaluation_data
def export_data(result, file):
fh = open(file, 'w')
fh.write("XLOC\tGENENAME\tOVERLAP\tFPKM_A\tFPKM_B\tCOUNT_A\tCOUNT_B\tEXPECTED\tEXPECTED_A\tEXPECTED_B\n")
for r in result:
fh.write("\t".join(r))
fh.write("\n")
fh.close()
def main(parser):
options = parser.parse_args()
dir = options.dir
file_A = options.fileA
file_B = options.fileB
## check dir
if(dir[-1] != "/"):
dir += "/"
name_file = dir + "cufflink_genes.txt"
result = dir + "cufflink_results.txt"
outfile = dir + "cufflink_evaluation.txt"
build_name(name_file)
build_expectation(file_A, 0)
build_expectation(file_B, 1)
evaluation = combine_expectation_observation(result)
export_data(evaluation, outfile)
print ""
print "Writing Gene List to File : %s" %(outfile)
if __name__ == "__main__":
parser = argparse.ArgumentParser(prog='cufflink_evaluator.py')
parser.add_argument("-d", "--directory", dest="dir", type=str, help="cufflink directory of input and output files", required = True)
parser.add_argument("-a", "--sampleA", dest="fileA", type=str, help="expected number of count from sample A", required = True)
parser.add_argument("-b", "--sampleB", dest="fileB", type=str, help="expected number of count from sample B", required = True)
main(parser)