forked from nextstrain/fauna
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrick_upload.py
138 lines (131 loc) · 5.18 KB
/
crick_upload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os, re, time, datetime, csv, sys, json
from upload import upload
import rethinkdb as r
from Bio import SeqIO
import argparse
import subprocess
from parse import parse
from upload import parser
sys.path.append('') # need to import from base
from base.rethink_io import rethink_io
from vdb.flu_upload import flu_upload
# import logging
# print 'yay'
# logger = logging.getLogger()
# print 'more yay'
parser.add_argument('--assay_type', default='hi')
def read_crick(path, fstem, assay_type):
'''
Read all csv tables in path, create data frame with reference viruses as columns
'''
fname = path + fstem # + ".csv"
# import glob
# flist = glob.glob(path + '/NIMR*csv') #BP
exten = [ os.path.isfile(path + fstem + ext) for ext in ['.xls', '.xlsm', '.xlsx'] ]
if True in exten:
ind = exten.index(True)
sheets = convert_xls_to_csv(path, fstem, ind)
for sheet in sheets:
fname = "data/tmp/{}.csv".format(sheet)
parse_crick_matrix_to_tsv(fname, path, assay_type)
else:
# logger.critical("Unable to recognize file extension of {}/{}".format(path,fstem))
print("EXITING")
sys.exit()
return sheets
def convert_xls_to_csv(path, fstem, ind):
import xlrd
sheets = []
exts = ['.xls', '.xlsm', '.xlsx']
workbook = xlrd.open_workbook(path+fstem + exts[ind])
for sheet in workbook.sheets():
with open('data/tmp/{}_{}.csv'.format(fstem, sheet.name), 'wb') as f:
writer = csv.writer(f)
print(sheet.name)
for row in range(sheet.nrows):
new_row = []
for cell in sheet.row_values(row):
try:
new_row.append(unicode(cell).encode('utf-8'))
except:
import pdb; pdb.set_trace()
writer.writerow(new_row)
print("wrote new csv to data/tmp/{}_{}.csv".format(fstem, sheet.name))
sheets.append("{}_{}".format(fstem, sheet.name))
return sheets
def parse_crick_matrix_to_tsv(fname, original_path, assay_type):
from string import strip
src_id = fname.split('/')[-1]
with open(fname) as infile:
csv_reader = csv.reader(infile)
mat = list(csv_reader)
with open('data/tmp/%s.tsv'%(src_id[:-4]), 'wb') as outfile:
header = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"]
outfile.write("%s\n" % ("\t".join(header)))
original_path = original_path.split('/')
try:
original_path.remove('')
except:
pass
if assay_type == "hi":
start_row = 14
start_col = 6
col_span = 1
virus_strain_col_index = 1
virus_passage_col_index = 5
elif assay_type == "fra":
start_row = 16
start_col = 5
col_span = 2
virus_strain_col_index = 1
virus_passage_col_index = 4
for i in range(start_row, len(mat)):
for j in range(start_col, len(mat[0]), col_span):
virus_strain = mat[i][virus_strain_col_index]
serum_strain = mat[6][j]
serum_id = mat[9][j]
titer = mat[i][j]
source = "crick_%s"%(src_id)
virus_passage = mat[i][virus_passage_col_index]
virus_passage_category = ''
serum_passage = mat[8][j]
serum_passage_category = ''
line = "%s\n" % ("\t".join([ virus_strain, serum_strain, serum_id, titer, source, virus_passage, virus_passage_category, serum_passage, serum_passage_category, assay_type]))
outfile.write(line)
def determine_subtype(fname):
if fname.lower().startswith('h3n2'):
subtype = 'h3n2'
elif fname.lower().startswith ('h1n1pdm'):
subtype = 'h1n1pdm'
elif fname.lower().startswith('bvic'):
subtype = 'vic'
elif fname.lower().startswith('byam'):
subtype = 'yam'
else:
subtype = 'unknown'
return subtype
if __name__=="__main__":
args = parser.parse_args()
if args.path is None:
args.path = "data/"
if args.database is None:
args.database = "crick_tdb"
if not os.path.isdir(args.path):
os.makedirs(args.path)
# x_shift, y_shift = determine_initial_indices(args.path, args.fstem)
sheets = read_crick(args.path, args.fstem, args.assay_type)
for sheet in sheets:
if args.subtype:
subtype = args.subtype
else:
subtype = determine_subtype(sheet)
if args.preview:
print("Subtype: {}".format(subtype))
print("Sheet: {}".format(sheet))
command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + subtype + " --path data/tmp/ --fstem " + sheet + " --preview"
print command
subprocess.call(command, shell=True)
else:
command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + subtype + " --path data/tmp/ --fstem " + sheet
print command
subprocess.call(command, shell=True)