-
Notifications
You must be signed in to change notification settings - Fork 47
/
Copy pathcorrect.py
executable file
·111 lines (93 loc) · 3.34 KB
/
correct.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python2
import sys
#reads input assembly, breakpoints given by the method and outputs new contig file with lengths
#offsets bed file as well
def parse_fasta(fh):
fa = {}
current_short_name = None
# Part 1: compile list of lines per sequence
for ln in fh:
if ln[0] == '>':
# new name line; remember current sequence's short name
long_name = ln[1:].rstrip()
current_short_name = long_name.split()[0]
fa[current_short_name] = []
else:
# append nucleotides to current sequence
fa[current_short_name].append(ln.rstrip())
# Part 2: join lists into strings
for short_name, nuc_list in fa.iteritems():
# join this sequence's lines into one long string
fa[short_name] = ''.join(nuc_list)
return fa
#read fasta first
input_seqs = parse_fasta(open(sys.argv[1],'r'))
#read breakpoints
contig2breakpoints = {}
with open(sys.argv[2],'r') as f:
for line in f:
attrs = line.split()
contig2breakpoints[attrs[0]] = int(attrs[1])
#first breake the input assembly and store the mapping of old to new names of contigs
contig2new = {}
contig2newseq = {}
contig_id = 1
for seq in input_seqs:
if seq not in contig2breakpoints:
contig2new[seq] = seq
contig2newseq[seq] = input_seqs[seq]
contig_id += 1
else:
first = input_seqs[seq][:contig2breakpoints[seq]]
second = input_seqs[seq][contig2breakpoints[seq]:]
first_id = seq+'_1'
contig_id += 1
second_id = seq+'_2'
contig_id += 1
contig2new[seq] = [first_id,second_id]
contig2newseq[first_id] = first
contig2newseq[second_id] = second
#now update the bed file in streaming fashion
oline = ""
count = 0
ofile = open(sys.argv[4]+'/alignment_iteration_1.tmp.bed','w')
with open(sys.argv[3],'r') as f:
for line in f:
attrs = line.split()
if attrs[0] not in contig2breakpoints:
oline += str(contig2new[attrs[0]] +'\t'+attrs[1]+'\t'+attrs[2]+'\t'+attrs[3]+'\n')
count += 1
else:
pos1 = int(attrs[1])
pos2 = int(attrs[2])
breakpoint_loc = contig2breakpoints[attrs[0]]
if pos1 < breakpoint_loc and pos2 > breakpoint_loc:
continue
else:
if pos2 < breakpoint_loc:
oline += str(contig2new[attrs[0]][0]+'\t'+attrs[1]+'\t'+attrs[2]+'\t'+attrs[3]+'\n')
count += 1
else:
new_start = pos1 - breakpoint_loc
new_end = pos2 - breakpoint_loc
oline += str(contig2new[attrs[0]][1]+'\t'+str(new_start)+'\t'+str(new_end)+'\t'+attrs[3]+'\n')
count += 1
if count >= 1000000:
ofile.write(oline)
oline = ""
count = 0
ofile.close()
#write fasta file
ofasta = open(sys.argv[4]+'/asm.cleaned.fasta','w')
for seq in contig2newseq:
contig_seq = contig2newseq[seq]
chunks = [contig_seq[i:i+80] for i in xrange(0,len(contig_seq),80)]
ofasta.write('>'+seq+'\n')
for chunk in chunks:
ofasta.write(chunk+'\n')
ofasta.close()
#write lengths
olens = open(sys.argv[4]+'/scaffold_length_iteration_1','w')
for seq in contig2newseq:
olens.write(seq+'\t'+str(len(contig2newseq[seq]))+'\n')
olens.close()