-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patherror_reads.py
61 lines (52 loc) · 1.73 KB
/
error_reads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import re
def find_errors(file_name):
''' file_name is .txt file in FASTA format a read is considered 'correct' if it occurs twice or more, specific errors must occur at most once,'''
file = open(file_name, 'r')
answers = []
correctSeq ={}
incorrectSeq = []
for line in file:
if line[0] == '>':
continue
line = re.sub('\n', '', line)
if line in correctSeq:
correctSeq[line] +=1
elif revComp(line) in correctSeq:
correctSeq[revComp(line)] +=1
else:
correctSeq[line] = 1
correct_list = [ i for i in correctSeq.items()]
for entry in correct_list:
if entry[1] ==1:
del correctSeq[entry[0]]
incorrectSeq.append(entry[0])
for seq_wrong in incorrectSeq:
for seq_right in correctSeq.items():
if hamming_one(seq_wrong, seq_right[0]):
answers.append(seq_wrong +'->'+ seq_right[0])
break
elif hamming_one(seq_wrong, revComp(seq_right[0])):
answers.append(seq_wrong + '->' + revComp(seq_right[0]))
break
for i in answers:
print(i)
def revComp(dna):
compDict = dict(zip("ATCG", "TAGC"))
ans =''
for b in reversed(dna):
ans += (compDict[b])
return ans
def hamming_one(str1, str2):
hamming = 0
for i in range(len(str1)):
if not str1[i] == str2[i]:
if hamming ==1:
return False
else:
assert hamming == 0
hamming =1
try:
assert hamming == 1
except AssertionError:
raise ValueError("Hamming =" +str(hamming) + "strings: " + str1 +' ' +str2)
return True