Skip to content

Commit 38922dd

Browse files
put to repo of my using script
1 parent 01975da commit 38922dd

18 files changed

+567
-0
lines changed

PacBio-indel-script/blast2fasta.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import re
2+
def main():
3+
4+
sourcefile = open('Wt_remaining_with_hyphen.fasta') # source file
5+
data=sourcefile.read()
6+
sourcefile.close()
7+
data_blocks=data.split('>')
8+
9+
fasta_file=open('Wt_remaining.fasta','w')
10+
block_name=['']*len(data_blocks)
11+
12+
for i in range(1,len(data_blocks)):
13+
paragraph=data_blocks[i]
14+
lines=paragraph.split('\n')
15+
block_name[i]=lines[0]
16+
print('>'+ block_name[i]+'\n')
17+
18+
lines[4]=lines[4].replace('-','')
19+
fasta_file.write('>'+ block_name[i]+'\n')
20+
fasta_file.write(lines[4]+'\n\n')
21+
22+
fasta_file.close()
23+
24+
if __name__ == "__main__": main()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import re
2+
def main():
3+
4+
sourcefile = open('Wt_single_hit.blast') # source file
5+
data=sourcefile.read()
6+
sourcefile.close()
7+
data_blocks=data.split('>')
8+
single_hit_fasta_file=open('Wt_single_hit_with_query_and_hyphen.fasta','w')
9+
block_name=['']*len(data_blocks)
10+
11+
for i in range(1,len(data_blocks)):
12+
paragraph=data_blocks[i]
13+
lines=paragraph.split('\n')
14+
block_name[i]=lines[0]
15+
print('>'+ block_name[i]+'\n')
16+
17+
queryparagraph=''
18+
re_pattern=re.compile('C-*C-*A-*C-*T-*G-*C-*A-*T-*C-*C-*T-*G-*G-*G-*G-*A')
19+
for k in range(5,len(lines)):
20+
if re.search('Query', lines[k]):
21+
lines[k]=lines[k].strip('Query')
22+
lines[k]= ''.join(m for m in lines[k] if not m.isdigit())
23+
lines[k]=lines[k].strip()
24+
queryparagraph+=(lines[k])
25+
m= re.search(re_pattern, queryparagraph)
26+
if m==None:
27+
continue
28+
29+
subparagraph=''
30+
for j in range(5,len(lines)):
31+
if re.search('Sbjct', lines[j]):
32+
lines[j]=lines[j].strip('Sbjct')
33+
lines[j]= ''.join(m for m in lines[j] if not m.isdigit())
34+
lines[j]=lines[j].strip()
35+
subparagraph+=(lines[j])
36+
print(subparagraph)
37+
print(len(subparagraph))
38+
single_hit_fasta_file.write('>'+ block_name[i]+'\n')
39+
single_hit_fasta_file.write('Query Sequence \n')
40+
single_hit_fasta_file.write(queryparagraph+'\n')
41+
single_hit_fasta_file.write('Reads Sequencen \n')
42+
single_hit_fasta_file.write(subparagraph+'\n\n')
43+
44+
single_hit_fasta_file.close()
45+
46+
if __name__ == "__main__": main()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import re
2+
def main():
3+
4+
sourcefile = open('Wt_remaining_with_hyphen.fasta') # source file
5+
data=sourcefile.read()
6+
sourcefile.close()
7+
data_blocks=data.split('>')
8+
block_name=['']*len(data_blocks)
9+
del_count_file=open('Wt_del_count_file.fasta','w')
10+
11+
re_pattern=re.compile('C-*C-*A-*C-*T-*G-*C-*A-*T-*C-*C-*T-*G-*G-*G-*G-*A')
12+
13+
for i in range(1,len(data_blocks)):
14+
paragraph=data_blocks[i]
15+
lines=paragraph.split('\n')
16+
block_name[i]=lines[0]
17+
18+
m= re.search(re_pattern, lines[2])
19+
if m!=None:
20+
target=m.group()
21+
sub1= re.search( 'T-*C-*C-*T', target)
22+
sub2= re.search( 'C-*C-*T-*G', target)
23+
sub3= re.search( 'C-*T-*G-*G', target)
24+
sub4= re.search( 'T-*G-*G-*G', target)
25+
sub5= re.search( 'G-*G-*G-*G', target)
26+
sub6= re.search( 'G-*G-*G-*A', target)
27+
core_positions=[]
28+
core_positions.append(m.start()+sub1.start())
29+
core_positions.append(m.start()+sub2.start())
30+
core_positions.append(m.start()+sub3.start())
31+
core_positions.append(m.start()+sub4.start())
32+
core_positions.append(m.start()+sub5.start())
33+
core_positions.append(m.start()+sub6.start())
34+
length_core_positions=len(core_positions)
35+
deletion_length=[0]*length_core_positions
36+
start_position=['']*length_core_positions
37+
38+
for j in range(length_core_positions):
39+
position=core_positions[j]
40+
if lines[4][position]=='-':
41+
print('There is an delection========================')
42+
deletion_length[j] += 1
43+
start_position[j]=position
44+
for k in range(position-1,0,-1):
45+
if lines[4][k]!='-':
46+
break
47+
deletion_length[j] += 1
48+
start_position[j] -= 1
49+
for m in range(position+1,len(lines[4])):
50+
if lines[4][m]!='-':
51+
break
52+
deletion_length[j] += 1
53+
start_keys = start_position
54+
length_values = deletion_length
55+
start_length_dictionary = dict(zip(start_keys, length_values))
56+
if '' in start_length_dictionary:
57+
del(start_length_dictionary[''])
58+
print(start_length_dictionary)
59+
total_del_length=str(sum(start_length_dictionary.values()))
60+
print('The total indel length is : ' + total_del_length)
61+
62+
del_count_file.write('>'+ block_name[i]+'\n')
63+
del_count_file.write('The total delection length is : \n' + total_del_length + '\n')
64+
del_count_file.write('The sequence is : \n'+lines[4]+'\n\n')
65+
66+
del_count_file.close()
67+
68+
if __name__ == "__main__": main()
+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/python
2+
3+
import re
4+
5+
def main():
6+
7+
sourcefile = open('Wt_Specific_del_file.fasta') # source file
8+
data=sourcefile.read()
9+
sourcefile.close()
10+
data_blocks=data.split('>')
11+
block_name=['']*len(data_blocks)
12+
13+
accumulated_for_each_lenth={}
14+
15+
for i in range(1,len(data_blocks)):
16+
paragraph=data_blocks[i]
17+
lines=paragraph.split('\n')
18+
block_name[i]=lines[0]
19+
gap_length=int(lines[2].strip())
20+
print(gap_length)
21+
# Specific_del_file.write('>'+ data_blocks[i])
22+
if gap_length not in accumulated_for_each_lenth:
23+
accumulated_for_each_lenth[gap_length]=1
24+
else:
25+
accumulated_for_each_lenth[gap_length]+=1
26+
print(accumulated_for_each_lenth)
27+
outfile = open('Wt_specific_gap_length_and_count.txt', 'w' )
28+
for key, value in accumulated_for_each_lenth.items():
29+
outfile.write( str(key) + '\t' + str(value) +'\n' )
30+
outfile.close()
31+
32+
if __name__ == "__main__": main()
33+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/bin/python
2+
3+
import re
4+
5+
def main():
6+
7+
sourcefile = open('Wt_insert.fasta_with_hyphen') # source file
8+
data=sourcefile.read()
9+
sourcefile.close()
10+
data_blocks=data.split('>')
11+
block_name=['']*len(data_blocks)
12+
13+
accumulated_for_each_lenth={}
14+
15+
for i in range(1,len(data_blocks)):
16+
paragraph=data_blocks[i]
17+
lines=paragraph.split('\n')
18+
block_name[i]=lines[0]
19+
gap_length=int(lines[4].strip())
20+
print(gap_length)
21+
if gap_length not in accumulated_for_each_lenth:
22+
accumulated_for_each_lenth[gap_length]=1
23+
else:
24+
accumulated_for_each_lenth[gap_length]+=1
25+
print(accumulated_for_each_lenth)
26+
outfile = open('Wt_specific_insert_length_and_count.txt', 'w' )
27+
for key, value in accumulated_for_each_lenth.items():
28+
outfile.write( str(key) + '\t' + str(value) +'\n' )
29+
outfile.close()
30+
31+
if __name__ == "__main__": main()
32+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import re
2+
def main():
3+
4+
sourcefile = open('Wt_single_hit_with_query_and_hyphen.fasta') # source file
5+
data=sourcefile.read()
6+
sourcefile.close()
7+
data_blocks=data.split('>')
8+
9+
insert_file=open('Wt_insert.fasta_with_hyphen','w')
10+
remaining_file=open('Wt_remaining_with_hyphen.fasta','w')
11+
other_file=open('Wt_other_with_hyphen.fasta','w')
12+
block_name=['']*len(data_blocks)
13+
14+
left_pattern=re.compile('C-*C-*A-*C-*T-*G-*C-*A')
15+
right_pattern=re.compile('C-*C-*T-*G-*G-*G-*G-*A')
16+
17+
for i in range(1,len(data_blocks)):
18+
paragraph=data_blocks[i]
19+
lines=paragraph.split('\n')
20+
block_name[i]=lines[0]
21+
22+
m=re.search(left_pattern,lines[2])
23+
if m!=None:
24+
upstram_position=re.search(left_pattern,lines[2]).start()
25+
shift=(len(m.group()))-1
26+
left_end = upstram_position + shift
27+
28+
n=re.search(right_pattern,lines[2])
29+
if n!=None:
30+
downstram_position=re.search(right_pattern,lines[2]).start()
31+
32+
distance=downstram_position-left_end-1
33+
34+
between_string =lines[2][left_end+1:downstram_position]
35+
hyphen_count= between_string.count('-')
36+
print(hyphen_count)
37+
38+
if hyphen_count >=1:
39+
insert_file.write('>'+ block_name[i]+'\n')
40+
insert_file.write('The upstream ends at : '+ str(left_end+1) +'\n')
41+
insert_file.write('The downstream begins at : '+ str(downstram_position+1)+'\n')
42+
insert_file.write('The insert length is :\n')
43+
insert_file.write(str(hyphen_count) +'\n')
44+
insert_file.write('The whole sequence is : \n'+ lines[4]+'\n\n')
45+
46+
elif hyphen_count==0:
47+
remaining_file.write('>'+ paragraph)
48+
else:
49+
other_file.write('>'+ paragraph)
50+
51+
insert_file.close()
52+
remaining_file.close()
53+
other_file.close()
54+
55+
if __name__ == "__main__": main()

PacBio-indel-script/keep_one_hit.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import re
2+
def main():
3+
4+
sourcefile = open('Wt_blast_pure.blast') # source file
5+
data=sourcefile.read()
6+
sourcefile.close()
7+
data_blocks=data.split('>')
8+
9+
single_hit_file=open('Wt_single_hit.blast','w')
10+
block_name=['']*len(data_blocks)
11+
12+
for i in range(1,len(data_blocks)):
13+
paragraph=data_blocks[i]
14+
lines=paragraph.split('\n')
15+
block_name[i]=lines[1]
16+
for k in range(5,len(lines)):
17+
m= re.search('Identities', lines[k])
18+
if m is not None:
19+
subparagraph='\n'.join(lines[0:k-1])
20+
single_hit_file.write('>'+subparagraph)
21+
break
22+
else:
23+
single_hit_file.write('>'+ paragraph)
24+
25+
single_hit_file.close()
26+
27+
if __name__ == "__main__": main()
+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from Bio import SeqIO
2+
from Bio.SeqUtils.CheckSum import seguid
3+
4+
def remove_dup_seqs(records):
5+
""""SeqRecord iterator to removing duplicate sequences."""
6+
checksums = set()
7+
for record in records:
8+
checksum = seguid(record.seq)
9+
if checksum in checksums:
10+
print "Ignoring %s" % record.id
11+
continue
12+
checksums.add(checksum)
13+
yield record
14+
15+
#records = remove_dup_seqs(SeqIO.parse("Sg2.fasta", "fasta"))
16+
#count = SeqIO.write(records, "Sg2_no_repeated_seq.fasta", "fasta")
17+
#records = remove_dup_seqs(SeqIO.parse("Sg5.fasta", "fasta"))
18+
#count = SeqIO.write(records, "Sg5_no_repeated_seq.fasta", "fasta")
19+
records = remove_dup_seqs(SeqIO.parse("Wt.fasta", "fasta"))
20+
count = SeqIO.write(records, "Wt_no_repeat.fasta", "fasta")
21+
print "Saved %i records" % count
22+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import re
2+
3+
4+
def main():
5+
6+
barcodelist=['gacttcag', 'gcttcaga', 'attcaggc', 'tggactca', 'agactctg', 'gactctag']
7+
reverse_complementary_barcodelist=['tgagtcca', 'cagagtct', 'ctagagtc', 'ctgaagtc', 'tctgaagc', 'gcctgaat']
8+
sourcefile = open('reads_of_insert.fastq') # source file containing all fastq data
9+
barcode_Num=len(barcodelist)
10+
lines=sourcefile.readlines()
11+
total_lines=len(lines)
12+
remainlines=lines[:]
13+
sourcefile.close()
14+
remain_reads_file = open('remain_reads.fastq', 'w') # source file containing all fastq data
15+
name=['']*len(barcodelist)
16+
num_bar_code=['']*len(barcodelist)
17+
num_RC_bar_code_without_bar_code=['']*len(barcodelist)
18+
19+
20+
for k in range(0,barcode_Num):
21+
name[k] = open(barcodelist[k]+'_'+'.fa','w') # the output file containing fastq data with certain barcode
22+
num_bar_code[k]=0
23+
num_RC_bar_code_without_bar_code[k]=0
24+
25+
for i in range(0,total_lines):
26+
line=lines[i]
27+
# lines=remainlines[:]
28+
ends=line[-9:-1]
29+
for k in range(0,barcode_Num):
30+
if re.match(barcodelist[k], line, re.IGNORECASE):
31+
name[k].writelines(lines[i-1:i+3])
32+
num_bar_code[k]=num_bar_code[k]+1
33+
remainlines[i-1:i+3]=["\n","\n","\n","\n"]
34+
elif re.search(reverse_complementary_barcodelist[k], ends, re.IGNORECASE):
35+
name[k].writelines(lines[i-1:i+3])
36+
remainlines[i-1:i+3]=["\n","\n","\n","\n"]
37+
num_RC_bar_code_without_bar_code[k]=num_RC_bar_code_without_bar_code[k]+1
38+
for k in range(0,barcode_Num):
39+
name[k].close()
40+
41+
remain_reads_file.writelines(remainlines)
42+
remain_reads_file.close()
43+
44+
45+
for k in range(0,barcode_Num):
46+
print("The barcode----" + barcodelist[k] +" has total begin matching times: "+ str(num_bar_code[k]))
47+
print("The reverse complementary barcode----" + barcodelist[k] +" has total end matching times (without begin matching) : "+ str(num_RC_bar_code_without_bar_code[k]))
48+
49+
if __name__ == "__main__": main()

0 commit comments

Comments
 (0)