-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_demo.py
152 lines (111 loc) · 5.85 KB
/
extract_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python3
from Bio import AlignIO
import sys
import os
import argparse
from datetime import datetime
#--------------------------
# argparse!
#--------------------------
parsing=argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description="""
extract_demo.py is used to extract primer amplicons or variable regions from
a single MSA of the ribosomal 16S gene sequence (rrn). Because the position of these features are based
on the E. coli genome, the coordinates refer to the position on the E. coli sequence.
Either CP016007.2543965.2545520 or CP016007.3589827.3591382
from the Silva database (v132) are considered close enough, but the first is favored.
The start and end coordinates on the reference sequence are passed through
the --excise option. The start coordinate should be smaller than the end
coordinate, but I'm using <list name>.sort() to force the order from low to high.
Coordinates are as used by UGENE. The first nucleotide of the sequence that starts
the MSA is counted as 1. Gaps are not counted in individual sequences, but
are counted in the MSA. Coordinates values are inclusive.
seq3 A T C - - C G T A -->
1 2 3 . . 4 5 6 7
seq4 - - - C G - - T A -->
. . . 1 2 . . 3 4
MSA 1 2 3 4 5 6 7 8 9 -->
Just pass the start and end values seperated by a comma:
>extract_demo.py -i infile -e 2,5
If seq3 is the E. coli reference, the MSA coordinates are determined and extracted
seq3 T C - - C G
2 3 . . 4 5
seq4 - - C G - -
. . 1 2 . .
MSA 2 3 4 5 6 7
and written to a new multi-record fasta file, without the "-" characters.
>seq3
TCCG
>seq4
CG
""", epilog="This script has no other dependencies besides BioPython")
parsing.add_argument("--infile", "-i", required=True, help="relative or absolute path to the MSA")
parsing.add_argument("--outfile", "-o", help="new filename, automatically written to ../processed_data")
parsing.add_argument("--excise", "-e", help="the start and stop coordinates on the reference E. coli sequence.")
parsing.add_argument("--verbose", "-v", action="store_true", help="spell out what is happening")
args=parsing.parse_args()
#--------------------------
# get to work
#--------------------------
runtime=datetime.now().strftime('%Y-%m-%d_%H%M')
# *** validate data args ***
# check that the infile is a clustal msa
try:
msa = AlignIO.read(args.infile, "clustal")
if args.verbose:
print("Format of infile is fine.")
except ValueError or AssertionError:
print("\n\tThe format of the file \"{}\" doesn't meet the Clustal file specifications:\n\n\t1) The first line in the file must start with the words \"CLUSTAL W\" or \"CLUSTALW\". Other information in the first line is ignored.\n\t2) One or more empty lines.\n\t3) One or more blocks of sequence data. Each block consists of:\n\t\t* One line for each sequence in the alignment. Each line consists of:\n\t\t\t* the sequence name\n\t\t\t* white space\n\t\t\t* up to 60 sequence symbols.\n\t\t\t* optional - white space followed by a cumulative count of residues for the sequences\n\t\t* A line showing the degree of conservation for the columns of the alignment in this block\n\t\t(this can be left as blank. \" \" indicates no match, but for this application the conservation isn't important)\n\t\t* One or more empty lines.\n\n\tQuitting now.".format(args.infile))
sys.exit(1)
# check that there's an E. coli rrn sequence CP016007.2543965.2545520 or CP016007.3589827.3591382
standards = ["CP016007.2543965.2545520", "CP016007.3589827.3591382"]
names=[record.id for record in msa]
if set(standards).intersection(set(names)):
if standards[0] in names:
map_reference=standards[0]
else:
map_reference=standards[1]
if args.verbose:
print("Found at least one of the standards: {}".format(set(standards).intersection(set(names))))
else:
print("\n\tThe MSA doesn't appear to have either of the E. coli rrn sequences needed for coordinates.\n\tCheck that the MSN includes one of these two sequences:\n\t\tCP016007.2543965.2545520\n\t\tCP016007.3589827.3591382\n\n\tQuitting now.")
sys.exit(1)
# new folder to hold output
final_dir="processed_data/extracted_16s_regions_{}".format(datetime.now().strftime('%Y-%m-%d'))
if not os.path.exists(final_dir):
os.makedirs(final_dir)
if args.verbose==True:
print("creating "+final_dir)
if args.outfile:
outfile="{0}/extracted_{1}_{2}.fna".format(final_dir, args.outfile, runtime)
else:
outfile="{0}/extracted_16s_{1}.fna".format(final_dir, runtime)
coords=[int(x) for x in args.excise.split(',')]
coords.sort()
# add 1 to the last value, AlignIO format is [inclusive:exclusive]
coords[-1]+=1
# make a dict out of the MSA
msa_dict={m.id:m.seq for m in msa}
# list is coord of ref seq, index is position in MSA
# probably the kind of thing that's already solved, don't care
counter=0
ref_map=list()
for x in msa_dict[map_reference]:
if x!="-":
ref_map.append(counter)
counter+=1
else:
ref_map.append(None)
# write as dictionary
extract=msa[:, ref_map.index(coords[0]):ref_map.index(coords[1])]
# I can't figure out how to do this directly through Biopython
# so I'm going to do it the dumb way:
# parse the fasta file to a dictionary while removing the gap characters
# then write the dictionary to a new fasta file
dumb={u.id:u.seq.ungap('-') for u in extract}
print("writing to {}".format(outfile))
with open(outfile, "w") as f:
for k,v in dumb.items():
blocks=[v[i:i+60] for i in range(0, len(v), 60)]
f.write(">{}\n".format(k))
for b in blocks:
f.write(str(b)+"\n")