-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathextract_align.py
executable file
·226 lines (201 loc) · 11.3 KB
/
extract_align.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import argparse
import shutil
import re
import os
import subprocess
from Bio import SeqIO
import pandas as pd
import numpy as np
from pybedtools import BedTool
from pyfaidx import Fasta
import logging
import time
import datetime
pd.options.mode.chained_assignment = None # default='warn'
LOGGER = logging.getLogger(__name__)
## NOTE regarding SOFTWARE path and dependencies. A path to several packages was hardcoded into
## this script.
## You will need to modify this path in lines 105 and 114 (the MUSCLE and CONSENSUSGEN functions).
## Non-python dependencies for this script are:
## Muscle - https://www.drive5.com/muscle/manual/index.html
## Trimal - http://trimal.cgenomics.org/
## EMBOSS - http://emboss.sourceforge.net/
## These should be installed in your SOFTWARE directory and tweaks may be needed to the paths in
## the MUSCLE and CONSENSUSGEN functions depending on your installation.
## Set up input arguments
def get_args():
parser = argparse.ArgumentParser(description="Will process a blast output generated using a file of putative TEs (usually generated by RepeatModeler. For each putative consensus in the input putative TE library, it will generate an aligned file with N buffered instances from the queried genome, the input consensus, and, if requested, a new revised and extended consensus for inspection.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-g', '--genome_fasta', type=str, help='Name of the fasta formatted genome to be queried.', required=True)
parser.add_argument('-b', '--blastfile', type=str, help='Blast output to be used. Must be formatted using "outfmt 6".', required = True)
parser.add_argument('-l', '--library', type=str, help='Library of putative TE consensus sequences to be extracted and aligned. Must be in fasta format with no # or / in the headers.', required = True)
parser.add_argument('-lb', '--leftbuffer', type=int, help='Left buffer size. The number of bp of flanking sequence for each hit to be extracted along with the hit. Optional, Default = 1000', default = 1000)
parser.add_argument('-rb', '--rightbuffer', type=int, help='Right beffer size. The number of bp of flanking sequence for each hit to be extracted along with the hit. Optional, Default = 1000', default = 1000)
parser.add_argument('-n', '--hitnumber', type=int, help='The number of hits to be exracted. Optional. Default = 50.', default = 50)
parser.add_argument('-a', '--align', type=str, help='Align the output fasta file, y or n?. Default is y.', default = 'y')
parser.add_argument('-t', '--trimal', type=str, help='Use trimal to remove low-aligning regions, y or n? Trimal can sometimes encounter an error that prevents it from working, this results in an empty file in downstream analyses. Default is y.', default = 'y')
parser.add_argument('-e', '--emboss', type=str, help='Generate a trimal/emboss consensus, y or n. Optional. Default=y.', default = 'y')
parser.add_argument('-m', '--maxiters2', type=str, help='Limit muscle iterations to 2? y or n. Optional. Default=n.', default = 'n')
parser.add_argument('-log', '--log_level', default='INFO')
parser.add_argument('-beds', '--keep_beds', type=str, help='Keep the bed files used for extractions. y or n. Optional. Default=n.', default='n')
args = parser.parse_args()
GENOMEFA = args.genome_fasta
BLAST = args.blastfile
LIB = args.library
LBUFFER = args.leftbuffer
RBUFFER = args.rightbuffer
HITNUM = args.hitnumber
ALIGN = args.align
TRIMAL = args.trimal
EMBOSS = args.emboss
MAXITERS = args.maxiters2
LOG = args.log_level
BEDS = args.keep_beds
return GENOMEFA, BLAST, LIB, LBUFFER, RBUFFER, HITNUM, ALIGN, TRIMAL, EMBOSS, MAXITERS, LOG, BEDS
## Create TE outfiles function. Creates files for populating with blast hits.
def CREATE_TE_OUTFILES(LIBRARY):
for record in SeqIO.parse(LIBRARY, 'fasta'):
NEWID = re.sub('#', '__', record.id)
NEWID = re.sub('/', '___', NEWID)
record.id = 'CONSENSUS-' + NEWID
record.description = ''
SeqIO.write(record, 'tmpTEfiles/' + NEWID + '.fa', 'fasta')
## Organize blast hits function. Will read in blast file, sort based on e-value and bitscore, deterine top HITNUM hits for extraction, extract, and combine with TE file from previous function.
def EXTRACT_BLAST_HITS(GENOME, BLAST, LBUFFER, RBUFFER, HITNUM, BEDS):
##Read in blast data
BLASTDF = pd.read_csv(BLAST, sep='\t', names=['QUERYNAME', 'SCAFFOLD', 'C', 'D', 'E', 'F', 'QUERYSTART', 'QUERYSTOP', 'SCAFSTART', 'SCAFSTOP', 'E-VALUE', 'BITSCORE'])
##Convert to bed format
BLASTBED = BLASTDF[['SCAFFOLD', 'SCAFSTART', 'SCAFSTOP', 'QUERYNAME', 'E-VALUE', 'BITSCORE']]
BLASTBED.insert(5, 'STRAND', '+')
BLASTBED.loc[BLASTBED.SCAFSTOP < BLASTBED.SCAFSTART, 'STRAND'] = '-'
BLASTBED.SCAFSTART, BLASTBED.SCAFSTOP = np.where(BLASTBED.SCAFSTART > BLASTBED.SCAFSTOP, [BLASTBED.SCAFSTOP, BLASTBED.SCAFSTART], [BLASTBED.SCAFSTART, BLASTBED.SCAFSTOP])
##Generate list of query names
QUERYLIST = BLASTBED.QUERYNAME.unique()
LOGGER.info('There are ' + str(len(QUERYLIST)) + ' consensus sequences to process')
# COUNTER = 1
##Sort subsets of df based on query names, keep the top BUFFER hits, make bedfiles, extract, and combine
for QUERY in QUERYLIST:
# LOGGER.info('Extracting for TE: ' + str(COUNTER))
QUERYFRAME = BLASTBED[BLASTBED['QUERYNAME'] == QUERY]
QUERYFRAME = QUERYFRAME.sort_values(by=['E-VALUE', 'BITSCORE'], ascending=[True, False])
QUERYFRAME = QUERYFRAME.head(HITNUM)
QUERYFRAMESAVE = 'tmpbedfiles/' + QUERY + '.bed'
QUERYFRAME.to_csv('tmpbedfiles/' + QUERY + '.bed', sep='\t', header=False, index=False)
CURRENTBED = BedTool('tmpbedfiles/' + QUERY + '.bed')
GENOMEPREFIX = os.path.splitext(GENOME)[0]
SLOPBED = CURRENTBED.slop(g=GENOMEPREFIX + '.fai', l=LBUFFER, r=RBUFFER, output='tmpbedfiles/' + QUERY + '.slop')
SLOPBED = BedTool('tmpbedfiles/' + QUERY + '.slop')
FASTA = SLOPBED.sequence(fi=GENOME, s=True)
FASTASAVE = SLOPBED.save_seqs('extracts/' + QUERY + '.fa')
os.remove('tmpbedfiles/' + QUERY + '.slop')
if BEDS == 'n':
os.remove('tmpbedfiles/' + QUERY + '.bed')
subprocess.call('cat {} {} >{}'.format('extracts/' + QUERY + '.fa', 'tmpTEfiles/' + QUERY +'.fa', 'catTEfiles/' + QUERY +'.fa'), shell=True)
# COUNTER = COUNTER + 1
##Alignment function
def MUSCLE(TOALIGN, MAXITERS):
TOALIGNPREFIX = os.path.splitext(TOALIGN)[0]
SOFTWARE = '/lustre/work/daray/software/'
if MAXITERS == 'y':
subprocess.check_call(SOFTWARE + 'muscle/muscle -in {} -out {} -maxiters 2'.format('catTEfiles/' + TOALIGN, 'muscle/' + TOALIGNPREFIX + '.fa'), shell=True)
else:
subprocess.check_call(SOFTWARE + 'muscle/muscle -in {} -out {}'.format('catTEfiles/' + TOALIGN, 'muscle/' + TOALIGNPREFIX + '.fa'), shell=True)
##Consensus generation function
def CONSENSUSGEN(ALIGNED, TRIMAL):
FILEPREFIX = os.path.splitext(ALIGNED)[0]
SOFTWARE = '/lustre/work/daray/software/'
if TRIMAL == 'y':
subprocess.call(SOFTWARE + 'trimal/source/trimal -in {} -gt 0.6 -cons 60 -fasta -out {}'.format('muscle/' + ALIGNED, 'muscle/' + FILEPREFIX + '_trimal.fa'), shell=True)
subprocess.call(SOFTWARE + 'EMBOSS-6.6.0/emboss/cons -sequence muscle/' + FILEPREFIX + '_trimal.fa -outseq muscle/' + FILEPREFIX + '_cons.fa -name ' + FILEPREFIX + '_cons -plurality 3 -identity 3', shell=True)
subprocess.call('cat {} {} >{}'.format('muscle/' + FILEPREFIX + '_trimal.fa', 'muscle/' + FILEPREFIX + '_cons.fa', 'consensusfiles/' + FILEPREFIX + '_cons.fa'), shell=True)
if TRIMAL == 'n':
# subprocess.call(SOFTWARE + 'trimal/source/trimal -in {} -gt 0.6 -cons 60 -fasta -out {}'.format('muscle/' + ALIGNED, 'muscle/' + FILEPREFIX + '_trimal.fa'), shell=True)
subprocess.call(SOFTWARE + 'EMBOSS-6.6.0/emboss/cons -sequence muscle/' + ALIGNED + ' -outseq muscle/' + FILEPREFIX + '_cons.fa -name ' + FILEPREFIX + '_cons -plurality 3 -identity 3', shell=True)
subprocess.call('cat {} {} >{}'.format('muscle/' + ALIGNED, 'muscle/' + FILEPREFIX + '_cons.fa', 'consensusfiles/' + FILEPREFIX + '_cons.fa'), shell=True)
def DIRS(DIR):
if os.path.exists(DIR):
shutil.rmtree(DIR)
os.mkdir(DIR)
####MAIN function
def main():
##Get input arguments
GENOMEFA, BLAST, LIB, LBUFFER, RBUFFER, HITNUM, ALIGN, TRIMAL, EMBOSS, MAXITERS, LOG, BEDS = get_args()
# Setup logging and script timing
handlers = [logging.FileHandler('extract_align.log'), logging.StreamHandler()]
logging.basicConfig(format='', handlers = handlers)
logging.getLogger().setLevel(getattr(logging, LOG.upper()))
start_time = time.time()
LOGGER.info('#\n# extract_align.py\n#')
LOGGER.info('Genome file: ' + GENOMEFA)
LOGGER.info('Blast file: ' + BLAST)
LOGGER.info('TE library: ' + LIB)
LOGGER.info('Left buffer size: ' + str(LBUFFER))
LOGGER.info('Right buffer size: ' + str(RBUFFER))
LOGGER.info('Number of hits evaluated: ' + str(HITNUM))
LOGGER.info('Muscle alignment = ' + ALIGN)
LOGGER.info('Trimal processing = ' + TRIMAL)
LOGGER.info('Emboss consensus = ' + EMBOSS)
LOGGER.info('Keep bed files = ' + BEDS)
LOGGER.info('Log level: ' + LOG)
## Index the genome
LOGGER.info('Indexing the genome')
GENOMEIDX = Fasta(GENOMEFA)
GENOMEPREFIX = os.path.splitext(GENOMEFA)[0]
FAIDX = pd.read_csv(GENOMEFA + '.fai', sep='\t', names=['one', 'two', 'three', 'four', 'five'])
FAIDX = FAIDX[['one', 'two']]
FAIDX.to_csv(GENOMEPREFIX + '.fai', sep='\t', header=False, index=False)
## Set up directories
LOGGER.info('Creating tmp and permanent directories')
DIRS('tmpTEfiles')
DIRS('tmpbedfiles')
if ALIGN == 'y':
DIRS('muscle')
if EMBOSS == 'y':
DIRS('consensusfiles')
DIRS('extracts')
DIRS('catTEfiles')
##Determine optional arguments and print to screen.
if ALIGN == 'n' and EMBOSS == 'y':
LOGGER.info('Input is contradictory. Generating a new consensus with emboss requires muscle alignment.')
elif ALIGN == 'y' and EMBOSS == 'y':
LOGGER.info('Output files will be aligned and a new consensus will be generated with emboss and trimal.')
elif ALIGN == 'y' and EMBOSS == 'n':
LOGGER.info('Output files will be aligned but without a new emboss/trimal consensus.')
elif ALIGN == 'n' and EMBOSS == 'n':
LOGGER.info('Extractions will be made but no alignment.')
else:
LOGGER.info('Invalid input for either align, or emboss, or both.')
##Create TE out files to populate with blast hits
CREATE_TE_OUTFILES(LIB)
##Extract hits and combine them with the TE out files if flagged
EXTRACT_BLAST_HITS(GENOMEFA, BLAST, LBUFFER, RBUFFER, HITNUM, BEDS)
##Align extracted hits if flagged
if ALIGN == 'y':
COUNTER = 1
for FILE in os.listdir('extracts'):
LOGGER.info('Aligning TE: ' + str(COUNTER))
MUSCLE(FILE, MAXITERS)
COUNTER = COUNTER + 1
##Generate new consensus with emboss if flagged
if EMBOSS == 'y':
for FILE in os.listdir('muscle'):
CONSENSUSGEN(FILE, TRIMAL)
##Remove empty tmp directories and unneeded files
LOGGER.info('Removing tmp directories and extraneous files')
if BEDS == 'n':
shutil.rmtree('tmpbedfiles/')
# shutil.rmtree('extracts/')
shutil.rmtree('tmpTEfiles/')
if ALIGN == 'y':
FILES = [F for F in os.listdir('muscle/') if F.endswith('_cons.fa')]
for FILE in FILES:
os.remove('muscle/' + FILE)
FILES = [F for F in os.listdir('muscle/') if F.endswith('_trimal.fa')]
for FILE in FILES:
os.remove('muscle/' + FILE)
end_time = time.time()
LOGGER.info('Run time: ' + str(datetime.timedelta(seconds=end_time-start_time)))
#
# Wrap script functionality in main() to avoid automatic execution
# when imported ( e.g. when help is called on file )
#
if __name__ =="__main__":main()