extract_align.py

import argparse
import shutil
import re
import os
import subprocess
from Bio import SeqIO
import pandas as pd
import numpy as np
from pybedtools import BedTool
from pyfaidx import Fasta
import logging
import time
import datetime
pd.options.mode.chained_assignment = None  # default='warn'

LOGGER = logging.getLogger(__name__)

## NOTE regarding SOFTWARE path and dependencies. A path to several packages was hardcoded into 
## this script. 
## You will need to modify this path in lines 105 and 114 (the MUSCLE and CONSENSUSGEN functions).
## Non-python dependencies for this script are:
## Muscle - https://www.drive5.com/muscle/manual/index.html
## Trimal - http://trimal.cgenomics.org/
## EMBOSS - http://emboss.sourceforge.net/
## These should be installed in your SOFTWARE directory and tweaks may be needed to the paths in 
## the MUSCLE and CONSENSUSGEN functions depending on your installation.

## Set up input arguments
def get_args():
	parser = argparse.ArgumentParser(description="Will process a blast output generated using a file of putative TEs (usually generated by RepeatModeler. For each putative consensus in the input putative TE library, it will generate an aligned file with N buffered instances from the queried genome, the input consensus, and, if requested, a new revised and extended consensus for inspection.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('-g', '--genome_fasta', type=str, help='Name of the fasta formatted genome to be queried.', required=True)
	parser.add_argument('-b', '--blastfile', type=str, help='Blast output to be used. Must be formatted using "outfmt 6".', required = True)
	parser.add_argument('-l', '--library', type=str, help='Library of putative TE consensus sequences to be extracted and aligned. Must be in fasta format with no # or / in the headers.', required = True)
	parser.add_argument('-lb', '--leftbuffer', type=int, help='Left buffer size. The number of bp of flanking sequence for each hit to be extracted along with the hit. Optional, Default = 1000', default = 1000)
	parser.add_argument('-rb', '--rightbuffer', type=int, help='Right beffer size. The number of bp of flanking sequence for each hit to be extracted along with the hit. Optional, Default = 1000', default = 1000)
	parser.add_argument('-n', '--hitnumber', type=int, help='The number of hits to be exracted. Optional. Default = 50.', default = 50)
	parser.add_argument('-a', '--align', type=str, help='Align the output fasta file, y or n?. Default is y.', default = 'y')
	parser.add_argument('-t', '--trimal', type=str, help='Use trimal to remove low-aligning regions, y or n? Trimal can sometimes encounter an error that prevents it from working, this results in an empty file in downstream analyses. Default is y.', default = 'y')
	parser.add_argument('-e', '--emboss', type=str, help='Generate a trimal/emboss consensus, y or n. Optional. Default=y.', default = 'y')
	parser.add_argument('-m', '--maxiters2', type=str, help='Limit muscle iterations to 2? y or n. Optional. Default=n.', default = 'n')
	parser.add_argument('-log', '--log_level', default='INFO')
	parser.add_argument('-beds', '--keep_beds', type=str, help='Keep the bed files used for extractions. y or n. Optional. Default=n.', default='n')

	args = parser.parse_args()
	GENOMEFA = args.genome_fasta
	BLAST = args.blastfile
	LIB = args.library
	LBUFFER = args.leftbuffer
	RBUFFER = args.rightbuffer
	HITNUM = args.hitnumber
	ALIGN = args.align
	TRIMAL = args.trimal
	EMBOSS = args.emboss
	MAXITERS = args.maxiters2
	LOG = args.log_level
	BEDS = args.keep_beds

	return GENOMEFA, BLAST, LIB, LBUFFER, RBUFFER, HITNUM, ALIGN, TRIMAL, EMBOSS, MAXITERS, LOG, BEDS

## Create TE outfiles function. Creates files for populating with blast hits.
def CREATE_TE_OUTFILES(LIBRARY):
	for record in SeqIO.parse(LIBRARY, 'fasta'):
		NEWID = re.sub('#', '__', record.id)
		NEWID = re.sub('/', '___', NEWID)
		record.id = 'CONSENSUS-' + NEWID
		record.description = ''
		SeqIO.write(record, 'tmpTEfiles/' + NEWID + '.fa', 'fasta')
				
## Organize blast hits function. Will read in blast file, sort based on e-value and bitscore, deterine top HITNUM hits for extraction, extract, and combine with TE file from previous function.
def EXTRACT_BLAST_HITS(GENOME, BLAST, LBUFFER, RBUFFER, HITNUM, BEDS):
##Read in blast data
	BLASTDF = pd.read_csv(BLAST, sep='\t', names=['QUERYNAME', 'SCAFFOLD', 'C', 'D', 'E', 'F', 'QUERYSTART', 'QUERYSTOP', 'SCAFSTART', 'SCAFSTOP', 'E-VALUE', 'BITSCORE'])
##Convert to bed format
	BLASTBED = BLASTDF[['SCAFFOLD', 'SCAFSTART', 'SCAFSTOP', 'QUERYNAME', 'E-VALUE', 'BITSCORE']]
	BLASTBED.insert(5, 'STRAND', '+')
	BLASTBED.loc[BLASTBED.SCAFSTOP < BLASTBED.SCAFSTART, 'STRAND'] = '-'
	BLASTBED.SCAFSTART, BLASTBED.SCAFSTOP = np.where(BLASTBED.SCAFSTART > BLASTBED.SCAFSTOP, [BLASTBED.SCAFSTOP, BLASTBED.SCAFSTART], [BLASTBED.SCAFSTART, BLASTBED.SCAFSTOP])
##Generate list of query names
	QUERYLIST = BLASTBED.QUERYNAME.unique()
	LOGGER.info('There are ' + str(len(QUERYLIST)) + ' consensus sequences to process')
#	COUNTER = 1
##Sort subsets of df based on query names, keep the top BUFFER hits, make bedfiles, extract, and combine
	for QUERY in QUERYLIST:
#		LOGGER.info('Extracting for TE: ' + str(COUNTER))
		QUERYFRAME = BLASTBED[BLASTBED['QUERYNAME'] == QUERY]
		QUERYFRAME = QUERYFRAME.sort_values(by=['E-VALUE', 'BITSCORE'], ascending=[True, False])
		QUERYFRAME = QUERYFRAME.head(HITNUM)
		QUERYFRAMESAVE = 'tmpbedfiles/' + QUERY + '.bed'
		QUERYFRAME.to_csv('tmpbedfiles/' + QUERY + '.bed', sep='\t', header=False, index=False)
		CURRENTBED = BedTool('tmpbedfiles/' + QUERY + '.bed')
		GENOMEPREFIX = os.path.splitext(GENOME)[0]
		SLOPBED = CURRENTBED.slop(g=GENOMEPREFIX + '.fai', l=LBUFFER, r=RBUFFER, output='tmpbedfiles/' + QUERY + '.slop')
		SLOPBED = BedTool('tmpbedfiles/' + QUERY + '.slop')
		FASTA = SLOPBED.sequence(fi=GENOME, s=True)
		FASTASAVE = SLOPBED.save_seqs('extracts/' + QUERY + '.fa')
		os.remove('tmpbedfiles/' +  QUERY + '.slop')
		if BEDS == 'n':
			os.remove('tmpbedfiles/' + QUERY + '.bed')
		subprocess.call('cat {} {} >{}'.format('extracts/' + QUERY + '.fa', 'tmpTEfiles/' + QUERY +'.fa', 'catTEfiles/' + QUERY +'.fa'), shell=True)
#		COUNTER = COUNTER + 1
		
##Alignment function
def MUSCLE(TOALIGN, MAXITERS):
	TOALIGNPREFIX = os.path.splitext(TOALIGN)[0]
	SOFTWARE = '/lustre/work/daray/software/'
	if MAXITERS == 'y':
		subprocess.check_call(SOFTWARE + 'muscle/muscle -in {} -out {} -maxiters 2'.format('catTEfiles/' + TOALIGN, 'muscle/' + TOALIGNPREFIX + '.fa'), shell=True)
	else:
		subprocess.check_call(SOFTWARE + 'muscle/muscle -in {} -out {}'.format('catTEfiles/' + TOALIGN, 'muscle/' + TOALIGNPREFIX + '.fa'), shell=True)

##Consensus generation function
def CONSENSUSGEN(ALIGNED, TRIMAL):
	FILEPREFIX = os.path.splitext(ALIGNED)[0] 
	SOFTWARE = '/lustre/work/daray/software/'
	if TRIMAL == 'y':
		subprocess.call(SOFTWARE + 'trimal/source/trimal -in {} -gt 0.6 -cons 60 -fasta -out {}'.format('muscle/' + ALIGNED, 'muscle/' + FILEPREFIX + '_trimal.fa'), shell=True)
		subprocess.call(SOFTWARE + 'EMBOSS-6.6.0/emboss/cons -sequence muscle/' + FILEPREFIX + '_trimal.fa -outseq muscle/' + FILEPREFIX + '_cons.fa -name ' + FILEPREFIX + '_cons -plurality 3 -identity 3', shell=True)
		subprocess.call('cat {} {} >{}'.format('muscle/' + FILEPREFIX + '_trimal.fa', 'muscle/' + FILEPREFIX + '_cons.fa', 'consensusfiles/' + FILEPREFIX + '_cons.fa'), shell=True)
	if TRIMAL == 'n':
#		subprocess.call(SOFTWARE + 'trimal/source/trimal -in {} -gt 0.6 -cons 60 -fasta -out {}'.format('muscle/' + ALIGNED, 'muscle/' + FILEPREFIX + '_trimal.fa'), shell=True)
		subprocess.call(SOFTWARE + 'EMBOSS-6.6.0/emboss/cons -sequence muscle/' + ALIGNED + ' -outseq muscle/' + FILEPREFIX + '_cons.fa -name ' + FILEPREFIX + '_cons -plurality 3 -identity 3', shell=True)
		subprocess.call('cat {} {} >{}'.format('muscle/' + ALIGNED, 'muscle/' + FILEPREFIX + '_cons.fa', 'consensusfiles/' + FILEPREFIX + '_cons.fa'), shell=True)

def DIRS(DIR):
	if os.path.exists(DIR):
		shutil.rmtree(DIR)
	os.mkdir(DIR)

####MAIN function
def main():	
##Get input arguments
	GENOMEFA, BLAST, LIB, LBUFFER, RBUFFER, HITNUM, ALIGN, TRIMAL, EMBOSS, MAXITERS, LOG, BEDS = get_args()

# Setup logging and script timing
	handlers = [logging.FileHandler('extract_align.log'), logging.StreamHandler()]
	logging.basicConfig(format='', handlers = handlers)
	logging.getLogger().setLevel(getattr(logging, LOG.upper()))

	start_time = time.time()

	LOGGER.info('#\n# extract_align.py\n#')

	LOGGER.info('Genome file: ' + GENOMEFA)
	LOGGER.info('Blast file: ' + BLAST)
	LOGGER.info('TE library: ' + LIB)
	LOGGER.info('Left buffer size: ' + str(LBUFFER))
	LOGGER.info('Right buffer size: ' + str(RBUFFER))
	LOGGER.info('Number of hits evaluated: ' + str(HITNUM))
	LOGGER.info('Muscle alignment = ' + ALIGN)
	LOGGER.info('Trimal processing = ' + TRIMAL)
	LOGGER.info('Emboss consensus = ' + EMBOSS)
	LOGGER.info('Keep bed files = ' + BEDS)
	LOGGER.info('Log level: ' + LOG)

## Index the genome 
	LOGGER.info('Indexing the genome')
	GENOMEIDX = Fasta(GENOMEFA)
	GENOMEPREFIX = os.path.splitext(GENOMEFA)[0]
	FAIDX = pd.read_csv(GENOMEFA + '.fai', sep='\t', names=['one', 'two', 'three', 'four', 'five'])
	FAIDX = FAIDX[['one', 'two']]
	FAIDX.to_csv(GENOMEPREFIX + '.fai', sep='\t', header=False, index=False)
		
## Set up directories	
	LOGGER.info('Creating tmp and permanent directories')
	DIRS('tmpTEfiles')
	DIRS('tmpbedfiles')
	if ALIGN == 'y':
		DIRS('muscle')
	if EMBOSS == 'y':
		DIRS('consensusfiles')
	DIRS('extracts')
	DIRS('catTEfiles')
	
##Determine optional arguments and print to screen.
	if ALIGN == 'n' and EMBOSS == 'y':
		LOGGER.info('Input is contradictory. Generating a new consensus with emboss requires muscle alignment.')
	elif ALIGN == 'y' and EMBOSS == 'y':
		LOGGER.info('Output files will be aligned and a new consensus will be generated with emboss and trimal.')
	elif ALIGN == 'y' and EMBOSS == 'n':
		LOGGER.info('Output files will be aligned but without a new emboss/trimal consensus.')
	elif ALIGN == 'n' and EMBOSS == 'n':
		LOGGER.info('Extractions will be made but no alignment.')
	else:
		LOGGER.info('Invalid input for either align, or emboss, or both.')

##Create TE out files to populate with blast hits
	CREATE_TE_OUTFILES(LIB)
	
##Extract hits and combine them with the TE out files if flagged
	EXTRACT_BLAST_HITS(GENOMEFA, BLAST, LBUFFER, RBUFFER, HITNUM, BEDS)
	
##Align extracted hits if flagged
	if ALIGN == 'y':
		COUNTER = 1
		for FILE in os.listdir('extracts'):
			LOGGER.info('Aligning TE: ' + str(COUNTER))
			MUSCLE(FILE, MAXITERS)
			COUNTER = COUNTER + 1

##Generate new consensus with emboss if flagged
	if EMBOSS == 'y':
		for FILE in os.listdir('muscle'):
			CONSENSUSGEN(FILE, TRIMAL)
			
##Remove empty tmp directories and unneeded files
	LOGGER.info('Removing tmp directories and extraneous files')
	if BEDS == 'n':
		shutil.rmtree('tmpbedfiles/')
#	shutil.rmtree('extracts/')
	shutil.rmtree('tmpTEfiles/')
	if ALIGN == 'y':
		FILES = [F for F in os.listdir('muscle/') if F.endswith('_cons.fa')]
		for FILE in FILES:
			os.remove('muscle/' + FILE)
		FILES = [F for F in os.listdir('muscle/') if F.endswith('_trimal.fa')]
		for FILE in FILES:
			os.remove('muscle/' + FILE)
	
	end_time = time.time()
	LOGGER.info('Run time: ' + str(datetime.timedelta(seconds=end_time-start_time)))
#
# Wrap script functionality in main() to avoid automatic execution
# when imported ( e.g. when help is called on file )
#
if __name__ =="__main__":main()