-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathTE_process.py
68 lines (53 loc) · 2.43 KB
/
TE_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import argparse
import gzip
import subprocess
import re
from Bio.Blast.Applications import NcbiblastnCommandline
import os
####MAIN function
def main():
##Use the get_args function
GENOME, LIBRARY, BLASTOUTFILE, SEQBUFFER, SEQNUMBER = get_args()
print('Input file = ' + GENOME)
PREFIX = re.split("[.]", GENOME)[0]
##Determine file type using extension and create a tmp.file to use.
if GENOME.lower().endswith('gz'):
print('Using .gz file.')
TMP = PREFIX + '.fa'
with gzip.open(GENOME, 'rt') as IN:
with open(TMP, 'w') as OUT:
for LINE in IN:
OUT.write(LINE)
# moduleload = 'module load intel rmblast'
# os.system(moduleload)
##Attempt 1
# makedb = 'makeblastdb -in ' + TMP + '-dbtype nucl'
# os.system(makedb)
# blastn = 'blastn –query ' + LIBRARY + ' -db ' + TMP + ' -out ' + BLASTOUTFILE + ' -outfmt 6'
# os.system(blastn)
##Attempt 2
blastn_cline = NcbiblastnCommandline(query=LIBRARY, out=BLASTOUTFILE, db=TMP, outfmt=6)
##Attempt 3
# subprocess.check_call('makeblastdb -in {} -dbtype nucl'.format(TMP), shell=True)
# subprocess.check_call('blastn –query {} -db {} -out {} -outfmt 6'.format(LIBRARY, TMP, BLASTOUTFILE), shell=True)
##Call subprocess to make the blast database and run blast
# subprocess.check_call('perl /lustre/work/daray/software/extractAlignTEs.pl --genome {} --blast {} --consTEs {} --seqBuffer {} --seqNum {} --align'.format (TMP, BLASTOUTFILE, LIBRARY, SEQBUFFER, SEQNUMBER), shell=True)
##Delete tmp file.
# print('Removing tmp file.')
# os.remove(TMP)
##Get arguments function
def get_args():
parser = argparse.ArgumentParser(description="Assignment 4", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-g', '--genome', type=str, help='Name of the .fa or .fa.gz file to be analyzed', required=True)
parser.add_argument('-l', '--library', type=str, help='Library of putative TEs', required=True)
parser.add_argument('-b', '--blast', type=str, help='Name of blast file to generate and use')
parser.add_argument('-sb', '--seqbuffer', type=int, help='Number of flanking bases to extract, default = 500 bp', default=500)
parser.add_argument('-sn', '--seqnumber', type=int, help='Number of hits from each putative TE to analyze')
args = parser.parse_args()
GENOME = args.genome
LIBRARY = args.library
BLASTOUTFILE = args.blast
SEQBUFFER = args.seqbuffer
SEQNUMBER = args.seqnumber
return GENOME, LIBRARY, BLASTOUTFILE, SEQBUFFER, SEQNUMBER
if __name__ =="__main__":main()