-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDBfetch.py
85 lines (64 loc) · 3.02 KB
/
DBfetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import sys
import argparse
import os
import subprocess
import textwrap
try:
from Bio import SeqIO
except:
print("SeqIO module is not installed! Please install SeqIO and try again.")
sys.exit()
try:
import tqdm
except :
print("tqdm module is not installed! Please install tqdm and try again.")
sys.exit()
parser = argparse.ArgumentParser(prog='python DBfetch.py',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=textwrap.dedent('''\
Author: Murat Buyukyoruk
DBfetch help:
This script is developed to fetch sequences from multifasta file by using a list of accession numbers to fetch.
SeqIO package from Bio is required to fetch sequences. Additionally, tqdm is required to provide a progress bar since some multifasta files can contain long and many sequences.
Syntax:
python DBfetch.py -i DB -l demo_sub_list.txt -o out.fasta
DBfetch dependencies:
Bio module and SeqIO available in this package refer to https://biopython.org/wiki/Download
tqdm refer to https://pypi.org/project/tqdm/
Input Paramaters (REQUIRED):
----------------------------
-i/--input DB Specify a DB file to process (i.e., /home/shared/amino-acid/bac_arc_db_01022023/bac_arc_db_01022023_prodigal.faa).
-l/--list List Specify a list of accession (Accession only). Each accession should be included in a new line (i.e. generated with Excel spreadsheet). Script works with or without '>' symbol before the accession.
-o/--output output file Specify a output file name that should contain fetched sequences.
Basic Options:
--------------
-h/--help HELP Shows this help text and exits the run.
'''))
parser.add_argument('-i', '--input', required=True, type=str, dest='filename',
help='Specify a DB file to process.\n')
parser.add_argument('-l', '--list', required=False, type=str, dest='list', default=None,
help='Specify a list of accession numbers to fetch.\n')
parser.add_argument('-o', '--output', required=True, dest='out',
help='Specify a output fasta file name.\n')
results = parser.parse_args()
filename = results.filename
list = results.list
out = results.out
os.system('> ' + out)
print(
"Locating index file! An index file will be generated if the fasta file is provided for the first time to generate DB.")
index = filename + '.index'
record_dict = SeqIO.index_db(index, filename, 'fasta')
proc = subprocess.Popen("wc -l < " + list, shell=True, stdout=subprocess.PIPE, text=True)
length = int(proc.communicate()[0].split('\n')[0])
if list != None:
f = open(out, 'a')
sys.stdout = f
with tqdm.tqdm(range(length + 1)) as pbar:
pbar.set_description('Reading...')
with open(list, 'r') as file:
for line in file:
pbar.update()
acc = line.split()[0].split("\n")[0]
found = record_dict[acc]
print(found.format("fasta"))