Skip to content

Commit

Permalink
Merge pull request #10 from Finn-Lab/implement_only_ips_gff_support
Browse files Browse the repository at this point in the history
Add support for protein FASTA input
  • Loading branch information
SantiagoSanchezF authored Feb 7, 2025
2 parents de0d17e + c06dd68 commit 780e841
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 51 deletions.
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,22 @@ sanntis test/files/BGC0001472.fna
conda deactivate sanntis
```

#### Use precomputed InterProScan outputs
#### Support of preprocessed InterProScan outputs

SanntiS can be executed using preprocessed InterProScan outputs along with a GenBank (GBK) file specifying the coding sequences (CDSs). This integration facilitates a streamlined analysis pipeline for bioinformatics applications, allowing for enhanced functionality and user flexibility.
SanntiS can be executed using preprocessed InterProScan outputs along with a GenBank (GBK) file specifying the coding sequences (CDSs). This integration increases user flexibility.
```bash
conda activate sanntis
sanntis --ip-file test/files/BGC0001472.fna.prodigal.faa.gff3 test/files/BGC0001472.fna.prodigal.faa.gb
conda deactivate sanntis
```

Additionally, the --ip-file option can be provided with a protein FASTA file containing headers formatted according to Prodigal's convention. In this case, the --is_protein flag must be included to indicate that the sequence file is a proteins FASTA.
```bash
conda activate sanntis
sanntis --is_protein --ip-file test/files/BGC0001472.fna.prodigal.faa.gff3 test/files/BGC0001472.fna.prodigal.faa
conda deactivate sanntis
```

### Docker:

#### Get InterProsScan data:
Expand Down
28 changes: 19 additions & 9 deletions sanntis/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,18 @@ def main(args=None):
parser.add_argument(
"seq_file",
type=str,
help="input nucleotide sequence file. FASTA or GBK. mandatory",
help=(
"Input sequence file. Supported formats: nucleotide FASTA, GBK, or protein FASTA. "
"If the file is a protein FASTA, it must use Prodigal output headers and must be accompanied "
"by the --is_protein flag. Mandatory."
),
metavar="SEQUENCE_FILE",
)
parser.add_argument(
"--is_protein",
action="store_true",
help="Specify if the input SEQUENCE_FILE is a protein FASTA file. Will only process sequences with headers formatted like Prodigal protein outputs.",
)
parser.add_argument(
"-v",
"--version",
Expand Down Expand Up @@ -66,62 +75,62 @@ def main(args=None):
dest="score",
default=None,
type=float,
help="validation filter threshold. overrides --greed",
help="Validation filter threshold. overrides --greed",
metavar="FLOAT",
)
parser.add_argument(
"--meta",
dest="meta",
default="True",
type=str,
help="prodigal option meta [default True]",
help="Prodigal option meta [default True]",
metavar="True|False",
)
parser.add_argument(
"--outdir",
default=os.getcwd(),
dest="outdir",
type=str,
help="output directory [default $PWD/SEQUENCE_FILE.sanntis]",
help="Output directory [default $PWD/SEQUENCE_FILE.sanntis]",
metavar="DIRECTORY",
)
parser.add_argument(
"--outfile",
dest="outfile",
type=str,
help="output file [default outdir/SEQUENCE_FILE.sanntis.gff]",
help="Output file [default outdir/SEQUENCE_FILE.sanntis.gff]",
metavar="FILE",
)
parser.add_argument(
"--minimal",
dest="minimal_out",
default="True",
type=str,
help="minimal output in a gff3 file [default True]",
help="Minimal output in a gff3 file [default True]",
metavar="True|False",
)
parser.add_argument(
"--antismash_output",
dest="antismash_out",
default="False",
type=str,
help="write results in antiSMASH 6.0 JSON specification output [default False]",
help="Write results in antiSMASH 6.0 JSON specification output [default False]",
metavar="True|False",
)
parser.add_argument(
"--refined",
dest="ref_b",
default="False",
type=str,
help="annotate high probability borders [default False]",
help="Annotate high probability borders [default False]",
metavar="True|False",
)
parser.add_argument(
"--cpu",
dest="cpu",
default=1,
type=int,
help="cpus for INTERPROSCAN and HMMSCAN",
help="Cpus for INTERPROSCAN and HMMSCAN",
metavar="INT",
)

Expand Down Expand Up @@ -153,6 +162,7 @@ def main(args=None):
log.info("preprocessing files")
preprocess = Preprocess(
os.path.abspath(args.seq_file),
args.is_protein,
args.ip_file,
args.meta,
args.cpu,
Expand Down
69 changes: 32 additions & 37 deletions sanntis/modules/BGCdetection.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import pickle
from itertools import groupby

from Bio import SeqIO
import numpy as np

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
Expand Down Expand Up @@ -93,48 +94,43 @@ def transformEmeraldHmm(self, hmmFile):
spl = l.split()
self.entriesDct.setdefault(spl[3], []).append(spl[0])

def transformCDSpredToCDScontigs(self, cdsPredFile, f):
def transformCDSpredToCDScontigs(self, cdsPredFile, file_format):

if not os.path.isfile(cdsPredFile):
log.exception(f"{cdsPredFile} file not found")

if file_format == "fasta":

_prodigal_pattern = re.compile(
r"_\d+\s#\s(\d+)\s#\s(\d+)\s#\s(-?1)\s#\sID=(\d+_\d+);partial=(\d{2});start_type="
r"(\w+);rbs_motif=(.+);rbs_spacer=(\S+);gc_cont=(\d+\.\d+)"
)

for record in SeqIO.parse(open(cdsPredFile, "r"), file_format):
header = record.description
prodigal_match = _prodigal_pattern.search(header)
if not prodigal_match:
log.warning(
f"Protein {record.id} does not follow the Prodigal header format. "
)
continue
start = int(prodigal_match.group(1))
end = int(prodigal_match.group(2))
protein_id = record.id

with open(cdsPredFile, "r") as h:

if f == "fasta":

for l in h:

if l[0] != ">":
continue

spl = l.split()
start, end = int(spl[2]), int(spl[4])

self.contigsDct.setdefault(
"_".join(spl[0].split("_")[:-1])[1:], []
).append((spl[0][1:].strip(), (start, end)))

elif f == "genbank":

from Bio import SeqIO

recs = list(SeqIO.parse(open(cdsPredFile, "r"), "gb"))

for rec in recs:
for f in rec.features:
if f.type == "CDS":

self.contigsDct.setdefault(
"_".join(record.id.split("_")[:-1]), []
).append((record.id, (start, end)))

start, end = int(f.location.start) + 1, int(f.location.end)
elif file_format == "genbank":

for record in SeqIO.parse(open(cdsPredFile, "r"), file_format):
for f in record.features:
if f.type == "CDS":
start, end = int(f.location.start) + 1, int(f.location.end)
protein_id = f.qualifiers["protein_id"][0].strip().replace(' ','') if "protein_id" in f.qualifiers else f.qualifiers["locus_tag"][0].strip().replace(' ','')

self.contigsDct.setdefault(rec.id, []).append(
(
f.qualifiers["protein_id"][0].strip().replace(' ','') # replace to avoid long id bug in gb files
if "protein_id" in f.qualifiers
else f.qualifiers["locus_tag"][0].strip().replace(' ',''),
(start, end),
)
)
self.contigsDct.setdefault(record.id, []).append( (protein_id,(start, end)))

def buildMatrices(self):

Expand All @@ -161,7 +157,6 @@ def buildMatrices(self):
]
self.annDct[contig][ix][modiAnn] = 1


def predictAnn(self, colapseFunc=max):

log.info("Predict BGC probability w/ TensorFlow")
Expand Down
6 changes: 4 additions & 2 deletions sanntis/modules/Preproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@
class Preprocess:
"""External tools needed for sanntis bgc detection"""

def __init__(self, seq_file, ip_file, meta, cpus, outdir):
def __init__(self, seq_file, seqfile_is_proteins, ip_file, meta, cpus, outdir):

self.seq_file = seq_file
self.seqfile_is_proteins = seqfile_is_proteins
self.ip_file = ip_file
self.meta = meta
self.cpus = int(cpus)
Expand Down Expand Up @@ -130,8 +131,9 @@ def process_sequence(self):
""" CDS prediction on sequence file"""

self.check_fmt()

if self.fmt == "fasta":
self.outFaa = self.runProdigal()
self.outFaa = self.seq_file if self.seqfile_is_proteins else self.runProdigal()
elif self.fmt == "genbank":
self.outFaa = self.gbkToProdigal()
else:
Expand Down
2 changes: 1 addition & 1 deletion sanntis/pkg_info.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "sanntis",
"version": "0.9.3.5",
"version": "0.9.4.0",
"description": "SMBGC detection tool",
"author": "Santiago Sanchez Fragoso",
"author_email": "[email protected]",
Expand Down

0 comments on commit 780e841

Please sign in to comment.