Skip to content

Commit

Permalink
Working on new SW alignment features. I have some functions that prod…
Browse files Browse the repository at this point in the history
…uce a minimizer index file (.kdbi.1), but the functionality is still nascent, and I'd really like to just accept reference and query sequences as inputs, rather than bother the user with the index files or .kdb functionality at all. Working on it in steps.
  • Loading branch information
MatthewRalston committed Jan 14, 2025
1 parent 1c13844 commit d029408
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 5 deletions.
16 changes: 15 additions & 1 deletion TODO.org
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,21 @@
# .kdb files should be debrujin graph databases
# The final prototype would be .bgzf format from biopython

* 12/16/24 | Minimizers (lexicographic with window size)
* 1/13/25 | Minimizers (lexicographic with window size)

** Use the most 5? abundant (wrt original genome/kdb) minimizers

*** Filtering the alignment seeds/kmers by abundance should be a good heuristic for creating biologically meaningful seed regions for alignment.

** The --window-size parameter is important for kmer/minimizer filtering and restricting the number of Smith Waterman alignments that must be done.

** Refactor minimizers.py to create a function that takes a fasta sequence as input, and returns a minimizer array. (lift and shift from get_minimizers in 0.8.11)

** [0/3] Final alignment command should manually produce temporary files:

- [ ] .kdb from reference (abundances)
- [ ] .kdbi.1 from reference (minimizers)
- [ ] .kdbi.1 from query (seeds/minimizers) :TODO:
* 10/5/24 | released from hospital (2 days) and working on refactoring kmer module
* 9/21/24 | somewhat changing from gpt4o-mini to claude sonnet
** using sonnet to create llc documents
Expand Down
81 changes: 77 additions & 4 deletions kmerdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def get_minimizers(arguments):

minimizers = minimizer.select_lexicographical_minimizers(seq, metadata['k'], arguments.window_size, kmer_ids)

mins_as_list = list(map(lambda s: int(s), minimizers))
mins_as_list = list(map(int, minimizers))
if mins is None:
mins=mins_as_list
else:
Expand All @@ -265,9 +265,74 @@ def get_minimizers(arguments):
elif kmer_is_selected == 1:
mins_as_list[i] = 1
# print minimizers in columnar format in new index file format (example.8.kdbi.1)
print(mins)
sys.exit(1)

output_index_file = "{0}.kdbi.1".format(str(arguments.kdb).strip(".kdb"))
logger.log_it("\n\nPrinting results of minimizers to '{0}'".format(output_index_file), "INFO")
with open(output_index_file, "w") as ofile:
for kmer, i in enumerate(kmer_ids):
ofile.write("{0}\t{1}\n".format(kmer, mins[i]))


def get_alignments(arguments):
"""
Produces Smith-Waterman alignment by loading minimizers
"""

from Bio import SeqIO

minimizers = []
reference_fasta_seqs = []
query_fasta_seqs = []

kdbi_sfx = ".".join(os.path.splitext(arguments.kdbi)[-2:])

print(kdbi_sfx)
sys.exit(1)

if kdbi_sfx != ".kdbi.1": # A filepath with invalid suffix
raise IOError("Input .kdb index filepath '{0}' does not end in '.kdb'".format(arguments.kdbi1))



# Create as temporary file, hold minimizer array in memory

with open(arguments.reference, mode="r") as ifile:
for record in SeqIO.parse(ifile, "fasta"):
reference_fasta_seqs.append(str(record.seq))

kdb_sfx = os.path.splitext(arguments.kdb)[-1]


if kdb_sfx != ".kdb": # A filepath with invalid suffix
raise IOError("Viewable .kdb filepath does not end in '.kdb'")
elif not os.path.exists(arguments.kdb):
raise IOError("Viewable .kdb filepath '{0}' does not exist on the filesystem".format(arguments.kdb)

)
if kdb_sfx == ".kdb":
with fileutil.open(arguments.kdb, mode='r', slurp=True) as kdb_in:
metadata = kdb_in.metadata


kmer_ids_dtype = metadata["kmer_ids_dtype"]
N = 4**metadata["k"]
if metadata["version"] != config.VERSION:
logger.log_it("KDB version is out of date, may be incompatible with current KDBReader class", "WARNING")
if arguments.header:
yaml.add_representer(OrderedDict, util.represent_yaml_from_collections_dot_OrderedDict)
print(yaml.dump(metadata, sort_keys=False))
print(config.header_delimiter)
logger.log_it("Reading from file...", "INFO")

for i, idx in enumerate(kmer_ids_sorted_by_count):
kmer_id = kdb_in.kmer_ids[i]


print("{0}\t{1}\t{2}\t{3}".format(i, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id]))




def distances(arguments):
"""
Expand Down Expand Up @@ -2430,7 +2495,15 @@ def cli():
minimizer_parser.add_argument("-l", "--log-file", type=str, default="kmerdb.log", help=argparse.SUPPRESS)
minimizer_parser.set_defaults(func=get_minimizers)


alignment_parser = subparsers.add_parser("alignment", help="Create a Smith-Waterman-like alignment using a reference fasta, its minimizers, and query sequence, and its minimizers.")
alignment_parser.add_argument("reference", type=str, help="Reference sequences in .fa/.fna/.fasta format")
alignment_parser.add_argument("query", type=str, help="Query sequences in .fa/.fna/.fasta format")

alignment_parser.add_argument("-v", "--verbose", help="Prints warnings to the console by default", default=0, action="count")
alignment_parser.add_argument("--debug", action="store_true", default=False, help="Debug mode. Do not format errors and condense log")
alignment_parser.add_argument("-nl", "--num-log-lines", type=int, choices=config.default_logline_choices, default=50, help=argparse.SUPPRESS)
alignment_parser.add_argument("-l", "--log-file", type=str, default="kmerdb.log", help=argparse.SUPPRESS)
alignment_parser.set_defaults(func=get_alignments)

usage_parser = subparsers.add_parser("usage", help="provide expanded usage information on parameters and functions provided")
usage_parser.add_argument("method", type=str, choices=("usage", "help", "profile", "graph", "index", "shuf", "matrix", "distance"), help="Print expanded usage statement")
Expand Down
79 changes: 79 additions & 0 deletions kmerdb/minimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,82 @@ def select_lexicographical_minimizers(seq, k, window_size, kmer_ids):

return minimizers


def minimizers_from_fasta_and_kdb(fasta_file, kdb_file):
from Bio import SeqIO
import numpy as np
from kmerdb import fileutil, config, util, minimizer
import json
metadata = None
N = None
kmer_ids = None
fasta_seqs = []
mins = None

fa_sfx = os.path.splitext(fasta_file)[-1]
kdb_sfx = os.path.splitext(arguments.kdb)[-1]


if kdb_sfx != ".kdb" and kdb_sfx != ".kdbg": # A filepath with invalid suffix
raise IOError("Input .kdb filepath '{0}' does not end in '.kdb'".format(kdb_file))
elif not os.path.exists(kdb_file):
raise IOError("Input .kdb filepath '{0}' does not exist on the filesystem".format(kdb_file))
elif fa_sfx != ".fa" and fa_sfx != ".fna" and fa_sfx != ".fasta":
raise IOError("Input .fasta filepath '{0}' does not exist on the filesystem".format(arguments.fasta))

with open(fasta_file, mode="r") as ifile:
for record in SeqIO.parse(ifile, "fasta"):
fasta_seqs.append(str(record.seq))

with fileutil.open(arguments.kdb, mode='r', slurp=True) as kdb_in:
metadata = kdb_in.metadata


kmer_ids_dtype = metadata["kmer_ids_dtype"]
N = 4**metadata["k"]
if metadata["version"] != config.VERSION:
logger.log_it("KDB version is out of date, may be incompatible with current KDBReader class", "WARNING")
kmer_ids = kdb_in.kmer_ids

for seq in fasta_seqs:

minimizers = minimizer.select_lexicographical_minimizers(seq, metadata['k'], arguments.window_size, kmer_ids)

# TODO: FIXME

mins_as_list = list(map(int, minimizers))

#
if mins is None:
mins=mins_as_list
else:
for kmer_is_selected, i in enumerate(mins_as_list):
if mins_as_list[i] == 1:
pass
elif kmer_is_selected == 1:
mins[i] = 1

return minimizers

def make_alignment(reference_fasta_seqs, query_fasta_seqs, k):


reference_minimizers = select_lexicographical_minimizers()


def read_minimizer_kdbi_index_file(kdbi1):
"""
Read a minimizer file into memory
"""
minimizers = []
with open(kdbi1, 'r') as minimizer_index_file:
for l in minimizer_index_file:

line = l.strip("\n").split("\t")

assert len(line) == 2, "Index file corrupted. Use 'kmerdb minimizer' to regenerate the index file."
kmer_id, is_minimizer = line

if is_minimizer == 1:
minimizers.append(kmer_id)
return minimizers

0 comments on commit d029408

Please sign in to comment.