add genbank parser to python (#87)

This adds a genbank parser to python through the C shared library. Most of this was written by AI, because it's pretty annoying to write, and is mostly just piping data. Amazingly, it seems to work. I also added a little benchmark showing that we have comparable performance to biopython - but I'm sure some of our other features will be even better in that regard. - Keoni
Koeng101 · Aug 17, 2024 · 888031d · 888031d
1 parent bb29c87
commit 888031d
Show file tree

Hide file tree

Showing 9 changed files with 849 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -75,6 +75,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+- Adds genbank parsing to python package. Release version 0.1.5 of dnadesign python. [#87](https://github.com/Koeng101/dnadesign/pull/87)
 - Adds fastq parsing to python package. Releases version 0.1.4 of dnadesign python. [#86](https://github.com/Koeng101/dnadesign/pull/86)
 - Integrated errgroup into source tree [#84](https://github.com/Koeng101/dnadesign/pull/84)
 - Added kmer detection for ligation events in cloning and removed enzyme manager [#83](https://github.com/Koeng101/dnadesign/pull/83)

diff --git a/py/README.md b/py/README.md
@@ -1,7 +1,20 @@
 # DnaDesign (Python)
 This directory contains code for allowing python users to use dnadesign through a shared C library.
 
-This is a work-in-progress. Right now, we have only ported the fasta parser. 
+This is a work-in-progress. Right now, we have only ported the fasta, fastq, and genbank parsers.
+
+For genbank files, we have this benchmark with bsub.gbk:
+```
+Running 10 iterations for each parser...
+
+Results:
+DNA design average time: 0.337005 seconds
+BioPython average time:  0.332007 seconds
+
+DNA design is 0.99x faster than BioPython
+```
+
+Turns out, we're just about as performant!
 
 ### Other platforms
 If you have interest in other platforms, like openbsd or freebsd, please add an issue! I'd be happy to add automatic packaging for these alternative platforms if I know someone will use them.

diff --git a/py/benchmark_against_biopython.py b/py/benchmark_against_biopython.py
@@ -0,0 +1,41 @@
+import timeit
+import os
+from Bio import SeqIO
+from dnadesign.parsers import parse_genbank_from_c_file
+
+def benchmark_dnadesign(file_path, num_iterations=10):
+    def parse_with_dnadesign():
+        records = parse_genbank_from_c_file(file_path)
+        return records
+
+    time_taken = timeit.timeit(parse_with_dnadesign, number=num_iterations)
+    return time_taken / num_iterations
+
+def benchmark_biopython(file_path, num_iterations=10):
+    def parse_with_biopython():
+        with open(file_path, "r") as handle:
+            records = list(SeqIO.parse(handle, "genbank"))
+        return records
+
+    time_taken = timeit.timeit(parse_with_biopython, number=num_iterations)
+    return time_taken / num_iterations
+
+def main():
+    current_dir = os.path.dirname(__file__)
+    example_path = os.path.join(current_dir, '../lib/bio/genbank/data/bsub.gbk')
+
+    print(f"Benchmarking GenBank parsing for file: {example_path}")
+    print("Running 10 iterations for each parser...")
+
+    dnadesign_time = benchmark_dnadesign(example_path)
+    biopython_time = benchmark_biopython(example_path)
+
+    print(f"\nResults:")
+    print(f"DNA design average time: {dnadesign_time:.6f} seconds")
+    print(f"BioPython average time:  {biopython_time:.6f} seconds")
+
+    speedup = biopython_time / dnadesign_time
+    print(f"\nDNA design is {speedup:.2f}x faster than BioPython")
+
+if __name__ == "__main__":
+    main()
diff --git a/py/dnadesign/definitions.h b/py/dnadesign/definitions.h
@@ -39,3 +39,101 @@ typedef struct {
 
 FastqResult ParseFastqFromCFile(void* cfile);
 FastqResult ParseFastqFromCString(char* cstring);
+
+// Genbank definitions
+// GenbankLocation
+typedef struct GenbankLocation {
+    int start;
+    int end;
+    int complement;
+    int join;
+    int five_prime_partial;
+    int three_prime_partial;
+    char* gbk_location_string;
+    struct GenbankLocation* sub_locations;
+    int sub_locations_count;
+} GenbankLocation;
+
+// GenbankFeature
+typedef struct {
+    char* type_;
+    char* description;
+    char** attribute_keys;
+    char*** attribute_values;
+    int* attribute_value_counts;
+    int attribute_count;
+    char* sequence_hash;
+    char* sequence_hash_function;
+    char* sequence;
+    GenbankLocation location;
+} GenbankFeature;
+
+// GenbankReference
+typedef struct {
+    char* authors;
+    char* title;
+    char* journal;
+    char* pub_med;
+    char* remark;
+    char* range_;
+    char* consortium;
+} GenbankReference;
+
+// GenbankLocus
+typedef struct {
+    char* name;
+    char* sequence_length;
+    char* molecule_type;
+    char* genbank_division;
+    char* modification_date;
+    char* sequence_coding;
+    int circular;
+} GenbankLocus;
+
+// GenbankBaseCount
+typedef struct {
+    char base;
+    int count;
+} GenbankBaseCount;
+
+// GenbankMeta
+typedef struct {
+    char* date;
+    char* definition;
+    char* accession;
+    char* version;
+    char* keywords;
+    char* organism;
+    char* source;
+    char** taxonomy;
+    int taxonomy_count;
+    char* origin;
+    GenbankLocus locus;
+    GenbankReference* references;
+    int reference_count;
+    GenbankBaseCount* base_counts;
+    int base_count_count;
+    char** other_keys;
+    char** other_values;
+    int other_count;
+    char* name;
+    char* sequence_hash;
+    char* sequence_hash_function;
+} GenbankMeta;
+
+// Genbank
+typedef struct {
+    GenbankMeta meta;
+    GenbankFeature* features;
+    int feature_count;
+    char* sequence;
+} Genbank;
+
+typedef struct {
+    Genbank* records;
+    int numRecords;
+    char* error;
+} GenbankResult;
+
+GenbankResult ParseGenbankFromCFile(void* cfile);
+GenbankResult ParseGenbankFromCString(char* cstring);
diff --git a/py/dnadesign/parsers.py b/py/dnadesign/parsers.py
@@ -78,3 +78,212 @@ def _process_fastq_result(result) -> List[FastqRecord]:
             optionals
         ))
     return fastq_records
+
+# Genbank time
+class GenbankLocus:
+    def __init__(self, name: str, sequence_length: str, molecule_type: str, genbank_division: str,
+                 modification_date: str, sequence_coding: str, circular: bool):
+        self.name = name
+        self.sequence_length = sequence_length
+        self.molecule_type = molecule_type
+        self.genbank_division = genbank_division
+        self.modification_date = modification_date
+        self.sequence_coding = sequence_coding
+        self.circular = circular
+
+class GenbankReference:
+    def __init__(self, authors: str, title: str, journal: str, pub_med: str, remark: str, range_: str, consortium: str):
+        self.authors = authors
+        self.title = title
+        self.journal = journal
+        self.pub_med = pub_med
+        self.remark = remark
+        self.range_ = range_
+        self.consortium = consortium
+
+class GenbankBaseCount:
+    def __init__(self, base: str, count: int):
+        self.base = base
+        self.count = count
+
+class GenbankMeta:
+    def __init__(self, date: str, definition: str, accession: str, version: str, keywords: str,
+                 organism: str, source: str, taxonomy: List[str], origin: str, locus: GenbankLocus,
+                 references: List[GenbankReference], base_counts: List[GenbankBaseCount], other: Dict[str, str],
+                 name: str, sequence_hash: str, hash_function: str):
+        self.date = date
+        self.definition = definition
+        self.accession = accession
+        self.version = version
+        self.keywords = keywords
+        self.organism = organism
+        self.source = source
+        self.taxonomy = taxonomy
+        self.origin = origin
+        self.locus = locus
+        self.references = references
+        self.base_counts = base_counts
+        self.other = other
+        self.name = name
+        self.sequence_hash = sequence_hash
+        self.hash_function = hash_function
+
+# Update the Location class to match the C struct
+class GenbankLocation:
+    def __init__(self, start: int, end: int, complement: bool, join: bool, five_prime_partial: bool,
+                 three_prime_partial: bool, gbk_location_string: str, sub_locations: List['GenbankLocation']):
+        self.start = start
+        self.end = end
+        self.complement = complement
+        self.join = join
+        self.five_prime_partial = five_prime_partial
+        self.three_prime_partial = three_prime_partial
+        self.gbk_location_string = gbk_location_string
+        self.sub_locations = sub_locations
+
+class GenbankFeature:
+    def __init__(self, type_: str, description: str, attributes: Dict[str, List[str]],
+                 sequence_hash: str, hash_function: str, sequence: str, location: GenbankLocation):
+        self.type_ = type_
+        self.description = description
+        self.attributes = attributes
+        self.sequence_hash = sequence_hash
+        self.hash_function = hash_function
+        self.sequence = sequence
+        self.location = location
+
+class Genbank:
+    def __init__(self, meta: GenbankMeta, features: List[GenbankFeature], sequence: str):
+        self.meta = meta
+        self.features = features
+        self.sequence = sequence
+
+def _safe_open_file(file_path: str):
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"The file {file_path} does not exist.")
+    cfile = lib.fopen(file_path.encode('utf-8'), "r".encode('utf-8'))
+    if cfile == ffi.NULL:
+        raise IOError(f"Failed to open the file {file_path}.")
+    return cfile
+
+def parse_genbank_from_c_file(file_path: str) -> List[Genbank]:
+    try:
+        cfile = _safe_open_file(file_path)
+        result = lib.ParseGenbankFromCFile(cfile)
+        return _process_genbank_result(result)
+    finally:
+        if 'cfile' in locals() and cfile != ffi.NULL:
+            lib.fclose(cfile)
+
+def parse_genbank_from_c_string(cstring: str) -> List[Genbank]:
+    result = lib.ParseGenbankFromCString(cstring.encode('utf-8'))
+    return _process_genbank_result(result)
+
+def _process_genbank_result(result) -> List[Genbank]:
+    if result.error != ffi.NULL:
+        error_str = ffi.string(result.error).decode('utf-8')
+        raise Exception("Error parsing Genbank: " + error_str)
+    num_records = result.numRecords
+    records = ffi.cast("Genbank*", result.records)
+    return [_convert_genbank_record(records[i]) for i in range(num_records)]
+
+def _convert_genbank_record(record) -> Genbank:
+    meta = _convert_meta(record.meta)
+    features = [_convert_feature(record.features[i]) for i in range(record.feature_count)]
+    sequence = ffi.string(record.sequence).decode('utf-8')
+    return Genbank(meta, features, sequence)
+
+def _convert_meta(meta) -> GenbankMeta:
+    locus = GenbankLocus(
+        ffi.string(meta.locus.name).decode('utf-8'),
+        ffi.string(meta.locus.sequence_length).decode('utf-8'),
+        ffi.string(meta.locus.molecule_type).decode('utf-8'),
+        ffi.string(meta.locus.genbank_division).decode('utf-8'),
+        ffi.string(meta.locus.modification_date).decode('utf-8'),
+        ffi.string(meta.locus.sequence_coding).decode('utf-8'),
+        meta.locus.circular
+    )
+
+    references = [
+        GenbankReference(
+            ffi.string(ref.authors).decode('utf-8'),
+            ffi.string(ref.title).decode('utf-8'),
+            ffi.string(ref.journal).decode('utf-8'),
+            ffi.string(ref.pub_med).decode('utf-8'),
+            ffi.string(ref.remark).decode('utf-8'),
+            ffi.string(ref.range_).decode('utf-8'),
+            ffi.string(ref.consortium).decode('utf-8')
+        ) for ref in meta.references[0:meta.reference_count]
+    ]
+
+    base_counts = [
+        GenbankBaseCount(
+            ffi.string(bc.base).decode('utf-8'),
+            bc.count
+        ) for bc in meta.base_counts[0:meta.base_count_count]
+    ]
+
+    other = {}
+    for i in range(meta.other_count):
+        key = ffi.string(meta.other_keys[i]).decode('utf-8')
+        value = ffi.string(meta.other_values[i]).decode('utf-8')
+        other[key] = value
+
+    return GenbankMeta(
+        ffi.string(meta.date).decode('utf-8'),
+        ffi.string(meta.definition).decode('utf-8'),
+        ffi.string(meta.accession).decode('utf-8'),
+        ffi.string(meta.version).decode('utf-8'),
+        ffi.string(meta.keywords).decode('utf-8'),
+        ffi.string(meta.organism).decode('utf-8'),
+        ffi.string(meta.source).decode('utf-8'),
+        [ffi.string(tax).decode('utf-8') for tax in meta.taxonomy[0:meta.taxonomy_count]],
+        ffi.string(meta.origin).decode('utf-8'),
+        locus,
+        references,
+        base_counts,
+        other,
+        ffi.string(meta.name).decode('utf-8'),
+        ffi.string(meta.sequence_hash).decode('utf-8'),
+        ffi.string(meta.sequence_hash_function).decode('utf-8')
+    )
+
+def _convert_location(loc) -> GenbankLocation:
+    sub_locations = []
+
+    if loc.sub_locations_count > 0:
+        sub_locations_array = ffi.cast("GenbankLocation *", loc.sub_locations)
+        for i in range(loc.sub_locations_count):
+            sub_loc = sub_locations_array[i]
+            sub_locations.append(_convert_location(sub_loc))
+
+    return GenbankLocation(
+        loc.start,
+        loc.end,
+        bool(loc.complement),
+        bool(loc.join),
+        bool(loc.five_prime_partial),
+        bool(loc.three_prime_partial),
+        ffi.string(loc.gbk_location_string).decode('utf-8') if loc.gbk_location_string else "",
+        sub_locations
+    )
+
+def _convert_feature(feature) -> GenbankFeature:
+    attributes = {}
+    for i in range(feature.attribute_count):
+        key = ffi.string(feature.attribute_keys[i]).decode('utf-8')
+        values = [ffi.string(feature.attribute_values[i][j]).decode('utf-8')
+                  for j in range(feature.attribute_value_counts[i])]
+        attributes[key] = values
+
+    location = _convert_location(feature.location)
+
+    return GenbankFeature(
+        ffi.string(feature.type_).decode('utf-8'),
+        ffi.string(feature.description).decode('utf-8'),
+        attributes,
+        ffi.string(feature.sequence_hash).decode('utf-8'),
+        ffi.string(feature.sequence_hash_function).decode('utf-8'),
+        ffi.string(feature.sequence).decode('utf-8'),
+        location
+    )