Skip to content

Commit

Permalink
add genbank parser to python (#87)
Browse files Browse the repository at this point in the history
This adds a genbank parser to python through the C shared library. Most of this was written by AI, because it's pretty annoying to write, and is mostly just piping data. Amazingly, it seems to work. I also added a little benchmark showing that we have comparable performance to biopython - but I'm sure some of our other features will be even better in that regard.

- Keoni
  • Loading branch information
Koeng101 authored Aug 17, 2024
1 parent bb29c87 commit 888031d
Show file tree
Hide file tree
Showing 9 changed files with 849 additions and 2 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
- Adds genbank parsing to python package. Release version 0.1.5 of dnadesign python. [#87](https://github.com/Koeng101/dnadesign/pull/87)
- Adds fastq parsing to python package. Releases version 0.1.4 of dnadesign python. [#86](https://github.com/Koeng101/dnadesign/pull/86)
- Integrated errgroup into source tree [#84](https://github.com/Koeng101/dnadesign/pull/84)
- Added kmer detection for ligation events in cloning and removed enzyme manager [#83](https://github.com/Koeng101/dnadesign/pull/83)
Expand Down
15 changes: 14 additions & 1 deletion py/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,20 @@
# DnaDesign (Python)
This directory contains code for allowing python users to use dnadesign through a shared C library.

This is a work-in-progress. Right now, we have only ported the fasta parser.
This is a work-in-progress. Right now, we have only ported the fasta, fastq, and genbank parsers.

For genbank files, we have this benchmark with bsub.gbk:
```
Running 10 iterations for each parser...
Results:
DNA design average time: 0.337005 seconds
BioPython average time: 0.332007 seconds
DNA design is 0.99x faster than BioPython
```

Turns out, we're just about as performant!

### Other platforms
If you have interest in other platforms, like openbsd or freebsd, please add an issue! I'd be happy to add automatic packaging for these alternative platforms if I know someone will use them.
Expand Down
41 changes: 41 additions & 0 deletions py/benchmark_against_biopython.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import timeit
import os
from Bio import SeqIO
from dnadesign.parsers import parse_genbank_from_c_file

def benchmark_dnadesign(file_path, num_iterations=10):
def parse_with_dnadesign():
records = parse_genbank_from_c_file(file_path)
return records

time_taken = timeit.timeit(parse_with_dnadesign, number=num_iterations)
return time_taken / num_iterations

def benchmark_biopython(file_path, num_iterations=10):
def parse_with_biopython():
with open(file_path, "r") as handle:
records = list(SeqIO.parse(handle, "genbank"))
return records

time_taken = timeit.timeit(parse_with_biopython, number=num_iterations)
return time_taken / num_iterations

def main():
current_dir = os.path.dirname(__file__)
example_path = os.path.join(current_dir, '../lib/bio/genbank/data/bsub.gbk')

print(f"Benchmarking GenBank parsing for file: {example_path}")
print("Running 10 iterations for each parser...")

dnadesign_time = benchmark_dnadesign(example_path)
biopython_time = benchmark_biopython(example_path)

print(f"\nResults:")
print(f"DNA design average time: {dnadesign_time:.6f} seconds")
print(f"BioPython average time: {biopython_time:.6f} seconds")

speedup = biopython_time / dnadesign_time
print(f"\nDNA design is {speedup:.2f}x faster than BioPython")

if __name__ == "__main__":
main()
98 changes: 98 additions & 0 deletions py/dnadesign/definitions.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,101 @@ typedef struct {

FastqResult ParseFastqFromCFile(void* cfile);
FastqResult ParseFastqFromCString(char* cstring);

// Genbank definitions
// GenbankLocation
typedef struct GenbankLocation {
int start;
int end;
int complement;
int join;
int five_prime_partial;
int three_prime_partial;
char* gbk_location_string;
struct GenbankLocation* sub_locations;
int sub_locations_count;
} GenbankLocation;

// GenbankFeature
typedef struct {
char* type_;
char* description;
char** attribute_keys;
char*** attribute_values;
int* attribute_value_counts;
int attribute_count;
char* sequence_hash;
char* sequence_hash_function;
char* sequence;
GenbankLocation location;
} GenbankFeature;

// GenbankReference
typedef struct {
char* authors;
char* title;
char* journal;
char* pub_med;
char* remark;
char* range_;
char* consortium;
} GenbankReference;

// GenbankLocus
typedef struct {
char* name;
char* sequence_length;
char* molecule_type;
char* genbank_division;
char* modification_date;
char* sequence_coding;
int circular;
} GenbankLocus;

// GenbankBaseCount
typedef struct {
char base;
int count;
} GenbankBaseCount;

// GenbankMeta
typedef struct {
char* date;
char* definition;
char* accession;
char* version;
char* keywords;
char* organism;
char* source;
char** taxonomy;
int taxonomy_count;
char* origin;
GenbankLocus locus;
GenbankReference* references;
int reference_count;
GenbankBaseCount* base_counts;
int base_count_count;
char** other_keys;
char** other_values;
int other_count;
char* name;
char* sequence_hash;
char* sequence_hash_function;
} GenbankMeta;

// Genbank
typedef struct {
GenbankMeta meta;
GenbankFeature* features;
int feature_count;
char* sequence;
} Genbank;

typedef struct {
Genbank* records;
int numRecords;
char* error;
} GenbankResult;

GenbankResult ParseGenbankFromCFile(void* cfile);
GenbankResult ParseGenbankFromCString(char* cstring);
209 changes: 209 additions & 0 deletions py/dnadesign/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,212 @@ def _process_fastq_result(result) -> List[FastqRecord]:
optionals
))
return fastq_records

# Genbank time
class GenbankLocus:
def __init__(self, name: str, sequence_length: str, molecule_type: str, genbank_division: str,
modification_date: str, sequence_coding: str, circular: bool):
self.name = name
self.sequence_length = sequence_length
self.molecule_type = molecule_type
self.genbank_division = genbank_division
self.modification_date = modification_date
self.sequence_coding = sequence_coding
self.circular = circular

class GenbankReference:
def __init__(self, authors: str, title: str, journal: str, pub_med: str, remark: str, range_: str, consortium: str):
self.authors = authors
self.title = title
self.journal = journal
self.pub_med = pub_med
self.remark = remark
self.range_ = range_
self.consortium = consortium

class GenbankBaseCount:
def __init__(self, base: str, count: int):
self.base = base
self.count = count

class GenbankMeta:
def __init__(self, date: str, definition: str, accession: str, version: str, keywords: str,
organism: str, source: str, taxonomy: List[str], origin: str, locus: GenbankLocus,
references: List[GenbankReference], base_counts: List[GenbankBaseCount], other: Dict[str, str],
name: str, sequence_hash: str, hash_function: str):
self.date = date
self.definition = definition
self.accession = accession
self.version = version
self.keywords = keywords
self.organism = organism
self.source = source
self.taxonomy = taxonomy
self.origin = origin
self.locus = locus
self.references = references
self.base_counts = base_counts
self.other = other
self.name = name
self.sequence_hash = sequence_hash
self.hash_function = hash_function

# Update the Location class to match the C struct
class GenbankLocation:
def __init__(self, start: int, end: int, complement: bool, join: bool, five_prime_partial: bool,
three_prime_partial: bool, gbk_location_string: str, sub_locations: List['GenbankLocation']):
self.start = start
self.end = end
self.complement = complement
self.join = join
self.five_prime_partial = five_prime_partial
self.three_prime_partial = three_prime_partial
self.gbk_location_string = gbk_location_string
self.sub_locations = sub_locations

class GenbankFeature:
def __init__(self, type_: str, description: str, attributes: Dict[str, List[str]],
sequence_hash: str, hash_function: str, sequence: str, location: GenbankLocation):
self.type_ = type_
self.description = description
self.attributes = attributes
self.sequence_hash = sequence_hash
self.hash_function = hash_function
self.sequence = sequence
self.location = location

class Genbank:
def __init__(self, meta: GenbankMeta, features: List[GenbankFeature], sequence: str):
self.meta = meta
self.features = features
self.sequence = sequence

def _safe_open_file(file_path: str):
if not os.path.exists(file_path):
raise FileNotFoundError(f"The file {file_path} does not exist.")
cfile = lib.fopen(file_path.encode('utf-8'), "r".encode('utf-8'))
if cfile == ffi.NULL:
raise IOError(f"Failed to open the file {file_path}.")
return cfile

def parse_genbank_from_c_file(file_path: str) -> List[Genbank]:
try:
cfile = _safe_open_file(file_path)
result = lib.ParseGenbankFromCFile(cfile)
return _process_genbank_result(result)
finally:
if 'cfile' in locals() and cfile != ffi.NULL:
lib.fclose(cfile)

def parse_genbank_from_c_string(cstring: str) -> List[Genbank]:
result = lib.ParseGenbankFromCString(cstring.encode('utf-8'))
return _process_genbank_result(result)

def _process_genbank_result(result) -> List[Genbank]:
if result.error != ffi.NULL:
error_str = ffi.string(result.error).decode('utf-8')
raise Exception("Error parsing Genbank: " + error_str)
num_records = result.numRecords
records = ffi.cast("Genbank*", result.records)
return [_convert_genbank_record(records[i]) for i in range(num_records)]

def _convert_genbank_record(record) -> Genbank:
meta = _convert_meta(record.meta)
features = [_convert_feature(record.features[i]) for i in range(record.feature_count)]
sequence = ffi.string(record.sequence).decode('utf-8')
return Genbank(meta, features, sequence)

def _convert_meta(meta) -> GenbankMeta:
locus = GenbankLocus(
ffi.string(meta.locus.name).decode('utf-8'),
ffi.string(meta.locus.sequence_length).decode('utf-8'),
ffi.string(meta.locus.molecule_type).decode('utf-8'),
ffi.string(meta.locus.genbank_division).decode('utf-8'),
ffi.string(meta.locus.modification_date).decode('utf-8'),
ffi.string(meta.locus.sequence_coding).decode('utf-8'),
meta.locus.circular
)

references = [
GenbankReference(
ffi.string(ref.authors).decode('utf-8'),
ffi.string(ref.title).decode('utf-8'),
ffi.string(ref.journal).decode('utf-8'),
ffi.string(ref.pub_med).decode('utf-8'),
ffi.string(ref.remark).decode('utf-8'),
ffi.string(ref.range_).decode('utf-8'),
ffi.string(ref.consortium).decode('utf-8')
) for ref in meta.references[0:meta.reference_count]
]

base_counts = [
GenbankBaseCount(
ffi.string(bc.base).decode('utf-8'),
bc.count
) for bc in meta.base_counts[0:meta.base_count_count]
]

other = {}
for i in range(meta.other_count):
key = ffi.string(meta.other_keys[i]).decode('utf-8')
value = ffi.string(meta.other_values[i]).decode('utf-8')
other[key] = value

return GenbankMeta(
ffi.string(meta.date).decode('utf-8'),
ffi.string(meta.definition).decode('utf-8'),
ffi.string(meta.accession).decode('utf-8'),
ffi.string(meta.version).decode('utf-8'),
ffi.string(meta.keywords).decode('utf-8'),
ffi.string(meta.organism).decode('utf-8'),
ffi.string(meta.source).decode('utf-8'),
[ffi.string(tax).decode('utf-8') for tax in meta.taxonomy[0:meta.taxonomy_count]],
ffi.string(meta.origin).decode('utf-8'),
locus,
references,
base_counts,
other,
ffi.string(meta.name).decode('utf-8'),
ffi.string(meta.sequence_hash).decode('utf-8'),
ffi.string(meta.sequence_hash_function).decode('utf-8')
)

def _convert_location(loc) -> GenbankLocation:
sub_locations = []

if loc.sub_locations_count > 0:
sub_locations_array = ffi.cast("GenbankLocation *", loc.sub_locations)
for i in range(loc.sub_locations_count):
sub_loc = sub_locations_array[i]
sub_locations.append(_convert_location(sub_loc))

return GenbankLocation(
loc.start,
loc.end,
bool(loc.complement),
bool(loc.join),
bool(loc.five_prime_partial),
bool(loc.three_prime_partial),
ffi.string(loc.gbk_location_string).decode('utf-8') if loc.gbk_location_string else "",
sub_locations
)

def _convert_feature(feature) -> GenbankFeature:
attributes = {}
for i in range(feature.attribute_count):
key = ffi.string(feature.attribute_keys[i]).decode('utf-8')
values = [ffi.string(feature.attribute_values[i][j]).decode('utf-8')
for j in range(feature.attribute_value_counts[i])]
attributes[key] = values

location = _convert_location(feature.location)

return GenbankFeature(
ffi.string(feature.type_).decode('utf-8'),
ffi.string(feature.description).decode('utf-8'),
attributes,
ffi.string(feature.sequence_hash).decode('utf-8'),
ffi.string(feature.sequence_hash_function).decode('utf-8'),
ffi.string(feature.sequence).decode('utf-8'),
location
)
Loading

0 comments on commit 888031d

Please sign in to comment.