diff --git a/CHANGELOG.md b/CHANGELOG.md index c2fe0c05..78bfc0a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,15 +8,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added -- Alternative start codons can now be used in the `synthesis/codon` DNA -> protein translation package (#305) -- Added a parser and writer for the `pileup` sequence alignment format (#329) +- Generic parser is now implemented across all parsers for consistent interactions. [(#339)](https://github.com/TimothyStiles/poly/issues/339) +- Alternative start codons can now be used in the `synthesis/codon` DNA -> protein translation package [(#305)](https://github.com/TimothyStiles/poly/issues/305) +- Added a parser and writer for the `pileup` sequence alignment format [(#329)](https://github.com/TimothyStiles/poly/issues/329) +- Created copy methods for Feature and Location to address concerns raised by [(#342)](https://github.com/TimothyStiles/poly/issues/342) +- Created new methods to convert polyjson -> genbank. +- Created new `Feature.StoreSequence` method to enable [(#388)](https://github.com/TimothyStiles/poly/issues/388) + +### Changed +- **Breaking**: Genbank parser uses new custom multimap for `Feature.Attributes`, which allows for duplicate keys. This changes the type of Features.Attributes from `map[string]string` to `MultiMap[string, string]`, an alias for `map[string]string` defined in `multimap.go`. [(#383)](https://github.com/TimothyStiles/poly/issues/383) +- Improves error reporting for genbank parse errors via a new `ParseError` struct. ### Fixed -- `fastq` parser no longer becomes de-aligned when reading (#325) -- `fastq` now handles optionals correctly (#323) +- `fastq` parser no longer becomes de-aligned when reading [(#325)](https://github.com/TimothyStiles/poly/issues/325) +- `fastq` now handles optionals correctly [(#323)](https://github.com/TimothyStiles/poly/issues/323) +- Adds functional test and fix for [(#313)](https://github.com/TimothyStiles/poly/issues/313). +- In addition to expanding the set of genbank files which can be validly parsed, the parser is more vocal when it encounters unusual syntax in the "feature" section. This "fail fast" approach is better as there were cases where inputs triggered a codepath which would neither return a valid Genbank object nor an error, and should help with debugging. ## [0.26.0] - 2023-07-22 Oops, we weren't keeping a changelog before this tag! [unreleased]: https://github.com/TimothyStiles/poly/compare/v0.26.0...main -[0.26.0]: https://github.com/TimothyStiles/poly/releases/tag/v0.26.0 \ No newline at end of file +[0.26.0]: https://github.com/TimothyStiles/poly/releases/tag/v0.26.0 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4d4bcb86..b29dc2a8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -93,6 +93,18 @@ In order to simplify the development experience, and environment setup, the poly Whether you're a beginner with Go or you're an experienced developer, You should see the suggestions popup automatically when you goto the *Plugins* tab in VSCode. Using these plugins can help accelerate the development experience and also allow you to work more collaboratively with other poly developers. +## Local Checks + +Poly runs numerous CI/CD checks via Github Actions before a PR can be merged. In order to make your PR mergeable, your PR must pass all of these checks. + +A quick way to check your PR will pass is to run: + +```sh +gofmt -s -w . && go test ./... +``` + +Additionally, you may want to [install](https://golangci-lint.run/usage/install/#local-installation) and run the linter. + # How to report a bug ### Security disclosures diff --git a/bio/bio.go b/bio/bio.go new file mode 100644 index 00000000..628e189b --- /dev/null +++ b/bio/bio.go @@ -0,0 +1,275 @@ +/* +Package bio provides utilities for reading and writing sequence data. + +There are many different biological file formats for different applications. +The poly bio package provides a consistent way to work with each of these file +formats. The underlying data returned by each parser is as raw as we can return +while still being easy to use for downstream applications, and should be +immediately recognizable as the original format. +*/ +package bio + +import ( + "bufio" + "context" + "errors" + "io" + "math" + + "github.com/TimothyStiles/poly/bio/fasta" + "github.com/TimothyStiles/poly/bio/fastq" + "github.com/TimothyStiles/poly/bio/genbank" + "github.com/TimothyStiles/poly/bio/pileup" + "github.com/TimothyStiles/poly/bio/slow5" + "golang.org/x/sync/errgroup" +) + +// Format is a enum of different parser formats. +type Format int + +const ( + Fasta Format = iota + Fastq + Genbank + Slow5 + Pileup +) + +// DefaultMaxLineLength variables are defined for performance reasons. While +// parsing, reading byte-by-byte takes far, far longer than reading many bytes +// into a buffer. In golang, this buffer in bufio is usually 64kb. However, +// many files, especially slow5 files, can contain single lines that are much +// longer. We use the default maxLineLength from bufio unless there is a +// particular reason to use a different number. +const defaultMaxLineLength int = bufio.MaxScanTokenSize // 64kB is a magic number often used by the Go stdlib for parsing. +var DefaultMaxLengths = map[Format]int{ + Fasta: defaultMaxLineLength, + Fastq: 8 * 1024 * 1024, // The longest single nanopore sequencing read so far is 4Mb. A 8mb buffer should be large enough for any sequencing. + Genbank: defaultMaxLineLength, + Slow5: 128 * 1024 * 1024, // 128mb is used because slow5 lines can be massive, since a single read can be many millions of base pairs. + Pileup: defaultMaxLineLength, +} + +/****************************************************************************** +Aug 30, 2023 + +Lower level interfaces + +******************************************************************************/ + +// parserInterface is a generic interface that all parsers must support. It is +// very simple, only requiring two functions, Header() and Next(). Header() +// returns the header of the file if there is one: most files, like fasta, +// fastq, and pileup do not contain headers, while others like sam and slow5 do +// have headers. Next() returns a record/read/line from the file format, and +// terminates on an io.EOF error. +// +// Next() may terminate with an io.EOF error with a nil Data or with a +// full Data, depending on where the EOF is in the actual file. A check +// for this is needed at the last Next(), when it returns an io.EOF error. A +// pointer is used to represent the difference between a null Data and an +// empty Data. +type parserInterface[Data io.WriterTo, Header io.WriterTo] interface { + Header() (Header, error) + Next() (Data, error) +} + +/****************************************************************************** + +Higher level parse + +******************************************************************************/ + +// Parser is generic bioinformatics file parser. It contains a LowerLevelParser +// and implements useful functions on top of it: such as Parse(), ParseToChannel(), and +// ParseWithHeader(). +type Parser[Data io.WriterTo, Header io.WriterTo] struct { + parserInterface parserInterface[Data, Header] +} + +// NewFastaParser initiates a new FASTA parser from an io.Reader. +func NewFastaParser(r io.Reader) (*Parser[*fasta.Record, *fasta.Header], error) { + return NewFastaParserWithMaxLineLength(r, DefaultMaxLengths[Fasta]) +} + +// NewFastaParserWithMaxLineLength initiates a new FASTA parser from an +// io.Reader and a user-given maxLineLength. +func NewFastaParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[*fasta.Record, *fasta.Header], error) { + return &Parser[*fasta.Record, *fasta.Header]{parserInterface: fasta.NewParser(r, maxLineLength)}, nil +} + +// NewFastqParser initiates a new FASTQ parser from an io.Reader. +func NewFastqParser(r io.Reader) (*Parser[*fastq.Read, *fastq.Header], error) { + return NewFastqParserWithMaxLineLength(r, DefaultMaxLengths[Fastq]) +} + +// NewFastqParserWithMaxLineLength initiates a new FASTQ parser from an +// io.Reader and a user-given maxLineLength. +func NewFastqParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[*fastq.Read, *fastq.Header], error) { + return &Parser[*fastq.Read, *fastq.Header]{parserInterface: fastq.NewParser(r, maxLineLength)}, nil +} + +// NewGenbankParser initiates a new Genbank parser form an io.Reader. +func NewGenbankParser(r io.Reader) (*Parser[*genbank.Genbank, *genbank.Header], error) { + return NewGenbankParserWithMaxLineLength(r, DefaultMaxLengths[Genbank]) +} + +// NewGenbankParserWithMaxLineLength initiates a new Genbank parser from an +// io.Reader and a user-given maxLineLength. +func NewGenbankParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[*genbank.Genbank, *genbank.Header], error) { + return &Parser[*genbank.Genbank, *genbank.Header]{parserInterface: genbank.NewParser(r, maxLineLength)}, nil +} + +// NewSlow5Parser initiates a new SLOW5 parser from an io.Reader. +func NewSlow5Parser(r io.Reader) (*Parser[*slow5.Read, *slow5.Header], error) { + return NewSlow5ParserWithMaxLineLength(r, DefaultMaxLengths[Slow5]) +} + +// NewSlow5ParserWithMaxLineLength initiates a new SLOW5 parser from an +// io.Reader and a user-given maxLineLength. +func NewSlow5ParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[*slow5.Read, *slow5.Header], error) { + parser, err := slow5.NewParser(r, maxLineLength) + return &Parser[*slow5.Read, *slow5.Header]{parserInterface: parser}, err +} + +// NewPileupParser initiates a new Pileup parser from an io.Reader. +func NewPileupParser(r io.Reader) (*Parser[*pileup.Line, *pileup.Header], error) { + return NewPileupParserWithMaxLineLength(r, DefaultMaxLengths[Pileup]) +} + +// NewPileupParserWithMaxLineLength initiates a new Pileup parser from an +// io.Reader and a user-given maxLineLength. +func NewPileupParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[*pileup.Line, *pileup.Header], error) { + return &Parser[*pileup.Line, *pileup.Header]{parserInterface: pileup.NewParser(r, maxLineLength)}, nil +} + +/****************************************************************************** + +Parser higher-level functions + +******************************************************************************/ + +// Next is a parsing primitive that should be used when low-level control is +// needed. It returns the next record/read/line from the parser. On EOF, it +// returns an io.EOF error, though the returned FASTA/FASTQ/SLOW5/Pileup may or +// may not be nil, depending on where the io.EOF is. This should be checked by +// downstream software. Next can only be called as many times as there are +// records in a file, as the parser reads the underlying io.Reader in a +// straight line. +func (p *Parser[Data, Header]) Next() (Data, error) { + return p.parserInterface.Next() +} + +// Header is a parsing primitive that should be used when low-level control is +// needed. It returns the header of the parser, which is usually parsed prior +// to the parser being returned by the "NewXXXParser" functions. Unlike the +// Next() function, Header() can be called as many times as needed. Sometimes +// files have useful headers, while other times they do not. +// +// The following file formats do not have a useful header: +// +// FASTA +// FASTQ +// Pileup +// +// The following file formats do have a useful header: +// +// SLOW5 +func (p *Parser[Data, Header]) Header() (Header, error) { + return p.parserInterface.Header() +} + +// ParseN returns a countN number of records/reads/lines from the parser. +func (p *Parser[Data, Header]) ParseN(countN int) ([]Data, error) { + var records []Data + for counter := 0; counter < countN; counter++ { + record, err := p.Next() + if err != nil { + if errors.Is(err, io.EOF) { + err = nil // EOF not treated as parsing error. + } + return records, err + } + records = append(records, record) + } + return records, nil +} + +// Parse returns all records/reads/lines from the parser, but does not include +// the header. It can only be called once on a given parser because it will +// read all the input from the underlying io.Reader before exiting. +func (p *Parser[Data, Header]) Parse() ([]Data, error) { + return p.ParseN(math.MaxInt) +} + +// ParseWithHeader returns all records/reads/lines, plus the header, from the +// parser. It can only be called once on a given parser because it will read +// all the input from the underlying io.Reader before exiting. +func (p *Parser[Data, Header]) ParseWithHeader() ([]Data, Header, error) { + header, headerErr := p.Header() + data, err := p.Parse() + if headerErr != nil { + return data, header, err + } + if err != nil { + return data, header, err + } + return data, header, nil +} + +/****************************************************************************** + +Concurrent higher-level functions + +******************************************************************************/ + +// ParseToChannel pipes all records/reads/lines from a parser into a channel, +// then optionally closes that channel. If parsing a single file, +// "keepChannelOpen" should be set to false, which will close the channel once +// parsing is complete. If many files are being parsed to a single channel, +// keepChannelOpen should be set to true, so that an external function will +// close channel once all are done parsing. +// +// Context can be used to close the parser in the middle of parsing - for +// example, if an error is found in another parser elsewhere and all files +// need to close. +func (p *Parser[Data, Header]) ParseToChannel(ctx context.Context, channel chan<- Data, keepChannelOpen bool) error { + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + record, err := p.Next() + if err != nil { + if errors.Is(err, io.EOF) { + err = nil // EOF not treated as parsing error. + } + if !keepChannelOpen { + close(channel) + } + return err + } + channel <- record + } + } +} + +// ManyToChannel is a generic function that implements the ManyXXXToChannel +// functions. It properly does concurrent parsing of many parsers to a +// single channel, then closes that channel. If any of the files fail to +// parse, the entire pipeline exits and returns. +func ManyToChannel[Data io.WriterTo, Header io.WriterTo](ctx context.Context, channel chan<- Data, parsers ...*Parser[Data, Header]) error { + errorGroup, ctx := errgroup.WithContext(ctx) + // For each parser, start a new goroutine to parse data to the channel + for _, p := range parsers { + parser := p // Copy to local variable to avoid loop variable scope issues + errorGroup.Go(func() error { + // Use the parser's ParseToChannel function, but keep the channel open + return parser.ParseToChannel(ctx, channel, true) + }) + } + // Wait for all parsers to complete + err := errorGroup.Wait() + close(channel) + return err +} diff --git a/bio/example_test.go b/bio/example_test.go new file mode 100644 index 00000000..d125c095 --- /dev/null +++ b/bio/example_test.go @@ -0,0 +1,290 @@ +package bio_test + +import ( + "bytes" + "compress/gzip" + "context" + "fmt" + "os" + "strings" + + "github.com/TimothyStiles/poly/bio" + "github.com/TimothyStiles/poly/bio/fasta" +) + +// Example_read shows an example of reading a file from disk. +func Example_read() { + // Read lets you read files from disk into a parser. + file, _ := os.Open("fasta/data/base.fasta") + parser, _ := bio.NewFastaParser(file) + + records, _ := parser.Parse() + + fmt.Println(records[1].Sequence) + // Output: ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK* +} + +// Example_readGz shows an example of reading and parsing a gzipped file. +func Example_readGz() { + fileGz, _ := os.Open("fasta/data/base.fasta.gz") + file, _ := gzip.NewReader(fileGz) + parser, _ := bio.NewFastaParser(file) + records, _ := parser.Parse() + + fmt.Println(records[1].Sequence) + // Output: ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK* +} + +func Example_newParserGz() { + // First, lets make a file that is gzip'd, represented by this + // buffer. + var file bytes.Buffer + zipWriter := gzip.NewWriter(&file) + _, _ = zipWriter.Write([]byte(`>gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] +LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV +EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG +LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL +GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX +IENY + +>MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken +ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID +FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA +DIDGDGQVNYEEFVQMMTAK*`)) + zipWriter.Close() + + fileDecompressed, _ := gzip.NewReader(&file) // Decompress the file + parser, _ := bio.NewFastaParser(fileDecompressed) + records, _ := parser.Parse() // Parse all data records from file + + fmt.Println(records[1].Sequence) + // Output: ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK* +} + +func ExampleParser_ParseWithHeader() { + // The following can be replaced with a any io.Reader. For example, + // `file, err := os.Open(path)` for file would also work. + file := strings.NewReader(`#slow5_version 0.2.0 +#num_read_groups 1 +@asic_id 4175987214 +#char* uint32_t double double double double uint64_t int16_t* uint64_t int32_t uint8_t double enum{unknown,partial,mux_change,unblock_mux_change,data_service_unblock_mux_change,signal_positive,signal_negative} char* +#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal start_time read_number start_mux median_before end_reason channel_number +0026631e-33a3-49ab-aa22-3ab157d71f8b 0 8192 16 1489.52832 4000 5347 430,472,463 8318394 5383 1 219.133423 5 10 +`) + parser, _ := bio.NewSlow5Parser(file) + reads, header, _ := parser.ParseWithHeader() // Parse all data records from file + + fmt.Printf("%s, %s\n", header.HeaderValues[0].Slow5Version, reads[0].ReadID) + // Output: 0.2.0, 0026631e-33a3-49ab-aa22-3ab157d71f8b +} + +func ExampleParser_ParseToChannel() { + // The following can be replaced with a any io.Reader. For example, + // `file, err := os.Open(path)` for file would also work. + file := strings.NewReader(`>gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] +LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV +EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG +LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL +GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX +IENY + +>MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken +ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID +FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA +DIDGDGQVNYEEFVQMMTAK*`) + parser, _ := bio.NewFastaParser(file) + + channel := make(chan *fasta.Record) + ctx := context.Background() + go func() { _ = parser.ParseToChannel(ctx, channel, false) }() + + var records []*fasta.Record + for record := range channel { + records = append(records, record) + } + + fmt.Println(records[1].Sequence) + // Output: ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK* +} + +func ExampleManyToChannel() { + file1 := strings.NewReader(`>gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] +LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV +EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG +LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL +GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX +IENY +`) + file2 := strings.NewReader(`>MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken +ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID +FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA +DIDGDGQVNYEEFVQMMTAK*`) + parser1, _ := bio.NewFastaParser(file1) + parser2, _ := bio.NewFastaParser(file2) + + channel := make(chan *fasta.Record) + ctx := context.Background() + go func() { _ = bio.ManyToChannel(ctx, channel, parser1, parser2) }() + + var records []*fasta.Record + for record := range channel { + records = append(records, record) + } + + fmt.Println(len(records)) // Records come out in a stochastic order, so we just make sure there are 2 + // Output: 2 +} + +func Example_writeAll() { + // The following can be replaced with a any io.Reader. For example, + // `file, err := os.Open(path)` for file would also work. + file := strings.NewReader(`#slow5_version 0.2.0 +#num_read_groups 1 +@asic_id 4175987214 +#char* uint32_t double double double double uint64_t int16_t* uint64_t int32_t uint8_t double enum{unknown,partial,mux_change,unblock_mux_change,data_service_unblock_mux_change,signal_positive,signal_negative} char* +#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal start_time read_number start_mux median_before end_reason channel_number +0026631e-33a3-49ab-aa22-3ab157d71f8b 0 8192 16 1489.52832 4000 5347 430,472,463 8318394 5383 1 219.133423 5 10 +`) + parser, _ := bio.NewSlow5Parser(file) + reads, header, _ := parser.ParseWithHeader() // Parse all data records from file + + // Write the files to an io.Writer. + // All headers and all records implement io.WriterTo interfaces. + var buffer bytes.Buffer + _, _ = header.WriteTo(&buffer) + for _, read := range reads { + _, _ = read.WriteTo(&buffer) + } + + fmt.Println(buffer.String()) + // Output: #slow5_version 0.2.0 + //#num_read_groups 1 + //@asic_id 4175987214 + //#char* uint32_t double double double double uint64_t int16_t* uint64_t int32_t uint8_t double enum{unknown,partial,mux_change,unblock_mux_change,data_service_unblock_mux_change,signal_positive,signal_negative} char* + //#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal start_time read_number start_mux median_before end_reason channel_number + //0026631e-33a3-49ab-aa22-3ab157d71f8b 0 8192 16 1489.52832 4000 5347 430,472,463 8318394 5383 1 219.133423 5 10 + // + // +} + +func ExampleNewFastaParser() { + // The following can be replaced with a any io.Reader. For example, + // `file, err := os.Open(path)` for file would also work. + file := strings.NewReader(`>gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] +LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV +EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG +LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL +GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX +IENY + +>MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken +ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID +FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA +DIDGDGQVNYEEFVQMMTAK*`) + parser, _ := bio.NewFastaParser(file) + records, _ := parser.Parse() // Parse all data records from file + + fmt.Println(records[1].Sequence) + // Output: ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK* +} + +func ExampleNewFastqParser() { + // The following can be replaced with a any io.Reader. For example, + // `file, err := os.Open(path)` for file would also work. + file := strings.NewReader(`@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;' && !p.start: + record, err := p.newRecord() + // New name + p.identifier = string(line[1:]) + return &record, err + // Process first line of file + case line[0] == '>' && p.start: + p.identifier = string(line[1:]) + p.start = false + } + } + p.more = false + // Add final sequence in file + record, err := p.newRecord() + if err != nil { + return &record, err + } + return &record, nil +} + +func (p *Parser) newRecord() (Record, error) { + sequence := p.buff.String() + if sequence == "" { + return Record{}, fmt.Errorf("%s has no sequence", p.identifier) + } + record := Record{ + Identifier: p.identifier, + Sequence: sequence, + } + // Reset sequence buffer + p.buff.Reset() + return record, nil +} + +///****************************************************************************** +// +//Start of Write functions +// +//******************************************************************************/ + +// WriteTo implements the io.WriterTo interface for fasta records. +func (record *Record) WriteTo(w io.Writer) (int64, error) { + var writtenBytes int64 + var newWrittenBytes int + newWrittenBytes, err := w.Write([]byte(">")) + if err != nil { + return writtenBytes, err + } + writtenBytes += int64(newWrittenBytes) + newWrittenBytes, err = w.Write([]byte(record.Identifier)) + if err != nil { + return writtenBytes, err + } + writtenBytes += int64(newWrittenBytes) + newWrittenBytes, err = w.Write([]byte("\n")) + if err != nil { + return writtenBytes, err + } + writtenBytes += int64(newWrittenBytes) + + lineCount := 0 + // write the fasta sequence 80 characters at a time + for _, character := range record.Sequence { + newWrittenBytes, err = w.Write([]byte{byte(character)}) + if err != nil { + return writtenBytes, err + } + writtenBytes += int64(newWrittenBytes) + lineCount++ + if lineCount == 80 { + newWrittenBytes, err = w.Write([]byte("\n")) + if err != nil { + return writtenBytes, err + } + writtenBytes += int64(newWrittenBytes) + lineCount = 0 + } + } + newWrittenBytes, err = w.Write([]byte("\n\n")) + if err != nil { + return writtenBytes, err + } + writtenBytes += int64(newWrittenBytes) + return writtenBytes, nil +} diff --git a/bio/fasta/fasta_test.go b/bio/fasta/fasta_test.go new file mode 100644 index 00000000..ba5dfa67 --- /dev/null +++ b/bio/fasta/fasta_test.go @@ -0,0 +1,168 @@ +package fasta + +import ( + "bytes" + "errors" + "io" + "strings" + "testing" +) + +func TestHeader(t *testing.T) { + _, err := NewParser(nil, 256).Header() + if err != nil { + t.Errorf("Unexpected error while parsing header: %v", err) + } +} + +func TestParser(t *testing.T) { + for testIndex, test := range []struct { + content string + expected []Record + }{ + { + content: ">humen\nGATTACA\nCATGAT", // EOF-ended Fasta is valid + expected: []Record{{Identifier: "humen", Sequence: "GATTACACATGAT"}}, + }, + { + content: ">humen\nGATTACA\nCATGAT\n", + expected: []Record{{Identifier: "humen", Sequence: "GATTACACATGAT"}}, + }, + { + content: ">doggy or something\nGATTACA\n\nCATGAT\n\n;a fun comment\n" + + ">homunculus\nAAAA\n", + expected: []Record{ + {Identifier: "doggy or something", Sequence: "GATTACACATGAT"}, + {Identifier: "homunculus", Sequence: "AAAA"}, + }, + }, + } { + var fastas []Record + parser := NewParser(strings.NewReader(test.content), 256) + for { + fa, err := parser.Next() + if err != nil { + if !errors.Is(err, io.EOF) { + t.Errorf("Got error: %s", err) + } + break + } + fastas = append(fastas, *fa) + } + if len(fastas) != len(test.expected) { + t.Errorf("case index %d: got %d fastas, expected %d", testIndex, len(fastas), len(test.expected)) + continue + } + for index, gotFasta := range fastas { + expected := test.expected[index] + if expected != gotFasta { + t.Errorf("got!=expected: %+v != %+v", gotFasta, expected) + } + } + } +} + +// TestReadEmptyFasta tests that an empty fasta file is parsed correctly. +func TestReadEmptyFasta(t *testing.T) { + var fastas []Record + var targetError error + emptyFasta := "testing\natagtagtagtagtagatgatgatgatgagatg\n\n\n\n\n\n\n\n\n\n\n" + parser := NewParser(strings.NewReader(emptyFasta), 256) + for { + fa, err := parser.Next() + if err != nil { + if errors.Is(err, io.EOF) { + err = nil // EOF not treated as parsing error. + } + targetError = err + break + } + fastas = append(fastas, *fa) + } + if targetError == nil { + t.Errorf("expected error reading empty fasta stream") + } + if len(fastas) != 0 { + t.Errorf("expected 1 fastas, got %d", len(fastas)) + } +} + +func TestReadEmptySequence(t *testing.T) { + var targetError error + emptyFasta := ">testing\natagtagtagtagtagatgatgatgatgagatg\n>testing2\n\n\n\n\n\n\n\n\n\n" + parser := NewParser(strings.NewReader(emptyFasta), 256) + for { + _, err := parser.Next() + if err != nil { + if errors.Is(err, io.EOF) { + err = nil // EOF not treated as parsing error. + } + targetError = err + break + } + } + if targetError == nil { + t.Errorf("expected error reading empty fasta sequence stream: %s", targetError) + } +} + +func TestBufferSmall(t *testing.T) { + var targetError error + emptyFasta := ">test\natagtagtagtagtagatgatgatgatgagatg\n>test\n\n\n\n\n\n\n\n\n\n" + parser := NewParser(strings.NewReader(emptyFasta), 8) + for { + _, err := parser.Next() + if err != nil { + if errors.Is(err, io.EOF) { + err = nil // EOF not treated as parsing error. + } + targetError = err + break + } + } + if targetError == nil { + t.Errorf("expected error with too small of a buffer") + } +} + +// The following functions help test for writing with a limit in order to get +// that sweet sweet test coverage. +// LimitedWriter wraps another io.Writer and returns an error if more than maxSize bytes are written. +type LimitedWriter struct { + w io.Writer + written int64 + maxSize int64 +} + +func NewLimitedWriter(w io.Writer, maxSize int64) *LimitedWriter { + return &LimitedWriter{ + w: w, + written: 0, + maxSize: maxSize, + } +} + +func (lw *LimitedWriter) Write(p []byte) (int, error) { + if int64(len(p)) > (lw.maxSize - lw.written) { + return 0, errors.New("write exceeds maximum size") + } + n, err := lw.w.Write(p) + lw.written += int64(n) + return n, err +} + +func TestWrite(t *testing.T) { + // 81 polyA + s := ">test\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n" + parser := NewParser(strings.NewReader(s), 1024) + fa, err := parser.Next() + if err != nil { + t.Errorf("Failed to read polyA: %s", err) + } + byteSizes := []int{0, 1, 5, 6, 7, 85, 86, 87, 89, 128} + for _, i := range byteSizes { + var buf bytes.Buffer + writer := NewLimitedWriter(&buf, int64(i)) + _, _ = fa.WriteTo(writer) + } +} diff --git a/io/fastq/data/nanosavseq.fastq b/bio/fastq/data/nanosavseq.fastq similarity index 100% rename from io/fastq/data/nanosavseq.fastq rename to bio/fastq/data/nanosavseq.fastq diff --git a/io/fastq/data/nanosavseq.fastq.gz b/bio/fastq/data/nanosavseq.fastq.gz similarity index 100% rename from io/fastq/data/nanosavseq.fastq.gz rename to bio/fastq/data/nanosavseq.fastq.gz diff --git a/io/fastq/data/nanosavseq_emptyseq.fastq b/bio/fastq/data/nanosavseq_emptyseq.fastq similarity index 100% rename from io/fastq/data/nanosavseq_emptyseq.fastq rename to bio/fastq/data/nanosavseq_emptyseq.fastq diff --git a/io/fastq/data/nanosavseq_noidentifier.fastq b/bio/fastq/data/nanosavseq_noidentifier.fastq similarity index 100% rename from io/fastq/data/nanosavseq_noidentifier.fastq rename to bio/fastq/data/nanosavseq_noidentifier.fastq diff --git a/io/fastq/data/nanosavseq_noplus.fastq b/bio/fastq/data/nanosavseq_noplus.fastq similarity index 100% rename from io/fastq/data/nanosavseq_noplus.fastq rename to bio/fastq/data/nanosavseq_noplus.fastq diff --git a/io/fastq/data/nanosavseq_noquality.fastq b/bio/fastq/data/nanosavseq_noquality.fastq similarity index 100% rename from io/fastq/data/nanosavseq_noquality.fastq rename to bio/fastq/data/nanosavseq_noquality.fastq diff --git a/io/fastq/data/nanosavseq_noquality2.fastq b/bio/fastq/data/nanosavseq_noquality2.fastq similarity index 100% rename from io/fastq/data/nanosavseq_noquality2.fastq rename to bio/fastq/data/nanosavseq_noquality2.fastq diff --git a/io/fastq/data/nanosavseq_noseq.fastq b/bio/fastq/data/nanosavseq_noseq.fastq similarity index 100% rename from io/fastq/data/nanosavseq_noseq.fastq rename to bio/fastq/data/nanosavseq_noseq.fastq diff --git a/bio/fastq/data/pOpen_v3.fastq.gz b/bio/fastq/data/pOpen_v3.fastq.gz new file mode 100644 index 00000000..bcd497f0 Binary files /dev/null and b/bio/fastq/data/pOpen_v3.fastq.gz differ diff --git a/bio/fastq/example_test.go b/bio/fastq/example_test.go new file mode 100644 index 00000000..22b59786 --- /dev/null +++ b/bio/fastq/example_test.go @@ -0,0 +1,30 @@ +package fastq_test + +import ( + _ "embed" + "fmt" + "strings" + + "github.com/TimothyStiles/poly/bio/fastq" +) + +//go:embed data/nanosavseq.fastq +var baseFastq string + +func ExampleParser() { + parser := fastq.NewParser(strings.NewReader(baseFastq), 2*32*1024) + for { + fastq, err := parser.Next() + if err != nil { + fmt.Println(err) + break + } + fmt.Println(fastq.Identifier) + } + //Output: + //e3cc70d5-90ef-49b6-bbe1-cfef99537d73 + //92728f25-b658-426c-8cd7-d82dc70dbf71 + //60907b6b-5e38-498e-9c07-f036ebd8c658 + //990e110e-5e50-41a2-8ad5-92044d4465b8 + //EOF +} diff --git a/bio/fastq/fastq.go b/bio/fastq/fastq.go new file mode 100644 index 00000000..c5ef5312 --- /dev/null +++ b/bio/fastq/fastq.go @@ -0,0 +1,217 @@ +/* +Package fastq contains fastq parsers and writers. + +Fastq is a flat text file format developed in ~2000 to store nucleotide +sequencing data. While similar to fastq, fastq has a few differences. First, +the sequence identifier begins with @ instead of >, and includes quality +values for a sequence. + +This package provides a parser and writer for working with Fastq formatted +sequencing data. +*/ +package fastq + +import ( + "bufio" + "errors" + "fmt" + "io" + "strings" +) + +/****************************************************************************** +March 22, 2023 + +Fastq Parser begins here + +I basically stole everything from the fasta parser, and added a few bits +for parsing out additional data from fastq nanopore files. Mwhahaha, stealing +code! + +Keoni + +******************************************************************************/ + +// Read is a struct representing a single Fastq read element with an Identifier, its corresponding sequence, its quality score, and any optional pieces of data. +type Read struct { + Identifier string `json:"identifier"` + Optionals map[string]string `json:"optionals"` // Nanopore, for example, carries along data like: `read=13956 ch=53 start_time=2020-11-11T01:49:01Z` + Sequence string `json:"sequence"` + Quality string `json:"quality"` +} + +// Header is a blank struct, needed for compatibility with bio parsers. It contains nothing. +type Header struct{} + +// WriteTo is a blank function, needed for compatibility with bio parsers. It doesn't do anything. +func (header *Header) WriteTo(w io.Writer) (int64, error) { + return 0, nil +} + +// Parser is a flexible parser that provides ample +// control over reading fastq-formatted sequences. +// It is initialized with NewParser. +type Parser struct { + // reader keeps state of current reader. + reader bufio.Reader + line uint + atEOF bool +} + +// Header returns nil,nil. +func (parser *Parser) Header() (*Header, error) { + return &Header{}, nil +} + +// NewParser returns a Parser that uses r as the source +// from which to parse fastq formatted sequences. +func NewParser(r io.Reader, maxLineSize int) *Parser { + return &Parser{ + reader: *bufio.NewReaderSize(r, maxLineSize), + } +} + +// Next reads next fastq genome in underlying reader and returns the result +// and the amount of bytes read during the call. +// Next only returns an error if it: +// - Attempts to read and fails to find a valid fastq sequence. +// - Returns reader's EOF if called after reader has been exhausted. +// - If a EOF is encountered immediately after a sequence with no newline ending. +// In this case the Read up to that point is returned with an EOF error. +// +// It is worth noting the amount of bytes read are always right up to before +// the next fastq starts which means this function can effectively be used +// to index where fastqs start in a file or string. +// +// Next is simplified for fastq files from fasta files. Unlike fasta +// files, fastq always have 4 lines following each other - not variable with +// a line limit of 80 like fasta files have. So instead of a for loop, you +// can just parse 4 lines at once. +func (parser *Parser) Next() (*Read, error) { + if parser.atEOF { + return &Read{}, io.EOF + } + // Initialization of parser state variables. + var ( + // Parser looks for a line starting with '@' + // that contains the next fastq sequence identifier. + lookingForIdentifier = true + seqIdentifier, quality string + optionals map[string]string + sequence, line []byte + err error + ) + + // parse identifier + line, err = parser.reader.ReadSlice('\n') + parser.line++ + if err != nil { + return &Read{}, err + } + + line = line[:len(line)-1] // Exclude newline delimiter. + if string(line)[0] == '@' { + lookingForIdentifier = false + } + lineSplits := strings.Split(string(line), " ") + seqIdentifier = lineSplits[0][1:] + optionals = make(map[string]string) + for _, optionalDatum := range lineSplits[1:] { + optionalSplits := strings.Split(optionalDatum, "=") + optionalKey := optionalSplits[0] + optionalValue := optionalSplits[1] + optionals[optionalKey] = optionalValue + } + + // parse sequence + line, err = parser.reader.ReadSlice('\n') + parser.line++ + if err != nil { + return &Read{}, err + } + if len(line) <= 1 { // newline delimiter - actually checking for empty line + return &Read{}, fmt.Errorf("empty fastq sequence for %q, got to line %d: %w", seqIdentifier, parser.line, err) + } + sequence = line[:len(line)-1] // Exclude newline delimiter. + var newSequence []byte + for _, char := range sequence { + newSequence = append(newSequence, char) + if !strings.ContainsRune("ATGCN", rune(char)) { + return &Read{}, errors.New("Only letters ATGCN are allowed for DNA/RNA in fastq file. Got letter: " + string(char)) + } + } + + // skip + + _, err = parser.reader.ReadSlice('\n') + parser.line++ + if err != nil { + return &Read{}, err + } + + // parse quality + line, err = parser.reader.ReadSlice('\n') + parser.line++ + if err != nil { + // If the line is EOF, just continue and finish off returning the new read. + if err != io.EOF { + return &Read{}, nil + } + parser.atEOF = true + } + if len(line) <= 1 { // newline delimiter - actually checking for empty line + return &Read{}, fmt.Errorf("empty quality sequence for %q, got to line %d: %w", seqIdentifier, parser.line, err) + } + quality = string(line[:len(line)-1]) + + // Parsing ended. Check for inconsistencies. + if lookingForIdentifier { + return &Read{}, fmt.Errorf("did not find fastq start '@', got to line %d: %w", parser.line, err) + } + fastq := Read{ + Identifier: seqIdentifier, + Optionals: optionals, + Quality: quality, + Sequence: string(newSequence), + } + // Gotten to this point err is non-nil only in EOF case. + // We report this error to note the fastq may be incomplete/corrupt + // like in the case of using an io.LimitReader wrapping the underlying reader. + return &fastq, nil +} + +// Reset discards all data in buffer and resets state. +func (parser *Parser) Reset(r io.Reader) { + parser.reader.Reset(r) + parser.line = 0 +} + +/****************************************************************************** + +Start of Write functions + +******************************************************************************/ + +func (read *Read) WriteTo(w io.Writer) (int64, error) { + var err error + var writtenBytes int64 + var newWrittenBytes int + newWrittenBytes, err = fmt.Fprintf(w, "@%s", read.Identifier) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } + for key, val := range read.Optionals { + newWrittenBytes, err = fmt.Fprintf(w, " %s=%s", key, val) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } + } + + // fastq doesn't limit at 80 characters, since it is + // mainly reading big ole' sequencing files without + // human input. + newWrittenBytes, err = fmt.Fprintf(w, "\n%s\n+\n%s\n", read.Sequence, read.Quality) + writtenBytes += int64(newWrittenBytes) + return writtenBytes, err +} diff --git a/bio/fastq/fastq_test.go b/bio/fastq/fastq_test.go new file mode 100644 index 00000000..223c7be0 --- /dev/null +++ b/bio/fastq/fastq_test.go @@ -0,0 +1,49 @@ +package fastq + +import ( + "io" + "os" + "strings" + "testing" +) + +func testException(t *testing.T, filePath string, errorString string) { + file, err := os.Open(filePath) + if err != nil { + t.Errorf("Failed to read %s. Got error: %s", filePath, err) + } + const maxLineSize = 2 * 32 * 1024 + parser := NewParser(file, maxLineSize) + for err == nil { + _, err = parser.Next() + } + if err == nil { + t.Errorf("%s parser should have gotten error: %s", filePath, errorString) + } +} + +func TestParseExceptions(t *testing.T) { + testException(t, "data/nanosavseq_noseq.fastq", "no seq") + testException(t, "data/nanosavseq_noquality.fastq", "no quality") + testException(t, "data/nanosavseq_noidentifier.fastq", "no identifier") + testException(t, "data/nanosavseq_emptyseq.fastq", "empty seq") + testException(t, "data/nanosavseq_noplus.fastq", "no plus EOF") + testException(t, "data/nanosavseq_noquality2.fastq", "no quality EOF") +} + +func TestParser(t *testing.T) { + file := strings.NewReader(`@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;6147) + /locus_tag="YIL177C" + /db_xref="GeneID:854630" + mRNA complement(join(<483..4598,4987..>6147)) + /locus_tag="YIL177C" + /product="Y' element ATP-dependent helicase" + /transcript_id="NM_001179522.1" + /db_xref="GeneID:854630" + CDS complement(join(483..4598,4987..6147)) + /locus_tag="YIL177C" + /EC_number="3.6.4.12" + /note="Putative Y' element ATP-dependent helicase" + /codon_start=1 + /product="Y' element ATP-dependent helicase" + /protein_id="NP_012092.1" + /db_xref="GeneID:854630" + /db_xref="SGD:S000001439" + /translation="MKVSDRRKFEKANFDEFESALNNKNDLVHCPSITLFESIPTEVR + SFYEDEKSGLIKVVKFRTGAMDRKRSFEKVVISVMVGKNVKKFLTFVEDEPDFQGGPI + PSKYLIPKKINLMVYTLFQVHTLKFNRKDYDTLSLFYLNRGYYNELSFRVLERCHEIA + SARPNDSSTMRTFTDFVSGAPIVRSLQKSTIRKYGYNLAPYMFLLLHVDELSIFSAYQ + ASLPGEKKVDTERLKRDLCPRKPIEIKYFSQICNDMMNKKDRLGDILHIILRACALNF + GAGPRGGAGDEEDRSITNEEPIIPSVDEHGLKVCKLRSPNTPRRLRKTLDAVKALLVS + SCACTARDLDIFDDNNGVAMWKWIKILYHEVAQETTLKDSYRITLVPSSDGISLLAFA + GPQRNVYVDDTTRRIQLYTDYNKNGSSEPRLKTLDGLTSDYVFYFVTVLRQMQICALG + NSYDAFNHDPWMDVVGFEDPNQVTNRDISRIVLYSYMFLNTAKGCLVEYATFRQYMRE + LPKNAPQKLNFREMRQGLIALGRHCVGSRFETDLYESATSELMANHSVQTGRNIYGVD + SFSLTSVSGTTATLLQERASERWIQWLGLESDYHCSFSSTRNAEDVVAGEAASSNHHQ + KISRVTRKRPREPKSTNDILVAGQKLFGSSFEFRDLHQLRLCYEIYMADTPSVAVQAP + PGYGKTELFHLPLIALASKGDVEYVSFLFVPYTVLLANCMIRLGRCGCLNVAPVRNFI + EEGYDGVTDLYVGIYDDLASTNFTDRIAAWENIVECTFRTNNVKLGYLIVDEFHNFET + EVYRQSQFGGITNLDFDAFEKAIFLSGTAPEAVADAALQRIGLTGLAKKSMDINELKR + SEDLSRGLSSYPTRMFNLIKEKSEVPLGHVHKIRKKVESQPEEALKLLLALFESEPES + KAIVVASTTNEVEELACSWRKYFRVVWIHGKLGAAEKVSRTKEFVTDGSMQVLIGTKL + VTEGIDIKQLMMVIMLDNRLNIIELIQGVGRLRDGGLCYLLSRKNSWAARNRKGELPP + IKEGCITEQVREFYGLESKKGKKGQHVGCCGSRTDLSADTVELIERMDRLAEKQATAS + MSIVALPSSFQESNSSDRYRKYCSSDEDSNTCIHGSANASTNASTNAITTASTNVRTN + ATTNASTNATTNASTNASTNATTNASTNATTNSSTNATTTASTNVRTSATTTASINVR + TSATTTESTNSSTNATTTESTNSSTNATTTESTNSNTSATTTASINVRTSATTTESTN + SSTSATTTASINVRTSATTTKSINSSTNATTTESTNSNTNATTTESTNSSTNATTTES + TNSSTNATTTESTNSNTSAATTESTNSNTSATTTESTNASAKEDANKDGNAEDNRFHP + VTDINKESYKRKGSQMVLLERKKLKAQFPNTSENMNVLQFLGFRSDEIKHLFLYGIDI + YFCPEGVFTQYGLCKGCQKMFELCVCWAGQKVSYRRIAWEALAVERMLRNDEEYKEYL + EDIEPYHGDPVGYLKYFSVKRREIYSQIQRNYAWYLAITRRRETISVLDSTRGKQGSQ + VFRMSGRQIKELYFKVWSNLRESKTEVLQYFLNWDEKKCQEEWEAKDDTVVVEALEKG + GVFQRLRSMTSAGLQGPQYVKLQFSRHHRQLRSRYELSLGMHLRDQIALGVTPSKVPH + WTAFLSMLIGLFYNKTFRQKLEYLLEQISEVWLLPHWLDLANVEVLAADDTRVPLYML + MVAVHKELDSDDVPDGRFDILLCRDSSREVGE" + rep_origin 7470..8793 + /note="ARS902; Putative replication origin; identified in + multiple array studies, not yet confirmed by plasmid-based + assay" + /db_xref="SGD:S000130156" + mRNA join(<155222,155311..>155765) + /gene="COX5B" + /locus_tag="YIL111W" + /product="cytochrome c oxidase subunit Vb" + /transcript_id="NM_001179459.1" + /db_xref="GeneID:854695" +CONTIG join(BK006942.2:1..439888) +// + diff --git a/data/benchling.gb b/bio/genbank/data/benchling.gb similarity index 100% rename from data/benchling.gb rename to bio/genbank/data/benchling.gb diff --git a/data/bsub.gbk b/bio/genbank/data/bsub.gbk similarity index 100% rename from data/bsub.gbk rename to bio/genbank/data/bsub.gbk diff --git a/data/flatGbk_test.seq b/bio/genbank/data/flatGbk_test.seq similarity index 100% rename from data/flatGbk_test.seq rename to bio/genbank/data/flatGbk_test.seq diff --git a/data/flatGbk_test.seq.gz b/bio/genbank/data/flatGbk_test.seq.gz similarity index 100% rename from data/flatGbk_test.seq.gz rename to bio/genbank/data/flatGbk_test.seq.gz diff --git a/data/long_comment.seq b/bio/genbank/data/long_comment.seq similarity index 100% rename from data/long_comment.seq rename to bio/genbank/data/long_comment.seq diff --git a/data/malformed_read_test.gbk b/bio/genbank/data/malformed_read_test.gbk similarity index 100% rename from data/malformed_read_test.gbk rename to bio/genbank/data/malformed_read_test.gbk diff --git a/data/multiGbk_test.seq b/bio/genbank/data/multiGbk_test.seq similarity index 100% rename from data/multiGbk_test.seq rename to bio/genbank/data/multiGbk_test.seq diff --git a/data/phix174.gb b/bio/genbank/data/phix174.gb similarity index 100% rename from data/phix174.gb rename to bio/genbank/data/phix174.gb diff --git a/data/pichia_chr1_head.gb b/bio/genbank/data/pichia_chr1_head.gb similarity index 100% rename from data/pichia_chr1_head.gb rename to bio/genbank/data/pichia_chr1_head.gb diff --git a/data/puc19.gbk b/bio/genbank/data/puc19.gbk similarity index 100% rename from data/puc19.gbk rename to bio/genbank/data/puc19.gbk diff --git a/data/puc19_303_regression.gbk b/bio/genbank/data/puc19_303_regression.gbk similarity index 100% rename from data/puc19_303_regression.gbk rename to bio/genbank/data/puc19_303_regression.gbk diff --git a/data/puc19_consrtm.gbk b/bio/genbank/data/puc19_consrtm.gbk similarity index 100% rename from data/puc19_consrtm.gbk rename to bio/genbank/data/puc19_consrtm.gbk diff --git a/data/puc19_snapgene.gb b/bio/genbank/data/puc19_snapgene.gb similarity index 100% rename from data/puc19_snapgene.gb rename to bio/genbank/data/puc19_snapgene.gb diff --git a/data/sample.gbk b/bio/genbank/data/sample.gbk similarity index 99% rename from data/sample.gbk rename to bio/genbank/data/sample.gbk index 51d0fe4e..ad755cf4 100644 --- a/data/sample.gbk +++ b/bio/genbank/data/sample.gbk @@ -163,4 +163,4 @@ ORIGIN 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc -// \ No newline at end of file +// diff --git a/data/t4_intron.gb b/bio/genbank/data/t4_intron.gb similarity index 100% rename from data/t4_intron.gb rename to bio/genbank/data/t4_intron.gb diff --git a/io/genbank/genbank.go b/bio/genbank/genbank.go similarity index 50% rename from io/genbank/genbank.go rename to bio/genbank/genbank.go index 468e17cf..f2152b23 100644 --- a/io/genbank/genbank.go +++ b/bio/genbank/genbank.go @@ -33,7 +33,7 @@ GBK specific IO related things begin here. var ( readFileFn = os.ReadFile - parseMultiNthFn = ParseMultiNth + parseMultiNthFn = parseMultiNth parseReferencesFn = parseReferences ) @@ -66,14 +66,14 @@ type Meta struct { // Feature holds the information for a feature in a Genbank file and other annotated sequence files. type Feature struct { - Type string `json:"type"` - Description string `json:"description"` - Attributes map[string]string `json:"attributes"` - SequenceHash string `json:"sequence_hash"` - SequenceHashFunction string `json:"hash_function"` - Sequence string `json:"sequence"` - Location Location `json:"location"` - ParentSequence *Genbank `json:"-"` + Type string `json:"type"` + Description string `json:"description"` + Attributes map[string][]string `json:"attributes"` + SequenceHash string `json:"sequence_hash"` + SequenceHashFunction string `json:"hash_function"` + Sequence string `json:"sequence"` + Location Location `json:"location"` + ParentSequence *Genbank `json:"-"` } // Reference holds information for one reference in a Meta struct. @@ -125,7 +125,25 @@ var ( sequenceRegex = regexp.MustCompile("[^a-zA-Z]+") ) +// StoreFeatureSequences calls StoreSequence on all features. +// The resulting JSON is guaranteed to have useful Feature.Sequence values. +// Useful when exporting for downstream analysis, such as with json.Marshal. +func (sequence *Genbank) StoreFeatureSequences() error { + for i := range sequence.Features { + _, err := sequence.Features[i].StoreSequence() + if err != nil { + return err + } + } + return nil +} + // AddFeature adds a feature to a Genbank struct. +// NOTE: This method assumes feature is not referenced in another location +// as this only creates a shallow copy. +// If you intend to duplicate a feature from another Genbank and plan +// to modify in either location, it is recommended you first call feature.Copy() +// before passing as input to save yourself trouble. func (sequence *Genbank) AddFeature(feature *Feature) error { feature.ParentSequence = sequence sequence.Features = append(sequence.Features, *feature) @@ -137,256 +155,362 @@ func (feature Feature) GetSequence() (string, error) { return getFeatureSequence(feature, feature.Location) } +// StoreSequence infers and assigns the value of feature.Sequence +// if currently an empty string. +func (feature *Feature) StoreSequence() (string, error) { + if feature.Sequence != "" { + return feature.Sequence, nil + } + seq, err := getFeatureSequence(*feature, feature.Location) + if err == nil { + feature.Sequence = seq + } + return seq, err +} + +// Copy creates deep copy of Feature, which supports safe duplication. +func (feature *Feature) Copy() Feature { + copy := *feature + copy.Location = CopyLocation(feature.Location) + copy.Attributes = NewMultiMap[string, string]() + ForEachKey(feature.Attributes, func(k string, v []string) { + copy.Attributes[k] = MapSlice(v, identity[string]) + }) + return copy +} + +// CopyLocation creates deep copy of Location, which supports safe duplication +func CopyLocation(location Location) Location { + location.SubLocations = MapSlice(location.SubLocations, CopyLocation) + return location +} + // getFeatureSequence takes a feature and location object and returns a sequence string. func getFeatureSequence(feature Feature, location Location) (string, error) { var sequenceBuffer bytes.Buffer - var sequenceString string parentSequence := feature.ParentSequence.Sequence if len(location.SubLocations) == 0 { sequenceBuffer.WriteString(parentSequence[location.Start:location.End]) } else { for _, subLocation := range location.SubLocations { - sequence, _ := getFeatureSequence(feature, subLocation) + sequence, err := getFeatureSequence(feature, subLocation) + if err != nil { + return "", err + } sequenceBuffer.WriteString(sequence) } } // reverse complements resulting string if needed. + sequenceString := sequenceBuffer.String() if location.Complement { - sequenceString = transform.ReverseComplement(sequenceBuffer.String()) - } else { - sequenceString = sequenceBuffer.String() + sequenceString = transform.ReverseComplement(sequenceString) } return sequenceString, nil } -// Read reads a GBK file from path and returns a Genbank struct. -func Read(path string) (Genbank, error) { - genbankSlice, err := ReadMultiNth(path, 1) - if err != nil { - return Genbank{}, err +// WriteTo implements the io.WriterTo interface on genbank records. +func (sequence *Genbank) WriteTo(w io.Writer) (int64, error) { + var writtenBytes int64 + var newWrittenBytes int + var err error + + locus := sequence.Meta.Locus + var shape string + + if locus.Circular { + shape = "circular" + } else { + shape = "linear" } - genbank := genbankSlice[0] - return genbank, err -} -// ReadMulti reads a multi Gbk from path and parses it into a slice of Genbank structs. -func ReadMulti(path string) ([]Genbank, error) { - return ReadMultiNth(path, -1) -} + fivespace := generateWhiteSpace(subMetaIndex) -// ReadMultiNth reads a multi Gbk from path and parses N entries into a slice of Genbank structs. -func ReadMultiNth(path string, count int) ([]Genbank, error) { - file, err := os.Open(path) + // building locus + locusData := locus.Name + fivespace + locus.SequenceLength + " bp" + fivespace + locus.MoleculeType + fivespace + shape + fivespace + locus.GenbankDivision + fivespace + locus.ModificationDate + locusString := "LOCUS " + locusData + "\n" + newWrittenBytes, err = w.Write([]byte(locusString)) + writtenBytes += int64(newWrittenBytes) if err != nil { - return []Genbank{}, err + return writtenBytes, err } - sequence, err := parseMultiNthFn(file, count) + // building other standard meta features + definitionString := buildMetaString("DEFINITION", sequence.Meta.Definition) + newWrittenBytes, err = w.Write([]byte(definitionString)) + writtenBytes += int64(newWrittenBytes) if err != nil { - return []Genbank{}, err + return writtenBytes, err } - return sequence, nil -} - -// Write takes an Genbank list and a path string and writes out a genbank record to that path. -func Write(sequences Genbank, path string) error { - // build function always returns nil error. - // This is for API consistency in case we need to - // add error handling in the future. - gbk, _ := Build(sequences) - - err := os.WriteFile(path, gbk, 0644) - return err -} - -// WriteMulti takes a slice of Genbank structs and a path string and writes out a multi genbank record to that path. -func WriteMulti(sequences []Genbank, path string) error { - // buildmulti function always returns nil error. - // This is for API consistency in case we need to - // add error handling in the future. - gbk, _ := BuildMulti(sequences) - - err := os.WriteFile(path, gbk, 0644) - return err -} - -// Build builds a GBK byte slice to be written out to db or file. -func Build(gbk Genbank) ([]byte, error) { - gbkSlice := []Genbank{gbk} - multiGBK, err := BuildMulti(gbkSlice) - return multiGBK, err -} - -// BuildMulti builds a MultiGBK byte slice to be written out to db or file. -func BuildMulti(sequences []Genbank) ([]byte, error) { - var gbkString bytes.Buffer - for _, sequence := range sequences { - locus := sequence.Meta.Locus - var shape string - - if locus.Circular { - shape = "circular" - } else { - shape = "linear" - } - - fivespace := generateWhiteSpace(subMetaIndex) - - // building locus - locusData := locus.Name + fivespace + locus.SequenceLength + " bp" + fivespace + locus.MoleculeType + fivespace + shape + fivespace + locus.GenbankDivision + fivespace + locus.ModificationDate - locusString := "LOCUS " + locusData + "\n" - gbkString.WriteString(locusString) - - // building other standard meta features - definitionString := buildMetaString("DEFINITION", sequence.Meta.Definition) - gbkString.WriteString(definitionString) - - accessionString := buildMetaString("ACCESSION", sequence.Meta.Accession) - gbkString.WriteString(accessionString) + accessionString := buildMetaString("ACCESSION", sequence.Meta.Accession) + newWrittenBytes, err = w.Write([]byte(accessionString)) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } - versionString := buildMetaString("VERSION", sequence.Meta.Version) - gbkString.WriteString(versionString) + versionString := buildMetaString("VERSION", sequence.Meta.Version) + newWrittenBytes, err = w.Write([]byte(versionString)) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } - keywordsString := buildMetaString("KEYWORDS", sequence.Meta.Keywords) - gbkString.WriteString(keywordsString) + keywordsString := buildMetaString("KEYWORDS", sequence.Meta.Keywords) + newWrittenBytes, err = w.Write([]byte(keywordsString)) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } - sourceString := buildMetaString("SOURCE", sequence.Meta.Source) - gbkString.WriteString(sourceString) + sourceString := buildMetaString("SOURCE", sequence.Meta.Source) + newWrittenBytes, err = w.Write([]byte(sourceString)) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } - organismString := buildMetaString(" ORGANISM", sequence.Meta.Organism) - gbkString.WriteString(organismString) + organismString := buildMetaString(" ORGANISM", sequence.Meta.Organism) + newWrittenBytes, err = w.Write([]byte(organismString)) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } - if len(sequence.Meta.Taxonomy) > 0 { - var taxonomyString strings.Builder - for i, taxonomyData := range sequence.Meta.Taxonomy { - taxonomyString.WriteString(taxonomyData) - if len(sequence.Meta.Taxonomy) == i+1 { - taxonomyString.WriteString(".") - } else { - taxonomyString.WriteString("; ") - } + if len(sequence.Meta.Taxonomy) > 0 { + var taxonomyString strings.Builder + for i, taxonomyData := range sequence.Meta.Taxonomy { + taxonomyString.WriteString(taxonomyData) + if len(sequence.Meta.Taxonomy) == i+1 { + taxonomyString.WriteString(".") + } else { + taxonomyString.WriteString("; ") } - gbkString.WriteString(buildMetaString("", taxonomyString.String())) } + newWrittenBytes, err = w.Write([]byte(buildMetaString("", taxonomyString.String()))) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } + } - // building references - // TODO: could use reflection to get keys and make more general. - for referenceIndex, reference := range sequence.Meta.References { - referenceString := buildMetaString("REFERENCE", fmt.Sprintf("%d %s", referenceIndex+1, reference.Range)) - gbkString.WriteString(referenceString) + // building references + // TODO: could use reflection to get keys and make more general. + for referenceIndex, reference := range sequence.Meta.References { + referenceString := buildMetaString("REFERENCE", fmt.Sprintf("%d %s", referenceIndex+1, reference.Range)) + newWrittenBytes, err = w.Write([]byte(referenceString)) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } - if reference.Authors != "" { - authorsString := buildMetaString(" AUTHORS", reference.Authors) - gbkString.WriteString(authorsString) + if reference.Authors != "" { + authorsString := buildMetaString(" AUTHORS", reference.Authors) + newWrittenBytes, err = w.Write([]byte(authorsString)) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err } + } - if reference.Title != "" { - titleString := buildMetaString(" TITLE", reference.Title) - gbkString.WriteString(titleString) + if reference.Title != "" { + titleString := buildMetaString(" TITLE", reference.Title) + newWrittenBytes, err = w.Write([]byte(titleString)) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err } + } - if reference.Journal != "" { - journalString := buildMetaString(" JOURNAL", reference.Journal) - gbkString.WriteString(journalString) + if reference.Journal != "" { + journalString := buildMetaString(" JOURNAL", reference.Journal) + newWrittenBytes, err = w.Write([]byte(journalString)) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err } + } - if reference.PubMed != "" { - pubMedString := buildMetaString(" PUBMED", reference.PubMed) - gbkString.WriteString(pubMedString) + if reference.PubMed != "" { + pubMedString := buildMetaString(" PUBMED", reference.PubMed) + newWrittenBytes, err = w.Write([]byte(pubMedString)) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err } - if reference.Consortium != "" { - consrtmString := buildMetaString(" CONSRTM", reference.Consortium) - gbkString.WriteString(consrtmString) + } + if reference.Consortium != "" { + consrtmString := buildMetaString(" CONSRTM", reference.Consortium) + newWrittenBytes, err = w.Write([]byte(consrtmString)) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err } } + } - // building other meta fields that are catch all - otherKeys := make([]string, 0, len(sequence.Meta.Other)) - for key := range sequence.Meta.Other { - otherKeys = append(otherKeys, key) - } + // building other meta fields that are catch all + otherKeys := make([]string, 0, len(sequence.Meta.Other)) + for key := range sequence.Meta.Other { + otherKeys = append(otherKeys, key) + } - for _, otherKey := range otherKeys { - otherString := buildMetaString(otherKey, sequence.Meta.Other[otherKey]) - gbkString.WriteString(otherString) + for _, otherKey := range otherKeys { + otherString := buildMetaString(otherKey, sequence.Meta.Other[otherKey]) + newWrittenBytes, err = w.Write([]byte(otherString)) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err } + } - // start writing features section. - gbkString.WriteString("FEATURES Location/Qualifiers\n") - for _, feature := range sequence.Features { - gbkString.WriteString(BuildFeatureString(feature)) + // start writing features section. + newWrittenBytes, err = w.Write([]byte("FEATURES Location/Qualifiers\n")) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } + for _, feature := range sequence.Features { + newWrittenBytes, err = w.Write([]byte(BuildFeatureString(feature))) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err } + } - if len(sequence.Meta.BaseCount) > 0 { - gbkString.WriteString("BASE COUNT ") - for _, baseCount := range sequence.Meta.BaseCount { - gbkString.WriteString(strconv.Itoa(baseCount.Count) + " " + baseCount.Base + " ") + // start writing base count + if len(sequence.Meta.BaseCount) > 0 { + newWrittenBytes, err = w.Write([]byte("BASE COUNT ")) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } + for _, baseCount := range sequence.Meta.BaseCount { + newWrittenBytes, err = w.Write([]byte(strconv.Itoa(baseCount.Count) + " " + baseCount.Base + " ")) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err } - gbkString.WriteString("\n") } - // start writing sequence section. - gbkString.WriteString("ORIGIN\n") - - // iterate over every character in sequence range. - for index, base := range sequence.Sequence { - // if 60th character add newline then whitespace and index number and space before adding next base. - if index%60 == 0 { - if index != 0 { - gbkString.WriteString("\n") + newWrittenBytes, err = w.Write([]byte("\n")) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } + } + + // start writing sequence section. + newWrittenBytes, err = w.Write([]byte("ORIGIN\n")) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } + + // iterate over every character in sequence range. + for index, base := range sequence.Sequence { + // if 60th character add newline then whitespace and index number and space before adding next base. + if index%60 == 0 { + if index != 0 { + newWrittenBytes, err = w.Write([]byte("\n")) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err } - lineNumberString := strconv.Itoa(index + 1) // genbank indexes at 1 for some reason - leadingWhiteSpaceLength := 9 - len(lineNumberString) // <- I wish I was kidding - for i := 0; i < leadingWhiteSpaceLength; i++ { - gbkString.WriteString(" ") + } + lineNumberString := strconv.Itoa(index + 1) // genbank indexes at 1 for some reason + leadingWhiteSpaceLength := 9 - len(lineNumberString) // <- I wish I was kidding + for i := 0; i < leadingWhiteSpaceLength; i++ { + newWrittenBytes, err = w.Write([]byte(" ")) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err } - gbkString.WriteString(lineNumberString + " ") - gbkString.WriteRune(base) - // if base index is divisible by ten add a space (genbank convention) - } else if index%10 == 0 { - gbkString.WriteString(" ") - gbkString.WriteRune(base) - // else just add the base. - } else { - gbkString.WriteRune(base) + } + newWrittenBytes, err = w.Write([]byte(lineNumberString + " ")) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } + newWrittenBytes, err = w.Write([]byte{byte(base)}) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } + // if base index is divisible by ten add a space (genbank convention) + } else if index%10 == 0 { + newWrittenBytes, err = w.Write([]byte(" ")) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } + newWrittenBytes, err = w.Write([]byte{byte(base)}) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err + } + // else just add the base. + } else { + newWrittenBytes, err = w.Write([]byte{byte(base)}) + writtenBytes += int64(newWrittenBytes) + if err != nil { + return writtenBytes, err } } - // finish genbank file with "//" on newline (again a genbank convention) - gbkString.WriteString("\n//\n") } - - return gbkString.Bytes(), nil -} - -// Parse takes in a reader representing a single gbk/gb/genbank file and parses it into a Genbank struct. -func Parse(r io.Reader) (Genbank, error) { - genbankSlice, err := parseMultiNthFn(r, 1) - + // finish genbank file with "//" on newline (again a genbank convention) + newWrittenBytes, err = w.Write([]byte("\n//\n")) + writtenBytes += int64(newWrittenBytes) if err != nil { - return Genbank{}, err + return writtenBytes, err } - return genbankSlice[0], err + return writtenBytes, nil } -// ParseMulti takes in a reader representing a multi gbk/gb/genbank file and parses it into a slice of Genbank structs. -func ParseMulti(r io.Reader) ([]Genbank, error) { - genbankSlice, err := parseMultiNthFn(r, -1) +// ParseError represents failures encountered while parsing, +// and pointers to it are fully compatable with the error interface. +type ParseError struct { + file string // the file origin + line string // the offending line + before bool // whether the error occurred before or on this line + lineNo int // the line number, 0 indexed + info string `default:"syntax error"` // description of the error type + wraps error // stores the error that led to this, if any +} - if err != nil { - return []Genbank{}, err +func (e ParseError) Error() string { + var out, loc string + if e.wraps == io.EOF { + out = "unexpected EOF" + if e.file != "" { + return fmt.Sprintf("%s in %s", out, e.file) + } else { + return out + } } - - return genbankSlice, err + if e.file == "" { + loc = fmt.Sprintf("line %d", e.lineNo) + } else { + loc = fmt.Sprintf("%s:%d", e.file, e.lineNo) + } + if e.before { + out = fmt.Sprintf("%s encountered before %s", e.info, loc) + } else { + out = fmt.Sprintf("%s encountered on %s: %s", e.info, loc, e.line) + } + if e.wraps != nil { + out = fmt.Sprintf("%s\nfrom %v", out, e.wraps) + } + return out } +// defines state for the parser, and utility methods to modify type parseLoopParameters struct { newLocation bool - quoteActive bool attribute string attributeValue string emptyAttribute bool @@ -406,238 +530,268 @@ type parseLoopParameters struct { // method to init loop parameters func (params *parseLoopParameters) init() { params.newLocation = true - params.feature.Attributes = make(map[string]string) + params.feature.Attributes = NewMultiMap[string, string]() params.parseStep = "metadata" params.genbankStarted = false params.genbank.Meta.Other = make(map[string]string) } -// ParseMultiNth takes in a reader representing a multi gbk/gb/genbank file and parses the first n records into a slice of Genbank structs. -func ParseMultiNth(r io.Reader, count int) ([]Genbank, error) { - scanner := bufio.NewScanner(r) - var genbanks []Genbank +// save our completed attribute / qualifier string to the current feature +// useful as a wrap-up step from multiple states +func (params *parseLoopParameters) saveLastAttribute() { + newValue := params.attributeValue != "" + emptyType := params.feature.Type != "" + if newValue || emptyType { + if newValue { + Put(params.feature.Attributes, params.attribute, params.attributeValue) + } + params.features = append(params.features, params.feature) + + // reset attribute state + params.attributeValue = "" + params.attribute = "" + params.feature = Feature{} + params.feature.Attributes = NewMultiMap[string, string]() + } +} + +// Header is a blank struct, needed for compatibility with bio parsers. It contains nothing. +type Header struct{} - // Sequence setup +// WriteTo is a blank function, needed for compatibility with bio parsers. It doesn't do anything. +func (header *Header) WriteTo(w io.Writer) (int64, error) { + return 0, nil +} + +// Parser is a genbank parser created on an io.Reader. +type Parser struct { + scanner bufio.Scanner + parameters parseLoopParameters +} - var parameters parseLoopParameters - parameters.init() +// Header returns nil,nil. +func (parser *Parser) Header() (*Header, error) { + return &Header{}, nil +} + +// NewParser returns a Parser that uses r as the source +// from which to parse genbank formatted sequences. +func NewParser(r io.Reader, maxLineSize int) *Parser { + scanner := bufio.NewScanner(r) + buf := make([]byte, maxLineSize) + scanner.Buffer(buf, maxLineSize) + return &Parser{ + scanner: *scanner, + } +} +// Next takes in a reader representing a multi gbk/gb/genbank file and outputs the next record +func (parser *Parser) Next() (*Genbank, error) { + parser.parameters.init() // Loop through each line of the file - for lineNum := 0; scanner.Scan(); lineNum++ { + for lineNum := 0; parser.scanner.Scan(); lineNum++ { // get line from scanner and split it - line := scanner.Text() + line := parser.scanner.Text() splitLine := strings.Split(strings.TrimSpace(line), " ") - prevline := parameters.currentLine - parameters.currentLine = line - parameters.prevline = prevline + prevline := parser.parameters.currentLine + parser.parameters.currentLine = line + parser.parameters.prevline = prevline // keep scanning until we find the start of the first record - if !parameters.genbankStarted { + if !parser.parameters.genbankStarted { // We detect the beginning of a new genbank file with "LOCUS" locusFlag := strings.Contains(line, "LOCUS") if locusFlag { - parameters = parseLoopParameters{} - parameters.init() - parameters.genbank.Meta.Locus = parseLocus(line) - parameters.genbankStarted = true + parser.parameters = parseLoopParameters{} + parser.parameters.init() + parser.parameters.genbank.Meta.Locus = parseLocus(line) + parser.parameters.genbankStarted = true } continue } - switch parameters.parseStep { + // define parser state machine + switch parser.parameters.parseStep { case "metadata": // Handle empty lines if len(line) == 0 { - return genbanks, fmt.Errorf("Empty metadata line on line %d", lineNum) + return &Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "unexpected empty metadata"} } // If we are currently reading a line, we need to figure out if it is a new meta line. - if string(line[0]) != " " || parameters.metadataTag == "FEATURES" { + if string(line[0]) != " " || parser.parameters.metadataTag == "FEATURES" { // If this is true, it means we are beginning a new meta tag. In that case, let's save // the older data, and then continue along. - switch parameters.metadataTag { + switch parser.parameters.metadataTag { case "DEFINITION": - parameters.genbank.Meta.Definition = parseMetadata(parameters.metadataData) + parser.parameters.genbank.Meta.Definition = parseMetadata(parser.parameters.metadataData) case "ACCESSION": - parameters.genbank.Meta.Accession = parseMetadata(parameters.metadataData) + parser.parameters.genbank.Meta.Accession = parseMetadata(parser.parameters.metadataData) case "VERSION": - parameters.genbank.Meta.Version = parseMetadata(parameters.metadataData) + parser.parameters.genbank.Meta.Version = parseMetadata(parser.parameters.metadataData) case "KEYWORDS": - parameters.genbank.Meta.Keywords = parseMetadata(parameters.metadataData) + parser.parameters.genbank.Meta.Keywords = parseMetadata(parser.parameters.metadataData) case "SOURCE": - parameters.genbank.Meta.Source, parameters.genbank.Meta.Organism, parameters.genbank.Meta.Taxonomy = getSourceOrganism(parameters.metadataData) + parser.parameters.genbank.Meta.Source, parser.parameters.genbank.Meta.Organism, parser.parameters.genbank.Meta.Taxonomy = getSourceOrganism(parser.parameters.metadataData) case "REFERENCE": - reference, err := parseReferencesFn(parameters.metadataData) + // parseReferencesFn = parseReferences in genbank_test. We use Fn for testing purposes. + reference, err := parseReferencesFn(parser.parameters.metadataData) if err != nil { - return []Genbank{}, fmt.Errorf("Failed in parsing reference above line %d. Got error: %s", lineNum, err) + return &Genbank{}, &ParseError{line: line, lineNo: lineNum, before: true, wraps: err, info: "failed in parsing reference"} } - parameters.genbank.Meta.References = append(parameters.genbank.Meta.References, reference) + parser.parameters.genbank.Meta.References = append(parser.parameters.genbank.Meta.References, reference) case "FEATURES": - parameters.parseStep = "features" + parser.parameters.parseStep = "features" // We know that we are now parsing features, so lets initialize our first feature - parameters.feature.Type = strings.TrimSpace(splitLine[0]) - parameters.feature.Location.GbkLocationString = strings.TrimSpace(splitLine[len(splitLine)-1]) - parameters.newLocation = true + parser.parameters.feature.Type = strings.TrimSpace(splitLine[0]) + parser.parameters.feature.Location.GbkLocationString = strings.TrimSpace(splitLine[len(splitLine)-1]) + parser.parameters.newLocation = true continue default: - if parameters.metadataTag != "" { - parameters.genbank.Meta.Other[parameters.metadataTag] = parseMetadata(parameters.metadataData) + if parser.parameters.metadataTag != "" { + parser.parameters.genbank.Meta.Other[parser.parameters.metadataTag] = parseMetadata(parser.parameters.metadataData) } } - parameters.metadataTag = strings.TrimSpace(splitLine[0]) - parameters.metadataData = []string{strings.TrimSpace(line[len(parameters.metadataTag):])} + parser.parameters.metadataTag = strings.TrimSpace(splitLine[0]) + parser.parameters.metadataData = []string{strings.TrimSpace(line[len(parser.parameters.metadataTag):])} } else { - parameters.metadataData = append(parameters.metadataData, line) + parser.parameters.metadataData = append(parser.parameters.metadataData, line) } case "features": - baseCountFlag := strings.Contains(line, "BASE COUNT") // example string for BASE COUNT: "BASE COUNT 67070277 a 48055043 c 48111528 g 67244164 t 18475410 n" if baseCountFlag { fields := strings.Fields(line) for countIndex := 2; countIndex < len(fields)-1; countIndex += 2 { // starts at two because we don't want to include "BASE COUNT" in our fields count, err := strconv.Atoi(fields[countIndex]) if err != nil { - return []Genbank{}, err + return &Genbank{}, &ParseError{line: line, lineNo: lineNum, wraps: err, info: "invalid base count"} } baseCount := BaseCount{ Base: fields[countIndex+1], Count: count, } - parameters.genbank.Meta.BaseCount = append(parameters.genbank.Meta.BaseCount, baseCount) + parser.parameters.genbank.Meta.BaseCount = append(parser.parameters.genbank.Meta.BaseCount, baseCount) } break } // Switch to sequence parsing originFlag := strings.Contains(line, "ORIGIN") // we detect the beginning of the sequence with "ORIGIN" - if originFlag { - parameters.parseStep = "sequence" + contigFlag := strings.Contains(line, "CONTIG") + if originFlag || contigFlag { + parser.parameters.parseStep = "sequence" - // save our completed attribute / qualifier string to the current feature - if parameters.attributeValue != "" { - parameters.feature.Attributes[parameters.attribute] = parameters.attributeValue - parameters.features = append(parameters.features, parameters.feature) - parameters.attributeValue = "" - parameters.attribute = "" - parameters.feature = Feature{} - parameters.feature.Attributes = make(map[string]string) - } else { - parameters.features = append(parameters.features, parameters.feature) - } + parser.parameters.saveLastAttribute() // add our features to the genbank - for _, feature := range parameters.features { + for _, feature := range parser.parameters.features { + // TODO: parse location when line is read, or track line number so error is localized location, err := parseLocation(feature.Location.GbkLocationString) if err != nil { - return []Genbank{}, err + return &Genbank{}, &ParseError{before: true, line: line, lineNo: lineNum, wraps: err, info: "invalid feature location"} } feature.Location = location - err = parameters.genbank.AddFeature(&feature) + err = parser.parameters.genbank.AddFeature(&feature) if err != nil { - return []Genbank{}, err + return &Genbank{}, &ParseError{before: true, line: line, lineNo: lineNum, wraps: err, info: "problem adding feature"} } } + + if contigFlag { + parser.parameters.genbank.Meta.Other["CONTIG"] = parseMetadata(splitLine[1:]) + } continue - } // end sequence parsing flag logic + } // check if current line contains anything but whitespace trimmedLine := strings.TrimSpace(line) - if len(trimmedLine) < 1 { + if len(trimmedLine) == 0 { continue } + indent := countLeadingSpaces(parser.parameters.currentLine) // determine if current line is a new top level feature - if countLeadingSpaces(parameters.currentLine) < countLeadingSpaces(parameters.prevline) || parameters.prevline == "FEATURES" { - // save our completed attribute / qualifier string to the current feature - if parameters.attributeValue != "" { - parameters.feature.Attributes[parameters.attribute] = parameters.attributeValue - parameters.features = append(parameters.features, parameters.feature) - parameters.attributeValue = "" - parameters.attribute = "" - parameters.feature = Feature{} - parameters.feature.Attributes = make(map[string]string) - } - - // } - // checks for empty types - if parameters.feature.Type != "" { - parameters.features = append(parameters.features, parameters.feature) - } + if indent == 0 { + return &Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "unexpected metadata when parsing feature"} + } else if indent < countLeadingSpaces(parser.parameters.prevline) || parser.parameters.prevline == "FEATURES" { + parser.parameters.saveLastAttribute() - parameters.feature = Feature{} - parameters.feature.Attributes = make(map[string]string) + parser.parameters.feature = Feature{} + parser.parameters.feature.Attributes = NewMultiMap[string, string]() // An initial feature line looks like this: `source 1..2686` with a type separated by its location if len(splitLine) < 2 { - return genbanks, fmt.Errorf("Feature line malformed on line %d. Got line: %s", lineNum, line) + return &Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "malformed feature"} } - parameters.feature.Type = strings.TrimSpace(splitLine[0]) - parameters.feature.Location.GbkLocationString = strings.TrimSpace(splitLine[len(splitLine)-1]) - parameters.multiLineFeature = false // without this we can't tell if something is a multiline feature or multiline qualifier - } else if !strings.Contains(parameters.currentLine, "/") { // current line is continuation of a feature or qualifier (sub-constituent of a feature) + parser.parameters.feature.Type = strings.TrimSpace(splitLine[0]) + parser.parameters.feature.Location.GbkLocationString = strings.TrimSpace(splitLine[len(splitLine)-1]) + parser.parameters.multiLineFeature = false // without this we can't tell if something is a multiline feature or multiline qualifier + } else if !strings.Contains(parser.parameters.currentLine, "/") { // current line is continuation of a feature or qualifier (sub-constituent of a feature) // if it's a continuation of the current feature, add it to the location - if !strings.Contains(parameters.currentLine, "\"") && (countLeadingSpaces(parameters.currentLine) > countLeadingSpaces(parameters.prevline) || parameters.multiLineFeature) { - parameters.feature.Location.GbkLocationString += strings.TrimSpace(line) - parameters.multiLineFeature = true // without this we can't tell if something is a multiline feature or multiline qualifier + if !strings.Contains(parser.parameters.currentLine, "\"") && (countLeadingSpaces(parser.parameters.currentLine) > countLeadingSpaces(parser.parameters.prevline) || parser.parameters.multiLineFeature) { + parser.parameters.feature.Location.GbkLocationString += strings.TrimSpace(line) + parser.parameters.multiLineFeature = true // without this we can't tell if something is a multiline feature or multiline qualifier } else { // it's a continued line of a qualifier removeAttributeValueQuotes := strings.Replace(trimmedLine, "\"", "", -1) - parameters.attributeValue = parameters.attributeValue + removeAttributeValueQuotes + parser.parameters.attributeValue = parser.parameters.attributeValue + removeAttributeValueQuotes } - } else if strings.Contains(parameters.currentLine, "/") { // current line is a new qualifier - trimmedCurrentLine := strings.TrimSpace(parameters.currentLine) + } else if strings.Contains(parser.parameters.currentLine, "/") { // current line is a new qualifier + trimmedCurrentLine := strings.TrimSpace(parser.parameters.currentLine) if trimmedCurrentLine[0] != '/' { // if we have an exception case, like (adenine(1518)-N(6)/adenine(1519)-N(6))- - parameters.attributeValue = parameters.attributeValue + trimmedCurrentLine + parser.parameters.attributeValue = parser.parameters.attributeValue + trimmedCurrentLine continue } // save our completed attribute / qualifier string to the current feature - if parameters.attributeValue != "" || parameters.emptyAttribute { - parameters.feature.Attributes[parameters.attribute] = parameters.attributeValue - parameters.emptyAttribute = false + if parser.parameters.attributeValue != "" || parser.parameters.emptyAttribute { + Put(parser.parameters.feature.Attributes, parser.parameters.attribute, parser.parameters.attributeValue) + parser.parameters.emptyAttribute = false } - parameters.attributeValue = "" + parser.parameters.attributeValue = "" splitAttribute := strings.Split(line, "=") trimmedSpaceAttribute := strings.TrimSpace(splitAttribute[0]) removedForwardSlashAttribute := strings.Replace(trimmedSpaceAttribute, "/", "", 1) - parameters.attribute = removedForwardSlashAttribute + parser.parameters.attribute = removedForwardSlashAttribute var removeAttributeValueQuotes string if len(splitAttribute) == 1 { // handle case of ` /pseudo `, which has no text removeAttributeValueQuotes = "" - parameters.emptyAttribute = true + parser.parameters.emptyAttribute = true } else { // this is normally triggered removeAttributeValueQuotes = strings.Replace(splitAttribute[1], "\"", "", -1) } - parameters.attributeValue = removeAttributeValueQuotes - parameters.multiLineFeature = false // without this we can't tell if something is a multiline feature or multiline qualifier + parser.parameters.attributeValue = removeAttributeValueQuotes + parser.parameters.multiLineFeature = false // without this we can't tell if something is a multiline feature or multiline qualifier + } else { + return &Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "invalid feature"} } case "sequence": if len(line) < 2 { // throw error if line is malformed - return genbanks, fmt.Errorf("Too short line found while parsing genbank sequence on line %d. Got line: %s", lineNum, line) + return &Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "too short line found while parsing genbank sequence"} } else if line[0:2] == "//" { // end of sequence - parameters.genbank.Sequence = parameters.sequenceBuilder.String() + parser.parameters.genbank.Sequence = parser.parameters.sequenceBuilder.String() - genbanks = append(genbanks, parameters.genbank) - parameters.genbankStarted = false - parameters.sequenceBuilder.Reset() + parser.parameters.genbankStarted = false + parser.parameters.sequenceBuilder.Reset() + return &parser.parameters.genbank, nil } else { // add line to total sequence - parameters.sequenceBuilder.WriteString(sequenceRegex.ReplaceAllString(line, "")) + parser.parameters.sequenceBuilder.WriteString(sequenceRegex.ReplaceAllString(line, "")) } default: - log.Warnf("Unknown parse step: %s", parameters.parseStep) - parameters.genbankStarted = false + log.Warnf("Unknown parse step: %s", parser.parameters.parseStep) + parser.parameters.genbankStarted = false } } - return genbanks, nil + return &Genbank{}, io.EOF } func countLeadingSpaces(line string) int { @@ -844,7 +998,7 @@ func parseLocation(locationString string) (Location, error) { location.GbkLocationString = locationString if !strings.ContainsAny(locationString, "(") { // Case checks for simple expression of x..x if !strings.ContainsAny(locationString, ".") { //Case checks for simple expression x - position, err := strconv.Atoi(locationString) + position, err := strconv.Atoi(partialRegex.ReplaceAllString(locationString, "")) if err != nil { return Location{}, err } @@ -1000,13 +1154,10 @@ func BuildFeatureString(feature Feature) string { featureHeader := generateWhiteSpace(subMetaIndex) + feature.Type + whiteSpaceTrail + location + "\n" returnString := featureHeader - qualifierKeys := make([]string, 0, len(feature.Attributes)) - for key := range feature.Attributes { - qualifierKeys = append(qualifierKeys, key) - } - - for _, qualifier := range qualifierKeys { - returnString += generateWhiteSpace(qualifierIndex) + "/" + qualifier + "=\"" + feature.Attributes[qualifier] + "\"\n" + if feature.Attributes != nil { + ForEachValue(feature.Attributes, func(key string, value string) { + returnString += generateWhiteSpace(qualifierIndex) + "/" + key + "=\"" + value + "\"\n" + }) } return returnString } @@ -1026,3 +1177,91 @@ func generateWhiteSpace(length int) string { GBK specific IO related things end here. ******************************************************************************/ + +/****************************************************************************** +Old functions for testing here. + +We used to have integrated read files, before we had the generic parser +interface. These are still here because of the need to switch over our tests. +******************************************************************************/ + +// read reads a GBK file from path and returns a Genbank struct. +func read(path string) (Genbank, error) { + genbankSlice, err := readMultiNth(path, 1) + if err != nil { + return Genbank{}, err + } + genbank := genbankSlice[0] + return genbank, err +} + +// readMulti reads a multi Gbk from path and parses it into a slice of Genbank structs. +func readMulti(path string) ([]Genbank, error) { + return readMultiNth(path, -1) +} + +// readMultiNth reads a multi Gbk from path and parses N entries into a slice of Genbank structs. +func readMultiNth(path string, count int) ([]Genbank, error) { + file, err := os.Open(path) + if err != nil { + return []Genbank{}, err + } + + sequence, perr := parseMultiNthFn(file, count) + if perr != nil { + perr.file = path + return []Genbank{}, perr + } + + return sequence, nil +} + +func parseMultiNth(r io.Reader, count int) ([]Genbank, *ParseError) { + parser := NewParser(r, bufio.MaxScanTokenSize) + var genbanks []Genbank + for i := 0; i < count; i++ { + gb, err := parser.Next() + if err != nil { + var perr *ParseError + if err == io.EOF { + perr = &ParseError{wraps: io.EOF} + } else if err != nil { + perr = err.(*ParseError) + } + return genbanks, perr + } + genbanks = append(genbanks, *gb) + } + return genbanks, nil +} + +func parse(r io.Reader) (Genbank, error) { + parser := NewParser(r, bufio.MaxScanTokenSize) + gb, err := parser.Next() + return *gb, err +} + +func write(gb Genbank, path string) error { + file, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return err + } + defer file.Close() + _, err = gb.WriteTo(file) + return err +} + +func writeMulti(gbs []Genbank, path string) error { + file, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return err + } + defer file.Close() + for _, gb := range gbs { + _, err = gb.WriteTo(file) + if err != nil { + return err + } + } + return nil +} diff --git a/io/genbank/genbank_test.go b/bio/genbank/genbank_test.go similarity index 83% rename from io/genbank/genbank_test.go rename to bio/genbank/genbank_test.go index 9466c6b1..8e148c5c 100644 --- a/io/genbank/genbank_test.go +++ b/bio/genbank/genbank_test.go @@ -25,13 +25,13 @@ Gbk/gb/genbank related benchmarks begin here. ******************************************************************************/ var singleGbkPaths = []string{ - "../../data/t4_intron.gb", - "../../data/puc19.gbk", - "../../data/puc19_snapgene.gb", - "../../data/benchling.gb", - "../../data/phix174.gb", - "../../data/sample.gbk", - // "../../data/pichia_chr1_head.gb", + "data/t4_intron.gb", + "data/puc19.gbk", + "data/puc19_snapgene.gb", + "data/benchling.gb", + "data/phix174.gb", + "data/sample.gbk", + // "data/pichia_chr1_head.gb", } func TestGbkIO(t *testing.T) { @@ -43,20 +43,20 @@ func TestGbkIO(t *testing.T) { // test single gbk read, write, build, parse for _, gbkPath := range singleGbkPaths { - gbk, _ := Read(gbkPath) + gbk, _ := read(gbkPath) tmpGbkFilePath := filepath.Join(tmpDataDir, filepath.Base(gbkPath)) - _ = Write(gbk, tmpGbkFilePath) + _ = write(gbk, tmpGbkFilePath) - writeTestGbk, _ := Read(tmpGbkFilePath) + writeTestGbk, _ := read(tmpGbkFilePath) if diff := cmp.Diff(gbk, writeTestGbk, []cmp.Option{cmpopts.IgnoreFields(Feature{}, "ParentSequence")}...); diff != "" { - t.Errorf("Parsing the output of Build() does not produce the same output as parsing the original file, \"%s\", read with Read(). Got this diff:\n%s", filepath.Base(gbkPath), diff) + t.Errorf("Parsing the output of Build() does not produce the same output as parsing the original file, \"%s\", read with read(). Got this diff:\n%s", filepath.Base(gbkPath), diff) } } // end test single gbk read, write, build, parse } func TestMultiLineFeatureParse(t *testing.T) { - pichia, _ := Read("../../data/pichia_chr1_head.gb") + pichia, _ := read("data/pichia_chr1_head.gb") var multilineOutput string for _, feature := range pichia.Features { multilineOutput = feature.Location.GbkLocationString @@ -75,15 +75,15 @@ func TestMultiGenbankIO(t *testing.T) { defer os.RemoveAll(tmpDataDir) // Test multiline Genbank features - gbkPath := "../../data/multiGbk_test.seq" - multiGbk, _ := ReadMulti(gbkPath) + gbkPath := "data/multiGbk_test.seq" + multiGbk, _ := readMulti(gbkPath) tmpGbkFilePath := filepath.Join(tmpDataDir, filepath.Base(gbkPath)) - _ = WriteMulti(multiGbk, tmpGbkFilePath) + _ = writeMulti(multiGbk, tmpGbkFilePath) - writeTestGbk, _ := ReadMulti(tmpGbkFilePath) + writeTestGbk, _ := readMulti(tmpGbkFilePath) if diff := cmp.Diff(multiGbk, writeTestGbk, []cmp.Option{cmpopts.IgnoreFields(Feature{}, "ParentSequence")}...); diff != "" { - t.Errorf("Parsing the output of Build() does not produce the same output as parsing the original file, \"%s\", read with Read(). Got this diff:\n%s", filepath.Base(gbkPath), diff) + t.Errorf("Parsing the output of Build() does not produce the same output as parsing the original file, \"%s\", read with read(). Got this diff:\n%s", filepath.Base(gbkPath), diff) } } @@ -94,7 +94,7 @@ func TestGbkLocationStringBuilder(t *testing.T) { } defer os.RemoveAll(tmpDataDir) - scrubbedGbk, err := Read("../../data/sample.gbk") + scrubbedGbk, err := read("data/sample.gbk") if err != nil { t.Error(err) } @@ -105,13 +105,13 @@ func TestGbkLocationStringBuilder(t *testing.T) { } tmpGbkFilePath := filepath.Join(tmpDataDir, "sample.gbk") - _ = Write(scrubbedGbk, tmpGbkFilePath) + _ = write(scrubbedGbk, tmpGbkFilePath) - testInputGbk, _ := Read("../../data/sample.gbk") - testOutputGbk, _ := Read(tmpGbkFilePath) + testInputGbk, _ := read("data/sample.gbk") + testOutputGbk, _ := read(tmpGbkFilePath) if diff := cmp.Diff(testInputGbk, testOutputGbk, []cmp.Option{cmpopts.IgnoreFields(Feature{}, "ParentSequence")}...); diff != "" { - t.Errorf("Issue with partial location building. Parsing the output of Build() does not produce the same output as parsing the original file read with Read(). Got this diff:\n%s", diff) + t.Errorf("Issue with partial location building. Parsing the output of Build() does not produce the same output as parsing the original file read with read(). Got this diff:\n%s", diff) } } @@ -122,7 +122,7 @@ func TestGbLocationStringBuilder(t *testing.T) { } defer os.RemoveAll(tmpDataDir) - scrubbedGb, _ := Read("../../data/t4_intron.gb") + scrubbedGb, _ := read("data/t4_intron.gb") // removing gbkLocationString from features to allow testing for gbkLocationBuilder for featureIndex := range scrubbedGb.Features { @@ -130,34 +130,34 @@ func TestGbLocationStringBuilder(t *testing.T) { } tmpGbFilePath := filepath.Join(tmpDataDir, "t4_intron_test.gb") - _ = Write(scrubbedGb, tmpGbFilePath) + _ = write(scrubbedGb, tmpGbFilePath) - testInputGb, _ := Read("../../data/t4_intron.gb") - testOutputGb, _ := Read(tmpGbFilePath) + testInputGb, _ := read("data/t4_intron.gb") + testOutputGb, _ := read(tmpGbFilePath) if diff := cmp.Diff(testInputGb, testOutputGb, []cmp.Option{cmpopts.IgnoreFields(Feature{}, "ParentSequence")}...); diff != "" { - t.Errorf("Issue with either Join or complement location building. Parsing the output of Build() does not produce the same output as parsing the original file read with Read(). Got this diff:\n%s", diff) + t.Errorf("Issue with either Join or complement location building. Parsing the output of Build() does not produce the same output as parsing the original file read with read(). Got this diff:\n%s", diff) } } func TestPartialLocationParseRegression(t *testing.T) { - gbk, _ := Read("../../data/sample.gbk") + gbk, _ := read("data/sample.gbk") for _, feature := range gbk.Features { if feature.Location.GbkLocationString == "687..3158>" && (feature.Location.Start != 686 || feature.Location.End != 3158) { - t.Errorf("Partial location for three prime location parsing has failed. Parsing the output of Build() does not produce the same output as parsing the original file read with Read()") + t.Errorf("Partial location for three prime location parsing has failed. Parsing the output of Build() does not produce the same output as parsing the original file read with read()") } } - gbk, err := Read("../../data/sample.gbk") + gbk, err := read("data/sample.gbk") if err != nil { t.Errorf("Failed to read sample.gbk. Got err: %s", err) } for _, feature := range gbk.Features { if feature.Location.GbkLocationString == "687..3158>" && (feature.Location.Start != 686 || feature.Location.End != 3158) { - t.Errorf("Partial location for three prime location parsing has failed. Parsing the output of Build() does not produce the same output as parsing the original file read with Read(). Got location start %d and location end %d. Expected 687..3158>.", feature.Location.Start, feature.Location.End) + t.Errorf("Partial location for three prime location parsing has failed. Parsing the output of Build() does not produce the same output as parsing the original file read with read(). Got location start %d and location end %d. Expected 687..3158>.", feature.Location.Start, feature.Location.End) } else if feature.Location.GbkLocationString == "<1..206" && (feature.Location.Start != 0 || feature.Location.End != 206) { - t.Errorf("Partial location for five prime location parsing has failed. Parsing the output of Build() does not produce the same output as parsing the original file read with Read().") + t.Errorf("Partial location for five prime location parsing has failed. Parsing the output of Build() does not produce the same output as parsing the original file read with read().") } } } @@ -188,7 +188,7 @@ func TestSubLocationStringParseRegression(t *testing.T) { } func TestSnapgeneGenbankRegression(t *testing.T) { - snapgene, err := Read("../../data/puc19_snapgene.gb") + snapgene, err := read("data/puc19_snapgene.gb") if snapgene.Sequence == "" { t.Errorf("Parsing snapgene returned an empty string. Got error: %s", err) @@ -196,7 +196,7 @@ func TestSnapgeneGenbankRegression(t *testing.T) { } func TestGetSequenceMethod(t *testing.T) { - gbk, _ := Read("../../data/t4_intron.gb") + gbk, _ := read("data/t4_intron.gb") // Check to see if GetSequence method works on Features struct feature, _ := gbk.Features[1].GetSequence() @@ -207,30 +207,30 @@ func TestGetSequenceMethod(t *testing.T) { } func TestLocationParser(t *testing.T) { - gbk, _ := Read("../../data/t4_intron.gb") + gbk, _ := read("data/t4_intron.gb") - // Read 1..243 + // read 1..243 feature, _ := gbk.Features[1].GetSequence() seq := "atgagattacaacgccagagcatcaaagattcagaagttagaggtaaatggtattttaatatcatcggtaaagattctgaacttgttgaaaaagctgaacatcttttacgtgatatgggatgggaagatgaatgcgatggatgtcctctttatgaagacggagaaagcgcaggattttggatttaccattctgacgtcgagcagtttaaagctgattggaaaattgtgaaaaagtctgtttga" if feature != seq { t.Errorf("Feature sequence parser has changed on test '1..243'. Got this:\n%s instead of \n%s", feature, seq) } - // Read join(893..1441,2459..2770) + // read join(893..1441,2459..2770) featureJoin, _ := gbk.Features[6].GetSequence() seqJoin := "atgaaacaataccaagatttaattaaagacatttttgaaaatggttatgaaaccgatgatcgtacaggcacaggaacaattgctctgttcggatctaaattacgctgggatttaactaaaggttttcctgcggtaacaactaagaagctcgcctggaaagcttgcattgctgagctaatatggtttttatcaggaagcacaaatgtcaatgatttacgattaattcaacacgattcgttaatccaaggcaaaacagtctgggatgaaaattacgaaaatcaagcaaaagatttaggataccatagcggtgaacttggtccaatttatggaaaacagtggcgtgattttggtggtgtagaccaaattatagaagttattgatcgtattaaaaaactgccaaatgataggcgtcaaattgtttctgcatggaatccagctgaacttaaatatatggcattaccgccttgtcatatgttctatcagtttaatgtgcgtaatggctatttggatttgcagtggtatcaacgctcagtagatgttttcttgggtctaccgtttaatattgcgtcatatgctacgttagttcatattgtagctaagatgtgtaatcttattccaggggatttgatattttctggtggtaatactcatatctatatgaatcacgtagaacaatgtaaagaaattttgaggcgtgaacctaaagagctttgtgagctggtaataagtggtctaccttataaattccgatatctttctactaaagaacaattaaaatatgttcttaaacttaggcctaaagatttcgttcttaacaactatgtatcacaccctcctattaaaggaaagatggcggtgtaa" if featureJoin != seqJoin { t.Errorf("Feature sequence parser has changed on test 'join(893..1441,2459..2770)'. Got this:\n%s instead of \n%s", featureJoin, seqJoin) } - // Read complement(2791..3054) + // read complement(2791..3054) featureComplement, _ := gbk.Features[10].GetSequence() seqComplement := "ttattcactacccggcatagacggcccacgctggaataattcgtcatattgtttttccgttaaaacagtaatatcgtagtaacagtcagaagaagttttaactgtggaaattttattatcaaaatactcacgagtcattttatgagtatagtattttttaccataaatggtaataggctgttctggtcctggaacttctaactcgcttgggttaggaagtgtaaaaagaactacaccagaagtatctttaaatcgtaaaatcat" if featureComplement != seqComplement { t.Errorf("Feature sequence parser has changed on test 'complement(2791..3054)'. Got this:\n%s instead of \n%s", featureComplement, seqComplement) } - // Read join(complement(315..330),complement(339..896)) + // read join(complement(315..330),complement(339..896)) // Note: it is known that some software, like Snapgene, assumes that since both strands are in the reverse direction // that the first sequence should be appended to the reverse sequence, instead of the second sequence // getting appended to the first. Biopython appends the second sequence to the first, and that is logically @@ -241,7 +241,7 @@ func TestLocationParser(t *testing.T) { t.Errorf("Feature sequence parser has changed on test 'join(complement(315..330),complement(339..896))'. Got this:\n%s instead of \n%s", featureJoinComplement, seqJoinComplement) } - // Read complement(join(893..1098,1101..2770)) + // read complement(join(893..1098,1101..2770)) featureComplementJoin, _ := gbk.Features[5].GetSequence() seqComplementJoin := "ttacaccgccatctttcctttaataggagggtgtgatacatagttgttaagaacgaaatctttaggcctaagtttaagaacatattttaattgttctttagtagaaagatatcggaatttataaggtagaccacttattaccagctcacaaagctctttaggttcacgcctcaaaatttctttacattgttctacgtgattcatatagatatgagtattaccaccagaaaatatcaaatcccctggaataagattacacatcttagctacaatatgaactaacgtagcatatgacgcaatattaaacggtagcattatgttcagataaggtcgttaatcttaccccggaattatatccagctgcatgtcaccatgcagagcagactatatctccaacttgttaaagcaagttgtctatcgtttcgagtcacttgaccctactccccaaagggatagtcgttaggcatttatgtagaaccaattccatttatcagattttacacgataagtaactaatccagacgaaattttaaaatgtctagctgcatctgctgcacaatcaaaaataaccccatcacatgaaatctttttaatattactaggctttttacctttcatcttttctgatattttagatttagttatgtctgaatgcttatgattaaagaatgaattattttcacctgaacgatttctgcatttactacaagtataagcagaagtttgtatgcgaacaccgcacttacaaaacttatgggtttctggattccaacgcccgtttttacttccgggtttactgtaaagagctttccgaccatcaggtccaagtttaagcatcttagctttaacagtttcagaacgtttcttaataatttcttcttttaatggatgcgtagaacatgtatcaccaaacgttgcatcagcaatattgtatccattaattttagaattaagctctttaatccaaaaattttctcgttcaataatcaaatctttctcatatggaatttcttccaaaatagaacattcaaacacattaccatgtttgttaaaagacctctgaagttttatagaagaatggcatcctttttctaaatctttaaaatgcctcttccatctcttttcaaaatctttagcacttcctacatatactttattgtttaaagtatttttaatctgataaattccgcttttcataaatacctctttaaatatagaagtatttattaaagggcaagtcctacaatttagcacgggattgtctactagagaggttccccgtttagatagattacaagtataagtcaccttatactcaggcctcaattaacccaagaaaacatctactgagcgttgataccactgcaaatccaaatagccattacgcacattaaactgatagaacatatgacaaggcggtaatgccatatatttaagttcagctggattccatgcagaaacaatttgacgcctatcatttggcagttttttaatacgatcaataacttctataatttggtctacaccaccaaaatcacgccactgttttccataaattggaccaagttcaccgctatggtatcctaaatcttttgcttgattttcgtaattttcatcccagactgttttgccttggattaacgaatcgtgttgaattaatcgtaaatcatacatttgtgcttcctgataaaaaccatattagctcagcaatgcaagctttccaggcgagcttcttagttgttaccgcaggaaaacctttagttaaatcccagcgtaatttagatccgaacagagcaattgttcctgtgcctgtacgatcatcggtttcataaccattttcaaaaatgtctttaattaaatcttggtattgtttcat" if featureComplementJoin != seqComplementJoin { @@ -250,11 +250,11 @@ func TestLocationParser(t *testing.T) { } func TestGenbankNewlineParsingRegression(t *testing.T) { - gbk, _ := Read("../../data/puc19.gbk") + gbk, _ := read("data/puc19.gbk") for _, feature := range gbk.Features { if feature.Location.Start == 410 && feature.Location.End == 1750 && feature.Type == "CDS" { - if feature.Attributes["product"] != "chromosomal replication initiator informational ATPase" { + if feature.Attributes["product"][0] != "chromosomal replication initiator informational ATPase" { t.Errorf("Newline parsing has failed.") } break @@ -264,7 +264,7 @@ func TestGenbankNewlineParsingRegression(t *testing.T) { func BenchmarkRead(b *testing.B) { for i := 0; i < b.N; i++ { - _, _ = Read("../../data/bsub.gbk") + _, _ = read("data/bsub.gbk") } } @@ -281,72 +281,13 @@ Gbk/gb/genbank related benchmarks end here. ******************************************************************************/ func TestBenchlingGenbank(t *testing.T) { - sequence, _ := Read("../../data/benchling.gb") + sequence, _ := read("data/benchling.gb") if len(sequence.Features) != 17 { t.Errorf("Parsing benchling genbank file not returned the correct quantity of features") } } -func TestParse(t *testing.T) { - type args struct { - r io.Reader - } - tests := []struct { - name string - args args - want Genbank - wantErr bool - }{ - // TODO: Add test cases. - // empty line in genbank meta data - // { - - // name: "empty line in genbank meta data", - // args: args{r: strings.NewReader("LOCUS puc19.gbk 2686 bp DNA circular 22-OCT-2019")}, - // wantErr: true, - // }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := Parse(tt.args.r) - if (err != nil) != tt.wantErr { - t.Errorf("Parse() error = %v, wantErr %v", err, tt.wantErr) - return - } - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("Parse() = %v, want %v", got, tt.want) - } - }) - } -} - -func TestParseMulti(t *testing.T) { - type args struct { - r io.Reader - } - tests := []struct { - name string - args args - want []Genbank - wantErr bool - }{ - // TODO: Add test cases. - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := ParseMulti(tt.args.r) - if (err != nil) != tt.wantErr { - t.Errorf("ParseMulti() error = %v, wantErr %v", err, tt.wantErr) - return - } - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("ParseMulti() = %v, want %v", got, tt.want) - } - }) - } -} - // this was hand-written and tests the same as the above suite. func TestFeature_GetSequence_Legacy(t *testing.T) { // This test is a little too complex and contrived for an example function. @@ -401,7 +342,6 @@ func TestFeature_GetSequence_Legacy(t *testing.T) { func Test_parseLoopParameters_init(t *testing.T) { type fields struct { newLocation bool - quoteActive bool attribute string attributeValue string sequenceBuilder strings.Builder @@ -423,7 +363,6 @@ func Test_parseLoopParameters_init(t *testing.T) { t.Run(tt.name, func(t *testing.T) { params := &parseLoopParameters{ newLocation: tt.fields.newLocation, - quoteActive: tt.fields.quoteActive, attribute: tt.fields.attribute, attributeValue: tt.fields.attributeValue, sequenceBuilder: tt.fields.sequenceBuilder, @@ -455,7 +394,7 @@ func TestParseMultiNth(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := ParseMultiNth(tt.args.r, tt.args.count) + got, err := parseMultiNthFn(tt.args.r, tt.args.count) if (err != nil) != tt.wantErr { t.Errorf("ParseMultiNth() error = %v, wantErr %v", err, tt.wantErr) return @@ -592,20 +531,20 @@ func TestRead(t *testing.T) { { name: "error on malformed file", args: args{ - path: "../../data/malformed_read_test.gbk", + path: "data/malformed_read_test.gbk", }, wantErr: true, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := Read(tt.args.path) + got, err := read(tt.args.path) if (err != nil) != tt.wantErr { - t.Errorf("Read() error = %v, wantErr %v", err, tt.wantErr) + t.Errorf("read() error = %v, wantErr %v", err, tt.wantErr) return } if !reflect.DeepEqual(got, tt.want) { - t.Errorf("Read() = %v, want %v", got, tt.want) + t.Errorf("read() = %v, want %v", got, tt.want) } }) } @@ -625,13 +564,13 @@ func TestReadMulti(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := ReadMulti(tt.args.path) + got, err := readMulti(tt.args.path) if (err != nil) != tt.wantErr { - t.Errorf("ReadMulti() error = %v, wantErr %v", err, tt.wantErr) + t.Errorf("readMulti() error = %v, wantErr %v", err, tt.wantErr) return } if !reflect.DeepEqual(got, tt.want) { - t.Errorf("ReadMulti() = %v, want %v", got, tt.want) + t.Errorf("readMulti() = %v, want %v", got, tt.want) } }) } @@ -726,14 +665,14 @@ func Test_generateWhiteSpace(t *testing.T) { func TestRead_error(t *testing.T) { readErr := errors.New("open /tmp/file: no such file or directory") - oldReadFileFn := readFileFn + oldreadFileFn := readFileFn readFileFn = func(filename string) ([]byte, error) { return nil, readErr } defer func() { - readFileFn = oldReadFileFn + readFileFn = oldreadFileFn }() - _, err := Read("/tmp/file") + _, err := read("/tmp/file") assert.EqualError(t, err, readErr.Error()) } @@ -750,23 +689,22 @@ func TestBuildFeatureString(t *testing.T) { } func TestParse_error(t *testing.T) { - parseMultiErr := errors.New("parse error") + parseMultiErr := &ParseError{wraps: io.EOF} oldParseMultiNthFn := parseMultiNthFn - parseMultiNthFn = func(r io.Reader, count int) ([]Genbank, error) { + parseMultiNthFn = func(r io.Reader, count int) ([]Genbank, *ParseError) { return nil, parseMultiErr } defer func() { parseMultiNthFn = oldParseMultiNthFn }() - _, err := Parse(strings.NewReader("")) - assert.EqualError(t, err, parseMultiErr.Error()) + _, err := parse(strings.NewReader("")) + assert.Equal(t, err, parseMultiErr.wraps) - _, err = ParseMulti(strings.NewReader("")) - assert.EqualError(t, err, parseMultiErr.Error()) + _, err = parseMultiNth(strings.NewReader(""), 10000) + assert.Equal(t, err, parseMultiErr) } func TestParseReferences_error(t *testing.T) { - parseReferencesErr := errors.New("Failed in parsing reference above line 13. Got error: ") oldParseReferencesFn := parseReferencesFn parseReferencesFn = func(metadataData []string) (Reference, error) { return Reference{}, errors.New("") @@ -774,21 +712,21 @@ func TestParseReferences_error(t *testing.T) { defer func() { parseReferencesFn = oldParseReferencesFn }() - file, _ := os.Open("../../data/puc19.gbk") + file, _ := os.Open("data/puc19.gbk") _, err := parseMultiNthFn(file, 1) - assert.EqualError(t, err, parseReferencesErr.Error()) + assert.Equal(t, err.info, "failed in parsing reference") } func TestIssue303Regression(t *testing.T) { - seq, _ := Read("../../data/puc19_303_regression.gbk") - expectedAttribute := "16S rRNA(adenine(1518)-N(6)/adenine(1519)-N(6))-dimethyltransferase" + seq, _ := read("data/puc19_303_regression.gbk") + expectedAttribute := []string{"16S rRNA(adenine(1518)-N(6)/adenine(1519)-N(6))-dimethyltransferase"} for _, feature := range seq.Features { - if feature.Attributes["locus_tag"] == "JCVISYN3A_0004" && feature.Type == "CDS" { - if feature.Attributes["product"] != expectedAttribute { + if cmp.Equal(feature.Attributes["locus_tag"], []string{"JCVISYN3A_0004"}) && feature.Type == "CDS" { + if !cmp.Equal(feature.Attributes["product"], expectedAttribute) { t.Errorf("Failed to get proper expected attribute. Got: %s Expected: %s", feature.Attributes["product"], expectedAttribute) } } - if feature.Attributes["locus_tag"] == "JCVISYN3A_0051" && feature.Type == "CDS" { + if cmp.Equal(feature.Attributes["locus_tag"], []string{"JCVISYN3A_0051"}) && feature.Type == "CDS" { if _, ok := feature.Attributes["pseudo"]; !ok { t.Errorf("pseudo should be in attributes") } @@ -797,7 +735,7 @@ func TestIssue303Regression(t *testing.T) { } func TestConsortiumRegression(t *testing.T) { - _, err := Read("../../data/puc19_consrtm.gbk") + _, err := read("data/puc19_consrtm.gbk") if err != nil { t.Errorf("Failed to read consrtm. Got err: %s", err) } diff --git a/bio/genbank/multimap.go b/bio/genbank/multimap.go new file mode 100644 index 00000000..3f067272 --- /dev/null +++ b/bio/genbank/multimap.go @@ -0,0 +1,65 @@ +/* +This file provides utilities for working with a MultiMap, +which is simply a map which can store multiple values for a single key instead +of the usual one. + +Useful for when we expect to encounter repeated keys but we want to store all pairs, +not just the last-inserted one. This implementation has the advantage of being compatible with +json.Marshal, cmp.Diff, pretty printing, and bracket indexing out of the box. +Does not make uniqueness quarantees for key value pairs. +Currently only used in genbank.Feature.Attributes, however his may end up +being useful for other parsers which allow for repeated keys, in which case +this should be made into its own module. +*/ +package genbank + +// MultiMap is defined as a simple type alias over a map of slices. +type MultiMap[K, V comparable] map[K][]V + +// NewMultiMap creates a new empty multimap. +func NewMultiMap[K, V comparable]() MultiMap[K, V] { + return make(map[K][]V) +} + +// Put adds a key-value pair to the multimap. +func Put[K, V comparable](m MultiMap[K, V], k K, v ...V) { + if _, ok := m[k]; !ok { + m[k] = v + } else { + m[k] = append(m[k], v...) + } +} + +// ForEachKey iterates over the multimap, once for each key with all values passed as a slice. +// do is a callback that takes the key, values slice for that key +// This exists purely as a convenience function, if your use case would benefit from +// early break/return, it is recommended you do the usual range iteration operator. +func ForEachKey[K, V comparable](m MultiMap[K, V], do func(K, []V)) { + for k, values := range m { + do(k, values) + } +} + +// ForEachValue iterates over the multimap, once for each value +// do is a callback that takes and a key and value. +func ForEachValue[K, V comparable](m MultiMap[K, V], do func(K, V)) { + ForEachKey(m, func(k K, values []V) { + for _, v := range values { + do(k, v) + } + }) +} + +// MapSlice efficiently applies a transformation to each element of a slice to create a new slice +func MapSlice[X any, Y any](slice []X, mapper func(X) Y) []Y { + y := make([]Y, len(slice)) + for i, x := range slice { + y[i] = mapper(x) + } + return y +} + +// identity returns its input, useful for using MapSlice to do a shallow copy +func identity[X any](x X) X { + return x +} diff --git a/io/gff/example_test.go b/bio/gff/example_test.go similarity index 99% rename from io/gff/example_test.go rename to bio/gff/example_test.go index 040e9403..3fef3a16 100644 --- a/io/gff/example_test.go +++ b/bio/gff/example_test.go @@ -7,7 +7,7 @@ import ( "path/filepath" "testing" - "github.com/TimothyStiles/poly/io/gff" + "github.com/TimothyStiles/poly/bio/gff" "github.com/TimothyStiles/poly/transform" ) diff --git a/io/gff/gff.go b/bio/gff/gff.go similarity index 100% rename from io/gff/gff.go rename to bio/gff/gff.go diff --git a/io/gff/gff_test.go b/bio/gff/gff_test.go similarity index 100% rename from io/gff/gff_test.go rename to bio/gff/gff_test.go diff --git a/io/pileup/data/test.pileup b/bio/pileup/data/test.pileup similarity index 100% rename from io/pileup/data/test.pileup rename to bio/pileup/data/test.pileup diff --git a/io/pileup/data/test_not_enough_fields.pileup b/bio/pileup/data/test_not_enough_fields.pileup similarity index 100% rename from io/pileup/data/test_not_enough_fields.pileup rename to bio/pileup/data/test_not_enough_fields.pileup diff --git a/io/pileup/data/test_position_non_int.pileup b/bio/pileup/data/test_position_non_int.pileup similarity index 100% rename from io/pileup/data/test_position_non_int.pileup rename to bio/pileup/data/test_position_non_int.pileup diff --git a/io/pileup/data/test_read_off.pileup b/bio/pileup/data/test_read_off.pileup similarity index 100% rename from io/pileup/data/test_read_off.pileup rename to bio/pileup/data/test_read_off.pileup diff --git a/io/pileup/data/test_readcount_non_int.pileup b/bio/pileup/data/test_readcount_non_int.pileup similarity index 100% rename from io/pileup/data/test_readcount_non_int.pileup rename to bio/pileup/data/test_readcount_non_int.pileup diff --git a/io/pileup/data/test_unknown_rune.pileup b/bio/pileup/data/test_unknown_rune.pileup similarity index 100% rename from io/pileup/data/test_unknown_rune.pileup rename to bio/pileup/data/test_unknown_rune.pileup diff --git a/io/pileup/pileup.go b/bio/pileup/pileup.go similarity index 57% rename from io/pileup/pileup.go rename to bio/pileup/pileup.go index f8a6bb02..5e1aaff4 100644 --- a/io/pileup/pileup.go +++ b/bio/pileup/pileup.go @@ -30,28 +30,24 @@ wikipedia (https://en.wikipedia.org/wiki/Pileup_format) is shown below: 6. Quality: Phred quality scores associated with each base This package provides a parser and writer for working with pileup files. + +Wikipedia reference: https://en.wikipedia.org/wiki/Pileup_format */ package pileup import ( "bufio" - "bytes" - "errors" "fmt" "io" - "math" - "os" "strconv" "strings" "unicode" ) -// https://en.wikipedia.org/wiki/Pileup_format - // Pileup struct is a single position in a pileup file. Pileup files "pile" // a bunch of separate bam/sam alignments into something more readable at a per // base pair level, so are only useful as a grouping. -type Pileup struct { +type Line struct { Sequence string `json:"sequence"` Position uint `json:"position"` ReferenceBase string `json:"reference_base"` @@ -60,18 +56,24 @@ type Pileup struct { Quality string `json:"quality"` } -// Parse parses a given Pileup file into an array of Pileup structs. -func Parse(r io.Reader) ([]Pileup, error) { - // 32kB is a magic number often used by the Go stdlib for parsing. We multiply it by two. - const maxLineSize = 2 * 32 * 1024 - parser := NewParser(r, maxLineSize) - return parser.ParseAll() +// Header is a blank struct, needed for compatibility with bio parsers. It contains nothing. +type Header struct{} + +// WriteTo is a blank function, needed for compatibility with bio parsers. It doesn't do anything. +func (header *Header) WriteTo(w io.Writer) (int64, error) { + return 0, nil } // Parser is a pileup parser. type Parser struct { reader bufio.Reader line uint + atEOF bool +} + +// Header returns nil,nil. +func (parser *Parser) Header() (*Header, error) { + return &Header{}, nil } // NewParser creates a parser from an io.Reader for pileup data. @@ -81,59 +83,45 @@ func NewParser(r io.Reader, maxLineSize int) *Parser { } } -// ParseAll parses all sequences in underlying reader only returning non-EOF errors. -// It returns all valid pileup sequences up to error if encountered. -func (parser *Parser) ParseAll() ([]Pileup, error) { - return parser.ParseN(math.MaxInt) -} - -// ParseN parses up to maxRows pileup sequences from the Parser's underlying reader. -// ParseN does not return EOF if encountered. -// If an non-EOF error is encountered it returns it and all correctly parsed sequences up to then. -func (parser *Parser) ParseN(maxRows int) (pileups []Pileup, err error) { - for counter := 0; counter < maxRows; counter++ { - pileup, err := parser.ParseNext() - if err != nil { - if errors.Is(err, io.EOF) { - err = nil // EOF not treated as parsing error. - } - return pileups, err - } - pileups = append(pileups, pileup) - } - return pileups, nil -} - -// ParseNext parses the next pileup row in a pileup file. -// ParseNext returns an EOF if encountered. -func (parser *Parser) ParseNext() (Pileup, error) { - if _, err := parser.reader.Peek(1); err != nil { - // Early return on error. Probably will be EOF. - return Pileup{}, err +// Next parses the next pileup row in a pileup file. +// Next returns an EOF if encountered. +func (parser *Parser) Next() (*Line, error) { + if parser.atEOF { + return &Line{}, io.EOF } // Parse out a single line lineBytes, err := parser.reader.ReadSlice('\n') if err != nil { - return Pileup{}, err + if err != io.EOF { + return &Line{}, err + } + parser.atEOF = true } parser.line++ line := string(lineBytes) + + // In this case, the file has ended on a newline. Just return + // with an io.EOF + if len(strings.TrimSpace(line)) == 0 && parser.atEOF { + return &Line{}, io.EOF + } + line = line[:len(line)-1] // Exclude newline delimiter. // Check that there are 6 values, as defined by the pileup format values := strings.Split(line, "\t") if len(values) != 6 { - return Pileup{}, fmt.Errorf("Error on line %d: Got %d values, expected 6.", parser.line, len(values)) + return &Line{}, fmt.Errorf("Error on line %d: Got %d values, expected 6.", parser.line, len(values)) } // Convert Position and ReadCount to integers - positionInteger, err := strconv.Atoi(values[1]) + positionInteger, err := strconv.Atoi(strings.TrimSpace(values[1])) if err != nil { - return Pileup{}, fmt.Errorf("Error on line %d. Got error: %w", parser.line, err) + return &Line{}, fmt.Errorf("Error on line %d. Got error: %w", parser.line, err) } - readCountInteger, err := strconv.Atoi(values[3]) + readCountInteger, err := strconv.Atoi(strings.TrimSpace(values[3])) if err != nil { - return Pileup{}, fmt.Errorf("Error on line %d. Got error: %w", parser.line, err) + return &Line{}, fmt.Errorf("Error on line %d. Got error: %w", parser.line, err) } // Parse ReadResults @@ -150,6 +138,8 @@ func (parser *Parser) ParseNext() (Pileup, error) { } resultRune := resultsString[resultIndex] switch resultRune { + case ' ': + continue case '^': starts = starts + 1 skip = skip + 2 @@ -179,40 +169,18 @@ func (parser *Parser) ParseNext() (Pileup, error) { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'T', 'G', 'C', 'N', 'a', 't', 'g', 'c', 'n', '-', '+': continue default: - return Pileup{}, fmt.Errorf("Rune within +,- not found on line %d. Got %c: only runes allowed are: [0 1 2 3 4 5 6 7 8 9 A T G C N a t g c n - +]", parser.line, letter) + return &Line{}, fmt.Errorf("Rune within +,- not found on line %d. Got %c: only runes allowed are: [0 1 2 3 4 5 6 7 8 9 A T G C N a t g c n - +]", parser.line, letter) } } readResults = append(readResults, readResult) skip = skip + regularExpressionInt + len(numberOfJumps) // The 1 makes sure to include the regularExpressionInt in readResult string default: - return Pileup{}, fmt.Errorf("Rune not found on line %d. Got %c: only runes allowed are: [^ $ . , * A T G C N a t g c n - +]", parser.line, resultRune) + return &Line{}, fmt.Errorf("Rune not found on line %d. Got %c: only runes allowed are: [^ $ . , * A T G C N a t g c n - +]", parser.line, resultRune) } readCount = readCount + 1 } - return Pileup{Sequence: values[0], Position: uint(positionInteger), ReferenceBase: values[2], ReadCount: uint(readCountInteger), ReadResults: readResults, Quality: values[5]}, nil -} - -// Reset discards all data in buffer and resets state. -func (parser *Parser) Reset(r io.Reader) { - parser.reader.Reset(r) - parser.line = 0 -} - -/****************************************************************************** - -Start of Read functions - -******************************************************************************/ - -// Read reads a file into an array of Pileup structs -func Read(path string) ([]Pileup, error) { - file, err := os.Open(path) - if err != nil { - return nil, err - } - defer file.Close() - return Parse(file) + return &Line{Sequence: values[0], Position: uint(positionInteger), ReferenceBase: values[2], ReadCount: uint(readCountInteger), ReadResults: readResults, Quality: values[5]}, nil } /****************************************************************************** @@ -221,32 +189,11 @@ Start of Write functions ******************************************************************************/ -// WritePileups writes a pileup array to an io.Writer -func WritePileups(pileups []Pileup, w io.Writer) error { - for _, pileup := range pileups { - _, err := w.Write([]byte( - strings.Join( - []string{ - pileup.Sequence, - strconv.FormatUint(uint64(pileup.Position), 10), - pileup.ReferenceBase, - strconv.FormatUint(uint64(pileup.ReadCount), 10), - strings.Join(pileup.ReadResults, ""), - pileup.Quality, - }, - "\t") + "\n")) - if err != nil { - return err - } - } - return nil -} - -// Write writes a pileup array to a file -func Write(pileups []Pileup, path string) error { - var b bytes.Buffer - if err := WritePileups(pileups, &b); err != nil { - return err +func (line *Line) WriteTo(w io.Writer) (int64, error) { + var buffer strings.Builder + for _, readResult := range line.ReadResults { + buffer.WriteString(readResult) } - return os.WriteFile(path, b.Bytes(), 0644) + written, err := fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\t%s\n", line.Sequence, strconv.FormatUint(uint64(line.Position), 10), line.ReferenceBase, strconv.FormatUint(uint64(line.ReadCount), 10), buffer.String(), line.Quality) + return int64(written), err } diff --git a/io/pileup/pileup_test.go b/bio/pileup/pileup_test.go similarity index 55% rename from io/pileup/pileup_test.go rename to bio/pileup/pileup_test.go index a38aa2ba..dccb62ca 100644 --- a/io/pileup/pileup_test.go +++ b/bio/pileup/pileup_test.go @@ -1,7 +1,6 @@ package pileup import ( - "bytes" "errors" "fmt" "io" @@ -18,16 +17,16 @@ func TestParse(t *testing.T) { defer file.Close() const maxLineSize = 2 * 32 * 1024 parser := NewParser(file, maxLineSize) - var pileupReads []Pileup + var pileupReads []Line for { - pileupRead, err := parser.ParseNext() + pileupRead, err := parser.Next() if err != nil { if !errors.Is(err, io.EOF) { t.Errorf("Got unknown error: %s", err) } break } - pileupReads = append(pileupReads, pileupRead) + pileupReads = append(pileupReads, *pileupRead) } if pileupReads[2].ReadCount != 1401 { t.Errorf("Expected 1401 read counts. Got: %d", pileupReads[2].ReadCount) @@ -41,7 +40,7 @@ func TestParse(t *testing.T) { defer file2.Close() parser = NewParser(file2, maxLineSize) for { - _, err = parser.ParseNext() + _, err = parser.Next() if err != nil { if !strings.Contains(fmt.Sprint(err), "values, expected 6.") { t.Errorf("Got unknown error: %s", err) @@ -58,7 +57,7 @@ func TestParse(t *testing.T) { defer file3.Close() parser = NewParser(file3, maxLineSize) for { - _, err = parser.ParseNext() + _, err = parser.Next() if err != nil { if !strings.Contains(fmt.Sprint(err), "Error on line") { t.Errorf("Got unknown error: %s", err) @@ -75,7 +74,7 @@ func TestParse(t *testing.T) { defer file4.Close() parser = NewParser(file4, maxLineSize) for { - _, err = parser.ParseNext() + _, err = parser.Next() if err != nil { if !strings.Contains(fmt.Sprint(err), "Error on line") { t.Errorf("Got unknown error: %s", err) @@ -92,7 +91,7 @@ func TestParse(t *testing.T) { defer file5.Close() parser = NewParser(file5, maxLineSize) for { - _, err = parser.ParseNext() + _, err = parser.Next() if err != nil { if !strings.Contains(fmt.Sprint(err), "Rune within +,- not found on line") { t.Errorf("Got unknown error: %s", err) @@ -109,7 +108,7 @@ func TestParse(t *testing.T) { defer file6.Close() parser = NewParser(file6, maxLineSize) for { - _, err = parser.ParseNext() + _, err = parser.Next() if err != nil { if !strings.Contains(fmt.Sprint(err), "Rune not found on line") { t.Errorf("Got unknown error: %s", err) @@ -117,72 +116,4 @@ func TestParse(t *testing.T) { break } } - - // Test ParseN - file7, err := os.Open("data/test.pileup") - if err != nil { - t.Errorf("Failed to open test.pileup: %s", err) - } - defer file7.Close() - parser = NewParser(file7, maxLineSize) - _, err = parser.ParseN(2) - if err != nil { - t.Errorf("Got unknown error: %s", err) - } -} - -func TestBuild(t *testing.T) { - file, err := os.Open("data/test.pileup") - if err != nil { - t.Errorf("Failed to open test.pileup: %s", err) - } - const maxLineSize = 2 * 32 * 1024 - parser := NewParser(file, maxLineSize) - pileups, err := parser.ParseAll() // Parse all - if err != nil { - t.Errorf("Failed to parse pileup: %s", err) - } - parser.Reset(file) - var b bytes.Buffer - if err := WritePileups(pileups, &b); err != nil { - t.Errorf("Failed to write pileups: %v", err) - } - newBody := b.Bytes() - file.Close() - - // Open file again, read all - file, err = os.Open("data/test.pileup") - if err != nil { - t.Errorf("Failed to open test.pileup: %s", err) - } - body, err := io.ReadAll(file) // Read them bytes - if err != nil { - t.Errorf("Failed to readall: %s", err) - } - if string(body) != string(newBody) { - t.Errorf("input bytes do not equal output bytes of build") - } -} - -func TestRead(t *testing.T) { - _, err := Read("data/DOESNT_EXIST.pileup") - if err == nil { - t.Errorf("Should have failed on empty file") - } - _, err = Read("data/test.pileup") - if err != nil { - t.Errorf("Should have succeeded in reading pileup file") - } -} - -func TestWrite(t *testing.T) { - pileups, _ := Read("data/test.pileup") - err := Write(pileups, "tmp.pileup") - if err != nil { - t.Errorf("Failed to write temporary pileup") - } - err = os.Remove("tmp.pileup") - if err != nil { - t.Errorf("Failed to delete temporary pileup") - } } diff --git a/io/polyjson/example_test.go b/bio/polyjson/example_test.go similarity index 99% rename from io/polyjson/example_test.go rename to bio/polyjson/example_test.go index eb65a190..ec589977 100644 --- a/io/polyjson/example_test.go +++ b/bio/polyjson/example_test.go @@ -6,7 +6,7 @@ import ( "path/filepath" "time" - "github.com/TimothyStiles/poly/io/polyjson" + "github.com/TimothyStiles/poly/bio/polyjson" "github.com/TimothyStiles/poly/seqhash" ) diff --git a/io/polyjson/polyjson.go b/bio/polyjson/polyjson.go similarity index 67% rename from io/polyjson/polyjson.go rename to bio/polyjson/polyjson.go index 81370480..359370e5 100644 --- a/io/polyjson/polyjson.go +++ b/bio/polyjson/polyjson.go @@ -13,6 +13,7 @@ import ( "os" "time" + "github.com/TimothyStiles/poly/bio/genbank" "github.com/TimothyStiles/poly/transform" ) @@ -153,6 +154,82 @@ func Write(sequence Poly, path string) error { return os.WriteFile(path, file, 0644) } +// Utilities to convert polyjson objects -> their genbank equivalents +// TODO add convert <- genbank methods, which is currently difficult as most +// genbank Meta values are discarded due to lack of support for wildcard metadata in polyjson. + +func (sequence *Poly) ToGenbank() genbank.Genbank { + gb := genbank.Genbank{ + Meta: sequence.Meta.ToGenbank(), + Features: make([]genbank.Feature, len(sequence.Features)), + Sequence: sequence.Sequence, + } + for i, f := range sequence.Features { + gb.Features[i] = f.ToGenbank() + gb.Features[i].ParentSequence = &gb + } + return gb +} + +func (meta *Meta) ToGenbank() genbank.Meta { + other := make(map[string]string) + if meta.URL != "" { + other["URL"] = meta.URL + } + if meta.CreatedBy != "" { + other["CreatedBy"] = meta.CreatedBy + } + if meta.CreatedWith != "" { + other["CreatedWith"] = meta.CreatedWith + } + other["CreatedOn"] = meta.CreatedOn.String() + if meta.Schema != "" { + other["Schema"] = meta.Schema + } + return genbank.Meta{ + Definition: meta.Description, + Source: meta.CreatedBy, + Origin: meta.CreatedWith, + Name: meta.Name, + SequenceHash: meta.Hash, + Other: other, + } +} + +func (feature *Feature) ToGenbank() genbank.Feature { + attributes := genbank.NewMultiMap[string, string]() + for key, value := range feature.Tags { + genbank.Put(attributes, key, value) + } + genbank.Put(attributes, "Name", feature.Name) + + return genbank.Feature{ + Type: feature.Type, + Description: feature.Description, + Attributes: attributes, + SequenceHash: feature.Hash, + Sequence: feature.Sequence, + Location: feature.Location.ToGenbank(), + } +} + +func (location *Location) ToGenbank() genbank.Location { + loc := genbank.Location{ + Start: location.Start, + End: location.End, + Complement: location.Complement, + Join: location.Join, + FivePrimePartial: location.FivePrimePartial, + ThreePrimePartial: location.ThreePrimePartial, + SubLocations: genbank.MapSlice( + location.SubLocations, + func(s Location) genbank.Location { return s.ToGenbank() }, + ), + } + loc.GbkLocationString = genbank.BuildLocationString(loc) + return loc +} + /****************************************************************************** JSON specific IO related things end here. diff --git a/io/polyjson/polyjson_test.go b/bio/polyjson/polyjson_test.go similarity index 97% rename from io/polyjson/polyjson_test.go rename to bio/polyjson/polyjson_test.go index eaee25b1..dedd7039 100644 --- a/io/polyjson/polyjson_test.go +++ b/bio/polyjson/polyjson_test.go @@ -104,3 +104,9 @@ func TestWrite_error(t *testing.T) { err := Write(Poly{}, "/tmp/file") assert.EqualError(t, err, marshalIndentErr.Error()) } + +func TestConvert(t *testing.T) { + sequence, err := Read("../../data/cat.json") + assert.NoError(t, err) + sequence.ToGenbank() +} diff --git a/io/rebase/data/rebase_test.txt b/bio/rebase/data/rebase_test.txt similarity index 100% rename from io/rebase/data/rebase_test.txt rename to bio/rebase/data/rebase_test.txt diff --git a/io/rebase/example_test.go b/bio/rebase/example_test.go similarity index 94% rename from io/rebase/example_test.go rename to bio/rebase/example_test.go index 313da658..a6814a22 100644 --- a/io/rebase/example_test.go +++ b/bio/rebase/example_test.go @@ -3,7 +3,7 @@ package rebase_test import ( "fmt" - "github.com/TimothyStiles/poly/io/rebase" + "github.com/TimothyStiles/poly/bio/rebase" ) // This example reads rebase into an enzymeMap and returns the AarI recognition diff --git a/io/rebase/rebase.go b/bio/rebase/rebase.go similarity index 100% rename from io/rebase/rebase.go rename to bio/rebase/rebase.go diff --git a/io/rebase/rebase_test.go b/bio/rebase/rebase_test.go similarity index 100% rename from io/rebase/rebase_test.go rename to bio/rebase/rebase_test.go diff --git a/io/slow5/data/example.slow5 b/bio/slow5/data/example.slow5 similarity index 100% rename from io/slow5/data/example.slow5 rename to bio/slow5/data/example.slow5 diff --git a/io/slow5/data/header_tests/test_header_empty.slow5 b/bio/slow5/data/header_tests/test_header_empty.slow5 similarity index 100% rename from io/slow5/data/header_tests/test_header_empty.slow5 rename to bio/slow5/data/header_tests/test_header_empty.slow5 diff --git a/io/slow5/data/header_tests/test_header_not_enough_attributes.slow5 b/bio/slow5/data/header_tests/test_header_not_enough_attributes.slow5 similarity index 100% rename from io/slow5/data/header_tests/test_header_not_enough_attributes.slow5 rename to bio/slow5/data/header_tests/test_header_not_enough_attributes.slow5 diff --git a/io/slow5/data/header_tests/test_header_numReadGroups_bad.slow5 b/bio/slow5/data/header_tests/test_header_numReadGroups_bad.slow5 similarity index 100% rename from io/slow5/data/header_tests/test_header_numReadGroups_bad.slow5 rename to bio/slow5/data/header_tests/test_header_numReadGroups_bad.slow5 diff --git a/io/slow5/data/header_tests/test_header_without_tabs.slow5 b/bio/slow5/data/header_tests/test_header_without_tabs.slow5 similarity index 100% rename from io/slow5/data/header_tests/test_header_without_tabs.slow5 rename to bio/slow5/data/header_tests/test_header_without_tabs.slow5 diff --git a/io/slow5/data/read_tests/continue.slow5 b/bio/slow5/data/read_tests/continue.slow5 similarity index 100% rename from io/slow5/data/read_tests/continue.slow5 rename to bio/slow5/data/read_tests/continue.slow5 diff --git a/io/slow5/data/read_tests/digitisation.slow5 b/bio/slow5/data/read_tests/digitisation.slow5 similarity index 100% rename from io/slow5/data/read_tests/digitisation.slow5 rename to bio/slow5/data/read_tests/digitisation.slow5 diff --git a/io/slow5/data/read_tests/endReason.slow5 b/bio/slow5/data/read_tests/endReason.slow5 similarity index 100% rename from io/slow5/data/read_tests/endReason.slow5 rename to bio/slow5/data/read_tests/endReason.slow5 diff --git a/io/slow5/data/read_tests/end_reason.slow5 b/bio/slow5/data/read_tests/end_reason.slow5 similarity index 100% rename from io/slow5/data/read_tests/end_reason.slow5 rename to bio/slow5/data/read_tests/end_reason.slow5 diff --git a/io/slow5/data/read_tests/end_reason_unknown.slow5 b/bio/slow5/data/read_tests/end_reason_unknown.slow5 similarity index 100% rename from io/slow5/data/read_tests/end_reason_unknown.slow5 rename to bio/slow5/data/read_tests/end_reason_unknown.slow5 diff --git a/io/slow5/data/read_tests/len_raw_signal.slow5 b/bio/slow5/data/read_tests/len_raw_signal.slow5 similarity index 100% rename from io/slow5/data/read_tests/len_raw_signal.slow5 rename to bio/slow5/data/read_tests/len_raw_signal.slow5 diff --git a/io/slow5/data/read_tests/median_before.slow5 b/bio/slow5/data/read_tests/median_before.slow5 similarity index 100% rename from io/slow5/data/read_tests/median_before.slow5 rename to bio/slow5/data/read_tests/median_before.slow5 diff --git a/io/slow5/data/read_tests/offset.slow5 b/bio/slow5/data/read_tests/offset.slow5 similarity index 100% rename from io/slow5/data/read_tests/offset.slow5 rename to bio/slow5/data/read_tests/offset.slow5 diff --git a/io/slow5/data/read_tests/range.slow5 b/bio/slow5/data/read_tests/range.slow5 similarity index 100% rename from io/slow5/data/read_tests/range.slow5 rename to bio/slow5/data/read_tests/range.slow5 diff --git a/io/slow5/data/read_tests/raw_signal.slow5 b/bio/slow5/data/read_tests/raw_signal.slow5 similarity index 100% rename from io/slow5/data/read_tests/raw_signal.slow5 rename to bio/slow5/data/read_tests/raw_signal.slow5 diff --git a/io/slow5/data/read_tests/read_group.slow5 b/bio/slow5/data/read_tests/read_group.slow5 similarity index 100% rename from io/slow5/data/read_tests/read_group.slow5 rename to bio/slow5/data/read_tests/read_group.slow5 diff --git a/io/slow5/data/read_tests/read_number.slow5 b/bio/slow5/data/read_tests/read_number.slow5 similarity index 100% rename from io/slow5/data/read_tests/read_number.slow5 rename to bio/slow5/data/read_tests/read_number.slow5 diff --git a/io/slow5/data/read_tests/sampling_rate.slow5 b/bio/slow5/data/read_tests/sampling_rate.slow5 similarity index 100% rename from io/slow5/data/read_tests/sampling_rate.slow5 rename to bio/slow5/data/read_tests/sampling_rate.slow5 diff --git a/io/slow5/data/read_tests/start_mux.slow5 b/bio/slow5/data/read_tests/start_mux.slow5 similarity index 100% rename from io/slow5/data/read_tests/start_mux.slow5 rename to bio/slow5/data/read_tests/start_mux.slow5 diff --git a/io/slow5/data/read_tests/start_time.slow5 b/bio/slow5/data/read_tests/start_time.slow5 similarity index 100% rename from io/slow5/data/read_tests/start_time.slow5 rename to bio/slow5/data/read_tests/start_time.slow5 diff --git a/io/slow5/data/read_tests/unknown.slow5 b/bio/slow5/data/read_tests/unknown.slow5 similarity index 100% rename from io/slow5/data/read_tests/unknown.slow5 rename to bio/slow5/data/read_tests/unknown.slow5 diff --git a/io/slow5/data/test_example.slow5 b/bio/slow5/data/test_example.slow5 similarity index 100% rename from io/slow5/data/test_example.slow5 rename to bio/slow5/data/test_example.slow5 diff --git a/bio/slow5/example_test.go b/bio/slow5/example_test.go new file mode 100644 index 00000000..1683448f --- /dev/null +++ b/bio/slow5/example_test.go @@ -0,0 +1,64 @@ +package slow5_test + +import ( + "fmt" + "os" + + "github.com/TimothyStiles/poly/bio/slow5" +) + +func ExampleNewParser() { + // example.slow5 is a file I generated using slow5tools from nanopore fast5 + // run where I was testing using nanopore for doing COVID testing. It + // contains real nanopore data. + file, _ := os.Open("data/example.slow5") + defer file.Close() + // Set maxLineSize to 64kb. If you expect longer reads, + // make maxLineSize longer! + const maxLineSize = 2 * 32 * 1024 + parser, _ := slow5.NewParser(file, maxLineSize) + + var outputReads []slow5.Read + for { + read, err := parser.Next() + if err != nil { + // Break at EOF + break + } + outputReads = append(outputReads, *read) + } + + fmt.Println(outputReads[0].RawSignal[0:10]) + // Output: [430 472 463 467 454 465 463 450 450 449] +} + +func ExampleSvbCompressRawSignal() { + // example.slow5 is a file I generated using slow5tools from nanopore fast5 + // run where I was testing using nanopore for doing COVID testing. It + // contains real nanopore data. + file, _ := os.Open("data/example.slow5") + defer file.Close() + // Set maxLineSize to 64kb. If you expect longer reads, + // make maxLineSize longer! + const maxLineSize = 2 * 32 * 1024 + parser, _ := slow5.NewParser(file, maxLineSize) + read, _ := parser.Next() + + // Get the raw signal from a read + rawSignal := read.RawSignal + + // Compress that raw signal into a mask and data + mask, data := slow5.SvbCompressRawSignal(rawSignal) + + // Decompress mask and data back into raw signal + rawSignalDecompressed := slow5.SvbDecompressRawSignal(len(rawSignal), mask, data) + + for idx := range rawSignal { + if rawSignal[idx] != rawSignalDecompressed[idx] { + fmt.Println("Compression failed!") + } + } + fmt.Println(data[:10]) + + // Output: [174 1 216 1 207 1 211 1 198 1] +} diff --git a/io/slow5/slow5.go b/bio/slow5/slow5.go similarity index 61% rename from io/slow5/slow5.go rename to bio/slow5/slow5.go index 3d43f0cd..a652db6b 100644 --- a/io/slow5/slow5.go +++ b/bio/slow5/slow5.go @@ -25,6 +25,8 @@ import ( "sort" "strconv" "strings" + + "github.com/koeng101/svb" ) /****************************************************************************** @@ -57,6 +59,10 @@ Keoni // Header contains metadata about the sequencing run in general. type Header struct { + HeaderValues []HeaderValue +} + +type HeaderValue struct { ReadGroupID uint32 Slow5Version string Attributes map[string]string @@ -82,7 +88,7 @@ type Read struct { StartTime uint64 EndReason string // enum{unknown,partial,mux_change,unblock_mux_change,data_service_unblock_mux_change,signal_positive,signal_negative} - Error error // in case there is an error while parsing! + EndReasonMap map[string]int // Used for writing } var knownEndReasons = map[string]bool{"unknown": true, @@ -99,19 +105,27 @@ var knownEndReasons = map[string]bool{"unknown": true, // It is initialized with NewParser. type Parser struct { // reader keeps state of current reader. - reader bufio.Reader - line uint - headerMap map[int]string - endReasonMap map[int]string + reader bufio.Reader + line uint + headerMap map[int]string + endReasonMap map[int]string + endReasonHeaderMap map[string]int + header Header + hitEOF bool +} + +// Header returns the header +func (parser *Parser) Header() (*Header, error) { + return &parser.header, nil } // NewParser parsers a slow5 file. -func NewParser(r io.Reader, maxLineSize int) (*Parser, []Header, error) { +func NewParser(r io.Reader, maxLineSize int) (*Parser, error) { parser := &Parser{ reader: *bufio.NewReaderSize(r, maxLineSize), line: 0, } - var headers []Header + var headers []HeaderValue var slow5Version string var numReadGroups uint32 headerMap := make(map[int]string) @@ -121,13 +135,13 @@ func NewParser(r io.Reader, maxLineSize int) (*Parser, []Header, error) { for { lineBytes, err := parser.reader.ReadSlice('\n') if err != nil { - return parser, []Header{}, err + return parser, err } line := strings.TrimSpace(string(lineBytes)) parser.line++ values := strings.Split(line, "\t") if len(values) < 2 { - return parser, []Header{}, fmt.Errorf("Got following line without tabs: %s", line) + return parser, fmt.Errorf("Got following line without tabs: %s", line) } // First, we need to identify the number of read groups. This number will be the length of our @@ -139,11 +153,11 @@ func NewParser(r io.Reader, maxLineSize int) (*Parser, []Header, error) { case "#num_read_groups": numReadGroupsUint, err := strconv.ParseUint(values[1], 10, 32) if err != nil { - return parser, []Header{}, err + return parser, err } numReadGroups = uint32(numReadGroupsUint) for id := uint32(0); id < numReadGroups; id++ { - headers = append(headers, Header{Slow5Version: slow5Version, ReadGroupID: id, Attributes: make(map[string]string)}) + headers = append(headers, HeaderValue{Slow5Version: slow5Version, ReadGroupID: id, Attributes: make(map[string]string)}) } } continue @@ -159,7 +173,7 @@ func NewParser(r io.Reader, maxLineSize int) (*Parser, []Header, error) { for endReasonIndex, endReason := range endReasons { if _, ok := knownEndReasons[endReason]; !ok { - return parser, headers, fmt.Errorf("unknown end reason '%s' found in end_reason enum. Please report", endReason) + return parser, fmt.Errorf("unknown end reason '%s' found in end_reason enum. Please report", endReason) } endReasonMap[endReasonIndex] = endReason endReasonHeaderMap[endReason] = endReasonIndex @@ -184,7 +198,7 @@ func NewParser(r io.Reader, maxLineSize int) (*Parser, []Header, error) { // Check to make sure we have the right amount of information for the num_read_groups if len(values) != int(numReadGroups+1) { - return parser, []Header{}, fmt.Errorf("Improper amount of information for read groups. Needed %d, got %d, in line: %s", numReadGroups+1, len(values), line) + return parser, fmt.Errorf("Improper amount of information for read groups. Needed %d, got %d, in line: %s", numReadGroups+1, len(values), line) } for id := 0; id < int(numReadGroups); id++ { headers[id].Attributes[values[0]] = values[id+1] @@ -193,14 +207,27 @@ func NewParser(r io.Reader, maxLineSize int) (*Parser, []Header, error) { } parser.headerMap = headerMap parser.endReasonMap = endReasonMap - return parser, headers, nil + parser.endReasonHeaderMap = endReasonHeaderMap + parser.header = Header{HeaderValues: headers} + return parser, nil } -// ParseNext parses the next read from a parser. -func (parser *Parser) ParseNext() (Read, error) { +// Next parses the next read from a parser. +func (parser *Parser) Next() (*Read, error) { + if parser.hitEOF { + return &Read{}, io.EOF + } lineBytes, err := parser.reader.ReadSlice('\n') if err != nil { - return Read{}, err + if err == io.EOF { + parser.hitEOF = true + if strings.TrimSpace(string(lineBytes)) == "" { + // If EOF line is empty, return. If not, continue! + return nil, io.EOF + } + } else { + return nil, err + } } parser.line++ line := strings.TrimSpace(string(lineBytes)) @@ -209,6 +236,7 @@ func (parser *Parser) ParseNext() (Read, error) { // Reads have started. // Once we have the read headers, start to parse the actual reads var newRead Read + newRead.EndReasonMap = parser.endReasonHeaderMap for valueIndex := 0; valueIndex < len(values); valueIndex++ { fieldValue := parser.headerMap[valueIndex] if values[valueIndex] == "." { @@ -220,37 +248,37 @@ func (parser *Parser) ParseNext() (Read, error) { case "read_group": readGroupID, err := strconv.ParseUint(values[valueIndex], 10, 32) if err != nil { - newRead.Error = fmt.Errorf("Failed convert read_group '%s' to uint on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return nil, fmt.Errorf("Failed convert read_group '%s' to uint on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.ReadGroupID = uint32(readGroupID) case "digitisation": digitisation, err := strconv.ParseFloat(values[valueIndex], 64) if err != nil { - newRead.Error = fmt.Errorf("Failed to convert digitisation '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return nil, fmt.Errorf("Failed to convert digitisation '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.Digitisation = digitisation case "offset": offset, err := strconv.ParseFloat(values[valueIndex], 64) if err != nil { - newRead.Error = fmt.Errorf("Failed to convert offset '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return nil, fmt.Errorf("Failed to convert offset '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.Offset = offset case "range": nanoporeRange, err := strconv.ParseFloat(values[valueIndex], 64) if err != nil { - newRead.Error = fmt.Errorf("Failed to convert range '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return nil, fmt.Errorf("Failed to convert range '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.Range = nanoporeRange case "sampling_rate": samplingRate, err := strconv.ParseFloat(values[valueIndex], 64) if err != nil { - newRead.Error = fmt.Errorf("Failed to convert sampling_rate '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return nil, fmt.Errorf("Failed to convert sampling_rate '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.SamplingRate = samplingRate case "len_raw_signal": lenRawSignal, err := strconv.ParseUint(values[valueIndex], 10, 64) if err != nil { - newRead.Error = fmt.Errorf("Failed to convert len_raw_signal '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return nil, fmt.Errorf("Failed to convert len_raw_signal '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.LenRawSignal = lenRawSignal case "raw_signal": @@ -258,7 +286,7 @@ func (parser *Parser) ParseNext() (Read, error) { for rawSignalIndex, rawSignalString := range strings.Split(values[valueIndex], ",") { rawSignal, err := strconv.ParseInt(rawSignalString, 10, 16) if err != nil { - newRead.Error = fmt.Errorf("Failed to convert raw signal '%s' to int on line %d, signal index %d. Got Error: %w", rawSignalString, parser.line, rawSignalIndex, err) + return nil, fmt.Errorf("Failed to convert raw signal '%s' to int on line %d, signal index %d. Got Error: %w", rawSignalString, parser.line, rawSignalIndex, err) } rawSignals = append(rawSignals, int16(rawSignal)) } @@ -266,44 +294,44 @@ func (parser *Parser) ParseNext() (Read, error) { case "start_time": startTime, err := strconv.ParseUint(values[valueIndex], 10, 64) if err != nil { - newRead.Error = fmt.Errorf("Failed to convert start_time '%s' to uint on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return nil, fmt.Errorf("Failed to convert start_time '%s' to uint on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.StartTime = startTime case "read_number": readNumber, err := strconv.ParseInt(values[valueIndex], 10, 32) if err != nil { - newRead.Error = fmt.Errorf("Failed to convert read_number '%s' to int on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return nil, fmt.Errorf("Failed to convert read_number '%s' to int on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.ReadNumber = int32(readNumber) case "start_mux": startMux, err := strconv.ParseUint(values[valueIndex], 10, 8) if err != nil { - newRead.Error = fmt.Errorf("Failed to convert start_mux '%s' to uint on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return nil, fmt.Errorf("Failed to convert start_mux '%s' to uint on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.StartMux = uint8(startMux) case "median_before": medianBefore, err := strconv.ParseFloat(values[valueIndex], 64) if err != nil { - newRead.Error = fmt.Errorf("Failed to convert median_before '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return nil, fmt.Errorf("Failed to convert median_before '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.MedianBefore = medianBefore case "end_reason": endReasonIndex, err := strconv.ParseInt(values[valueIndex], 10, 64) if err != nil { - newRead.Error = fmt.Errorf("Failed to convert end_reason '%s' to int on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return nil, fmt.Errorf("Failed to convert end_reason '%s' to int on line %d. Got Error: %w", values[valueIndex], parser.line, err) } if _, ok := parser.endReasonMap[int(endReasonIndex)]; !ok { - newRead.Error = fmt.Errorf("End reason out of range. Got '%d' on line %d. Cannot find valid enum reason", int(endReasonIndex), parser.line) + return nil, fmt.Errorf("End reason out of range. Got '%d' on line %d. Cannot find valid enum reason", int(endReasonIndex), parser.line) } newRead.EndReason = parser.endReasonMap[int(endReasonIndex)] case "channel_number": // For whatever reason, this is a string. newRead.ChannelNumber = values[valueIndex] default: - newRead.Error = fmt.Errorf("Unknown field to parser '%s' found on line %d. Please report to github.com/TimothyStiles/poly", fieldValue, parser.line) + return nil, fmt.Errorf("Unknown field to parser '%s' found on line %d. Please report to github.com/TimothyStiles/poly", fieldValue, parser.line) } } - return newRead, nil + return &newRead, nil } /****************************************************************************** @@ -326,19 +354,22 @@ Keoni ******************************************************************************/ -// Write writes a list of headers and a channel of reads to an output. -func Write(headers []Header, reads <-chan Read, output io.Writer) error { +func (header *Header) WriteTo(w io.Writer) (int64, error) { + headers := header.HeaderValues + var written int64 // First, write the slow5 version number slow5Version := headers[0].Slow5Version endReasonHeaderMap := headers[0].EndReasonHeaderMap - _, err := fmt.Fprintf(output, "#slow5_version\t%s\n", slow5Version) + newWriteN, err := fmt.Fprintf(w, "#slow5_version\t%s\n", slow5Version) + written += int64(newWriteN) if err != nil { - return err + return written, err } // Then, write the number of read groups (ie, the number of headers) - _, err = fmt.Fprintf(output, "#num_read_groups\t%d\n", len(headers)) + newWriteN, err = fmt.Fprintf(w, "#num_read_groups\t%d\n", len(headers)) + written += int64(newWriteN) if err != nil { - return err + return written, err } // Next, we need a map of what attribute values are available possibleAttributeKeys := make(map[string]bool) @@ -378,9 +409,10 @@ func Write(headers []Header, reads <-chan Read, output io.Writer) error { // Write the header attribute strings to the output for _, headerAttributeString := range headerAttributeStrings { - _, err = fmt.Fprintf(output, "%s\n", headerAttributeString) + newWriteN, err = fmt.Fprintf(w, "%s\n", headerAttributeString) + written += int64(newWriteN) if err != nil { - return err + return written, err } } @@ -404,37 +436,88 @@ func Write(headers []Header, reads <-chan Read, output io.Writer) error { // Write the read headers // These are according to the slow5 specifications - _, err = fmt.Fprintf(output, "#char* uint32_t double double double double uint64_t int16_t* uint64_t int32_t uint8_t double enum{%s} char*\n", endReasonString) + newWriteN, err = fmt.Fprintf(w, "#char* uint32_t double double double double uint64_t int16_t* uint64_t int32_t uint8_t double enum{%s} char*\n", endReasonString) + written += int64(newWriteN) if err != nil { - return err + return written, err } - _, err = fmt.Fprintln(output, "#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal start_time read_number start_mux median_before end_reason channel_number") + newWriteN, err = fmt.Fprintln(w, "#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal start_time read_number start_mux median_before end_reason channel_number") + written += int64(newWriteN) if err != nil { - return err + return written, err } + return written, nil +} - // Iterate over reads. This is reading from a channel, and will end - // when the channel is closed. - for read := range reads { - // converts []int16 to string - var rawSignalStringBuilder strings.Builder - for signalIndex, signal := range read.RawSignal { - _, err = fmt.Fprint(&rawSignalStringBuilder, signal) +func (read *Read) WriteTo(w io.Writer) (int64, error) { + var err error + var written int64 + // converts []int16 to string + var rawSignalStringBuilder strings.Builder + for signalIndex, signal := range read.RawSignal { + _, err = fmt.Fprint(&rawSignalStringBuilder, signal) + if err != nil { + return written, err + } + if signalIndex != len(read.RawSignal)-1 { // Don't add a comma to last number + _, err = fmt.Fprint(&rawSignalStringBuilder, ",") if err != nil { - return err - } - if signalIndex != len(read.RawSignal)-1 { // Don't add a comma to last number - _, err = fmt.Fprint(&rawSignalStringBuilder, ",") - if err != nil { - return err - } + return written, err } } - // Look at above output.Write("#read_id ... for the values here. - _, err = fmt.Fprintf(output, "%s\t%d\t%g\t%g\t%g\t%g\t%d\t%s\t%d\t%d\t%d\t%g\t%d\t%s\n", read.ReadID, read.ReadGroupID, read.Digitisation, read.Offset, read.Range, read.SamplingRate, read.LenRawSignal, rawSignalStringBuilder.String(), read.StartTime, read.ReadNumber, read.StartMux, read.MedianBefore, endReasonHeaderMap[read.EndReason], read.ChannelNumber) - if err != nil { - return err - } } - return nil + // Look at above output.Write("#read_id ... for the values here. + newWriteN, err := fmt.Fprintf(w, "%s\t%d\t%g\t%g\t%g\t%g\t%d\t%s\t%d\t%d\t%d\t%g\t%d\t%s\n", read.ReadID, read.ReadGroupID, read.Digitisation, read.Offset, read.Range, read.SamplingRate, read.LenRawSignal, rawSignalStringBuilder.String(), read.StartTime, read.ReadNumber, read.StartMux, read.MedianBefore, read.EndReasonMap[read.EndReason], read.ChannelNumber) + written += int64(newWriteN) + if err != nil { + return written, err + } + return written, nil +} + +/****************************************************************************** +Aug 15, 2023 + +StreamVByte (svb) compression of raw signal strength is used to decrease the +overall size of records in blow5 files. In my tests using a SQLite database +that contained both slow5 and fastq files, switching from raw signal TEXT to +svb compressed BLOBs brought the total database size from 12GB to 7.1GB, a +~40% reduction in size. + +This is the primary method that can be used to decrease the size of storing +slow5 records in alternative datastores to blow5 (SQL databases, etc). + +svb is an integer compression algorithm, which are specialized in compressing +integer arrays, which perfectly fits raw signal strength data. When converting +from slow5 to blow5, files are additionally compressed with zstd or zlib on top +of raw signal compression with svb. Still, svb compression is where most the +data size saving comes from. + +Stream VByte paper: https://doi.org/10.48550/arXiv.1709.08990 + +Keoni + +******************************************************************************/ + +// SvbCompressRawSignal takes a read and converts its raw signal field to two +// arrays: a mask array and a data array. Both are needed for decompression. +func SvbCompressRawSignal(rawSignal []int16) (mask, data []byte) { + rawSignalUint32 := make([]uint32, len(rawSignal)) + for idx := range rawSignal { + rawSignalUint32[idx] = uint32(rawSignal[idx]) + } + return svb.Uint32Encode(rawSignalUint32) +} + +// SvbDecompressRawSignal decompresses raw signal back to a []int16. It +// requires not only the mask array and data array returned by +// SvbCompressRawSignal, but also the length of the raw signals. +func SvbDecompressRawSignal(lenRawSignal int, mask, data []byte) []int16 { + rawSignalUint32 := make([]uint32, lenRawSignal) + rawSignal := make([]int16, lenRawSignal) + svb.Uint32Decode32(mask, data, rawSignalUint32) + for idx := 0; idx < lenRawSignal; idx++ { + rawSignal[idx] = int16(rawSignalUint32[idx]) + } + return rawSignal } diff --git a/io/slow5/slow5_test.go b/bio/slow5/slow5_test.go similarity index 67% rename from io/slow5/slow5_test.go rename to bio/slow5/slow5_test.go index 65027bfc..7bf8d452 100644 --- a/io/slow5/slow5_test.go +++ b/bio/slow5/slow5_test.go @@ -5,7 +5,10 @@ import ( "io" "io/ioutil" "os" + "strings" "testing" + + "github.com/google/go-cmp/cmp" ) const maxLineSize = 2 * 32 * 1024 @@ -15,29 +18,30 @@ func TestParse(t *testing.T) { if err != nil { t.Errorf("Failed to open example.slow5: %s", err) } - parser, headers, err := NewParser(file, maxLineSize) + parser, err := NewParser(file, maxLineSize) if err != nil { t.Errorf("Failed to parse headers of file: %s", err) } // Test headers - if len(headers) != 1 { - t.Errorf("There should only be 1 read group. Got: %d", len(headers)) + headers, _ := parser.Header() + if len(headers.HeaderValues) != 1 { + t.Errorf("There should only be 1 read group. Got: %d", len(headers.HeaderValues)) } - if headers[0].Attributes["@asic_id"] != "4175987214" { - t.Errorf("Expected AsicId 4175987214. Got: %s", headers[0].Attributes["asic_id"]) + if headers.HeaderValues[0].Attributes["@asic_id"] != "4175987214" { + t.Errorf("Expected AsicId 4175987214. Got: %s", headers.HeaderValues[0].Attributes["asic_id"]) } // Test reads var outputReads []Read for { - read, err := parser.ParseNext() + read, err := parser.Next() if err != nil { if !errors.Is(err, io.EOF) { t.Errorf("Got unknown error: %s", err) } break } - outputReads = append(outputReads, read) + outputReads = append(outputReads, *read) } if outputReads[0].RawSignal[0] != 430 { t.Errorf("Expected first outputRead to have a raw_signal of 430. Got: %d", outputReads[0].RawSignal[0]) @@ -50,7 +54,7 @@ func TestParseImproperHeaders(t *testing.T) { if err != nil { t.Errorf("Failed to open file with error: %s", err) } - _, _, err = NewParser(file, maxLineSize) + _, err = NewParser(file, maxLineSize) if err == nil { t.Errorf("Test should have failed if header line doesn't have any tabs") } @@ -59,7 +63,7 @@ func TestParseImproperHeaders(t *testing.T) { if err != nil { t.Errorf("Failed to open file with error: %s", err) } - _, _, err = NewParser(file, maxLineSize) + _, err = NewParser(file, maxLineSize) if err == nil { t.Errorf("Test should have failed if numReadGroup can't be converted to an int") } @@ -68,7 +72,7 @@ func TestParseImproperHeaders(t *testing.T) { if err != nil { t.Errorf("Failed to open file with error: %s", err) } - _, _, err = NewParser(file, maxLineSize) + _, err = NewParser(file, maxLineSize) if err == nil { t.Errorf("Test should have failed if the header doesn't have enough attributes for numReadGroup") } @@ -77,7 +81,7 @@ func TestParseImproperHeaders(t *testing.T) { if err != nil { t.Errorf("Failed to open file with error: %s", err) } - _, _, err = NewParser(file, maxLineSize) + _, err = NewParser(file, maxLineSize) if err == nil { t.Errorf("Test should have failed if the file is empty") } @@ -88,20 +92,16 @@ func testParseReadsHelper(t *testing.T, fileTarget string, errorMessage string) if err != nil { t.Errorf("Failed to open file with error: %s", err) } - parser, _, _ := NewParser(file, maxLineSize) + parser, _ := NewParser(file, maxLineSize) var targetErr []error for { - read, err := parser.ParseNext() + _, err := parser.Next() if err != nil { if !errors.Is(err, io.EOF) { - t.Errorf("Got unknown error: %s", err) + targetErr = append(targetErr, err) } break } - err = read.Error - if err != nil { - targetErr = append(targetErr, err) - } } if len(targetErr) == 0 { t.Errorf(errorMessage) @@ -113,22 +113,18 @@ func TestParseReads(t *testing.T) { if err != nil { t.Errorf("Failed to open file with error: %s", err) } - parser, _, _ := NewParser(file, maxLineSize) + parser, _ := NewParser(file, maxLineSize) var outputReads []Read for { - read, err := parser.ParseNext() + read, err := parser.Next() if err != nil { if !errors.Is(err, io.EOF) { t.Errorf("Got unknown error: %s", err) } break } - err = read.Error - if err != nil { - t.Errorf("Failed ParseReads with error: %s", err) - } - outputReads = append(outputReads, read) + outputReads = append(outputReads, *read) } if outputReads[0].ReadID != "0026631e-33a3-49ab-aa22-3ab157d71f8b" { t.Errorf("First read id should be 0026631e-33a3-49ab-aa22-3ab157d71f8b. Got: %s", outputReads[0].ReadID) @@ -159,7 +155,7 @@ func TestWrite(t *testing.T) { t.Errorf("Failed to open file with error: %s", err) } // Parse headers - parser, headers, err := NewParser(file, maxLineSize) + parser, err := NewParser(file, maxLineSize) if err != nil { t.Errorf("Failed to parse headers with error: %s", err) } @@ -173,20 +169,27 @@ func TestWrite(t *testing.T) { reads := make(chan Read) go func() { for { - read, err := parser.ParseNext() + read, err := parser.Next() if err != nil { // Break at EOF break } - reads <- read + reads <- *read } close(reads) }() - // Write - err = Write(headers, reads, testFile) + // Write header + headers, _ := parser.Header() + _, err = headers.WriteTo(testFile) if err != nil { - t.Errorf("Failed to write slow5 file. Got error: %s", err) + t.Errorf("Failed to write slow5 header. Got error: %s", err) + } + for read := range reads { + _, err = read.WriteTo(testFile) + if err != nil { + t.Errorf("Failed to write slow5 read. Got error: %s", err) + } } // Compare both files @@ -204,3 +207,47 @@ func TestWrite(t *testing.T) { t.Errorf("Example and test write are different") } } + +func TestSimpleExample(t *testing.T) { + file := strings.NewReader(`#slow5_version 0.2.0 +#num_read_groups 1 +@asic_id 4175987214 +#char* uint32_t double double double double uint64_t int16_t* uint64_t int32_t uint8_t double enum{unknown,partial,mux_change,unblock_mux_change,data_service_unblock_mux_change,signal_positive,signal_negative} char* +#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal start_time read_number start_mux median_before end_reason channel_number +0026631e-33a3-49ab-aa22-3ab157d71f8b 0 8192 16 1489.52832 4000 5347 430,472,463 8318394 5383 1 219.133423 5 10`) + parser, err := NewParser(file, maxLineSize) + if err != nil { + t.Errorf("failed: %s", err) + } + read, _ := parser.Next() + if read.RawSignal[0] != 430 { + t.Errorf("Should have gotten 430. Got: %d", read.RawSignal[0]) + } +} + +func TestSvb(t *testing.T) { + file, _ := os.Open("data/example.slow5") + defer file.Close() + const maxLineSize = 2 * 32 * 1024 + parser, _ := NewParser(file, maxLineSize) + var outputReads []Read + for { + read, err := parser.Next() + if err != nil { + if !errors.Is(err, io.EOF) { + t.Errorf("Got unknown error: %s", err) + } + break + } + outputReads = append(outputReads, *read) + } + + for readNum, read := range outputReads { + rawSignal := read.RawSignal + mask, data := SvbCompressRawSignal(rawSignal) + rawSignalDecompressed := SvbDecompressRawSignal(len(rawSignal), mask, data) + if !cmp.Equal(rawSignal, rawSignalDecompressed) { + t.Errorf("Read signal at readNum %d (%v) didn't match decompressed signal %v", readNum, rawSignal, rawSignalDecompressed) + } + } +} diff --git a/io/uniprot/data/check_uniprot b/bio/uniprot/data/check_uniprot similarity index 100% rename from io/uniprot/data/check_uniprot rename to bio/uniprot/data/check_uniprot diff --git a/io/uniprot/data/test b/bio/uniprot/data/test similarity index 100% rename from io/uniprot/data/test rename to bio/uniprot/data/test diff --git a/io/uniprot/data/uniprot.xsd b/bio/uniprot/data/uniprot.xsd similarity index 100% rename from io/uniprot/data/uniprot.xsd rename to bio/uniprot/data/uniprot.xsd diff --git a/io/uniprot/data/uniprot_sprot_mini.xml.gz b/bio/uniprot/data/uniprot_sprot_mini.xml.gz similarity index 100% rename from io/uniprot/data/uniprot_sprot_mini.xml.gz rename to bio/uniprot/data/uniprot_sprot_mini.xml.gz diff --git a/io/uniprot/data/xml_go_generator b/bio/uniprot/data/xml_go_generator similarity index 100% rename from io/uniprot/data/xml_go_generator rename to bio/uniprot/data/xml_go_generator diff --git a/io/uniprot/example_test.go b/bio/uniprot/example_test.go similarity index 91% rename from io/uniprot/example_test.go rename to bio/uniprot/example_test.go index 5b1af139..069352f7 100644 --- a/io/uniprot/example_test.go +++ b/bio/uniprot/example_test.go @@ -3,7 +3,7 @@ package uniprot_test import ( "fmt" - "github.com/TimothyStiles/poly/io/uniprot" + "github.com/TimothyStiles/poly/bio/uniprot" ) // This example shows how to open a uniprot data dump file and read the results diff --git a/io/uniprot/uniprot.go b/bio/uniprot/uniprot.go similarity index 100% rename from io/uniprot/uniprot.go rename to bio/uniprot/uniprot.go diff --git a/io/uniprot/uniprot_test.go b/bio/uniprot/uniprot_test.go similarity index 100% rename from io/uniprot/uniprot_test.go rename to bio/uniprot/uniprot_test.go diff --git a/io/uniprot/xml.go b/bio/uniprot/xml.go similarity index 100% rename from io/uniprot/xml.go rename to bio/uniprot/xml.go diff --git a/io/uniprot/xml_test.go b/bio/uniprot/xml_test.go similarity index 100% rename from io/uniprot/xml_test.go rename to bio/uniprot/xml_test.go diff --git a/go.mod b/go.mod index 41892465..659c5439 100644 --- a/go.mod +++ b/go.mod @@ -9,14 +9,18 @@ require ( github.com/mroth/weightedrand v0.4.1 github.com/pmezard/go-difflib v1.0.0 github.com/sergi/go-diff v1.2.0 + github.com/spaolacci/murmur3 v1.1.0 golang.org/x/exp v0.0.0-20230310171629-522b1b587ee0 lukechampine.com/blake3 v1.1.5 ) require ( github.com/davecgh/go-spew v1.1.1 // indirect + github.com/intel-go/cpuid v0.0.0-20181003105527-1a4a6f06a1c6 // indirect + github.com/koeng101/svb v0.0.0-20230815034912-d6737f9ed8b8 // indirect github.com/mattn/go-sqlite3 v1.14.13 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect + golang.org/x/sync v0.3.0 // indirect ) require ( diff --git a/go.sum b/go.sum index d47c80d5..6962438e 100644 --- a/go.sum +++ b/go.sum @@ -3,8 +3,12 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg= github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/intel-go/cpuid v0.0.0-20181003105527-1a4a6f06a1c6 h1:XboatR7lasl05yel5hNXF7kQBw2oFUGdMztcgisfhNU= +github.com/intel-go/cpuid v0.0.0-20181003105527-1a4a6f06a1c6/go.mod h1:RmeVYf9XrPRbRc3XIx0gLYA8qOFvNoPOfaEZduRlEp4= github.com/klauspost/cpuid v1.3.1 h1:5JNjFYYQrZeKRJ0734q51WCEEn2huer72Dc7K+R/b6s= github.com/klauspost/cpuid v1.3.1/go.mod h1:bYW4mA6ZgKPob1/Dlai2LviZJO7KGI3uoWLd42rAQw4= +github.com/koeng101/svb v0.0.0-20230815034912-d6737f9ed8b8 h1:oaPrvMY8VJEYfJ/r/VTKh2zshX/SxYHcImJRPZ++JPI= +github.com/koeng101/svb v0.0.0-20230815034912-d6737f9ed8b8/go.mod h1:/zIPMIRhcEjka8JxY3mo7jexMl4ncHLRUnv9RIiAS9E= github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= @@ -16,6 +20,7 @@ github.com/mattn/go-sqlite3 v1.14.13 h1:1tj15ngiFfcZzii7yd82foL+ks+ouQcj8j/TPq3f github.com/mattn/go-sqlite3 v1.14.13/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0= github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0= +github.com/mmcloughlin/avo v0.0.0-20200504053806-fa88270b07e4/go.mod h1:wqKykBG2QzQDJEzvRkcS8x6MiSJkF52hXZsXcjaB3ls= github.com/mroth/weightedrand v0.4.1 h1:rHcbUBopmi/3x4nnrvwGJBhX9d0vk+KgoLUZeDP6YyI= github.com/mroth/weightedrand v0.4.1/go.mod h1:3p2SIcC8al1YMzGhAIoXD+r9olo/g/cdJgAD905gyNE= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= @@ -27,8 +32,28 @@ github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +golang.org/x/arch v0.0.0-20190909030613-46d78d1859ac/go.mod h1:flIaEI6LNU6xOCD5PaJvn9wGP0agmIOqjrtsKGRguv4= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/exp v0.0.0-20230310171629-522b1b587ee0 h1:LGJsf5LRplCck6jUCH3dBL2dmycNruWNF5xugkSlfXw= golang.org/x/exp v0.0.0-20230310171629-522b1b587ee0/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200425043458-8463f397d07c/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -38,3 +63,4 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= lukechampine.com/blake3 v1.1.5 h1:hsACfxWvLdGmjYbWGrumQIphOvO+ZruZehWtgd2fxoM= lukechampine.com/blake3 v1.1.5/go.mod h1:hE8RpzdO8ttZ7446CXEwDP1eu2V4z7stv0Urj1El20g= +rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/io/example_test.go b/io/example_test.go deleted file mode 100644 index 08fd21f4..00000000 --- a/io/example_test.go +++ /dev/null @@ -1,32 +0,0 @@ -package io_test - -import ( - "github.com/TimothyStiles/poly/io/fasta" - "github.com/TimothyStiles/poly/io/genbank" - "github.com/TimothyStiles/poly/io/gff" - "github.com/TimothyStiles/poly/io/polyjson" -) - -// This is where the integration tests that make effed up cyclic dependencies go. - -func Example() { - // Poly can take in basic gff, gbk, fasta, and JSON. - // We call the json package "pson" (poly JSON) to prevent namespace collision with Go's standard json package. - - gffInput, _ := gff.Read("../data/ecoli-mg1655-short.gff") - gbkInput, _ := genbank.Read("../data/puc19.gbk") - fastaInput, _ := fasta.Read("fasta/data/base.fasta") - jsonInput, _ := polyjson.Read("../data/cat.json") - - // Poly can also output these file formats. Every file format has a corresponding Write function. - _ = gff.Write(gffInput, "test.gff") - _ = genbank.Write(gbkInput, "test.gbk") - _ = fasta.Write(fastaInput, "test.fasta") - _ = polyjson.Write(jsonInput, "test.json") - - // Extra tips: - - // 1. All of these file formats can be read and written in JSON format using their native schemas. - // 2. If you want to convert from one format to another (e.g. genbank to polyjson), you can easily do so with a for-loop and some field mapping. - // 3. Every file format is unique but they all share a common interface so you can use them with almost every native function in Poly. -} diff --git a/io/fasta/example_test.go b/io/fasta/example_test.go deleted file mode 100644 index e1bb6eeb..00000000 --- a/io/fasta/example_test.go +++ /dev/null @@ -1,114 +0,0 @@ -package fasta_test - -import ( - "bytes" - _ "embed" - "fmt" - "os" - "strings" - - "github.com/TimothyStiles/poly/io/fasta" -) - -//go:embed data/base.fasta -var baseFasta string - -// This example shows how to open a file with the fasta parser. The sequences -// within that file can then be analyzed further with different software. -func Example_basic() { - fastas, _ := fasta.Read("data/base.fasta") - fmt.Println(fastas[1].Sequence) - // Output: ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK* -} - -// ExampleRead shows basic usage for Read. -func ExampleRead() { - fastas, _ := fasta.Read("data/base.fasta") - fmt.Println(fastas[0].Name) - // Output: gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] -} - -// ExampleParse shows basic usage for Parse. -func ExampleParse() { - file, _ := os.Open("data/base.fasta") - fastas, _ := fasta.Parse(file) - - fmt.Println(fastas[0].Name) - // Output: gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] -} - -// ExampleBuild shows basic usage for Build -func ExampleBuild() { - fastas, _ := fasta.Read("data/base.fasta") // get example data - fasta, _ := fasta.Build(fastas) // build a fasta byte array - firstLine := string(bytes.Split(fasta, []byte("\n"))[0]) - - fmt.Println(firstLine) - // Output: >gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] -} - -// ExampleWrite shows basic usage of the writer. -func ExampleWrite() { - fastas, _ := fasta.Read("data/base.fasta") // get example data - _ = fasta.Write(fastas, "data/test.fasta") // write it out again - testSequence, _ := fasta.Read("data/test.fasta") // read it in again - - os.Remove("data/test.fasta") // getting rid of test file - - fmt.Println(testSequence[0].Name) - // Output: gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] -} - -// ExampleReadGz shows basic usage for ReadGz on a gzip'd file. -func ExampleReadGz() { - fastas, _ := fasta.ReadGz("data/uniprot_1mb_test.fasta.gz") - var name string - for _, fasta := range fastas { - name = fasta.Name - } - - fmt.Println(name) - // Output: sp|P86857|AGP_MYTCA Alanine and glycine-rich protein (Fragment) OS=Mytilus californianus OX=6549 PE=1 SV=1 -} - -// ExampleReadGzConcurrent shows how to use the concurrent parser for larger files. -func ExampleReadGzConcurrent() { - fastas := make(chan fasta.Fasta, 1000) - go fasta.ReadGzConcurrent("data/uniprot_1mb_test.fasta.gz", fastas) - var name string - for fasta := range fastas { - name = fasta.Name - } - - fmt.Println(name) - // Output: sp|P86857|AGP_MYTCA Alanine and glycine-rich protein (Fragment) OS=Mytilus californianus OX=6549 PE=1 SV=1 -} - -// ExampleReadConcurrent shows how to use the concurrent parser for decompressed fasta files. -func ExampleReadConcurrent() { - fastas := make(chan fasta.Fasta, 100) - go fasta.ReadConcurrent("data/base.fasta", fastas) - var name string - for fasta := range fastas { - name = fasta.Name - } - - fmt.Println(name) - // Output: MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken -} - -func ExampleParser() { - parser := fasta.NewParser(strings.NewReader(baseFasta), 256) - for { - fasta, _, err := parser.ParseNext() - if err != nil { - fmt.Println(err) - break - } - fmt.Println(fasta.Name) - } - //Output: - // gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus] - // MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken - // EOF -} diff --git a/io/fasta/fasta.go b/io/fasta/fasta.go deleted file mode 100644 index ab2d14d9..00000000 --- a/io/fasta/fasta.go +++ /dev/null @@ -1,387 +0,0 @@ -/* -Package fasta contains fasta parsers and writers. - -Fasta is a flat text file format developed in 1985 to store nucleotide and -amino acid sequences. It is extremely simple and well-supported across many -languages. However, this simplicity means that annotation of genetic objects -is not supported. - -This package provides a parser and writer for working with Fasta formatted -genetic sequences. -*/ -package fasta - -import ( - "bufio" - "bytes" - "compress/gzip" - "errors" - "fmt" - "io" - "math" - "os" - "strings" - "unsafe" -) - -/****************************************************************************** -Apr 25, 2021 - -Fasta Parser begins here - -Many thanks to Jordan Campbell (https://github.com/0x106) for building the first -parser for Poly and thanks to Tim Stiles (https://github.com/TimothyStiles) -for helping complete that PR. This work expands on the previous work by allowing -for concurrent parsing and giving Poly a specific parser subpackage, -as well as few bug fixes. - -Fasta is a very simple file format for working with DNA, RNA, or protein sequences. -It was first released in 1985 and is still widely used in bioinformatics. - -https://en.wikipedia.org/wiki/_format - -One interesting use of the concurrent parser is working with the Uniprot -fasta dump files, which are far too large to fit into RAM. This parser is able -to easily handle those files by doing computation actively while the data dump -is getting parsed. - -https://www.uniprot.org/downloads - -I have removed the Parsers from the io.go file and moved them into this -subpackage. - -Hack the Planet, - -Keoni - -******************************************************************************/ - -var ( - gzipReaderFn = gzip.NewReader - openFn = os.Open - buildFn = Build -) - -// Fasta is a struct representing a single Fasta file element with a Name and its corresponding Sequence. -type Fasta struct { - Name string `json:"name"` - Sequence string `json:"sequence"` -} - -// Parse parses a given Fasta file into an array of Fasta structs. Internally, it uses ParseFastaConcurrent. -func Parse(r io.Reader) ([]Fasta, error) { - // 32kB is a magic number often used by the Go stdlib for parsing. We multiply it by two. - const maxLineSize = 2 * 32 * 1024 - parser := NewParser(r, maxLineSize) - return parser.ParseAll() -} - -// Parser is a flexible parser that provides ample -// control over reading fasta-formatted sequences. -// It is initialized with NewParser. -type Parser struct { - // reader keeps state of current reader. - reader bufio.Reader - line uint -} - -// NewParser returns a Parser that uses r as the source -// from which to parse fasta formatted sequences. -func NewParser(r io.Reader, maxLineSize int) *Parser { - return &Parser{ - reader: *bufio.NewReaderSize(r, maxLineSize), - } -} - -// ParseAll parses all sequences in underlying reader only returning non-EOF errors. -// It returns all valid fasta sequences up to error if encountered. -func (parser *Parser) ParseAll() ([]Fasta, error) { - return parser.ParseN(math.MaxInt) -} - -// ParseN parses up to maxSequences fasta sequences from the Parser's underlying reader. -// ParseN does not return EOF if encountered. -// If an non-EOF error is encountered it returns it and all correctly parsed sequences up to then. -func (parser *Parser) ParseN(maxSequences int) (fastas []Fasta, err error) { - for counter := 0; counter < maxSequences; counter++ { - fasta, _, err := parser.ParseNext() - if err != nil { - if errors.Is(err, io.EOF) { - err = nil // EOF not treated as parsing error. - } - return fastas, err - } - fastas = append(fastas, fasta) - } - return fastas, nil -} - -// ParseByteLimited parses fastas until byte limit is reached. -// This is NOT a hard limit. To set a hard limit on bytes read use a -// io.LimitReader to wrap the reader passed to the Parser. -func (parser *Parser) ParseByteLimited(byteLimit int64) (fastas []Fasta, bytesRead int64, err error) { - for bytesRead < byteLimit { - fasta, n, err := parser.ParseNext() - bytesRead += n - if err != nil { - if errors.Is(err, io.EOF) { - err = nil // EOF not treated as parsing error. - } - return fastas, bytesRead, err - } - fastas = append(fastas, fasta) - } - return fastas, bytesRead, nil -} - -// ParseNext reads next fasta genome in underlying reader and returns the result -// and the amount of bytes read during the call. -// ParseNext only returns an error if it: -// - Attempts to read and fails to find a valid fasta sequence. -// - Returns reader's EOF if called after reader has been exhausted. -// - If a EOF is encountered immediately after a sequence with no newline ending. -// In this case the Fasta up to that point is returned with an EOF error. -// -// It is worth noting the amount of bytes read are always right up to before -// the next fasta starts which means this function can effectively be used -// to index where fastas start in a file or string. -func (parser *Parser) ParseNext() (Fasta, int64, error) { - if _, err := parser.reader.Peek(1); err != nil { - // Early return on error. Probably will be EOF. - return Fasta{}, 0, err - } - // Initialization of parser state variables. - var ( - // Parser looks for a line starting with '>' (U+003E) - // that contains the next fasta sequence name. - lookingForName = true - seqName string - sequence, line []byte - err error - totalRead int64 - ) - - // parse loop begins here. - for { - line, err = parser.reader.ReadSlice('\n') - isSkippable := len(line) <= 1 || line[0] == ';' // OR short circuits so no panic here. - totalRead += int64(len(line)) - parser.line++ - - // More general case of error handling. - if err != nil { - isEOF := errors.Is(err, io.EOF) - if isSkippable { - if isEOF { - // got EOF on a empty or commented line. - err = nil - } - break - } else if errors.Is(err, bufio.ErrBufferFull) { - // Buffer size too small to read fasta line. - return Fasta{}, totalRead, fmt.Errorf("line %d too large for buffer, use larger maxLineSize: %w", parser.line+1, err) - } else if !isEOF { - return Fasta{}, totalRead, err // Unexpected error. - } - - // So got to this point the line is probably OK, we will return a Fasta. - // with the EOF error. - sequence = append(sequence, line...) - break - } - - line = line[:len(line)-1] // Exclude newline delimiter. - peek, _ := parser.reader.Peek(1) - if !lookingForName && len(peek) == 1 && peek[0] == '>' { - // We are currently parsing a fasta and next line contains a new fasta. - // We handle this situation by appending current line to sequence if not a comment - // and ending the current fasta parsing. - if !isSkippable { - sequence = append(sequence, line...) - } - break - } else if isSkippable { - continue - } - - if lookingForName { - if line[0] == '>' { - // We got the start of a fasta. - seqName = string(line[1:]) - lookingForName = false - } - // This continue will also skip line if we are looking for name - // and the current line does not contain the name. - continue - } - // If we got to this point we are currently inside of the fasta - // sequence contents. We append line to what we found of sequence so far. - sequence = append(sequence, line...) - } // parse loop ends here. - - // Parsing ended. Check for inconsistencies. - if lookingForName { - return Fasta{}, totalRead, fmt.Errorf("did not find fasta start '>', got to line %d: %w", parser.line, err) - } - if !lookingForName && len(sequence) == 0 { - // We found a fasta name but no sequence to go with it. - return Fasta{}, totalRead, fmt.Errorf("empty fasta sequence for %q, got to line %d: %w", seqName, parser.line, err) - } - fasta := Fasta{ - Name: seqName, - Sequence: *(*string)(unsafe.Pointer(&sequence)), // Stdlib strings.Builder.String() does this so it *should* be safe. - } - // Gotten to this point err is non-nil only in EOF case. - // We report this error to note the fasta may be incomplete/corrupt - // like in the case of using an io.LimitReader wrapping the underlying reader. - // We return the fasta as well since some libraries generate fastas with no - // ending newline i.e Zymo. It is up to the user to decide whether they want - // an EOF-ended fasta or not, the rest of this library discards EOF-ended fastas. - return fasta, totalRead, err -} - -// Reset discards all data in buffer and resets state. -func (parser *Parser) Reset(r io.Reader) { - parser.reader.Reset(r) - parser.line = 0 -} - -// ParseConcurrent concurrently parses a given Fasta file in an io.Reader into a channel of Fasta structs. -func ParseConcurrent(r io.Reader, sequences chan<- Fasta) { - // Initialize necessary variables - var sequenceLines []string - var name string - start := true - - // Start the scanner - scanner := bufio.NewScanner(r) - for scanner.Scan() { - line := scanner.Text() - switch { - // if there's nothing on this line skip this iteration of the loop - case len(line) == 0: - continue - // if it's a comment skip this line - case line[0:1] == ";": - continue - // start of a fasta line - case line[0:1] != ">": - sequenceLines = append(sequenceLines, line) - // Process normal new lines - case line[0:1] == ">" && !start: - sequence := strings.Join(sequenceLines, "") - newFasta := Fasta{ - Name: name, - Sequence: sequence} - // Reset sequence lines - sequenceLines = []string{} - // New name - name = line[1:] - sequences <- newFasta - // Process first line of file - case line[0:1] == ">" && start: - name = line[1:] - start = false - } - } - // Add final sequence in file to channel - sequence := strings.Join(sequenceLines, "") - newFasta := Fasta{ - Name: name, - Sequence: sequence} - sequences <- newFasta - close(sequences) -} - -/****************************************************************************** - -Start of Read functions - -******************************************************************************/ - -// ReadGzConcurrent concurrently reads a gzipped Fasta file into a Fasta channel. -// Deprecated: Use Parser.ParseNext() instead. -func ReadGzConcurrent(path string, sequences chan<- Fasta) { - file, _ := os.Open(path) // TODO: these errors need to be handled/logged - reader, _ := gzipReaderFn(file) - go func() { - defer file.Close() - defer reader.Close() - ParseConcurrent(reader, sequences) - }() -} - -// ReadConcurrent concurrently reads a flat Fasta file into a Fasta channel. -func ReadConcurrent(path string, sequences chan<- Fasta) { - file, _ := os.Open(path) // TODO: these errors need to be handled/logged - go func() { - defer file.Close() - ParseConcurrent(file, sequences) - }() -} - -// ReadGz reads a gzipped file into an array of Fasta structs. -func ReadGz(path string) ([]Fasta, error) { - file, err := openFn(path) - if err != nil { - return nil, err - } - defer file.Close() - reader, err := gzipReaderFn(file) - if err != nil { - return nil, err - } - defer reader.Close() - return Parse(reader) -} - -// Read reads a file into an array of Fasta structs -func Read(path string) ([]Fasta, error) { - file, err := openFn(path) - if err != nil { - return nil, err - } - defer file.Close() - return Parse(file) -} - -/****************************************************************************** - -Start of Write functions - -******************************************************************************/ - -// Build converts a Fastas array into a byte array to be written to a file. -func Build(fastas []Fasta) ([]byte, error) { - var fastaString bytes.Buffer - fastaLength := len(fastas) - for fastaIndex, fasta := range fastas { - fastaString.WriteString(">") - fastaString.WriteString(fasta.Name) - fastaString.WriteString("\n") - - lineCount := 0 - // write the fasta sequence 80 characters at a time - for _, character := range fasta.Sequence { - fastaString.WriteRune(character) - lineCount++ - if lineCount == 80 { - fastaString.WriteString("\n") - lineCount = 0 - } - } - if fastaIndex != fastaLength-1 { - fastaString.WriteString("\n\n") - } - } - return fastaString.Bytes(), nil -} - -// Write writes a fasta array to a file. -func Write(fastas []Fasta, path string) error { - fastaBytes, err := buildFn(fastas) // fasta.Build returns only nil errors. - if err != nil { - return err - } - return os.WriteFile(path, fastaBytes, 0644) -} diff --git a/io/fasta/fasta_test.go b/io/fasta/fasta_test.go deleted file mode 100644 index 7063821e..00000000 --- a/io/fasta/fasta_test.go +++ /dev/null @@ -1,241 +0,0 @@ -package fasta - -import ( - "compress/gzip" - "errors" - "io" - "os" - "strings" - "testing" - - "github.com/stretchr/testify/assert" -) - -const ( - // This fasta stream contains no Fasta. - emptyFasta = "testing\natagtagtagtagtagatgatgatgatgagatg\n\n\n\n\n\n\n\n\n\n\n" -) - -// Initialized at TestMain. -var uniprotFasta string - -func TestMain(m *testing.M) { - const uniprotFastaGzFilePath = "data/uniprot_1mb_test.fasta.gz" - // unzip uniprot data and create uniprotFasta string for benchmarks and testing. - uniprotFastaGzFile, err := os.Open(uniprotFastaGzFilePath) - if err != nil { - panic(uniprotFastaGzFilePath + " required for tests!") - } - defer uniprotFastaGzFile.Close() - - uniprotFastaGzReader, _ := gzip.NewReader(uniprotFastaGzFile) - defer uniprotFastaGzReader.Close() - - uniprotFastaBytes, err := io.ReadAll(uniprotFastaGzReader) - if err != nil { - panic(err) - } - - uniprotFasta = string(uniprotFastaBytes) - m.Run() -} - -func BenchmarkFastaLegacy(b *testing.B) { - var fastas []Fasta - var err error - for i := 0; i < b.N; i++ { - fastas, err = Parse(strings.NewReader(uniprotFasta)) - if err != nil { - b.Fatal(err) - } - } - _ = fastas -} - -func BenchmarkParser(b *testing.B) { - var fastaRecords []Fasta - for i := 0; i < b.N; i++ { - parser := NewParser(strings.NewReader(uniprotFasta), 256) - for { - fasta, _, err := parser.ParseNext() - if err != nil { - if !errors.Is(err, io.EOF) { - b.Fatal(err) - } - break - } - fastaRecords = append(fastaRecords, fasta) - } - fastaRecords = nil // Reset memory - } - _ = fastaRecords -} - -func TestRead_error(t *testing.T) { - t.Run("Read errors opening the file", func(t *testing.T) { - openErr := errors.New("open error") - oldOpenFn := openFn - openFn = func(name string) (*os.File, error) { - return nil, openErr - } - defer func() { - openFn = oldOpenFn - }() - _, err := Read("/tmp/file") - assert.EqualError(t, err, openErr.Error()) - }) - - t.Run("ReadGz errors opening the file", func(t *testing.T) { - openErr := errors.New("open error") - oldOpenFn := openFn - openFn = func(name string) (*os.File, error) { - return nil, openErr - } - defer func() { - openFn = oldOpenFn - }() - _, err := ReadGz("/tmp/file") - assert.EqualError(t, err, openErr.Error()) - }) - - t.Run("ReadGz errors reading the file", func(t *testing.T) { - readErr := errors.New("read error") - oldOpenFn := openFn - oldGzipReaderFn := gzipReaderFn - openFn = func(name string) (*os.File, error) { - return &os.File{}, nil - } - gzipReaderFn = func(r io.Reader) (*gzip.Reader, error) { - return nil, readErr - } - defer func() { - openFn = oldOpenFn - gzipReaderFn = oldGzipReaderFn - }() - _, err := ReadGz("/tmp/file") - assert.EqualError(t, err, readErr.Error()) - }) -} - -func TestWrite_error(t *testing.T) { - buildErr := errors.New("build error") - oldBuildFn := buildFn - buildFn = func(fastas []Fasta) ([]byte, error) { - return nil, buildErr - } - defer func() { - buildFn = oldBuildFn - }() - err := Write([]Fasta{}, "/tmp/file") - assert.EqualError(t, err, buildErr.Error()) -} - -func TestParser(t *testing.T) { - parser := NewParser(nil, 256) - for testIndex, test := range []struct { - content string - expected []Fasta - }{ - { - content: ">humen\nGATTACA\nCATGAT", // EOF-ended Fasta not valid - expected: []Fasta{}, - }, - { - content: ">humen\nGATTACA\nCATGAT\n", - expected: []Fasta{{Name: "humen", Sequence: "GATTACACATGAT"}}, - }, - { - content: ">doggy or something\nGATTACA\n\nCATGAT\n" + - ">homunculus\nAAAA\n", - expected: []Fasta{ - {Name: "doggy or something", Sequence: "GATTACACATGAT"}, - {Name: "homunculus", Sequence: "AAAA"}, - }, - }, - } { - parser.Reset(strings.NewReader(test.content)) - fastas, err := parser.ParseAll() - if err != nil { - t.Fatal(err) - } - if len(fastas) != len(test.expected) { - t.Errorf("case index %d: got %d fastas, expected %d", testIndex, len(fastas), len(test.expected)) - continue - } - for index, gotFasta := range fastas { - expected := test.expected[index] - if expected != gotFasta { - t.Errorf("got!=expected: %+v != %+v", gotFasta, expected) - } - } - } -} - -func TestParseBytes(t *testing.T) { - // Partial read test. - const testFasta = ">0\nGAT\n>1\nCAC\n" - p := NewParser(strings.NewReader(testFasta), 256) - result1, bytesRead, err := p.ParseByteLimited(1) - if err != nil { - t.Fatal(err) - } - if len(result1) != 1 { - t.Error("expected result of length 1 (partial read)") - } - expectBytesRead := 1 + strings.Index(testFasta[1:], ">") - if int(bytesRead) != expectBytesRead { - t.Errorf("expected %d bytes read, got %d bytes read", expectBytesRead, bytesRead) - } - - // Full read test. - p.Reset(strings.NewReader(testFasta)) - result1, bytesRead, err = p.ParseByteLimited(100) - if err != nil { - t.Fatal(err) - } - if len(result1) != 2 { - t.Error("expected result of length 2 (full read)") - } - expectBytesRead = len(testFasta) - if int(bytesRead) != expectBytesRead { - t.Errorf("expected %d bytes read, got %d bytes read", expectBytesRead, bytesRead) - } -} - -// TestReadEmptyFasta tests that an empty fasta file is parsed correctly. -func TestReadEmptyFasta(t *testing.T) { - fastas, err := Parse(strings.NewReader(emptyFasta)) - if err == nil { - t.Errorf("expected error reading empty fasta stream") - } - if len(fastas) != 0 { - t.Errorf("expected 1 fastas, got %d", len(fastas)) - } -} - -// TestParseBufferFail tests that the parser fails when the buffer is too small. -func TestParseBufferFail(t *testing.T) { - // Error only triggered when there is a line greater than 16 bytes in length. - // This is because the underlying implementation uses bufio.Reader, which auto-sets - // the buffer to 16, the minimum length permissible for the implementation. - // We should not test for this anyways because we may change from using - // bufio.Reader.ReadSlice to bufio.Reader.ReadLine, which allows for incomplete - // line parsing, though this makes parsing more difficult. - const testFasta = ">0\n0123456789ABCDEF\n>1\nCAC\n" - parser := NewParser(strings.NewReader(testFasta), 2) - fasta, err := parser.ParseAll() - _ = fasta - if err == nil { - t.Error("expected error, got nil") - } -} - -func TestParseEOFAfterName(t *testing.T) { - const testFasta = ">OK Fasta\nABGABA\n>NotOKFasta\n" - parser := NewParser(strings.NewReader(testFasta), 2) - fasta, err := parser.ParseAll() - _ = fasta - if err == nil { - t.Error("expected error, got nil") - } -} diff --git a/io/fastq/example_test.go b/io/fastq/example_test.go deleted file mode 100644 index 498b1e38..00000000 --- a/io/fastq/example_test.go +++ /dev/null @@ -1,64 +0,0 @@ -package fastq_test - -import ( - _ "embed" - "fmt" - "os" - "strings" - - "github.com/TimothyStiles/poly/io/fastq" -) - -//go:embed data/nanosavseq.fastq -var baseFastq string - -// ExampleRead shows basic usage for Read. -func ExampleRead() { - fastqs, _ := fastq.Read("data/nanosavseq.fastq") - fmt.Println(fastqs[0].Identifier) - //Output: - //e3cc70d5-90ef-49b6-bbe1-cfef99537d73 -} - -// ExampleReadGz shows basic usage for ReadGz. -func ExampleReadGz() { - fastqs, _ := fastq.ReadGz("data/nanosavseq.fastq.gz") - fmt.Println(fastqs[0].Identifier) - //Output: - //e3cc70d5-90ef-49b6-bbe1-cfef99537d73 -} - -// ExampleWrite shows basic usage of the writer. -func ExampleWrite() { - fastqs, _ := fastq.Read("data/nanosavseq.fastq") // get example data - _ = fastq.Write(fastqs, "data/test.fastq") // write it out again - testSequence, _ := fastq.Read("data/test.fastq") // read it in again - - os.Remove("data/test.fastq") // getting rid of test file - - fmt.Println(testSequence[0].Identifier) - fmt.Println(testSequence[0].Sequence) - fmt.Println(testSequence[0].Quality) - //Output: - //e3cc70d5-90ef-49b6-bbe1-cfef99537d73 - //GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT - //$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;, and includes quality -values for a sequence. - -This package provides a parser and writer for working with Fastq formatted -sequencing data. -*/ -package fastq - -import ( - "bufio" - "bytes" - "compress/gzip" - "errors" - "fmt" - "io" - "math" - "os" - "strings" -) - -/****************************************************************************** -March 22, 2023 - -Fastq Parser begins here - -I basically stole everything from the fasta parser, and added a few bits -for parsing out additional data from fastq nanopore files. Mwhahaha, stealing -code! - -Keoni - -******************************************************************************/ - -var ( - gzipReaderFn = gzip.NewReader - openFn = os.Open - buildFn = Build -) - -// Fastq is a struct representing a single Fastq file element with an Identifier, its corresponding sequence, its quality score, and any optional pieces of data. -type Fastq struct { - Identifier string `json:"identifier"` - Optionals map[string]string `json:"optionals"` // Nanopore, for example, carries along data like: read=13956 ch=53 start_time=2020-11-11T01:49:01Z - Sequence string `json:"sequence"` - Quality string `json:"quality"` -} - -// Parse parses a given Fastq file into an array of Fastq structs. Internally, it uses ParseFastqConcurrent. -func Parse(r io.Reader) ([]Fastq, error) { - // 32kB is a magic number often used by the Go stdlib for parsing. We multiply it by two. - const maxLineSize = 2 * 32 * 1024 - parser := NewParser(r, maxLineSize) - return parser.ParseAll() -} - -// Parser is a flexible parser that provides ample -// control over reading fastq-formatted sequences. -// It is initialized with NewParser. -type Parser struct { - // reader keeps state of current reader. - reader bufio.Reader - line uint -} - -// NewParser returns a Parser that uses r as the source -// from which to parse fastq formatted sequences. -func NewParser(r io.Reader, maxLineSize int) *Parser { - return &Parser{ - reader: *bufio.NewReaderSize(r, maxLineSize), - } -} - -// ParseAll parses all sequences in underlying reader only returning non-EOF errors. -// It returns all valid fastq sequences up to error if encountered. -func (parser *Parser) ParseAll() ([]Fastq, error) { - return parser.ParseN(math.MaxInt) -} - -// ParseN parses up to maxSequences fastq sequences from the Parser's underlying reader. -// ParseN does not return EOF if encountered. -// If an non-EOF error is encountered it returns it and all correctly parsed sequences up to then. -func (parser *Parser) ParseN(maxSequences int) (fastqs []Fastq, err error) { - for counter := 0; counter < maxSequences; counter++ { - fastq, _, err := parser.ParseNext() - if err != nil { - if errors.Is(err, io.EOF) { - err = nil // EOF not treated as parsing error. - } - return fastqs, err - } - fastqs = append(fastqs, fastq) - } - return fastqs, nil -} - -// ParseNext reads next fastq genome in underlying reader and returns the result -// and the amount of bytes read during the call. -// ParseNext only returns an error if it: -// - Attempts to read and fails to find a valid fastq sequence. -// - Returns reader's EOF if called after reader has been exhausted. -// - If a EOF is encountered immediately after a sequence with no newline ending. -// In this case the Fastq up to that point is returned with an EOF error. -// -// It is worth noting the amount of bytes read are always right up to before -// the next fastq starts which means this function can effectively be used -// to index where fastqs start in a file or string. -// -// ParseNext is simplified for fastq files from fasta files. Unlike fasta -// files, fastq always have 4 lines following each other - not variable with -// a line limit of 80 like fasta files have. So instead of a for loop, you -// can just parse 4 lines at once. -func (parser *Parser) ParseNext() (Fastq, int64, error) { - if _, err := parser.reader.Peek(1); err != nil { - // Early return on error. Probably will be EOF. - return Fastq{}, 0, err - } - - // More general case of error handling. - handleErr := func(err error) error { - isEOF := errors.Is(err, io.EOF) - if errors.Is(err, bufio.ErrBufferFull) { - // Buffer size too small to read fastq line. - return fmt.Errorf("line %d too large for buffer, use larger maxLineSize: %w", parser.line+1, err) - } else if isEOF { - return fmt.Errorf("line %d failed: unexepcted EOF encountered", parser.line+1) - } - return err - } - - // Initialization of parser state variables. - var ( - // Parser looks for a line starting with '@' - // that contains the next fastq sequence identifier. - lookingForIdentifier = true - seqIdentifier, quality string - optionals map[string]string - sequence, line []byte - err error - totalRead int64 - ) - - // parse identifier - line, err = parser.reader.ReadSlice('\n') - totalRead += int64(len(line)) - parser.line++ - if handleErr(err) != nil { - return Fastq{}, totalRead, handleErr(err) - } - - line = line[:len(line)-1] // Exclude newline delimiter. - if string(line)[0] == '@' { - lookingForIdentifier = false - } - lineSplits := strings.Split(string(line), " ") - seqIdentifier = lineSplits[0][1:] - optionals = make(map[string]string) - for _, optionalDatum := range lineSplits[1:] { - optionalSplits := strings.Split(optionalDatum, "=") - optionalKey := optionalSplits[0] - optionalValue := optionalSplits[1] - optionals[optionalKey] = optionalValue - } - - // parse sequence - line, err = parser.reader.ReadSlice('\n') - totalRead += int64(len(line)) - parser.line++ - if handleErr(err) != nil { - return Fastq{}, totalRead, handleErr(err) - } - if len(line) <= 1 { // newline delimiter - actually checking for empty line - return Fastq{}, totalRead, fmt.Errorf("empty fastq sequence for %q, got to line %d: %w", seqIdentifier, parser.line, err) - } - sequence = line[:len(line)-1] // Exclude newline delimiter. - - // skip + - _, err = parser.reader.ReadSlice('\n') - totalRead += int64(len(line)) - parser.line++ - if handleErr(err) != nil { - return Fastq{}, totalRead, handleErr(err) - } - - // parse quality - line, err = parser.reader.ReadSlice('\n') - totalRead += int64(len(line)) - parser.line++ - if handleErr(err) != nil { - return Fastq{}, totalRead, handleErr(err) - } - if len(line) <= 1 { // newline delimiter - actually checking for empty line - return Fastq{}, totalRead, fmt.Errorf("empty quality sequence for %q, got to line %d: %w", seqIdentifier, parser.line, err) - } - quality = string(line[:len(line)-1]) - - // Parsing ended. Check for inconsistencies. - if lookingForIdentifier { - return Fastq{}, totalRead, fmt.Errorf("did not find fastq start '@', got to line %d: %w", parser.line, err) - } - fastq := Fastq{ - Identifier: seqIdentifier, - Optionals: optionals, - Quality: quality, - Sequence: string(sequence), // Stdlib strings.Builder.String() does this so it *should* be safe. - } - // Gotten to this point err is non-nil only in EOF case. - // We report this error to note the fastq may be incomplete/corrupt - // like in the case of using an io.LimitReader wrapping the underlying reader. - return fastq, totalRead, err -} - -// Reset discards all data in buffer and resets state. -func (parser *Parser) Reset(r io.Reader) { - parser.reader.Reset(r) - parser.line = 0 -} - -/****************************************************************************** - -Start of Read functions - -******************************************************************************/ - -// ReadGz reads a gzipped file into an array of Fastq structs. -func ReadGz(path string) ([]Fastq, error) { - file, err := openFn(path) - if err != nil { - return nil, err - } - defer file.Close() - reader, err := gzipReaderFn(file) - if err != nil { - return nil, err - } - defer reader.Close() - return Parse(reader) -} - -// Read reads a file into an array of Fastq structs -func Read(path string) ([]Fastq, error) { - file, err := openFn(path) - if err != nil { - return nil, err - } - defer file.Close() - return Parse(file) -} - -/****************************************************************************** - -Start of Write functions - -******************************************************************************/ - -// Build converts a Fastqs array into a byte array to be written to a file. -func Build(fastqs []Fastq) ([]byte, error) { - var fastqString bytes.Buffer - for _, fastq := range fastqs { - fastqString.WriteString("@") - fastqString.WriteString(fastq.Identifier) - for key, val := range fastq.Optionals { - fastqString.WriteString(" ") - fastqString.WriteString(key) - fastqString.WriteString("=") - fastqString.WriteString(val) - } - fastqString.WriteString("\n") - - // fastq doesn't limit at 80 characters, since it is - // mainly reading big ole' sequencing files without - // human input. - fastqString.WriteString(fastq.Sequence) - fastqString.WriteString("\n+\n") - fastqString.WriteString(fastq.Quality) - fastqString.WriteString("\n") - } - return fastqString.Bytes(), nil -} - -// Write writes a fastq array to a file. -func Write(fastqs []Fastq, path string) error { - fastqBytes, _ := buildFn(fastqs) // fastq.Build returns only nil errors. - return os.WriteFile(path, fastqBytes, 0644) -} diff --git a/io/fastq/fastq_test.go b/io/fastq/fastq_test.go deleted file mode 100644 index cc0f4350..00000000 --- a/io/fastq/fastq_test.go +++ /dev/null @@ -1,66 +0,0 @@ -package fastq - -import ( - "os" - "testing" -) - -func TestParseNLow(t *testing.T) { - file, err := os.Open("data/nanosavseq.fastq") - if err != nil { - t.Errorf("Failed to read nanosavseq.fastq. Got error: %s", err) - } - const maxLineSize = 2 * 32 * 1024 - parser := NewParser(file, maxLineSize) - _, err = parser.ParseN(0) - if err != nil { - t.Errorf("Failed to parse 0 fastqs. Got error: %s", err) - } -} - -func TestParseSmallLine(t *testing.T) { - file, _ := os.Open("data/nanosavseq.fastq") - parser := NewParser(file, 10) - _, err := parser.ParseAll() - if err == nil { - t.Errorf("Should have encountered a maxLine error") - } - parser.Reset(file) -} - -func TestRead(t *testing.T) { - _, err := Read("data/doesntexist.fastq") - if err == nil { - t.Errorf("Should have failed to read non-existent file") - } - _, err = ReadGz("data/doesntexist.fastq.gz") - if err == nil { - t.Errorf("Should have failed to read non-existent gz file") - } - _, err = ReadGz("data/nanosavseq.fastq") - if err == nil { - t.Errorf("Should have failed to read a file that is not gz'ed") - } -} - -func testException(t *testing.T, filePath string, errorString string) { - file, err := os.Open(filePath) - if err != nil { - t.Errorf("Failed to read %s. Got error: %s", filePath, err) - } - const maxLineSize = 2 * 32 * 1024 - parser := NewParser(file, maxLineSize) - _, err = parser.ParseAll() - if err == nil { - t.Errorf("%s parser should have gotten error: %s", filePath, errorString) - } -} - -func TestParseExceptions(t *testing.T) { - testException(t, "data/nanosavseq_noseq.fastq", "no seq") - testException(t, "data/nanosavseq_noquality.fastq", "no quality") - testException(t, "data/nanosavseq_noidentifier.fastq", "no identifier") - testException(t, "data/nanosavseq_emptyseq.fastq", "empty seq") - testException(t, "data/nanosavseq_noplus.fastq", "no plus EOF") - testException(t, "data/nanosavseq_noquality2.fastq", "no quality EOF") -} diff --git a/io/genbank/example_test.go b/io/genbank/example_test.go deleted file mode 100644 index 0bc8433e..00000000 --- a/io/genbank/example_test.go +++ /dev/null @@ -1,168 +0,0 @@ -package genbank_test - -import ( - "bytes" - "fmt" - "os" - "path/filepath" - - "github.com/TimothyStiles/poly/io/genbank" -) - -// This example shows how to open a genbank file and search for a gene given -// its name. After finding it, notes about the particular gene are read. -func Example_basic() { - sequences, _ := genbank.Read("../../data/puc19.gbk") - for _, feature := range sequences.Features { - if feature.Attributes["gene"] == "bla" { - fmt.Println(feature.Attributes["note"]) - } - } - // Output: confers resistance to ampicillin, carbenicillin, andrelated antibiotics -} - -func ExampleRead() { - sequence, _ := genbank.Read("../../data/puc19.gbk") - fmt.Println(sequence.Meta.Locus.ModificationDate) - // Output: 22-OCT-2019 -} - -func ExampleWrite() { - tmpDataDir, err := os.MkdirTemp("", "data-*") - if err != nil { - fmt.Println(err.Error()) - } - defer os.RemoveAll(tmpDataDir) - - sequences, _ := genbank.Read("../../data/puc19.gbk") - - tmpGbkFilePath := filepath.Join(tmpDataDir, "puc19.gbk") - _ = genbank.Write(sequences, tmpGbkFilePath) - - testSequence, _ := genbank.Read(tmpGbkFilePath) - - fmt.Println(testSequence.Meta.Locus.ModificationDate) - // Output: 22-OCT-2019 -} - -func ExampleBuild() { - sequences, _ := genbank.Read("../../data/puc19.gbk") - gbkBytes, _ := genbank.Build(sequences) - testSequence, _ := genbank.Parse(bytes.NewReader(gbkBytes)) - - fmt.Println(testSequence.Meta.Locus.ModificationDate) - // Output: 22-OCT-2019 -} - -func ExampleParse() { - file, _ := os.Open("../../data/puc19.gbk") - sequence, _ := genbank.Parse(file) - - fmt.Println(sequence.Meta.Locus.ModificationDate) - // Output: 22-OCT-2019 -} - -func ExampleReadMulti() { - sequences, err := genbank.ReadMulti("../../data/multiGbk_test.seq") - if err != nil { - fmt.Println(err.Error()) - } - - fmt.Println(sequences[1].Meta.Locus.ModificationDate) - // Output: 05-FEB-1999 -} - -func ExampleWriteMulti() { - tmpDataDir, err := os.MkdirTemp("", "data-*") - if err != nil { - fmt.Println(err.Error()) - } - - sequences, _ := genbank.ReadMulti("../../data/multiGbk_test.seq") - tmpGbkFilePath := filepath.Join(tmpDataDir, "multiGbk_test.seq") - - err = genbank.WriteMulti(sequences, tmpGbkFilePath) - - if err != nil { - fmt.Println(err.Error()) - } - - testSequences, _ := genbank.ReadMulti(tmpGbkFilePath) - isEqual := sequences[1].Meta.Locus.ModificationDate == testSequences[1].Meta.Locus.ModificationDate - fmt.Println(isEqual) - // Output: true -} - -func ExampleBuildMulti() { - sequences, _ := genbank.ReadMulti("../../data/multiGbk_test.seq") - gbkBytes, _ := genbank.BuildMulti(sequences) - testSequences, _ := genbank.ParseMulti(bytes.NewReader(gbkBytes)) - - isEqual := sequences[1].Meta.Locus.ModificationDate == testSequences[1].Meta.Locus.ModificationDate - fmt.Println(isEqual) - // Output: true -} - -func ExampleParseMulti() { - file, _ := os.Open("../../data/multiGbk_test.seq") - sequences, _ := genbank.ParseMulti(file) - fmt.Println(sequences[1].Meta.Locus.ModificationDate) - // Output: 05-FEB-1999 -} - -func ExampleGenbank_AddFeature() { - // Sequence for greenflourescent protein (GFP) that we're using as test data for this example. - gfpSequence := "ATGGCTAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCTACATACGGAAAGCTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCCCGTTATCCGGATCATATGAAACGGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAACGCACTATATCTTTCAAAGATGACGGGAACTACAAGACGCGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATCGTATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTCGGACACAAACTCGAGTACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCTAACTTCAAAATTCGCCACAACATTGAAGATGGATCCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCGACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGCGTGACCACATGGTCCTTCTTGAGTTTGTAACTGCTGCTGGGATTACACATGGCATGGATGAGCTCTACAAATAA" - - // initialize sequence and feature structs. - var sequence genbank.Genbank - var feature genbank.Feature - - // set the initialized sequence struct's sequence. - sequence.Sequence = gfpSequence - - // Set the initialized feature name and sequence location. - feature.Description = "Green Fluorescent Protein" - feature.Location = genbank.Location{} - feature.Location.Start = 0 - feature.Location.End = len(sequence.Sequence) - - // Add the GFP feature to the sequence struct. - _ = sequence.AddFeature(&feature) - - // get the GFP feature sequence string from the sequence struct. - featureSequence, _ := feature.GetSequence() - - // check to see if the feature was inserted properly into the sequence. - fmt.Println(gfpSequence == featureSequence) - - // Output: true -} - -func ExampleFeature_GetSequence() { - // Sequence for greenflourescent protein (GFP) that we're using as test data for this example. - gfpSequence := "ATGGCTAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCTACATACGGAAAGCTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCCCGTTATCCGGATCATATGAAACGGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAACGCACTATATCTTTCAAAGATGACGGGAACTACAAGACGCGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATCGTATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTCGGACACAAACTCGAGTACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCTAACTTCAAAATTCGCCACAACATTGAAGATGGATCCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCGACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGCGTGACCACATGGTCCTTCTTGAGTTTGTAACTGCTGCTGGGATTACACATGGCATGGATGAGCTCTACAAATAA" - - // initialize sequence and feature structs. - var sequence genbank.Genbank - var feature genbank.Feature - - // set the initialized sequence struct's sequence. - sequence.Sequence = gfpSequence - - // Set the initialized feature name and sequence location. - feature.Description = "Green Fluorescent Protein" - feature.Location.Start = 0 - feature.Location.End = len(sequence.Sequence) - - // Add the GFP feature to the sequence struct. - _ = sequence.AddFeature(&feature) - - // get the GFP feature sequence string from the sequence struct. - featureSequence, _ := feature.GetSequence() - - // check to see if the feature was inserted properly into the sequence. - fmt.Println(gfpSequence == featureSequence) - - // Output: true -} diff --git a/io/io.go b/io/io.go deleted file mode 100644 index fbdfac75..00000000 --- a/io/io.go +++ /dev/null @@ -1,9 +0,0 @@ -/* -Package io provides utilities for reading and writing sequence data. -*/ -package io - -/* -This package is supposed to be empty and only exists to provide a doc string. -Otherwise its namespace would collide with Go's native IO package. -*/ diff --git a/io/slow5/example_test.go b/io/slow5/example_test.go deleted file mode 100644 index f51d44b2..00000000 --- a/io/slow5/example_test.go +++ /dev/null @@ -1,32 +0,0 @@ -package slow5_test - -import ( - "fmt" - "os" - - "github.com/TimothyStiles/poly/io/slow5" -) - -func ExampleNewParser() { - // example.slow5 is a file I generated using slow5tools from nanopore fast5 - // run where I was testing using nanopore for doing COVID testing. It - // contains real nanopore data. - file, _ := os.Open("data/example.slow5") - // Set maxLineSize to 64kb. If you expect longer reads, - // make maxLineSize longer! - const maxLineSize = 2 * 32 * 1024 - parser, _, _ := slow5.NewParser(file, maxLineSize) - - var outputReads []slow5.Read - for { - read, err := parser.ParseNext() - if err != nil { - // Break at EOF - break - } - outputReads = append(outputReads, read) - } - - fmt.Println(outputReads[0].RawSignal[0:10]) - // Output: [430 472 463 467 454 465 463 450 450 449] -} diff --git a/seqhash/example_test.go b/seqhash/example_test.go index 02629c85..718d8376 100644 --- a/seqhash/example_test.go +++ b/seqhash/example_test.go @@ -2,8 +2,9 @@ package seqhash_test import ( "fmt" + "os" - "github.com/TimothyStiles/poly/io/genbank" + "github.com/TimothyStiles/poly/bio" "github.com/TimothyStiles/poly/seqhash" ) @@ -31,7 +32,11 @@ func ExampleHash() { } func ExampleRotateSequence() { - sequence, _ := genbank.Read("../data/puc19.gbk") + file, _ := os.Open("../data/puc19.gbk") + defer file.Close() + parser, _ := bio.NewGenbankParser(file) + sequence, _ := parser.Next() + sequenceLength := len(sequence.Sequence) testSequence := sequence.Sequence[sequenceLength/2:] + sequence.Sequence[0:sequenceLength/2] diff --git a/seqhash/seqhash_test.go b/seqhash/seqhash_test.go index 5787cc36..27da9535 100644 --- a/seqhash/seqhash_test.go +++ b/seqhash/seqhash_test.go @@ -3,9 +3,10 @@ package seqhash import ( "bytes" "fmt" + "os" "testing" - "github.com/TimothyStiles/poly/io/genbank" + "github.com/TimothyStiles/poly/bio" "github.com/sergi/go-diff/diffmatchpatch" ) @@ -66,7 +67,10 @@ func TestHash(t *testing.T) { } func TestLeastRotation(t *testing.T) { - sequence, _ := genbank.Read("../data/puc19.gbk") + file, _ := os.Open("../data/puc19.gbk") + defer file.Close() + parser, _ := bio.NewGenbankParser(file) + sequence, _ := parser.Next() var sequenceBuffer bytes.Buffer sequenceBuffer.WriteString(sequence.Sequence) diff --git a/synthesis/codon/codon_test.go b/synthesis/codon/codon_test.go index c5b84625..0374e799 100644 --- a/synthesis/codon/codon_test.go +++ b/synthesis/codon/codon_test.go @@ -6,12 +6,14 @@ import ( "strings" "testing" - "github.com/TimothyStiles/poly/io/genbank" + "github.com/TimothyStiles/poly/bio" "github.com/google/go-cmp/cmp" weightedRand "github.com/mroth/weightedrand" "github.com/stretchr/testify/assert" ) +const puc19path = "../../bio/genbank/data/puc19.gbk" + func TestTranslation(t *testing.T) { gfpTranslation := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" gfpDnaSequence := "ATGGCTAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCTACATACGGAAAGCTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCCCGTTATCCGGATCATATGAAACGGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAACGCACTATATCTTTCAAAGATGACGGGAACTACAAGACGCGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATCGTATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTCGGACACAAACTCGAGTACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCTAACTTCAAAATTCGCCACAACATTGAAGATGGATCCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCGACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGCGTGACCACATGGTCCTTCTTGAGTTTGTAACTGCTGCTGGGATTACACATGGCATGGATGAGCTCTACAAATAA" @@ -77,7 +79,11 @@ func TestTranslationLowerCase(t *testing.T) { func TestOptimize(t *testing.T) { gfpTranslation := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" - sequence, _ := genbank.Read("../../data/puc19.gbk") + file, _ := os.Open("../../bio/genbank/data/puc19.gbk") + defer file.Close() + parser, _ := bio.NewGenbankParser(file) + sequence, _ := parser.Next() + codonTable := GetCodonTable(11) // a string builder to build a single concatenated string of all coding regions @@ -107,7 +113,10 @@ func TestOptimize(t *testing.T) { func TestOptimizeSameSeed(t *testing.T) { var gfpTranslation = "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" - var sequence, _ = genbank.Read("../../data/puc19.gbk") + file, _ := os.Open(puc19path) + defer file.Close() + parser, _ := bio.NewGenbankParser(file) + sequence, _ := parser.Next() var codonTable = GetCodonTable(11) // a string builder to build a single concatenated string of all coding regions @@ -137,7 +146,10 @@ func TestOptimizeSameSeed(t *testing.T) { func TestOptimizeDifferentSeed(t *testing.T) { var gfpTranslation = "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" - var sequence, _ = genbank.Read("../../data/puc19.gbk") + file, _ := os.Open(puc19path) + defer file.Close() + parser, _ := bio.NewGenbankParser(file) + sequence, _ := parser.Next() var codonTable = GetCodonTable(11) // a string builder to build a single concatenated string of all coding regions @@ -273,12 +285,12 @@ JSON related tests begin here. ******************************************************************************/ func TestWriteCodonJSON(t *testing.T) { - testCodonTable := ReadCodonJSON("../../data/bsub_codon_test.json") - WriteCodonJSON(testCodonTable, "../../data/codon_test1.json") - readTestCodonTable := ReadCodonJSON("../../data/codon_test1.json") + testCodonTable := ReadCodonJSON("../../../data/bsub_codon_test.json") + WriteCodonJSON(testCodonTable, "../../../data/codon_test1.json") + readTestCodonTable := ReadCodonJSON("../../../data/codon_test1.json") // cleaning up test data - os.Remove("../../data/codon_test1.json") + os.Remove("../../../data/codon_test1.json") if diff := cmp.Diff(testCodonTable, readTestCodonTable); diff != "" { t.Errorf(" mismatch (-want +got):\n%s", diff) @@ -293,7 +305,10 @@ Codon Compromise + Add related tests begin here. ***************************************************************************** */ func TestCompromiseCodonTable(t *testing.T) { - sequence, _ := genbank.Read("../../data/puc19.gbk") + file, _ := os.Open(puc19path) + defer file.Close() + parser, _ := bio.NewGenbankParser(file) + sequence, _ := parser.Next() codonTable := GetCodonTable(11) // a string builder to build a single concatenated string of all coding regions @@ -313,7 +328,10 @@ func TestCompromiseCodonTable(t *testing.T) { // weight our codon optimization table using the regions we collected from the genbank file above optimizationTable := codonTable.OptimizeTable(codingRegions) - sequence2, _ := genbank.Read("../../data/phix174.gb") + file2, _ := os.Open("../../data/phix174.gb") + defer file2.Close() + parser2, _ := bio.NewGenbankParser(file2) + sequence2, _ := parser2.Next() codonTable2 := GetCodonTable(11) // a string builder to build a single concatenated string of all coding regions @@ -361,7 +379,10 @@ func TestCapitalizationRegression(t *testing.T) { // Tests to make sure that amino acids are capitalized gfpTranslation := "MaSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" - sequence, _ := genbank.Read("../../data/puc19.gbk") + file, _ := os.Open(puc19path) + defer file.Close() + parser, _ := bio.NewGenbankParser(file) + sequence, _ := parser.Next() codonTable := GetCodonTable(11) // a string builder to build a single concatenated string of all coding regions diff --git a/synthesis/codon/example_test.go b/synthesis/codon/example_test.go index ce5f9061..213ce650 100644 --- a/synthesis/codon/example_test.go +++ b/synthesis/codon/example_test.go @@ -5,10 +5,13 @@ import ( "os" "strings" - "github.com/TimothyStiles/poly/io/genbank" + "github.com/TimothyStiles/poly/bio" "github.com/TimothyStiles/poly/synthesis/codon" ) +const puc19path = "../../bio/genbank/data/puc19.gbk" +const phix174path = "../../bio/genbank/data/phix174.gb" + func ExampleTranslate() { gfpTranslation := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" gfpDnaSequence := "ATGGCTAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCTACATACGGAAAGCTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCCCGTTATCCGGATCATATGAAACGGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAACGCACTATATCTTTCAAAGATGACGGGAACTACAAGACGCGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATCGTATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTCGGACACAAACTCGAGTACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCTAACTTCAAAATTCGCCACAACATTGAAGATGGATCCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCGACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGCGTGACCACATGGTCCTTCTTGAGTTTGTAACTGCTGCTGGGATTACACATGGCATGGATGAGCTCTACAAATAA" @@ -21,7 +24,10 @@ func ExampleTranslate() { func ExampleOptimize() { gfpTranslation := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" - sequence, _ := genbank.Read("../../data/puc19.gbk") + file, _ := os.Open(puc19path) + defer file.Close() + parser, _ := bio.NewGenbankParser(file) + sequence, _ := parser.Next() codonTable := codon.GetCodonTable(11) // a string builder to build a single concatenated string of all coding regions @@ -99,7 +105,10 @@ func ExampleWriteCodonJSON() { } func ExampleCompromiseCodonTable() { - sequence, _ := genbank.Read("../../data/puc19.gbk") + file, _ := os.Open(puc19path) + defer file.Close() + parser, _ := bio.NewGenbankParser(file) + sequence, _ := parser.Next() codonTable := codon.GetCodonTable(11) // a string builder to build a single concatenated string of all coding regions @@ -119,7 +128,11 @@ func ExampleCompromiseCodonTable() { // weight our codon optimization table using the regions we collected from the genbank file above optimizationTable := codonTable.OptimizeTable(codingRegions) - sequence2, _ := genbank.Read("../../data/phix174.gb") + file2, _ := os.Open(phix174path) + defer file2.Close() + parser2, _ := bio.NewGenbankParser(file2) + sequence2, _ := parser2.Next() + codonTable2 := codon.GetCodonTable(11) // a string builder to build a single concatenated string of all coding regions var codingRegionsBuilder2 strings.Builder @@ -150,7 +163,10 @@ func ExampleCompromiseCodonTable() { } func ExampleAddCodonTable() { - sequence, _ := genbank.Read("../../data/puc19.gbk") + file, _ := os.Open(puc19path) + defer file.Close() + parser, _ := bio.NewGenbankParser(file) + sequence, _ := parser.Next() codonTable := codon.GetCodonTable(11) // a string builder to build a single concatenated string of all coding regions @@ -170,7 +186,10 @@ func ExampleAddCodonTable() { // weight our codon optimization table using the regions we collected from the genbank file above optimizationTable := codonTable.OptimizeTable(codingRegions) - sequence2, _ := genbank.Read("../../data/phix174.gb") + file2, _ := os.Open(phix174path) + defer file2.Close() + parser2, _ := bio.NewGenbankParser(file2) + sequence2, _ := parser2.Next() codonTable2 := codon.GetCodonTable(11) // a string builder to build a single concatenated string of all coding regions diff --git a/tutorials/001_input_output_test.go b/tutorials/001_input_output_test.go index c44eee48..b0d25200 100644 --- a/tutorials/001_input_output_test.go +++ b/tutorials/001_input_output_test.go @@ -7,7 +7,8 @@ import ( "path/filepath" "testing" - "github.com/TimothyStiles/poly/io/genbank" + "github.com/TimothyStiles/poly/bio" + "github.com/TimothyStiles/poly/bio/genbank" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" ) @@ -46,6 +47,13 @@ TTFN, Tim ******************************************************************************/ +// Ignore ParentSequence as that's a pointer which can't be serialized. +func CmpOptions() []cmp.Option { + return []cmp.Option{ + cmpopts.IgnoreFields(genbank.Feature{}, "ParentSequence"), + } +} + // if you're using VS-CODE you should see a DEBUG TEST button right below this // comment. Please set break points and use it early and often. func TestFileIOTutorial(t *testing.T) { @@ -53,7 +61,10 @@ func TestFileIOTutorial(t *testing.T) { // backbone puc19. Plasmids are super small rings of "Circular DNA" that are // between 1 and 10 kilobases in length. - puc19, _ := genbank.Read("../data/puc19.gbk") + file, _ := os.Open("../bio/genbank/data/puc19.gbk") + defer file.Close() + parser, _ := bio.NewGenbankParser(file) + puc19, _ := parser.Next() // A plasmid backbone is an empty template designed so that we // can easily insert genes into it to be expressed in our organism of choice. @@ -154,26 +165,30 @@ func TestFileIOTutorial(t *testing.T) { // write the modified plasmid to a GenBank file puc19Path := filepath.Join(tmpDataDir, "pUC19_modified.gb") - err = genbank.Write(puc19, puc19Path) + writeFile, _ := os.OpenFile(puc19Path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) + defer writeFile.Close() + _, err = puc19.WriteTo(writeFile) if err != nil { t.Error(err) } // read the plasmid back in from the GenBank file and make sure it's the same as the original - puc19Copy, err := genbank.Read(puc19Path) + fileCopy, _ := os.Open(puc19Path) + defer fileCopy.Close() + parser2, _ := bio.NewGenbankParser(fileCopy) + puc19Copy, err := parser2.Next() if err != nil { t.Error(err) } // compare our read-in plasmid to the the one we wrote out. - // Ignore ParentSequence as that's a pointer which can't be serialized. - if diff := cmp.Diff(puc19, puc19Copy, []cmp.Option{cmpopts.IgnoreFields(genbank.Feature{}, "ParentSequence")}...); diff != "" { + if diff := cmp.Diff(puc19, puc19Copy, CmpOptions()...); diff != "" { t.Errorf("Parsing the output of Build() does not produce the same output as parsing the original file, \"%s\", read with Read(). Got this diff:\n%s", filepath.Base(puc19Path), diff) } // write the modified plasmid to a JSON file puc19JSONPath := filepath.Join(tmpDataDir, "pUC19_modified.json") - marshaledPuc19, err := json.MarshalIndent(puc19, "", " ") + marshaledPuc19, err := json.MarshalIndent(*puc19, "", " ") if err != nil { t.Error(err) } @@ -190,7 +205,7 @@ func TestFileIOTutorial(t *testing.T) { if err := json.Unmarshal(jsonContent, &unmarshaledPuc19); err != nil { t.Error(err) } - if diff := cmp.Diff(puc19, unmarshaledPuc19, []cmp.Option{cmpopts.IgnoreFields(genbank.Feature{}, "ParentSequence")}...); diff != "" { + if diff := cmp.Diff(puc19, &unmarshaledPuc19, CmpOptions()...); diff != "" { t.Errorf("Parsing the JSON does not produce the same output as parsing the original file, \"%s\", read with Read(). Got this diff:\n%s", filepath.Base(puc19Path), diff) }