From 54ef7701ef10bd85f686e057a0860edc7dcbfaa9 Mon Sep 17 00:00:00 2001 From: Koeng101 Date: Thu, 7 Dec 2023 09:20:04 -0800 Subject: [PATCH 1/3] Lowercase methylation (#2) * Added methylated cloning * add a little docs * make linter happy * add changelog * add enzyme * Updated changelog --- CHANGELOG.md | 5 +++++ clone/clone.go | 39 ++++++++++++++++++++++++++++----------- clone/clone_test.go | 37 +++++++++++++++++++++++++++---------- clone/example_test.go | 2 +- 4 files changed, 61 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 502c4e3..672c855 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Alternative start codons can now be used in the `synthesis/codon` DNA -> protein translation package (#305) +- Added a parser and writer for the `pileup` sequence alignment format (#329) +- Added statistics to the `synthesis/codon` package (keeping track of the observed start codon occurrences in a translation table) (#350) +- Added option to fragmenter to fragment with only certain overhangs (#387) +- GoldenGates with methylated sequences using lowercase letters (#391) - Generic parser is now implemented across all parsers for consistent interactions. [(#339)](https://github.com/TimothyStiles/poly/issues/339) - Alternative start codons can now be used in the `synthesis/codon` DNA -> protein translation package [(#305)](https://github.com/TimothyStiles/poly/issues/305) - Added a parser and writer for the `pileup` sequence alignment format [(#329)](https://github.com/TimothyStiles/poly/issues/329) diff --git a/clone/clone.go b/clone/clone.go index e7f1c3e..bd2f2f5 100644 --- a/clone/clone.go +++ b/clone/clone.go @@ -32,6 +32,11 @@ in a single tube and that is extraordinarily efficient (up to 50 parts) and is p for new modular DNA part toolkits. Users can easily simulate GoldenGate assembly reactions with just their input fragments + the enzyme name. +Unlike many other GoldenGate simulators, we support simulating GoldenGate with +methylated DNA sequences, which are represented as lowercased sequences in user +inputted sequences. Normally, this can be turned off, but can be used in the +special case of recursive GoldenGate reactions. + Let's build some DNA! # Keoni @@ -113,8 +118,9 @@ Base cloning functions begin here. // CutWithEnzymeByName cuts a given sequence with an enzyme represented by the // enzyme's name. It is a convenience wrapper around CutWithEnzyme that -// allows us to specify the enzyme by name. -func (enzymeManager EnzymeManager) CutWithEnzymeByName(part Part, directional bool, name string) ([]Fragment, error) { +// allows us to specify the enzyme by name. Set methylated flag to true if +// there is lowercase methylated DNA as part of the sequence. +func (enzymeManager EnzymeManager) CutWithEnzymeByName(part Part, directional bool, name string, methylated bool) ([]Fragment, error) { // Get the enzyme from the enzyme map enzyme, err := enzymeManager.GetEnzymeByName(name) if err != nil { @@ -122,7 +128,7 @@ func (enzymeManager EnzymeManager) CutWithEnzymeByName(part Part, directional bo return []Fragment{}, err } // Cut the sequence with the enzyme - return CutWithEnzyme(part, directional, enzyme), nil + return CutWithEnzyme(part, directional, enzyme, methylated), nil } // GetEnzymeByName gets the enzyme by it's name. If the enzyme manager does not @@ -135,13 +141,21 @@ func (enzymeManager EnzymeManager) GetEnzymeByName(name string) (Enzyme, error) } // CutWithEnzyme cuts a given sequence with an enzyme represented by an Enzyme struct. -func CutWithEnzyme(part Part, directional bool, enzyme Enzyme) []Fragment { +// If there is methylated parts of the target DNA, set the "methylated" flag to +// true and lowercase ONLY methylated DNA. +func CutWithEnzyme(part Part, directional bool, enzyme Enzyme, methylated bool) []Fragment { var fragmentSequences []string - var sequence string + + // Setup circular sequences + sequence := part.Sequence if part.Circular { - sequence = strings.ToUpper(part.Sequence + part.Sequence) - } else { - sequence = strings.ToUpper(part.Sequence) + sequence = sequence + sequence + } + + // If unmethylated, set everything to uppercase so that the enzyme regex + // works on all the sequence + if !methylated { + sequence = strings.ToUpper(sequence) } // Check for palindromes @@ -343,11 +357,12 @@ Specific cloning functions begin here. ******************************************************************************/ // GoldenGate simulates a GoldenGate cloning reaction. As of right now, we only -// support BsaI, BbsI, BtgZI, and BsmBI. -func GoldenGate(sequences []Part, cuttingEnzyme Enzyme) (openConstructs []string, infiniteLoops []string) { +// support BsaI, BbsI, BtgZI, and BsmBI. Set methylated flag to true if there +// is lowercase methylated DNA as part of the sequence. +func GoldenGate(sequences []Part, cuttingEnzyme Enzyme, methylated bool) (openConstructs []string, infiniteLoops []string) { var fragments []Fragment for _, sequence := range sequences { - newFragments := CutWithEnzyme(sequence, true, cuttingEnzyme) + newFragments := CutWithEnzyme(sequence, true, cuttingEnzyme, methylated) fragments = append(fragments, newFragments...) } openconstructs, infiniteloops := CircularLigate(fragments) @@ -360,5 +375,7 @@ func GetBaseRestrictionEnzymes() []Enzyme { {"BsaI", regexp.MustCompile("GGTCTC"), regexp.MustCompile("GAGACC"), 1, 4, "GGTCTC"}, {"BbsI", regexp.MustCompile("GAAGAC"), regexp.MustCompile("GTCTTC"), 2, 4, "GAAGAC"}, {"BtgZI", regexp.MustCompile("GCGATG"), regexp.MustCompile("CATCGC"), 10, 4, "GCGATG"}, + {"PaqCI", regexp.MustCompile("CACCTGC"), regexp.MustCompile("GCAGGTG"), 4, 4, "CACCTGC"}, + {"BsmBI", regexp.MustCompile("CGTCTC"), regexp.MustCompile("GAGACG"), 1, 4, "CGTCTC"}, } } diff --git a/clone/clone_test.go b/clone/clone_test.go index ee0ca78..34501d1 100644 --- a/clone/clone_test.go +++ b/clone/clone_test.go @@ -9,7 +9,7 @@ var popen = Part{"TAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTG func TestCutWithEnzymeByName(t *testing.T) { enzymeManager := NewEnzymeManager(GetBaseRestrictionEnzymes()) - _, err := enzymeManager.CutWithEnzymeByName(popen, true, "EcoFake") + _, err := enzymeManager.CutWithEnzymeByName(popen, true, "EcoFake", false) if err == nil { t.Errorf("CutWithEnzymeByName should have failed when looking for fake restriction enzyme EcoFake") } @@ -25,7 +25,7 @@ func TestCutWithEnzyme(t *testing.T) { // Test case of `<-bsaiComplement bsai-> <-bsaiComplement bsai->` where bsaI cuts off of a linear sequence. This tests the line: // if !sequence.Circular && (overhangSet[len(overhangSet)-1].Position+enzyme.EnzymeSkip+enzyme.EnzymeOverhangLen > len(sequence)) sequence = Part{"ATATATA" + bsaiComplement + bsai + "ATGCATCGATCGACTAGCATG" + bsaiComplement + bsai[:8], false} - fragment, err := enzymeManager.CutWithEnzymeByName(sequence, true, "BsaI") + fragment, err := enzymeManager.CutWithEnzymeByName(sequence, true, "BsaI", false) if err != nil { t.Errorf("CutWithEnzyme should not have failed on test(1). Got error: %s", err) } @@ -39,7 +39,7 @@ func TestCutWithEnzyme(t *testing.T) { // test(2) // Now if we take the same sequence and circularize it, we get a different result sequence.Circular = true - fragment, err = enzymeManager.CutWithEnzymeByName(sequence, true, "BsaI") + fragment, err = enzymeManager.CutWithEnzymeByName(sequence, true, "BsaI", false) if err != nil { t.Errorf("CutWithEnzyme should not have failed on test(2). Got error: %s", err) } @@ -57,7 +57,7 @@ func TestCutWithEnzyme(t *testing.T) { // directionality flag to false. This tests the line: // if len(overhangs) == 1 && !directional && !sequence.Circular sequence = Part{"ATATATATATATATAT" + bsai + "GCGCGCGCGCGCGCGCGCGC", false} - fragment, err = enzymeManager.CutWithEnzymeByName(sequence, false, "BsaI") + fragment, err = enzymeManager.CutWithEnzymeByName(sequence, false, "BsaI", false) if err != nil { t.Errorf("CutWithEnzyme should not have failed on test(3). Got error: %s", err) } @@ -73,7 +73,7 @@ func TestCutWithEnzyme(t *testing.T) { // tests the line: // if len(overhangs) == 2 && !directional && sequence.Circular sequence.Circular = true - fragment, err = enzymeManager.CutWithEnzymeByName(sequence, false, "BsaI") + fragment, err = enzymeManager.CutWithEnzymeByName(sequence, false, "BsaI", false) if err != nil { t.Errorf("CutWithEnzyme should not have failed on test(4). Got error: %s", err) } @@ -87,7 +87,7 @@ func TestCutWithEnzyme(t *testing.T) { // test(5) // This tests if we have a fragment where we do not care about directionality // but have more than 1 cut site in our fragment. We can use pOpen for this. - fragment, err = enzymeManager.CutWithEnzymeByName(popen, false, "BbsI") + fragment, err = enzymeManager.CutWithEnzymeByName(popen, false, "BbsI", false) if err != nil { t.Errorf("CutWithEnzyme should not have failed on test(5). Got error: %s", err) } @@ -183,7 +183,7 @@ func TestSignalKilledGoldenGate(t *testing.T) { t.Errorf("Error when getting Enzyme. Got error: %s", err) } - clones, loopingClones := GoldenGate(fragments, bbsI) + clones, loopingClones := GoldenGate(fragments, bbsI, false) if len(clones) != 1 { t.Errorf("There should be 1 output Got: %d", len(clones)) } @@ -210,7 +210,7 @@ func TestPanicGoldenGate(t *testing.T) { t.Errorf("Error when getting Enzyme. Got error: %s", err) } - _, _ = GoldenGate(fragments, bbsI) + _, _ = GoldenGate(fragments, bbsI, false) } func TestCircularCutRegression(t *testing.T) { @@ -218,7 +218,7 @@ func TestCircularCutRegression(t *testing.T) { // This used to error with 0 fragments since the BsaI cut site is on the other // side of the origin from its recognition site. plasmid1 := Part{"AAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCCGAGaccaagtcgcggccgcgaggtgtcaatcgtcggagtagggataacagggtaatccgctgagcaataactagcataaccccttggggcctctaaacgggtcttgaggggttttttgcatggtcatagctgtttcctgttacgccccgccctgccactcgtcgcagtactgttgtaattcattaagcattctgccgacatggaagccatcacaaacggcatgatgaacctgaatcgccagcggcatcagcaccttgtcgccttgcgtataatatttgcccatggtgaaaacgggggcgaagaagttgtccatattggccacgtttaaatcaaaactggtgaaactcacccagggattggctgacacgaaaaacatattctcaataaaccctttagggaaataggccaggttttcaccgtaacacgccacatcttgcgaatatatgtgtagaaactgccggaaatcgtcgtggtattcactccagagggatgaaaacgtttcagtttgctcatggaaaacggtgtaacaagggtgaacactatcccatatcaccagctcaccatccttcattgccatacgaaattccggatgagcattcatcaggcgggcaagaatgtgaataaaggccggataaaacttgtgcttatttttctttacggtctttaaaaaggccgtaatatccagctgaacggtctggttataggtacattgagcaactgactgaaatgcctcaaaatgttctttacgatgccattgggatatatcaacggtggtatatccagtgatttttttctccattttagcttccttagctcctgaaaatctcgataactcaaaaaatacgcccggtagtgatcttatttcattatggtgaaagttggaacctcttacgtgccgatcatttccataggctccgcccccctgacgagcatcacaaaaatcgacgctcaagtcagaggtggcgaaacccgacaggactataaagataccaggcgtttccccctggaagctccctcgtgcgctctcctgttccgaccctgccgcttaccggatacctgtccgcctttctcccttcgggaagcgtggcgctttctcatagctcacgctgtaggtatctcagttcggtgtaggtcgttcgctccaagctgggctgtgtgcacgaaccccccgttcagcccgaccgctgcgccttatccggtaactatcgtcttgagtccaacccggtaagacacgacttatcgccactggcagcagccactggtaacaggattagcagagcgaggtatgtaggcggtgctacagagttcttgaagtggtggcctaactacggctacactagaaggacagtatttggtatctgcgctctgctgaagccagttaccttcggaaaaagagttggtagctcttgatccggcaaacaaaccaccgctggtagcggtggtttttttgtttgcaagcagcagattacgcgcagaaaaaaaggatctcaagtaaaacgacggccagtagtcaaaagcctccgaccggaggcttttgacttggttcaggtggagtggcggccgcgacttgGTCTC", true} - newFragments, err := enzymeManager.CutWithEnzymeByName(plasmid1, true, "BsaI") + newFragments, err := enzymeManager.CutWithEnzymeByName(plasmid1, true, "BsaI", false) if err != nil { t.Errorf("Failed to cut: %s", err) } @@ -227,13 +227,30 @@ func TestCircularCutRegression(t *testing.T) { } } +func TestMethylatedGoldenGate(t *testing.T) { + pOpenV3Methylated := Part{"AGGGTAATGGTCTCTCGAGACcAAGTCGTCATAGCTGTTTCCTGAGAGCTTGGCAGGTGATGACACACATTAACAAATTTCGTGAGGAGTCTCCAGAAGAATGCCATTAATTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGAACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGGCCTACTATTAGCAACAACGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGAACCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACCTGCACCAGTCAGTAAAACGACGGCCAGTGACTTgGTCTCGAGACCTAGGGATA", false} + frag1 := Part{"AGTTGCAGTATCTAACCCGCGGTCTCTGTCTCATCTCACTTAATCTTCTGTACTCTGAAGAGGAGTGGGAAATACCAAGAAAAACATCAAACTCGAATGATTTTCCCAAACCCCTACCACAAGATATTCATCAGCTGCGAGATGAGACCATACTGTAAGAACCACGCGGT", false} + frag2 := Part{"AGTTGCAGTATCTAACCCGCGGTCTCTGAGATAGGCTGATCAGGAGCAAGCTCGTACGAGAAGAAACAAAATGACAAAAAAAATCCTATACTATATAGGTTACAAATAAAAAAGTATCAAAAATGAAGCTGAGACCATACTGTAAGAACCACGCGGTAAAAGACGCTACG", false} + frag3 := Part{"AGTTGCAGTATCTAACCCGCGGTCTCTAAGCCTGCATCTCTCAGGCAAATGGCATTCTGACATCCTCTTGATTACGAGTGAGACCATACTGTAAGAACCACGCGGCTGAACCTCCAGCGGACTCAGTCGCGAAAATACTTACCAAAGGACCGAATTCACCGATCGAACGG", false} + + enzymeManager := NewEnzymeManager(GetBaseRestrictionEnzymes()) + bsai, err := enzymeManager.GetEnzymeByName("BsaI") + if err != nil { + t.Errorf("Error when getting Enzyme. Got error: %s", err) + } + clones, _ := GoldenGate([]Part{pOpenV3Methylated, frag1, frag2, frag3}, bsai, true) + if len(clones) != 1 { + t.Errorf("Should have gotten a single clone") + } +} + func benchmarkGoldenGate(b *testing.B, enzymeManager EnzymeManager, parts []Part) { bbsI, err := enzymeManager.GetEnzymeByName("BbsI") if err != nil { b.Errorf("Error when getting Enzyme. Got error: %s", err) } for n := 0; n < b.N; n++ { - _, _ = GoldenGate(parts, bbsI) + _, _ = GoldenGate(parts, bbsI, false) } } diff --git a/clone/example_test.go b/clone/example_test.go index 85d12dc..e8f0175 100644 --- a/clone/example_test.go +++ b/clone/example_test.go @@ -24,7 +24,7 @@ func ExampleGoldenGate() { if err != nil { log.Fatalf("Something went wrong when trying to get the enzyme. Got error: %s", err) } - Clones, _ := clone.GoldenGate([]clone.Part{fragment1, fragment2, popen}, bbsI) + Clones, _ := clone.GoldenGate([]clone.Part{fragment1, fragment2, popen}, bbsI, false) fmt.Println(seqhash.RotateSequence(Clones[0])) // Output: AAAAAAAGGATCTCAAGAAGGCCTACTATTAGCAACAACGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGAACCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACCTGCACCAGTCAGTAAAACGACGGCCAGTAGTCAAAAGCCTCCGACCGGAGGCTTTTGACTTGGTTCAGGTGGAGTGGGAGAAACACGTGGCAAACATTCCGGTCTCAAATGGAAAAGAGCAACGAAACCAACGGCTACCTTGACAGCGCTCAAGCCGGCCCTGCAGCTGGCCCGGGCGCTCCGGGTACCGCCGCGGGTCGTGCACGTCGTTGCGCGGGCTTCCTGCGGCGCCAAGCGCTGGTGCTGCTCACGGTGTCTGGTGTTCTGGCAGGCGCCGGTTTGGGCGCGGCACTGCGTGGGCTCAGCCTGAGCCGCACCCAGGTCACCTACCTGGCCTTCCCCGGCGAGATGCTGCTCCGCATGCTGCGCATGATCATCCTGCCGCTGGTGGTCTGCAGCCTGGTGTCGGGCGCCGCCTCCCTCGATGCCAGCTGCCTCGGGCGTCTGGGCGGTATCGCTGTCGCCTACTTTGGCCTCACCACACTGAGTGCCTCGGCGCTCGCCGTGGCCTTGGCGTTCATCATCAAGCCAGGATCCGGTGCGCAGACCCTTCAGTCCAGCGACCTGGGGCTGGAGGACTCGGGGCCTCCTCCTGTCCCCAAAGAAACGGTGGACTCTTTCCTCGACCTGGCCAGAAACCTGTTTCCCTCCAATCTTGTGGTTGCAGCTTTCCGTACGTATGCAACCGATTATAAAGTCGTGACCCAGAACAGCAGCTCTGGAAATGTAACCCATGAAAAGATCCCCATAGGCACTGAGATAGAAGGGATGAACATTTTAGGATTGGTCCTGTTTGCTCTGGTGTTAGGAGTGGCCTTAAAGAAACTAGGCTCCGAAGGAGAGGACCTCATCCGTTTCTTCAATTCCCTCAACGAGGCGACGATGGTGCTGGTGTCCTGGATTATGTGGTACGTACCTGTGGGCATCATGTTCCTTGTTGGAAGCAAGATCGTGGAAATGAAAGACATCATCGTGCTGGTGACCAGCCTGGGGAAATACATCTTCGCATCTATATTGGGCCACGTCATTCATGGTGGTATCGTCCTGCCGCTGATTTATTTTGTTTTCACACGAAAAAACCCATTCAGATTCCTCCTGGGCCTCCTCGCCCCATTTGCGACAGCATTTGCTACGTGCTCCAGCTCAGCGACCCTTCCCTCTATGATGAAGTGCATTGAAGAGAACAATGGTGTGGACAAGAGGATCTCCAGGTTTATTCTCCCCATCGGGGCCACCGTGAACATGGACGGAGCAGCCATCTTCCAGTGTGTGGCCGCGGTGTTCATTGCGCAACTCAACAACGTAGAGCTCAACGCAGGACAGATTTTCACCATTCTAGTGACTGCCACAGCGTCCAGTGTTGGAGCAGCAGGCGTGCCAGCTGGAGGGGTCCTCACCATTGCCATTATCCTGGAGGCCATTGGGCTGCCTACTCATGATCTGCCTCTGATCCTGGCTGTGGACTGGATTGTGGACCGGACCACCACGGTGGTGAATGTGGAAGGGGATGCCCTGGGTGCAGGCATTCTCCACCACCTGAATCAGAAGGCAACAAAGAAAGGCGAGCAGGAACTTGCTGAGGTGAAAGTGGAAGCCATCCCCAACTGCAAGTCTGAGGAGGAAACCTCGCCCCTGGTGACACACCAGAACCCCGCTGGCCCCGTGGCCAGTGCCCCAGAACTGGAATCCAAGGAGTCGGTTCTGTGAAGAGCTTAGAGACCGACGACTGCCTAAGGACATTCGCTGAGGTGTCAATCGTCGGAGCCGCTGAGCAATAACTAGCATAACCCCTTGGGGCCTCTAAACGGGTCTTGAGGGGTTTTTTGCATGGTCATAGCTGTTTCCTGAGAGCTTGGCAGGTGATGACACACATTAACAAATTTCGTGAGGAGTCTCCAGAAGAATGCCATTAATTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGAACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAG From 395849e5fe825faf0f8baa33e6515421421f1d0c Mon Sep 17 00:00:00 2001 From: Koeng101 Date: Thu, 7 Dec 2023 09:22:23 -0800 Subject: [PATCH 2/3] Seqhash compressed (#3) * Add seqhash v2 * updated changelog * make linter happy * seqhash * Hash2->HashV2 * renamed fwd and rev * Add top level comment * updated changelog --------- Co-authored-by: Timothy Stiles --- CHANGELOG.md | 2 + seqhash/example_test.go | 15 ++- seqhash/seqhash.go | 256 ++++++++++++++++++++++++++++++++++++++-- seqhash/seqhash_test.go | 34 ++++++ 4 files changed, 296 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 672c855..461f3b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Created copy methods for Feature and Location to address concerns raised by [(#342)](https://github.com/TimothyStiles/poly/issues/342) - Created new methods to convert polyjson -> genbank. - Created new `Feature.StoreSequence` method to enable [(#388)](https://github.com/TimothyStiles/poly/issues/388) +- Added seqhash v2 (#398) ### Changed - **Breaking**: Genbank parser uses new custom multimap for `Feature.Attributes`, which allows for duplicate keys. This changes the type of Features.Attributes from `map[string]string` to `MultiMap[string, string]`, an alias for `map[string]string` defined in `multimap.go`. [(#383)](https://github.com/TimothyStiles/poly/issues/383) @@ -30,6 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Adds functional test and fix for [(#313)](https://github.com/TimothyStiles/poly/issues/313). - In addition to expanding the set of genbank files which can be validly parsed, the parser is more vocal when it encounters unusual syntax in the "feature" section. This "fail fast" approach is better as there were cases where inputs triggered a codepath which would neither return a valid Genbank object nor an error, and should help with debugging. - Fixed bug that produced wrong overhang in linear, non-directional, single cut reactions. #408 +>>>>>>> main ## [0.26.0] - 2023-07-22 Oops, we weren't keeping a changelog before this tag! diff --git a/seqhash/example_test.go b/seqhash/example_test.go index 718d837..a1c6082 100644 --- a/seqhash/example_test.go +++ b/seqhash/example_test.go @@ -15,9 +15,9 @@ func Example_basic() { circular := false doubleStranded := true - sequenceSeqhash, _ := seqhash.Hash(sequence, sequenceType, circular, doubleStranded) + sequenceSeqhash, _ := seqhash.EncodeHashV2(seqhash.HashV2(sequence, sequenceType, circular, doubleStranded)) fmt.Println(sequenceSeqhash) - // Output: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1608a615f46350 + // Output: C_JPQCj5PgjFwjy7jaoYmwqQ== } func ExampleHash() { @@ -43,3 +43,14 @@ func ExampleRotateSequence() { fmt.Println(seqhash.RotateSequence(sequence.Sequence) == seqhash.RotateSequence(testSequence)) // output: true } + +func ExampleHashV2() { + sequence := "ATGC" + sequenceType := seqhash.DNA + circular := false + doubleStranded := true + + sequenceSeqhash, _ := seqhash.HashV2(sequence, sequenceType, circular, doubleStranded) + fmt.Println(sequenceSeqhash) + // Output: [36 244 2 143 147 224 140 92 35 203 184 218 161 137 176 169] +} diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go index 335a925..5613ff0 100644 --- a/seqhash/seqhash.go +++ b/seqhash/seqhash.go @@ -3,6 +3,9 @@ Package seqhash contains the seqhash algorithm. This package contains the reference seqhash algorithm. +If you are new to using seqhash, use V2. V1 should only be used in situations +where full 256 rather than 120 bit hashing is needed. + There is a big problem with current sequence databases - they all use different identifiers and accession numbers. This means cross-referencing databases is a complicated exercise, especially as the quantity of databases increases, or if @@ -39,7 +42,9 @@ only ACDEFGHIKLMNPQRSTVWYUO*BXZ characters are allowed in sequences. Selenocyste (Pyl; O) are included in the protein character set - usually U and O don't occur within protein sequences, but for certain organisms they do, and it is certainly a relevant amino acid for those particular proteins. -A Seqhash is separated into 3 different elements divided by underscores. It looks like the following: +# Seqhash version 1 + +A version 1 seqhash is separated into 3 different elements divided by underscores. It looks like the following: v1_DCD_4b0616d1b3fc632e42d78521deb38b44fba95cca9fde159e01cd567fa996ceb9 @@ -50,12 +55,38 @@ not the sequence is circular (C for Circular, L for Linear). The final letter co sequence is double stranded (D for Double stranded, S for Single stranded). The final element is the blake3 hash of the sequence (once rotated and complemented, as stated above). -Seqhash is a simple algorithm that allows for much better indexing of genetic sequences than what is -currently available. +# Seqhash version 2 + +Version 1 seqhashes are rather long, and version 2 seqhashes are built to be +much shorter. The intended use case are for handling sequences with LLM systems +since these system's context window is a value resource, and smaller references +allows the system to be more focused. Seqhash version 2 are approximately 3x +smaller than version 1 seqhashes. Officially, they are [16]byte arrays, but can +be also encoded with base64 to get a hash that can be used as a string across +different systems. Here is a length comparison: + + version 1: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1608a615f46350 + version 2: C_JPQCj5PgjFwjy7jaoYmwqQ== + +The metadata is now encoded in a 1 byte flag rather than a metadata string, +instead of 7 rune like in version 1. Rather than use 256 bits for encoding +the hash, we use 120 bits. Since seqhashes are not meant for security, this +is good enough (50% collision with 1.3x10^18 hashes), while making them +conveniently only 16 btyes long. Additionally, encoded prefixes are added +to the front of the base64 encoded hash as a heuristic device for LLMs while +processing batches of seqhashes. + +In addition, seqhashes can now encode fragments. Fragments are double stranded +DNA that are the result of restriction digestion, with single stranded +overhangs flanking both sides. These fragments can encode genetic parts - and +an important part of any vector containing these parts would be the part +seqhash, rather than the vector seqhash. This enhancement allows you to +identify genetic parts irregardless of their context. */ package seqhash import ( + "encoding/base64" "encoding/hex" "errors" "sort" @@ -69,9 +100,10 @@ import ( type SequenceType string const ( - DNA SequenceType = "DNA" - RNA SequenceType = "RNA" - PROTEIN SequenceType = "PROTEIN" + DNA SequenceType = "DNA" + RNA SequenceType = "RNA" + PROTEIN SequenceType = "PROTEIN" + FRAGMENT SequenceType = "FRAGMENT" ) // boothLeastRotation gets the least rotation of a circular string. @@ -137,8 +169,11 @@ func RotateSequence(sequence string) string { return sequence } -// Hash is a function to create Seqhashes, a specific kind of identifier. -func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) (string, error) { +// prepareDeterministicSequence prepares input data to be hashed by first running +// all of the checks for sequence typing, then by applying sequence +// manipulations to make a consistent hash for circular and double stranded +// sequences. +func prepareDeterministicSequence(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) (string, error) { // By definition, Seqhashes are of uppercase sequences sequence = strings.ToUpper(sequence) // If RNA, convert to a DNA sequence. The hash itself between a DNA and RNA sequence will not @@ -174,7 +209,6 @@ func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStran if sequenceType == PROTEIN && doubleStranded { return "", errors.New("Proteins cannot be double stranded") } - // Gets Deterministic sequence based off of metadata + sequence var deterministicSequence string switch { @@ -191,6 +225,15 @@ func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStran case !circular && !doubleStranded: deterministicSequence = sequence } + return deterministicSequence, nil +} + +// Hash creates a version 1 seqhash. +func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) (string, error) { + deterministicSequence, err := prepareDeterministicSequence(sequence, sequenceType, circular, doubleStranded) + if err != nil { + return "", err + } // Build 3 letter metadata var sequenceTypeLetter string @@ -222,3 +265,198 @@ func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStran seqhash := "v1" + "_" + sequenceTypeLetter + circularLetter + doubleStrandedLetter + "_" + hex.EncodeToString(newhash[:]) return seqhash, nil } + +// The following consts are for seqhash version 2 +const ( + // Define bit masks for each part of the flag + hash2versionMask byte = 0b11110000 // Version occupies the first 4 bits + hash2circularityMask byte = 0b00001000 // Circularity occupies the 5th bit + hash2doubleStrandedMask byte = 0b00000100 // Double-strandedness occupies the 6th bit + hash2typeMask byte = 0b00000011 // DNA/RNA/PROTEIN occupies the last 2 bits + + // Define shift counts for each part + hash2versionShift = 4 + hash2circularityShift = 3 + hash2doubleStrandedShift = 2 +) + +var ( + // sequenceTypeStringToByteFlagMap converts a sequenceType to a byte + sequenceTypeStringToByteFlagMap = map[SequenceType]byte{ + DNA: 0b00, + RNA: 0b01, + PROTEIN: 0b10, + FRAGMENT: 0b11, + } + // sequenceTypeByteToStringFlagMap converts a byte to a sequenceType + sequenceTypeByteToStringFlagMap = map[byte]SequenceType{ + 0b00: DNA, + 0b01: RNA, + 0b10: PROTEIN, + 0b11: FRAGMENT, + } +) + +// EncodeFlag encodes the version, circularity, double-strandedness, and type into a single byte flag. +// Used for seqhash v2 +func EncodeFlag(version int, sequenceType SequenceType, circularity bool, doubleStranded bool) byte { + var flag byte + + // Encode the version (assuming version is in the range 0-15) + flag |= (byte(version) << hash2versionShift) + + // Encode the circularity + if circularity { + flag |= (1 << hash2circularityShift) + } + + // Encode the double-strandedness + if doubleStranded { + flag |= (1 << hash2doubleStrandedShift) + } + + // Encode the DNA/RNA/PROTEIN + dnaRnaProtein := sequenceTypeStringToByteFlagMap[sequenceType] + flag |= (dnaRnaProtein & hash2typeMask) + + return flag +} + +// DecodeFlag decodes the single byte flag into its constituent parts. +// Outputs: version, circularity, doubleStranded, dnaRnaProtein. +// Used for seqhash v2 +func DecodeFlag(flag byte) (int, SequenceType, bool, bool) { + version := int((flag & hash2versionMask) >> hash2versionShift) + circularity := (flag & hash2circularityMask) != 0 + doubleStranded := (flag & hash2doubleStrandedMask) != 0 + dnaRnaProtein := flag & hash2typeMask + sequenceType := sequenceTypeByteToStringFlagMap[dnaRnaProtein] + + return version, sequenceType, circularity, doubleStranded +} + +// HashV2 creates a version 2 seqhash. +func HashV2(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) ([16]byte, error) { + var result [16]byte + + // First, get the determistic sequence of the hash + deterministicSequence, err := prepareDeterministicSequence(sequence, sequenceType, circular, doubleStranded) + if err != nil { + return result, err + } + + // Build our byte flag + flag := EncodeFlag(2, sequenceType, circular, doubleStranded) + result[0] = flag + + // Compute BLAKE3, then copy those to the remaining 15 bytes + newhash := blake3.Sum256([]byte(deterministicSequence)) + copy(result[1:], newhash[:15]) + + return result, nil +} + +// HashV2Fragment creates a version 2 fragment seqhash. Fragment seqhashes are +// a special kind of seqhash that are used to identify fragments, usually +// released by restriction enzyme digestion, rather than complete DNA +// sequences. This is very useful for tracking genetic parts in a database: as +// abstractions away from their container vectors, so that many fragments in +// different vectors can be identified consistently. +// +// fwdOverhangLength and revOverhangLength are the lengths of both overhangs. +// Hashed sequences are hashed with their overhangs attached. Most of the time, +// both of these will equal 4, as they are released by TypeIIS restriction +// enzymes. +// +// In order to make sure fwdOverhangLength and revOverhangLength fit in the +// hash, the hash is truncated at 13 bytes rather than 15, and both int8 are +// inserted. So the bytes would be: +// +// flag + fwdOverhangLength + revOverhangLength + [13]byte(hash) +// +// fwdOverhangLength and revOverhangLength are both int8, and their negatives +// are considered if the the overhang is on the 3prime strand, rather than the +// 5prime strand. +// +// 13 bytes is considered enough, because the number of fragments is limited +// by our ability to physically produce them, while other other sequence types +// can be found in nature. +// +// The fwdOverhang and revOverhang are the lengths of the overhangs of the +// input sequence. The hash, however, contains the forward and reverse overhang +// lengths of the deterministic sequence - ie, the alphabetically less-than +// strand, when comparing the uppercase forward and reverse complement strand. +// This means if the input sequence is not less than its reverse complement (for +// example, GTT is greater than AAC), then the output hash will have the forward +// and reverse overhang lengths of the reverse complement, not the input strand. +func HashV2Fragment(sequence string, fwdOverhangLength int8, revOverhangLength int8) ([16]byte, error) { + var result [16]byte + + // First, run checks and get the determistic sequence of the hash + for _, char := range sequence { + if !strings.Contains("ATUGCYRSWKMBDHVNZ", string(char)) { + return result, errors.New("Only letters ATUGCYRSWKMBDHVNZ are allowed for DNA/RNA. Got letter: " + string(char)) + } + } + sequence = strings.ToUpper(sequence) + var forward, reverse int8 + var deterministicSequence string + reverseComplement := transform.ReverseComplement(sequence) + if sequence > reverseComplement { + // If the reverse complement is smaller, reverse the overhangs forward and reverse + forward = revOverhangLength + reverse = fwdOverhangLength + deterministicSequence = reverseComplement + } else { + forward = fwdOverhangLength + reverse = revOverhangLength + deterministicSequence = sequence + } + + // Build our byte flag and copy length flags + flag := EncodeFlag(2, FRAGMENT, false, false) + result[0] = flag + result[1] = byte(forward) + result[2] = byte(reverse) + + // Compute BLAKE3, then copy those to the remaining 13 bytes + newhash := blake3.Sum256([]byte(deterministicSequence)) + copy(result[3:], newhash[:13]) + + return result, nil +} + +// HashV2MetadataKey is a key for a seqhash v2 single letter metadata tag. +type HashV2MetadataKey struct { + SequenceType SequenceType + Circular bool + DoubleStranded bool +} + +// HashV2Metadata contains the seqhash v2 single letter metadata tags. +var HashV2Metadata = map[HashV2MetadataKey]rune{ + {DNA, true, true}: 'A', + {DNA, true, false}: 'B', + {DNA, false, true}: 'C', + {DNA, false, false}: 'D', + {RNA, true, true}: 'E', + {RNA, true, false}: 'F', + {RNA, false, true}: 'G', + {RNA, false, false}: 'H', + {PROTEIN, false, false}: 'I', + {PROTEIN, true, false}: 'J', + {FRAGMENT, false, false}: 'K', + {FRAGMENT, true, false}: 'L', + {FRAGMENT, false, true}: 'M', + {FRAGMENT, true, true}: 'N', +} + +// EncodeHashV2 encodes HashV2 as a base64 string. It also adds a single +// letter metadata tag that can be used as an easy heuristic for an LLM to +// identify misbehaving code. +func EncodeHashV2(hash [16]byte, err error) (string, error) { + _, sequenceType, circularity, doubleStranded := DecodeFlag(hash[0]) + encoded := base64.StdEncoding.EncodeToString(hash[:]) + + return string(HashV2Metadata[HashV2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, err +} diff --git a/seqhash/seqhash_test.go b/seqhash/seqhash_test.go index 27da953..a258366 100644 --- a/seqhash/seqhash_test.go +++ b/seqhash/seqhash_test.go @@ -93,3 +93,37 @@ func TestLeastRotation(t *testing.T) { } } } + +func TestFlagEncoding(t *testing.T) { + version := 2 + sequenceType := DNA + circularity := true + doubleStranded := true + flag := EncodeFlag(version, sequenceType, circularity, doubleStranded) + decodedVersion, decodedSequenceType, decodedCircularity, decodedDoubleStranded := DecodeFlag(flag) + if (decodedVersion != version) || (decodedSequenceType != sequenceType) || (decodedCircularity != circularity) || (decodedDoubleStranded != doubleStranded) { + t.Errorf("Got different decoded flag.") + } +} + +func TestHashV2(t *testing.T) { + // Test TNA as sequenceType + _, err := HashV2("ATGGGCTAA", "TNA", true, true) + if err == nil { + t.Errorf("TestHashV2() has failed. TNA is not a valid sequenceType.") + } +} + +func TestHashV2Fragment(t *testing.T) { + // Test X failure + _, err := HashV2Fragment("ATGGGCTAX", 4, 4) + if err == nil { + t.Errorf("TestHashV2Fragment() has failed. X is not a valid sequenceType.") + } + // Test actual hash + sqHash, _ := EncodeHashV2(HashV2Fragment("ATGGGCTAA", 4, 4)) + expectedHash := "K_IwQEwsn8RN9yA1CCoVLpSw==" + if sqHash != expectedHash { + t.Errorf("Expected %s, Got: %s", expectedHash, sqHash) + } +} From 38185e941b1b6dd193613f3d5cc1225162ef57f3 Mon Sep 17 00:00:00 2001 From: Koeng101 Date: Thu, 7 Dec 2023 09:24:14 -0800 Subject: [PATCH 3/3] Make codon work with json (#4) * Removed update weights * add freqB table --- synthesis/codon/codon.go | 5 +- synthesis/codon/codon_test.go | 18 + synthesis/codon/default_tables/freqB.json | 379 ++++++++++++++++++++++ synthesis/codon/example_test.go | 46 --- 4 files changed, 401 insertions(+), 47 deletions(-) create mode 100644 synthesis/codon/default_tables/freqB.json diff --git a/synthesis/codon/codon.go b/synthesis/codon/codon.go index 85958c5..c0b95de 100644 --- a/synthesis/codon/codon.go +++ b/synthesis/codon/codon.go @@ -156,7 +156,10 @@ func (table *TranslationTable) Optimize(aminoAcids string, randomState ...int) ( } var codons strings.Builder - codonChooser := table.Choosers + codonChooser, err := newAminoAcidChoosers(table.AminoAcids) + if err != nil { + return "", err + } for _, aminoAcid := range aminoAcids { chooser, ok := codonChooser[string(aminoAcid)] diff --git a/synthesis/codon/codon_test.go b/synthesis/codon/codon_test.go index 6638696..02fdcf3 100644 --- a/synthesis/codon/codon_test.go +++ b/synthesis/codon/codon_test.go @@ -1,6 +1,7 @@ package codon import ( + _ "embed" "errors" "os" "strings" @@ -558,3 +559,20 @@ func TestUpdateWeights(t *testing.T) { }) } } + +//go:embed default_tables/freqB.json +var ecoliCodonTable []byte + +func TestCodonJSONRegression(t *testing.T) { + ct := ParseCodonJSON(ecoliCodonTable) + gfp := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK" + seed := 0 + sequence, err := ct.Optimize(gfp, seed) + if err != nil { + t.Errorf("Failed to optimize with premade table. Got error: %s", err) + } + expectedSequence := `ATGGCATCCAAGGGCGAGGAGTTGTTCACCGGTGTTGTGCCGATCCTGGTGGAGCTGGACGGTGACGTGAACGGTCACAAATTTAGCGTGTCCGGTGAGGGTGAGGGTGATGCTACCTATGGCAAGCTGACCCTGAAATTCATTTGTACCACGGGTAAACTGCCGGTCCCGTGGCCGACGCTGGTGACCACCTTCAGCTATGGTGTGCAGTGTTTCAGCCGCTACCCGGACCACATGAAGCGCCACGACTTTTTCAAGAGCGCGATGCCGGAGGGTTATGTGCAAGAACGTACCATCAGCTTTAAAGATGATGGTAACTATAAGACCCGCGCGGAAGTCAAGTTTGAGGGTGACACGCTGGTGAATCGTATTGAGTTGAAGGGTATTGACTTTAAGGAGGATGGTAATATTTTGGGCCACAAACTGGAGTACAATTACAATAGCCACAATGTTTACATCACGGCAGATAAACAGAAGAACGGTATCAAGGCGAACTTCAAAATTCGTCACAACATTGAGGACGGTTCTGTTCAACTGGCGGACCATTACCAACAGAATACCCCGATCGGTGACGGCCCGGTTCTGCTGCCGGACAACCATTATTTGAGCACCCAGTCCGCCCTGAGCAAGGACCCGAATGAGAAGCGTGATCATATGGTTCTGCTGGAGTTTGTGACCGCGGCGGGCATCACCCACGGCATGGACGAGCTGTACAAG` + if sequence != expectedSequence { + t.Errorf("Failed to output expected sequence.") + } +} diff --git a/synthesis/codon/default_tables/freqB.json b/synthesis/codon/default_tables/freqB.json new file mode 100644 index 0000000..8206e14 --- /dev/null +++ b/synthesis/codon/default_tables/freqB.json @@ -0,0 +1,379 @@ +{ + "start_codons": [ + "TTG", + "CTG", + "ATT", + "ATC", + "ATA", + "ATG", + "GTG" + ], + "stop_codons": [ + "TAA", + "TAG", + "TGA" + ], + "amino_acids": [ + { + "letter": "Y", + "codons": [ + { + "triplet": "TAT", + "weight": 42 + }, + { + "triplet": "TAC", + "weight": 58 + } + ] + }, + { + "letter": "C", + "codons": [ + { + "triplet": "TGT", + "weight": 42 + }, + { + "triplet": "TGC", + "weight": 58 + } + ] + }, + { + "letter": "I", + "codons": [ + { + "triplet": "ATT", + "weight": 49 + }, + { + "triplet": "ATC", + "weight": 51 + }, + { + "triplet": "ATA", + "weight": 0 + } + ] + }, + { + "letter": "V", + "codons": [ + { + "triplet": "GTT", + "weight": 35 + }, + { + "triplet": "GTC", + "weight": 28 + }, + { + "triplet": "GTA", + "weight": 3 + }, + { + "triplet": "GTG", + "weight": 34 + } + ] + }, + { + "letter": "G", + "codons": [ + { + "triplet": "GGT", + "weight": 60 + }, + { + "triplet": "GGC", + "weight": 39 + }, + { + "triplet": "GGA", + "weight": 0 + }, + { + "triplet": "GGG", + "weight": 0 + } + ] + }, + { + "letter": "L", + "codons": [ + { + "triplet": "TTA", + "weight": 3 + }, + { + "triplet": "TTG", + "weight": 14 + }, + { + "triplet": "CTT", + "weight": 2 + }, + { + "triplet": "CTC", + "weight": 3 + }, + { + "triplet": "CTA", + "weight": 0 + }, + { + "triplet": "CTG", + "weight": 78 + } + ] + }, + { + "letter": "W", + "codons": [ + { + "triplet": "TGG", + "weight": 1 + } + ] + }, + { + "letter": "K", + "codons": [ + { + "triplet": "AAA", + "weight": 51 + }, + { + "triplet": "AAG", + "weight": 49 + } + ] + }, + { + "letter": "S", + "codons": [ + { + "triplet": "TCT", + "weight": 10 + }, + { + "triplet": "TCC", + "weight": 13 + }, + { + "triplet": "TCA", + "weight": 2 + }, + { + "triplet": "TCG", + "weight": 5 + }, + { + "triplet": "AGT", + "weight": 0 + }, + { + "triplet": "AGC", + "weight": 68 + } + ] + }, + { + "letter": "*", + "codons": [ + { + "triplet": "TAA", + "weight": 2015 + }, + { + "triplet": "TAG", + "weight": 1667 + }, + { + "triplet": "TGA", + "weight": 1300 + } + ] + }, + { + "letter": "Q", + "codons": [ + { + "triplet": "CAA", + "weight": 45 + }, + { + "triplet": "CAG", + "weight": 55 + } + ] + }, + { + "letter": "R", + "codons": [ + { + "triplet": "CGT", + "weight": 62 + }, + { + "triplet": "CGC", + "weight": 35 + }, + { + "triplet": "CGA", + "weight": 0 + }, + { + "triplet": "CGG", + "weight": 0 + }, + { + "triplet": "AGA", + "weight": 3 + }, + { + "triplet": "AGG", + "weight": 0 + } + ] + }, + { + "letter": "A", + "codons": [ + { + "triplet": "GCT", + "weight": 12 + }, + { + "triplet": "GCC", + "weight": 19 + }, + { + "triplet": "GCA", + "weight": 24 + }, + { + "triplet": "GCG", + "weight": 44 + } + ] + }, + { + "letter": "E", + "codons": [ + { + "triplet": "GAA", + "weight": 43 + }, + { + "triplet": "GAG", + "weight": 57 + } + ] + }, + { + "letter": "F", + "codons": [ + { + "triplet": "TTT", + "weight": 45 + }, + { + "triplet": "TTC", + "weight": 55 + } + ] + }, + { + "letter": "P", + "codons": [ + { + "triplet": "CCT", + "weight": 9 + }, + { + "triplet": "CCC", + "weight": 0 + }, + { + "triplet": "CCA", + "weight": 10 + }, + { + "triplet": "CCG", + "weight": 81 + } + ] + }, + { + "letter": "H", + "codons": [ + { + "triplet": "CAT", + "weight": 38 + }, + { + "triplet": "CAC", + "weight": 62 + } + ] + }, + { + "letter": "M", + "codons": [ + { + "triplet": "ATG", + "weight": 1 + } + ] + }, + { + "letter": "T", + "codons": [ + { + "triplet": "ACT", + "weight": 10 + }, + { + "triplet": "ACC", + "weight": 57 + }, + { + "triplet": "ACA", + "weight": 0 + }, + { + "triplet": "ACG", + "weight": 33 + } + ] + }, + { + "letter": "N", + "codons": [ + { + "triplet": "AAT", + "weight": 47 + }, + { + "triplet": "AAC", + "weight": 53 + } + ] + }, + { + "letter": "D", + "codons": [ + { + "triplet": "GAT", + "weight": 46 + }, + { + "triplet": "GAC", + "weight": 54 + } + ] + } + ] +} diff --git a/synthesis/codon/example_test.go b/synthesis/codon/example_test.go index 4c9fd17..eabee56 100644 --- a/synthesis/codon/example_test.go +++ b/synthesis/codon/example_test.go @@ -20,52 +20,6 @@ func ExampleTranslationTable_Translate() { // output: true } -func ExampleTranslationTable_UpdateWeights() { - gfpTranslation := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" - sequenceWithCustomWeights := "ATGGCAAGTAAGGGAGAAGAGCTTTTTACCGGCGTAGTACCAATTCTGGTAGAACTGGATGGTGATGTAAACGGTCACAAATTTAGTGTAAGCGGAGAAGGTGAGGGTGATGCTACCTATGGCAAACTGACCCTAAAGTTTATATGCACGACTGGAAAACTTCCGGTACCGTGGCCAACGTTAGTTACAACGTTTTCTTATGGAGTACAGTGCTTCAGCCGCTACCCAGATCATATGAAACGCCATGATTTCTTTAAGAGCGCCATGCCAGAGGGTTATGTTCAGGAGCGCACGATCTCGTTTAAGGATGATGGTAACTATAAGACTCGTGCTGAGGTGAAGTTCGAAGGCGATACCCTTGTAAATCGTATTGAATTGAAGGGTATAGACTTCAAGGAGGATGGAAATATTCTTGGACATAAGCTGGAATACAATTACAATTCACATAACGTTTATATAACTGCCGACAAGCAAAAAAACGGGATAAAAGCTAATTTTAAAATACGCCACAACATAGAGGACGGGTCGGTGCAACTAGCCGATCATTATCAACAAAACACACCAATCGGCGACGGACCAGTTCTGTTGCCCGATAATCATTACTTATCAACCCAAAGTGCCTTAAGTAAGGATCCGAACGAAAAGCGCGATCATATGGTACTTCTTGAGTTTGTTACCGCTGCAGGCATAACGCATGGCATGGACGAGCTATACAAATAA" - - table := codon.NewTranslationTable(11) - - // this example is using custom weights for different codons for Arginine. Use this if you would rather use your own - // codon weights, they can also be computed for you with `UpdateWeightsWithSequence`. - - err := table.UpdateWeights([]codon.AminoAcid{ - { - Letter: "R", - Codons: []codon.Codon{ - { - Triplet: "CGU", - Weight: 1, - }, - { - Triplet: "CGA", - Weight: 2, - }, - { - Triplet: "CGG", - Weight: 4, - }, - { - Triplet: "AGA", - Weight: 6, - }, - { - Triplet: "AGG", - Weight: 2, - }, - }, - }, - }) - if err != nil { - fmt.Println("Could not update weights in example") - } - - optimizedSequence, _ := table.Optimize(gfpTranslation, 1) - - fmt.Println(optimizedSequence == sequenceWithCustomWeights) - // output: true -} - func ExampleTranslationTable_Optimize() { gfpTranslation := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*"