Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cloning refactor and data race fix #393

Merged
merged 16 commits into from
Nov 15, 2023
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- `fastq` parser no longer becomes de-aligned when reading (#325)
- `fastq` now handles optionals correctly (#323)
- No more data race in GoldenGate (#276)

### Breaking
- GolenGate and CutWithEnzymeByName are now receivers of EnzymeManager. This is an effort to remove
dependence on some package level global state and build some flexibility managing enzymes over the
lifetime of the program.

## [0.26.0] - 2023-07-22
Oops, we weren't keeping a changelog before this tag!

[unreleased]: https://github.com/TimothyStiles/poly/compare/v0.26.0...main
[0.26.0]: https://github.com/TimothyStiles/poly/releases/tag/v0.26.0
[0.26.0]: https://github.com/TimothyStiles/poly/releases/tag/v0.26.0
194 changes: 95 additions & 99 deletions clone/clone.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ import (
"regexp"
"sort"
"strings"
"sync"

"github.com/TimothyStiles/poly/checks"
"github.com/TimothyStiles/poly/seqhash"
Expand Down Expand Up @@ -83,15 +82,27 @@ type Enzyme struct {
RegexpFor *regexp.Regexp
RegexpRev *regexp.Regexp
Skip int
OverhangLen int
OverheadLength int
RecognitionSite string
}

// Eventually, we want to get the data for this map from ftp://ftp.neb.com/pub/rebase
var enzymeMap = map[string]Enzyme{
"BsaI": {"BsaI", regexp.MustCompile("GGTCTC"), regexp.MustCompile("GAGACC"), 1, 4, "GGTCTC"},
"BbsI": {"BbsI", regexp.MustCompile("GAAGAC"), regexp.MustCompile("GTCTTC"), 2, 4, "GAAGAC"},
"BtgZI": {"BtgZI", regexp.MustCompile("GCGATG"), regexp.MustCompile("CATCGC"), 10, 4, "GCGATG"},
// EnzymeManager manager for Enzymes. Allows for management of enzymes throughout the lifecyle of your
// program. EnzymeManager is not safe for concurrent use.
type EnzymeManager struct {
// enzymeMap Map of enzymes that exist for the lifetime of the manager. Not safe for concurrent use.
enzymeMap map[string]Enzyme
}

// NewEnzymeManager creates a new EnzymeManager given some enzymes.
func NewEnzymeManager(enzymes []Enzyme) EnzymeManager {
enzymeMap := make(map[string]Enzyme)
for enzymeIndex := range enzymes {
enzymeMap[enzymes[enzymeIndex].Name] = enzymes[enzymeIndex]
}

return EnzymeManager{
enzymeMap: enzymeMap,
}
}

/******************************************************************************
Expand All @@ -100,30 +111,27 @@ Base cloning functions begin here.

******************************************************************************/

func getBaseRestrictionEnzymes() map[string]Enzyme {
return enzymeMap
}

// CutWithEnzymeByName cuts a given sequence with an enzyme represented by the
// enzyme's name. It is a convenience wrapper around CutWithEnzyme that
// allows us to specify the enzyme by name.
func CutWithEnzymeByName(seq Part, directional bool, enzymeStr string) ([]Fragment, error) {
enzymeMap := getBaseRestrictionEnzymes()
if _, ok := enzymeMap[enzymeStr]; !ok {
return []Fragment{}, errors.New("Enzyme " + enzymeStr + " not found in enzymeMap")
func (enzymeManager EnzymeManager) CutWithEnzymeByName(part Part, directional bool, name string) ([]Fragment, error) {
// Get the enzyme from the enzyme map
if enzyme, ok := enzymeManager.enzymeMap[name]; ok {
// Cut the sequence with the enzyme
return CutWithEnzyme(part, directional, enzyme), nil
}
enzyme := enzymeMap[enzymeStr]
return CutWithEnzyme(seq, directional, enzyme), nil
// Return an error if the enzyme is not found
return []Fragment{}, errors.New("Enzyme " + name + " not found")
}

// CutWithEnzyme cuts a given sequence with an enzyme represented by an Enzyme struct.
func CutWithEnzyme(seq Part, directional bool, enzyme Enzyme) []Fragment {
var fragmentSeqs []string
func CutWithEnzyme(part Part, directional bool, enzyme Enzyme) []Fragment {
var fragmentSequences []string
var sequence string
if seq.Circular {
sequence = strings.ToUpper(seq.Sequence + seq.Sequence)
if part.Circular {
sequence = strings.ToUpper(part.Sequence + part.Sequence)
} else {
sequence = strings.ToUpper(seq.Sequence)
sequence = strings.ToUpper(part.Sequence)
}

// Check for palindromes
Expand All @@ -135,20 +143,20 @@ func CutWithEnzyme(seq Part, directional bool, enzyme Enzyme) []Fragment {
var reverseOverhangs []Overhang
forwardCuts := enzyme.RegexpFor.FindAllStringIndex(sequence, -1)
for _, forwardCut := range forwardCuts {
forwardOverhangs = append(forwardOverhangs, Overhang{Length: enzyme.OverhangLen, Position: forwardCut[1] + enzyme.Skip, Forward: true, RecognitionSitePlusSkipLength: len(enzyme.RecognitionSite) + enzyme.Skip})
forwardOverhangs = append(forwardOverhangs, Overhang{Length: enzyme.OverheadLength, Position: forwardCut[1] + enzyme.Skip, Forward: true, RecognitionSitePlusSkipLength: len(enzyme.RecognitionSite) + enzyme.Skip})
}
// Palindromic enzymes won't need reverseCuts
if !palindromic {
reverseCuts := enzyme.RegexpRev.FindAllStringIndex(sequence, -1)
for _, reverseCut := range reverseCuts {
reverseOverhangs = append(reverseOverhangs, Overhang{Length: enzyme.OverhangLen, Position: reverseCut[0] - enzyme.Skip, Forward: false, RecognitionSitePlusSkipLength: len(enzyme.RecognitionSite) + enzyme.Skip})
reverseOverhangs = append(reverseOverhangs, Overhang{Length: enzyme.OverheadLength, Position: reverseCut[0] - enzyme.Skip, Forward: false, RecognitionSitePlusSkipLength: len(enzyme.RecognitionSite) + enzyme.Skip})
}
}

// If, on a linear sequence, the last overhang's position + EnzymeSkip + EnzymeOverhangLen is over the length of the sequence, remove that overhang.
// If, on a linear sequence, the last overhang's position + EnzymeSkip + EnzymeOverhangLength is over the length of the sequence, remove that overhang.
for _, overhangSet := range [][]Overhang{forwardOverhangs, reverseOverhangs} {
if len(overhangSet) > 0 {
if !seq.Circular && (overhangSet[len(overhangSet)-1].Position+enzyme.Skip+enzyme.OverhangLen > len(sequence)) {
if !part.Circular && (overhangSet[len(overhangSet)-1].Position+enzyme.Skip+enzyme.OverheadLength > len(sequence)) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could there be some explanation of the logic behind this line and what it's doing?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that is if the recognition sequence is at the very end of a circular DNA, and then cutting on the other side (ie, the beginning of the sequence).

overhangSet = overhangSet[:len(overhangSet)-1]
}
}
Expand All @@ -166,26 +174,26 @@ func CutWithEnzyme(seq Part, directional bool, enzyme Enzyme) []Fragment {
var nextOverhang Overhang
// Linear fragments with 1 cut that are no directional will always give a
// 2 fragments
if len(overhangs) == 1 && !directional && !seq.Circular { // Check the case of a single cut
if len(overhangs) == 1 && !directional && !part.Circular { // Check the case of a single cut
// In the case of a single cut in a linear sequence, we get two fragments with only 1 stick end
fragmentSeq1 := sequence[overhangs[0].Position+overhangs[0].Length:]
fragmentSeq2 := sequence[:overhangs[0].Position]
overhangSeq := sequence[overhangs[0].Position : overhangs[0].Position+overhangs[0].Length]
fragments = append(fragments, Fragment{fragmentSeq1, overhangSeq, ""})
fragments = append(fragments, Fragment{fragmentSeq2, "", overhangSeq})
fragmentSequence1 := sequence[overhangs[0].Position+overhangs[0].Length:]
fragmentSequence2 := sequence[:overhangs[0].Position]
overhangSequence := sequence[overhangs[0].Position : overhangs[0].Position+overhangs[0].Length]
fragments = append(fragments, Fragment{fragmentSequence1, overhangSequence, ""})
fragments = append(fragments, Fragment{fragmentSequence2, "", overhangSequence})
return fragments
}

// Circular fragments with 1 cut will always have 2 overhangs (because of the
// concat earlier). If we don't require directionality, this will always get
// cut into a single fragment
if len(overhangs) == 2 && !directional && seq.Circular {
if len(overhangs) == 2 && !directional && part.Circular {
// In the case of a single cut in a circular sequence, we get one fragment out with sticky overhangs
fragmentSeq1 := sequence[overhangs[0].Position+overhangs[0].Length : len(seq.Sequence)]
fragmentSeq2 := sequence[:overhangs[0].Position]
fragmentSeq := fragmentSeq1 + fragmentSeq2
overhangSeq := sequence[overhangs[0].Position : overhangs[0].Position+overhangs[0].Length]
fragments = append(fragments, Fragment{fragmentSeq, overhangSeq, overhangSeq})
fragmentSequence1 := sequence[overhangs[0].Position+overhangs[0].Length : len(part.Sequence)]
fragmentSequence2 := sequence[:overhangs[0].Position]
fragmentSequence := fragmentSequence1 + fragmentSequence2
overhangSequence := sequence[overhangs[0].Position : overhangs[0].Position+overhangs[0].Length]
fragments = append(fragments, Fragment{fragmentSequence, overhangSequence, overhangSequence})
return fragments
}

Expand All @@ -205,28 +213,28 @@ func CutWithEnzyme(seq Part, directional bool, enzyme Enzyme) []Fragment {
// the basis of GoldenGate assembly.
if directional && !palindromic {
if currentOverhang.Forward && !nextOverhang.Forward {
fragmentSeqs = append(fragmentSeqs, sequence[currentOverhang.Position:nextOverhang.Position])
fragmentSequences = append(fragmentSequences, sequence[currentOverhang.Position:nextOverhang.Position])
}
// We have to subtract RecognitionSitePlusSkipLength in case we have a recognition site on
// one side of the origin of a circular sequence and the cut site on the other side of the origin
if nextOverhang.Position-nextOverhang.RecognitionSitePlusSkipLength > len(seq.Sequence) {
if nextOverhang.Position-nextOverhang.RecognitionSitePlusSkipLength > len(part.Sequence) {
break
}
} else {
fragmentSeqs = append(fragmentSeqs, sequence[currentOverhang.Position:nextOverhang.Position])
if nextOverhang.Position-nextOverhang.RecognitionSitePlusSkipLength > len(seq.Sequence) {
fragmentSequences = append(fragmentSequences, sequence[currentOverhang.Position:nextOverhang.Position])
if nextOverhang.Position-nextOverhang.RecognitionSitePlusSkipLength > len(part.Sequence) {
break
}
}
}
// Convert fragment sequences into fragments
for _, fragment := range fragmentSeqs {
for _, fragmentsequence := range fragmentSequences {
// Minimum lengths (given oligos) for assembly is 8 base pairs
// https://doi.org/10.1186/1756-0500-3-291
if len(fragment) > 8 {
fragmentSequence := fragment[enzyme.OverhangLen : len(fragment)-enzyme.OverhangLen]
forwardOverhang := fragment[:enzyme.OverhangLen]
reverseOverhang := fragment[len(fragment)-enzyme.OverhangLen:]
if len(fragmentsequence) > 8 {
fragmentSequence := fragmentsequence[enzyme.OverheadLength : len(fragmentsequence)-enzyme.OverheadLength]
forwardOverhang := fragmentsequence[:enzyme.OverheadLength]
reverseOverhang := fragmentsequence[len(fragmentsequence)-enzyme.OverheadLength:]
fragments = append(fragments, Fragment{Sequence: fragmentSequence, ForwardOverhang: forwardOverhang, ReverseOverhang: reverseOverhang})
}
}
Expand All @@ -235,12 +243,17 @@ func CutWithEnzyme(seq Part, directional bool, enzyme Enzyme) []Fragment {
return fragments
}

func recurseLigate(wg *sync.WaitGroup, constructs chan string, infiniteLoopingConstructs chan string, seedFragment Fragment, fragmentList []Fragment, usedFragments []Fragment) {
func recurseLigate(seedFragment Fragment, fragmentList []Fragment, usedFragments []Fragment, existingSeqhashes map[string]struct{}) (openConstructs []string, infiniteConstructs []string) {
// Recurse ligate simulates all possible ligations of a series of fragments. Each possible combination begins with a "seed" that fragments from the pool can be added to.
defer wg.Done()
// If the seed ligates to itself, we can call it done with a successful circularization!
if seedFragment.ForwardOverhang == seedFragment.ReverseOverhang {
constructs <- seedFragment.ForwardOverhang + seedFragment.Sequence
construct := seedFragment.ForwardOverhang + seedFragment.Sequence
seqhash, _ := seqhash.Hash(construct, "DNA", true, true)
if _, ok := existingSeqhashes[seqhash]; ok {
return nil, nil
}
existingSeqhashes[seqhash] = struct{}{}
return []string{construct}, nil
} else {
for _, newFragment := range fragmentList {
// If the seedFragment's reverse overhang is ligates to a fragment's forward overhang, we can ligate those together and seed another ligation reaction
Expand All @@ -262,67 +275,40 @@ func recurseLigate(wg *sync.WaitGroup, constructs chan string, infiniteLoopingCo
// If the newFragment's reverse complement already exists in the used fragment list, we need to cancel the recursion.
for _, usedFragment := range usedFragments {
if usedFragment.Sequence == newFragment.Sequence {
infiniteLoopingConstructs <- usedFragment.ForwardOverhang + usedFragment.Sequence + usedFragment.ReverseOverhang
return
infiniteConstruct := usedFragment.ForwardOverhang + usedFragment.Sequence + usedFragment.ReverseOverhang
seqhash, _ := seqhash.Hash(infiniteConstruct, "DNA", false, true)
if _, ok := existingSeqhashes[seqhash]; ok {
return nil, nil
}
existingSeqhashes[seqhash] = struct{}{}
return nil, []string{infiniteConstruct}
}
}
wg.Add(1)
// If everything is clear, append fragment to usedFragments and recurse.
usedFragments = append(usedFragments, newFragment)
go recurseLigate(wg, constructs, infiniteLoopingConstructs, newSeed, fragmentList, usedFragments)
}
}
}
}
oc, ic := recurseLigate(newSeed, fragmentList, usedFragments, existingSeqhashes)

func getConstructs(c chan string, constructSequences chan []string, circular bool) {
var constructs []string
var exists bool
var existingSeqhashes []string
for {
construct, more := <-c
if more {
exists = false
seqhashConstruct, _ := seqhash.Hash(construct, "DNA", circular, true)
// Check if this construct is unique
for _, existingSeqhash := range existingSeqhashes {
if existingSeqhash == seqhashConstruct {
exists = true
}
openConstructs = append(openConstructs, oc...)
infiniteConstructs = append(infiniteConstructs, ic...)
}
if !exists {
constructs = append(constructs, construct)
existingSeqhashes = append(existingSeqhashes, seqhashConstruct)
}
} else {
constructSequences <- constructs
close(constructSequences)
return
}
}

return openConstructs, infiniteConstructs
}

// CircularLigate simulates ligation of all possible fragment combinations into circular plasmids.
func CircularLigate(fragments []Fragment) ([]string, []string, error) {
var wg sync.WaitGroup
func CircularLigate(fragments []Fragment) ([]string, []string) {
var outputConstructs []string
var outputInfiniteLoopingConstructs []string
constructs := make(chan string)
infiniteLoopingConstructs := make(chan string) // sometimes we will get stuck in infinite loops. These are sequences with a recursion break
constructSequences := make(chan []string)
infiniteLoopingConstructSequences := make(chan []string)
existingSeqhashes := make(map[string]struct{})
for _, fragment := range fragments {
wg.Add(1)
go recurseLigate(&wg, constructs, infiniteLoopingConstructs, fragment, fragments, []Fragment{})
openConstructs, infiniteConstructs := recurseLigate(fragment, fragments, []Fragment{}, existingSeqhashes)

outputConstructs = append(outputConstructs, openConstructs...)
outputInfiniteLoopingConstructs = append(outputInfiniteLoopingConstructs, infiniteConstructs...)
}
go getConstructs(constructs, constructSequences, true)
go getConstructs(infiniteLoopingConstructs, infiniteLoopingConstructSequences, false)
wg.Wait()
close(constructs)
close(infiniteLoopingConstructs)
outputConstructs = <-constructSequences
outputInfiniteLoopingConstructs = <-infiniteLoopingConstructSequences
return outputConstructs, outputInfiniteLoopingConstructs, nil
return outputConstructs, outputInfiniteLoopingConstructs
}

/******************************************************************************
Expand All @@ -333,14 +319,24 @@ Specific cloning functions begin here.

// GoldenGate simulates a GoldenGate cloning reaction. As of right now, we only
// support BsaI, BbsI, BtgZI, and BsmBI.
func GoldenGate(sequences []Part, enzymeStr string) ([]string, []string, error) {
func (enzymeManager EnzymeManager) GoldenGate(sequences []Part, enzymeStr string) (openConstructs []string, infiniteLoops []string, err error) {
Copy link
Collaborator

@TimothyStiles TimothyStiles Nov 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should GoldenGate really be a method of EnzymeManager rather than its own function?

Copy link
Contributor Author

@TwFlem TwFlem Nov 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After learning more about how Golden Gate works IRL and a little bit more about SynBio- probably not.

It seems more appropriate for Golden Gate to just accept something like a 'cuttingEnzyme' of type 'Enzyme' and have the caller lookup whatever enzyme they want however they want.

This is also nice because now Golden Gate doesn't have to return an error anymore.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I personally would like the change of GoldenGate to be its own function, independent of an enzymeManager.

Copy link
Contributor Author

@TwFlem TwFlem Nov 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated to no longer be a receiver

var fragments []Fragment
for _, sequence := range sequences {
newFragments, err := CutWithEnzymeByName(sequence, true, enzymeStr)
newFragments, err := enzymeManager.CutWithEnzymeByName(sequence, true, enzymeStr)
if err != nil {
return []string{}, []string{}, err
}
fragments = append(fragments, newFragments...)
}
return CircularLigate(fragments)
oc, il := CircularLigate(fragments)
return oc, il, nil
}

// GetBaseRestrictionEnzymes return a basic slice of common enzymes used in Golden Gate Assembly. Eventually, we want to get the data for this map from ftp://ftp.neb.com/pub/rebase
func GetBaseRestrictionEnzymes() []Enzyme {
return []Enzyme{
{"BsaI", regexp.MustCompile("GGTCTC"), regexp.MustCompile("GAGACC"), 1, 4, "GGTCTC"},
{"BbsI", regexp.MustCompile("GAAGAC"), regexp.MustCompile("GTCTTC"), 2, 4, "GAAGAC"},
{"BtgZI", regexp.MustCompile("GCGATG"), regexp.MustCompile("CATCGC"), 10, 4, "GCGATG"},
}
}
Loading
Loading