From f2310dba2c5e395e747b36daa135704f8f4ecdf6 Mon Sep 17 00:00:00 2001 From: Tim Date: Wed, 11 Oct 2023 14:39:24 -0700 Subject: [PATCH] created parser regression test for nested sublocations. (#385) * created parser regression test for nested sublocations. * integrating @abondrn's fix to genbank parser. * fixed lint issues. * added data for regression test. Co-authored-by: Alex Co-authored-by: Alex --- data/parseLocationRegressionTest.json | 1 + io/genbank/genbank.go | 36 ++++++++++++++++----------- io/genbank/genbank_test.go | 27 ++++++++++++++++++++ 3 files changed, 50 insertions(+), 14 deletions(-) create mode 100644 data/parseLocationRegressionTest.json diff --git a/data/parseLocationRegressionTest.json b/data/parseLocationRegressionTest.json new file mode 100644 index 00000000..8f599021 --- /dev/null +++ b/data/parseLocationRegressionTest.json @@ -0,0 +1 @@ +{"start":0,"end":0,"complement":false,"join":true,"five_prime_partial":false,"three_prime_partial":false,"gbk_location_string":"join(complement(5306942..5307394),complement(5304401..5305029),complement(5303328..5303393),complement(5301928..5302004))","sub_locations":[{"start":5306941,"end":5307394,"complement":true,"join":false,"five_prime_partial":false,"three_prime_partial":false,"gbk_location_string":"join(complement(5306942..5307394),complement(5304401..5305029),complement(5303328..5303393),complement(5301928..5302004))","sub_locations":null},{"start":5304400,"end":5305029,"complement":true,"join":false,"five_prime_partial":false,"three_prime_partial":false,"gbk_location_string":"join(complement(5306942..5307394),complement(5304401..5305029),complement(5303328..5303393),complement(5301928..5302004))","sub_locations":null},{"start":5303327,"end":5303393,"complement":true,"join":false,"five_prime_partial":false,"three_prime_partial":false,"gbk_location_string":"join(complement(5306942..5307394),complement(5304401..5305029),complement(5303328..5303393),complement(5301928..5302004))","sub_locations":null},{"start":5301927,"end":5302004,"complement":true,"join":false,"five_prime_partial":false,"three_prime_partial":false,"gbk_location_string":"join(complement(5306942..5307394),complement(5304401..5305029),complement(5303328..5303393),complement(5301928..5302004))","sub_locations":null}]} \ No newline at end of file diff --git a/io/genbank/genbank.go b/io/genbank/genbank.go index 34062b36..e023288b 100644 --- a/io/genbank/genbank.go +++ b/io/genbank/genbank.go @@ -811,8 +811,8 @@ func getSourceOrganism(metadataData []string) (string, string, []string) { func parseLocation(locationString string) (Location, error) { var location Location location.GbkLocationString = locationString - if !(strings.ContainsAny(locationString, "(")) { // Case checks for simple expression of x..x - if !(strings.ContainsAny(locationString, ".")) { //Case checks for simple expression x + if !strings.ContainsAny(locationString, "(") { // Case checks for simple expression of x..x + if !strings.ContainsAny(locationString, ".") { //Case checks for simple expression x position, err := strconv.Atoi(locationString) if err != nil { return Location{}, err @@ -841,26 +841,34 @@ func parseLocation(locationString string) (Location, error) { if strings.ContainsAny(expression, "(") { firstInnerParentheses := strings.Index(expression, "(") ParenthesesCount := 1 - comma := 0 - for i := 1; ParenthesesCount > 0; i++ { // "(" is at 0, so we start at 1 - comma = i - switch expression[firstInnerParentheses+i] { - case []byte("(")[0]: + prevSubLocationStart := 0 + for i := firstInnerParentheses + 1; i < len(expression); i++ { // "(" is at 0, so we start at 1 + switch expression[i] { + case '(': ParenthesesCount++ - case []byte(")")[0]: + case ')': ParenthesesCount-- + case ',': + if ParenthesesCount == 0 { + parsedSubLocation, err := parseLocation(expression[prevSubLocationStart:i]) + if err != nil { + return Location{}, err + } + parsedSubLocation.GbkLocationString = locationString + location.SubLocations = append(location.SubLocations, parsedSubLocation) + prevSubLocationStart = i + 1 + } } } - parseLeftLocation, err := parseLocation(expression[:firstInnerParentheses+comma+1]) - if err != nil { - return Location{}, err + if ParenthesesCount != 0 { + return Location{}, fmt.Errorf("Unbalanced parentheses") } - parseRightLocation, err := parseLocation(expression[2+firstInnerParentheses+comma:]) + parsedSubLocation, err := parseLocation(expression[prevSubLocationStart:]) if err != nil { return Location{}, err } - - location.SubLocations = append(location.SubLocations, parseLeftLocation, parseRightLocation) + parsedSubLocation.GbkLocationString = locationString + location.SubLocations = append(location.SubLocations, parsedSubLocation) } else { // This is the default join(x..x,x..x) for _, numberRange := range strings.Split(expression, ",") { joinLocation, err := parseLocation(numberRange) diff --git a/io/genbank/genbank_test.go b/io/genbank/genbank_test.go index 17090386..9466c6b1 100644 --- a/io/genbank/genbank_test.go +++ b/io/genbank/genbank_test.go @@ -1,7 +1,9 @@ package genbank import ( + "encoding/json" "errors" + "fmt" "io" "os" "path/filepath" @@ -160,6 +162,31 @@ func TestPartialLocationParseRegression(t *testing.T) { } } +func TestSubLocationStringParseRegression(t *testing.T) { + location := "join(complement(5306942..5307394),complement(5304401..5305029),complement(5303328..5303393),complement(5301928..5302004))" + parsedLocation, err := parseLocation(location) + if err != nil { + t.Errorf("Failed to parse location string. Got err: %s", err) + } + jsonFile, err := os.Open("../../data/parseLocationRegressionTest.json") + // if we os.Open returns an error then handle it + if err != nil { + fmt.Println(err) + } + defer jsonFile.Close() + + byteValue, _ := io.ReadAll(jsonFile) + var testParsedLocation Location + err = json.Unmarshal(byteValue, &testParsedLocation) + if err != nil { + t.Errorf("Failed to unmarshal json. Got err: %s", err) + } + + if diff := cmp.Diff(parsedLocation, testParsedLocation); diff != "" { + t.Errorf("Failed to parse sublocation string. Got this diff:\n%s", diff) + } +} + func TestSnapgeneGenbankRegression(t *testing.T) { snapgene, err := Read("../../data/puc19_snapgene.gb")