From a103f17f78af5591b37d90e1987dfb0ff61a8170 Mon Sep 17 00:00:00 2001 From: Stephen Hurwitz Date: Mon, 16 Oct 2023 17:56:26 -0700 Subject: [PATCH] Implements Scanner type for tokenizing nginx configs Implemented `crossplane.Scanner` that follows the example of other "scanner" types implemented in the Go stdlib. The existing `Lex` uses concurrency to make tokens available to the caller while managing "state". I think this design queue was taken from Rob Pike's 2011 talk on [Lexical Scanning in Go](https://go.dev/talks/2011/lex.slide). If you look at examples from the Go stdlib-- such as `bufio.Scanner` that `Lex` depends on-- you'd find that this isn't the strategy being employed and instead there is a struct that manages the state of the scanner and a method that used by the caller to advance the scanner to obtain tokens. After a bit of Internet archeology, I found [this](https://groups.google.com/g/golang-nuts/c/q--5t2cxv78/m/Vkr9bNuhP5sJ) post on `golang-nuts` from Rob Pike himself: > That talk was about a lexer, but the deeper purpose was to demonstrate > how concurrency can make programs nice even without obvious parallelism > in the problem. And like many such uses of concurrency, the code is > pretty but not necessarily fast. > > I think it's a fine approach to a lexer if you don't care about > performance. It is significantly slower than some other approaches but > is very easy to adapt. I used it in ivy, for example, but just so you > know, I'm probably going to replace the one in ivy with a more > traditional model to avoid some issues with the lexer accessing global > state. You don't care about that for your application, I'm sure. > So: It's pretty and nice to work on, but you'd probably not choose that > approach for a production compiler. An implementation of a "scanner" using the more "traditional" model-- much of the logic is the same or very close to `Lex`-- seems to support the above statement. ``` go test -benchmem -run=^$ -bench "^BenchmarkScan|BenchmarkLex$" github.com/nginxinc/nginx-go-crossplane -count=1 -v goos: darwin goarch: arm64 pkg: github.com/nginxinc/nginx-go-crossplane BenchmarkLex BenchmarkLex/simple BenchmarkLex/simple-10 70982 16581 ns/op 102857 B/op 37 allocs/op BenchmarkLex/with-comments BenchmarkLex/with-comments-10 64125 18366 ns/op 102921 B/op 43 allocs/op BenchmarkLex/messy BenchmarkLex/messy-10 28171 42697 ns/op 104208 B/op 166 allocs/op BenchmarkLex/quote-behavior BenchmarkLex/quote-behavior-10 83667 14154 ns/op 102768 B/op 24 allocs/op BenchmarkLex/quoted-right-brace BenchmarkLex/quoted-right-brace-10 48022 24799 ns/op 103369 B/op 52 allocs/op BenchmarkScan BenchmarkScan/simple BenchmarkScan/simple-10 179712 6660 ns/op 4544 B/op 34 allocs/op BenchmarkScan/with-comments BenchmarkScan/with-comments-10 133178 7628 ns/op 4608 B/op 40 allocs/op BenchmarkScan/messy BenchmarkScan/messy-10 49251 24106 ns/op 5896 B/op 163 allocs/op BenchmarkScan/quote-behavior BenchmarkScan/quote-behavior-10 240026 4854 ns/op 4456 B/op 21 allocs/op BenchmarkScan/quoted-right-brace BenchmarkScan/quoted-right-brace-10 87468 13534 ns/op 5056 B/op 49 allocs/op PASS ok github.com/nginxinc/nginx-go-crossplane 13.676s ``` This alternative to `Lex` is probably a micro-optimization for many use cases. As the size and number of NGINX configurations that need to be analyzed grows, optimization can be a good thing as well as an API that feels familiar to Go developers who might use this tool for their own purposes. Next steps: - Use `Scanner` to "parse" NGINX configurations. I think this should be done in place so that the existing API works as is, but we should also expose a way to allow the caller to provide the scanner. - Deprecate `Lex` in favor of `Scanner`. If we leave `Lex` in place then I don't think we would need a `v2` of the crossplane package (yet). --- lex_test.go | 57 +++++++++--- scanner.go | 230 ++++++++++++++++++++++++++++++++++++++++++++++++ scanner_test.go | 110 +++++++++++++++++++++++ util.go | 2 + 4 files changed, 386 insertions(+), 13 deletions(-) create mode 100644 scanner.go create mode 100644 scanner_test.go diff --git a/lex_test.go b/lex_test.go index 1b9504d1..8880b270 100644 --- a/lex_test.go +++ b/lex_test.go @@ -252,22 +252,53 @@ func TestLex(t *testing.T) { } } -func TestLex_unhappy(t *testing.T) { - t.Parallel() +var lexToken NgxToken //nolint: gochecknoglobals // trying to avoid return value being optimzed away + +func BenchmarkLex(b *testing.B) { + var t NgxToken - testcases := map[string]string{ - "unbalanced open brance": `http {{}`, - "unbalanced closing brace": `http {}}`, - "multiple open braces": `http {{server {}}`, - "multiple closing braces after block end": `http {server {}}}`, - "multiple semicolons": `server { listen 80;; }`, - "semicolon afer closing brace": `server { listen 80; };`, - "open brace after semicolon": `server { listen 80; {}`, - "braces with no directive": `http{}{}`, - "missing final brace": `http{`, + for _, bm := range lexFixtures { + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + file, err := os.Open(path) + if err != nil { + b.Fatal(err) + } + defer file.Close() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + if _, err := file.Seek(0, 0); err != nil { + b.Fatal(err) + } + + for tok := range Lex(file) { + t = tok + } + } + }) } - for name, c := range testcases { + lexToken = t +} + +//nolint:gochecknoglobals +var unhappyFixtures = map[string]string{ + "unbalanced open brance": `http {{}`, + "unbalanced closing brace": `http {}}`, + "multiple open braces": `http {{server {}}`, + "multiple closing braces after block end": `http {server {}}}`, + "multiple semicolons": `server { listen 80;; }`, + "semicolon afer closing brace": `server { listen 80; };`, + "open brace after semicolon": `server { listen 80; {}`, + "braces with no directive": `http{}{}`, + "missing final brace": `http{`, +} + +func TestLex_unhappy(t *testing.T) { + t.Parallel() + + for name, c := range unhappyFixtures { c := c t.Run(name, func(t *testing.T) { t.Parallel() diff --git a/scanner.go b/scanner.go new file mode 100644 index 00000000..cae32bf0 --- /dev/null +++ b/scanner.go @@ -0,0 +1,230 @@ +package crossplane + +import ( + "bufio" + "errors" + "fmt" + "io" + "strings" +) + +// Token is a lexical token of the NGINX configuration syntax. +type Token struct { + // Text is the string corresponding to the token. It could be a directive or symbol. The value is the actual token + // sequence in order to support defining directives in modules other than the core NGINX module set. + Text string + // Line is the source starting line number the token within a file. + Line int + // IsQuoted signifies if the token is wrapped by quotes (", '). Quotes are not usually necessary in an NGINX + // configuration and mostly serve to help make the config less ambiguous. + IsQuoted bool +} + +type scannerError struct { + msg string + line int +} + +func (e *scannerError) Error() string { return e.msg } +func (e *scannerError) Line() int { return e.line } + +func newScannerErrf(line int, format string, a ...any) *scannerError { + return &scannerError{line: line, msg: fmt.Sprintf(format, a...)} +} + +// LineNumber reports the line on which the error occurred by finding the first error in +// the errs chain that returns a line number. Otherwise, it returns 0, false. +// +// An error type should provide a Line() int method to return a line number. +func LineNumber(err error) (int, bool) { + var e interface{ Line() int } + if !errors.As(err, &e) { + return 0, false + } + + return e.Line(), true +} + +// Scanner provides an interface for tokenizing an NGINX configuration. Successive calls to the Scane method will step +// through the 'tokens; of an NGINX configuration. +// +// Scanning stops unrecoverably at EOF, the first I/O error, or an unexpected token. +// +// Use NewScanner to construct a Scanner. +type Scanner struct { + scanner *bufio.Scanner + lineno int + tokenStartLine int + tokenDepth int + repeateSpecialChar bool // only '}' can be repeated + prev string +} + +// NewScanner returns a new Scanner to read from r. +func NewScanner(r io.Reader) *Scanner { + s := &Scanner{ + scanner: bufio.NewScanner(r), + lineno: 1, + tokenStartLine: 1, + tokenDepth: 0, + repeateSpecialChar: false, + } + + s.scanner.Split(bufio.ScanRunes) + + return s +} + +// Scan reads the next token from source and returns it.. It returns io.EOF at the end of the source. Scanner errors are +// returned when encountered. +func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo + var tok strings.Builder + + lexState := skipSpace + newToken := false + readNext := true + esc := false + + var r, quote string + + for { + switch { + case s.prev != "": + r = s.prev + s.prev = "" + case readNext: + if !s.scanner.Scan() { + if tok.Len() > 0 { + return Token{Text: tok.String(), Line: s.tokenStartLine, IsQuoted: lexState == inQuote}, nil + } + + if s.tokenDepth > 0 { + return Token{}, &scannerError{line: s.tokenStartLine, msg: "unexpected end of file, expecting }"} + } + + return Token{}, io.EOF + } + + nextRune := s.scanner.Text() + r = nextRune + if isEOL(r) { + s.lineno++ + } + default: + readNext = true + } + + // skip CRs + if r == "\r" || r == "\\\r" { + continue + } + + if r == "\\" && !esc { + esc = true + continue + } + + if esc { + esc = false + r = "\\" + r + } + + switch lexState { + case skipSpace: + if !isSpace(r) { + lexState = inWord + newToken = true + readNext = false // re-eval + s.tokenStartLine = s.lineno + } + continue + + case inWord: + if newToken { + newToken = false + if r == "#" { + tok.WriteString(r) + lexState = inComment + s.tokenStartLine = s.lineno + continue + } + } + + if isSpace(r) { + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + + // parameter expansion syntax (ex: "${var[@]}") + if tok.Len() > 0 && strings.HasSuffix(tok.String(), "$") && r == "{" { + tok.WriteString(r) + lexState = inVar + s.repeateSpecialChar = false + continue + } + + // add entire quoted string to the token buffer + if r == `"` || r == "'" { + if tok.Len() > 0 { + // if a quote is inside a token, treat it like any other char + tok.WriteString(r) + } else { + quote = r + lexState = inQuote + s.tokenStartLine = s.lineno + } + s.repeateSpecialChar = false + continue + } + + // special characters treated as full tokens + if isSpecialChar(r) { + if tok.Len() > 0 { + s.prev = r + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + + // only } can be repeated + if s.repeateSpecialChar && r != "}" { + return Token{}, newScannerErrf(s.tokenStartLine, "unxpected %q", r) + } + + s.repeateSpecialChar = true + if r == "{" { + s.tokenDepth++ + } + + if r == "}" { + s.tokenDepth-- + if s.tokenDepth < 0 { + return Token{}, &scannerError{line: s.tokenStartLine, msg: `unexpected "}"`} + } + } + + tok.WriteString(r) + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + + s.repeateSpecialChar = false + tok.WriteString(r) + case inComment: + if isEOL(r) { + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + tok.WriteString(r) + case inVar: + tok.WriteString(r) + if r != "}" && !isSpace(r) { + continue + } + lexState = inWord + case inQuote: + if r == quote { + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + if r == "\\"+quote { + r = quote + } + tok.WriteString(r) + } + } +} diff --git a/scanner_test.go b/scanner_test.go new file mode 100644 index 00000000..30d06bbe --- /dev/null +++ b/scanner_test.go @@ -0,0 +1,110 @@ +package crossplane + +import ( + "io" + "os" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestScanner(t *testing.T) { + t.Parallel() + + for _, f := range lexFixtures { + f := f + t.Run(f.name, func(t *testing.T) { + t.Parallel() + + path := getTestConfigPath(f.name, "nginx.conf") + file, err := os.Open(path) + if err != nil { + t.Fatal(err) + } + defer file.Close() + + s := NewScanner(file) + + i := 0 + for { + got, err := s.Scan() + if err == io.EOF { + if i < len(f.tokens)-1 { + t.Fatal("unexpected end of file") + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + want := f.tokens[i] + require.Equal(t, want.value, got.Text) + require.Equal(t, want.line, got.Line) + i++ + } + }) + } +} + +func TestScanner_unhappy(t *testing.T) { + t.Parallel() + + for name, c := range unhappyFixtures { + c := c + t.Run(name, func(t *testing.T) { + t.Parallel() + + s := NewScanner(strings.NewReader(c)) + for { + _, err := s.Scan() + if err == io.EOF { + t.Fatal("reached end of string") + } + + if err != nil { + t.Logf("got error: %v", err) + return + } + } + }) + } +} + +var t Token //nolint: gochecknoglobals // trying to avoid return value being optimzed away + +func BenchmarkScan(b *testing.B) { + for _, bm := range lexFixtures { + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + file, err := os.Open(path) + if err != nil { + b.Fatal(err) + } + defer file.Close() + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + if _, err := file.Seek(0, 0); err != nil { + b.Fatal(err) + } + + s := NewScanner(file) + + for { + tok, err := s.Scan() + if err == io.EOF { + break + } + if err != nil { + b.Fatal(err) + } + t = tok + } + } + }) + } +} diff --git a/util.go b/util.go index d2e84ade..186afeb5 100644 --- a/util.go +++ b/util.go @@ -35,6 +35,8 @@ func isEOL(s string) bool { return strings.HasSuffix(s, "\n") } +func isSpecialChar(s string) bool { return s == "{" || s == "}" || s == ";" } + func repr(s string) string { q := fmt.Sprintf("%q", s) for _, char := range s {