diff --git a/lex_test.go b/lex_test.go index 1b9504d1..8880b270 100644 --- a/lex_test.go +++ b/lex_test.go @@ -252,22 +252,53 @@ func TestLex(t *testing.T) { } } -func TestLex_unhappy(t *testing.T) { - t.Parallel() +var lexToken NgxToken //nolint: gochecknoglobals // trying to avoid return value being optimzed away + +func BenchmarkLex(b *testing.B) { + var t NgxToken - testcases := map[string]string{ - "unbalanced open brance": `http {{}`, - "unbalanced closing brace": `http {}}`, - "multiple open braces": `http {{server {}}`, - "multiple closing braces after block end": `http {server {}}}`, - "multiple semicolons": `server { listen 80;; }`, - "semicolon afer closing brace": `server { listen 80; };`, - "open brace after semicolon": `server { listen 80; {}`, - "braces with no directive": `http{}{}`, - "missing final brace": `http{`, + for _, bm := range lexFixtures { + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + file, err := os.Open(path) + if err != nil { + b.Fatal(err) + } + defer file.Close() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + if _, err := file.Seek(0, 0); err != nil { + b.Fatal(err) + } + + for tok := range Lex(file) { + t = tok + } + } + }) } - for name, c := range testcases { + lexToken = t +} + +//nolint:gochecknoglobals +var unhappyFixtures = map[string]string{ + "unbalanced open brance": `http {{}`, + "unbalanced closing brace": `http {}}`, + "multiple open braces": `http {{server {}}`, + "multiple closing braces after block end": `http {server {}}}`, + "multiple semicolons": `server { listen 80;; }`, + "semicolon afer closing brace": `server { listen 80; };`, + "open brace after semicolon": `server { listen 80; {}`, + "braces with no directive": `http{}{}`, + "missing final brace": `http{`, +} + +func TestLex_unhappy(t *testing.T) { + t.Parallel() + + for name, c := range unhappyFixtures { c := c t.Run(name, func(t *testing.T) { t.Parallel() diff --git a/scanner.go b/scanner.go new file mode 100644 index 00000000..cae32bf0 --- /dev/null +++ b/scanner.go @@ -0,0 +1,230 @@ +package crossplane + +import ( + "bufio" + "errors" + "fmt" + "io" + "strings" +) + +// Token is a lexical token of the NGINX configuration syntax. +type Token struct { + // Text is the string corresponding to the token. It could be a directive or symbol. The value is the actual token + // sequence in order to support defining directives in modules other than the core NGINX module set. + Text string + // Line is the source starting line number the token within a file. + Line int + // IsQuoted signifies if the token is wrapped by quotes (", '). Quotes are not usually necessary in an NGINX + // configuration and mostly serve to help make the config less ambiguous. + IsQuoted bool +} + +type scannerError struct { + msg string + line int +} + +func (e *scannerError) Error() string { return e.msg } +func (e *scannerError) Line() int { return e.line } + +func newScannerErrf(line int, format string, a ...any) *scannerError { + return &scannerError{line: line, msg: fmt.Sprintf(format, a...)} +} + +// LineNumber reports the line on which the error occurred by finding the first error in +// the errs chain that returns a line number. Otherwise, it returns 0, false. +// +// An error type should provide a Line() int method to return a line number. +func LineNumber(err error) (int, bool) { + var e interface{ Line() int } + if !errors.As(err, &e) { + return 0, false + } + + return e.Line(), true +} + +// Scanner provides an interface for tokenizing an NGINX configuration. Successive calls to the Scane method will step +// through the 'tokens; of an NGINX configuration. +// +// Scanning stops unrecoverably at EOF, the first I/O error, or an unexpected token. +// +// Use NewScanner to construct a Scanner. +type Scanner struct { + scanner *bufio.Scanner + lineno int + tokenStartLine int + tokenDepth int + repeateSpecialChar bool // only '}' can be repeated + prev string +} + +// NewScanner returns a new Scanner to read from r. +func NewScanner(r io.Reader) *Scanner { + s := &Scanner{ + scanner: bufio.NewScanner(r), + lineno: 1, + tokenStartLine: 1, + tokenDepth: 0, + repeateSpecialChar: false, + } + + s.scanner.Split(bufio.ScanRunes) + + return s +} + +// Scan reads the next token from source and returns it.. It returns io.EOF at the end of the source. Scanner errors are +// returned when encountered. +func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo + var tok strings.Builder + + lexState := skipSpace + newToken := false + readNext := true + esc := false + + var r, quote string + + for { + switch { + case s.prev != "": + r = s.prev + s.prev = "" + case readNext: + if !s.scanner.Scan() { + if tok.Len() > 0 { + return Token{Text: tok.String(), Line: s.tokenStartLine, IsQuoted: lexState == inQuote}, nil + } + + if s.tokenDepth > 0 { + return Token{}, &scannerError{line: s.tokenStartLine, msg: "unexpected end of file, expecting }"} + } + + return Token{}, io.EOF + } + + nextRune := s.scanner.Text() + r = nextRune + if isEOL(r) { + s.lineno++ + } + default: + readNext = true + } + + // skip CRs + if r == "\r" || r == "\\\r" { + continue + } + + if r == "\\" && !esc { + esc = true + continue + } + + if esc { + esc = false + r = "\\" + r + } + + switch lexState { + case skipSpace: + if !isSpace(r) { + lexState = inWord + newToken = true + readNext = false // re-eval + s.tokenStartLine = s.lineno + } + continue + + case inWord: + if newToken { + newToken = false + if r == "#" { + tok.WriteString(r) + lexState = inComment + s.tokenStartLine = s.lineno + continue + } + } + + if isSpace(r) { + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + + // parameter expansion syntax (ex: "${var[@]}") + if tok.Len() > 0 && strings.HasSuffix(tok.String(), "$") && r == "{" { + tok.WriteString(r) + lexState = inVar + s.repeateSpecialChar = false + continue + } + + // add entire quoted string to the token buffer + if r == `"` || r == "'" { + if tok.Len() > 0 { + // if a quote is inside a token, treat it like any other char + tok.WriteString(r) + } else { + quote = r + lexState = inQuote + s.tokenStartLine = s.lineno + } + s.repeateSpecialChar = false + continue + } + + // special characters treated as full tokens + if isSpecialChar(r) { + if tok.Len() > 0 { + s.prev = r + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + + // only } can be repeated + if s.repeateSpecialChar && r != "}" { + return Token{}, newScannerErrf(s.tokenStartLine, "unxpected %q", r) + } + + s.repeateSpecialChar = true + if r == "{" { + s.tokenDepth++ + } + + if r == "}" { + s.tokenDepth-- + if s.tokenDepth < 0 { + return Token{}, &scannerError{line: s.tokenStartLine, msg: `unexpected "}"`} + } + } + + tok.WriteString(r) + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + + s.repeateSpecialChar = false + tok.WriteString(r) + case inComment: + if isEOL(r) { + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + tok.WriteString(r) + case inVar: + tok.WriteString(r) + if r != "}" && !isSpace(r) { + continue + } + lexState = inWord + case inQuote: + if r == quote { + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + if r == "\\"+quote { + r = quote + } + tok.WriteString(r) + } + } +} diff --git a/scanner_test.go b/scanner_test.go new file mode 100644 index 00000000..30d06bbe --- /dev/null +++ b/scanner_test.go @@ -0,0 +1,110 @@ +package crossplane + +import ( + "io" + "os" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestScanner(t *testing.T) { + t.Parallel() + + for _, f := range lexFixtures { + f := f + t.Run(f.name, func(t *testing.T) { + t.Parallel() + + path := getTestConfigPath(f.name, "nginx.conf") + file, err := os.Open(path) + if err != nil { + t.Fatal(err) + } + defer file.Close() + + s := NewScanner(file) + + i := 0 + for { + got, err := s.Scan() + if err == io.EOF { + if i < len(f.tokens)-1 { + t.Fatal("unexpected end of file") + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + want := f.tokens[i] + require.Equal(t, want.value, got.Text) + require.Equal(t, want.line, got.Line) + i++ + } + }) + } +} + +func TestScanner_unhappy(t *testing.T) { + t.Parallel() + + for name, c := range unhappyFixtures { + c := c + t.Run(name, func(t *testing.T) { + t.Parallel() + + s := NewScanner(strings.NewReader(c)) + for { + _, err := s.Scan() + if err == io.EOF { + t.Fatal("reached end of string") + } + + if err != nil { + t.Logf("got error: %v", err) + return + } + } + }) + } +} + +var t Token //nolint: gochecknoglobals // trying to avoid return value being optimzed away + +func BenchmarkScan(b *testing.B) { + for _, bm := range lexFixtures { + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + file, err := os.Open(path) + if err != nil { + b.Fatal(err) + } + defer file.Close() + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + if _, err := file.Seek(0, 0); err != nil { + b.Fatal(err) + } + + s := NewScanner(file) + + for { + tok, err := s.Scan() + if err == io.EOF { + break + } + if err != nil { + b.Fatal(err) + } + t = tok + } + } + }) + } +} diff --git a/util.go b/util.go index d2e84ade..186afeb5 100644 --- a/util.go +++ b/util.go @@ -35,6 +35,8 @@ func isEOL(s string) bool { return strings.HasSuffix(s, "\n") } +func isSpecialChar(s string) bool { return s == "{" || s == "}" || s == ";" } + func repr(s string) string { q := fmt.Sprintf("%q", s) for _, char := range s {