Switch to github.com/dlclark/regexp2.

This makes translating Pygments lexers much much simpler (and possible).
alecthomas · Sep 18, 2017 · a10fd0a · a10fd0a
1 parent 86bda70
commit a10fd0a
Show file tree

Hide file tree

Showing 139 changed files with 9,588 additions and 875 deletions.
diff --git a/COPYING b/COPYING
@@ -0,0 +1,19 @@
+Copyright (C) 2017 Alec Thomas
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/_tools/pygments2chroma.py b/_tools/pygments2chroma.py
@@ -1,9 +1,10 @@
-import re
-import os
+import functools
 import importlib
+import json
+import os
+import re
 import sys
 import types
-import json
 
 import pystache
 from pygments import lexer as pygments_lexer
@@ -47,13 +48,11 @@
 '''
 
 
+def go_regex(s):
+    return go_string(s)
+
+
 def go_string(s):
-    # TODO: Search for substring ranges and convert them to character classes.
-    #
-    # This seems to commonly occur with Unicode character classes, which presumably
-    # aren't supported by Python's regex engine.
-    if '(?<' in s:
-        warning('perl regex found in %r' % s)
     if '`' not in s:
         return '`' + s + '`'
     return json.dumps(s)
@@ -105,6 +104,8 @@ def resolve_emitter(emitter):
 
 
 def process_state_action(action):
+    if isinstance(action, tuple):
+        return functools.reduce(lambda a, b: a + b, (process_state_action(a) for a in action))
     if action.startswith('#'):
         action = action[1:]
         if action== 'pop':
@@ -119,7 +120,7 @@ def process_state_action(action):
             raise ValueError('unsupported action %r' % (action,))
     else:
         action = 'Push("%s")' % action
-    return action
+    return (action,)
 
 
 def translate_rules(rules):
@@ -128,16 +129,18 @@ def translate_rules(rules):
         if isinstance(rule, tuple):
             regex = rule[0]
             if isinstance(regex, str):
-                regex = go_string(regex)
+                regex = go_regex(regex)
             elif isinstance(regex, pygments_lexer.words):
-                regex = go_string('`%s(?:%s)%s`' % (regex.prefix, '|'.join(regex.words), regex.suffix))
+                regex = 'Words(%s, %s, %s)' % (go_string(regex.prefix),
+                                               go_string(regex.suffix),
+                                               ', '.join(go_string(w) for w in regex.words))
             else:
                 raise ValueError('expected regex string but got %r' % regex)
             emitter = resolve_emitter(rule[1])
             if len(rule) == 2:
                 modifier = 'nil'
             elif type(rule[2]) is str:
-                modifier = process_state_action(rule[2])
+                modifier = process_state_action(rule[2])[0]
             elif isinstance(rule[2], pygments_lexer.combined):
                 modifier = 'Combined("%s")' % '", "'.join(rule[2])
             elif type(rule[2]) is tuple:
@@ -148,7 +151,7 @@ def translate_rules(rules):
         elif isinstance(rule, pygments_lexer.include):
             out.append('Include("{}")'.format(rule))
         elif isinstance(rule, pygments_lexer.default):
-            out.append('Default({})'.format(process_state_action(rule.state)))
+            out.append('Default({})'.format(', '.join(process_state_action(rule.state))))
         else:
             raise ValueError('unsupported rule %r' % (rule,))
     return out

diff --git a/cmd/chroma/main.go b/cmd/chroma/main.go
@@ -1,7 +1,6 @@
 package main
 
 import (
-	"bufio"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -54,8 +53,9 @@ func main() {
 		}()
 		defer pprof.StopCPUProfile()
 	}
-	w := bufio.NewWriterSize(os.Stdout, 16384)
-	defer w.Flush()
+	// w := bufio.NewWriterSize(os.Stdout, 16384)
+	w := os.Stdout
+	// defer w.Flush()
 	if *formatterFlag == "html" {
 		options := []html.Option{}
 		if *htmlPrefixFlag != "" {

diff --git a/lexer.go b/lexer.go
@@ -4,6 +4,7 @@ import (
 	"fmt"
 	"regexp"
 	"strings"
+	"sync"
 
 	"github.com/dlclark/regexp2"
 )
@@ -54,6 +55,9 @@ type Config struct {
 
 	// If given and greater than 0, expand tabs in the input.
 	// TabSize int
+
+	// Whether to track how long rules take to process.
+	TimeRules bool
 }
 
 // Token output to formatter.
@@ -153,11 +157,11 @@ func UsingSelf(state string) Emitter {
 }
 
 // Words creates a regex that matches any of the given literal words.
-func Words(words ...string) string {
+func Words(prefix, suffix string, words ...string) string {
 	for i, word := range words {
 		words[i] = regexp.QuoteMeta(word)
 	}
-	return `\b(?:` + strings.Join(words, `|`) + `)\b`
+	return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
 }
 
 // Rules maps from state to a sequence of Rules.
@@ -186,7 +190,6 @@ func NewLexer(config *Config, rules Rules) (*RegexLexer, error) {
 	compiledRules := map[string][]CompiledRule{}
 	for state, rules := range rules {
 		for _, rule := range rules {
-			crule := CompiledRule{Rule: rule}
 			flags := ""
 			if !config.NotMultiline {
 				flags += "m"
@@ -197,12 +200,7 @@ func NewLexer(config *Config, rules Rules) (*RegexLexer, error) {
 			if config.DotAll {
 				flags += "s"
 			}
-			re, err := regexp2.Compile("^(?"+flags+")(?:"+rule.Pattern+")", 0)
-			if err != nil {
-				return nil, fmt.Errorf("invalid regex %q for state %q: %s", rule.Pattern, state, err)
-			}
-			crule.Regexp = re
-			compiledRules[state] = append(compiledRules[state], crule)
+			compiledRules[state] = append(compiledRules[state], CompiledRule{Rule: rule, flags: flags})
 		}
 	}
 	return &RegexLexer{
@@ -215,25 +213,39 @@ func NewLexer(config *Config, rules Rules) (*RegexLexer, error) {
 type CompiledRule struct {
 	Rule
 	Regexp *regexp2.Regexp
+	flags  string
 }
 
 type CompiledRules map[string][]CompiledRule
 
 type LexerState struct {
-	Text  string
+	Text  []rune
 	Pos   int
 	Rules map[string][]CompiledRule
 	Stack []string
 	State string
 	Rule  int
 	// Group matches.
 	Groups []string
+	// Custum context for mutators.
+	MutatorContext map[interface{}]interface{}
+}
+
+func (l *LexerState) Set(key interface{}, value interface{}) {
+	l.MutatorContext[key] = value
+}
+
+func (l *LexerState) Get(key interface{}) interface{} {
+	return l.MutatorContext[key]
 }
 
 type RegexLexer struct {
 	config   *Config
-	rules    map[string][]CompiledRule
 	analyser func(text string) float32
+
+	mu       sync.Mutex
+	compiled bool
+	rules    map[string][]CompiledRule
 }
 
 // SetAnalyser sets the analyser function used to perform content inspection.
@@ -253,21 +265,45 @@ func (r *RegexLexer) Config() *Config {
 	return r.config
 }
 
+// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
+func (r *RegexLexer) maybeCompile() (err error) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if r.compiled {
+		return nil
+	}
+	for _, rules := range r.rules {
+		for i, rule := range rules {
+			if rule.Regexp == nil {
+				rule.Regexp, err = regexp2.Compile("^(?"+rule.flags+")(?:"+rule.Pattern+")", 0)
+				if err != nil {
+					return err
+				}
+			}
+			rules[i] = rule
+		}
+	}
+	r.compiled = true
+	return nil
+}
+
 func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string, out func(*Token)) error {
+	r.maybeCompile()
 	if options == nil {
 		options = defaultOptions
 	}
 	state := &LexerState{
-		Text:  text,
-		Stack: []string{options.State},
-		Rules: r.rules,
+		Text:           []rune(text),
+		Stack:          []string{options.State},
+		Rules:          r.rules,
+		MutatorContext: map[interface{}]interface{}{},
 	}
-	for state.Pos < len(text) && len(state.Stack) > 0 {
+	for state.Pos < len(state.Text) && len(state.Stack) > 0 {
 		state.State = state.Stack[len(state.Stack)-1]
 		ruleIndex, rule, groups := matchRules(state.Text[state.Pos:], state.Rules[state.State])
 		// No match.
 		if groups == nil {
-			out(&Token{Error, state.Text[state.Pos : state.Pos+1]})
+			out(&Token{Error, string(state.Text[state.Pos : state.Pos+1])})
 			state.Pos++
 			continue
 		}
@@ -294,9 +330,9 @@ func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]*Token, err
 	return out, lexer.Tokenise(options, text, func(token *Token) { out = append(out, token) })
 }
 
-func matchRules(text string, rules []CompiledRule) (int, CompiledRule, []string) {
+func matchRules(text []rune, rules []CompiledRule) (int, CompiledRule, []string) {
 	for i, rule := range rules {
-		match, err := rule.Regexp.FindStringMatch(text)
+		match, err := rule.Regexp.FindRunesMatch(text)
 		if match != nil && err == nil {
 			groups := []string{}
 			for _, g := range match.Groups() {

diff --git a/lexers/abnf.go b/lexers/abnf.go
@@ -0,0 +1,37 @@
+package lexers
+
+import (
+	. "github.com/alecthomas/chroma" // nolint
+)
+
+// Abnf lexer.
+var Abnf = Register(MustNewLexer(
+	&Config{
+		Name:      "ABNF",
+		Aliases:   []string{"abnf"},
+		Filenames: []string{"*.abnf"},
+		MimeTypes: []string{"text/x-abnf"},
+	},
+	Rules{
+		"root": {
+			{`;.*$`, CommentSingle, nil},
+			{`(%[si])?"[^"]*"`, Literal, nil},
+			{`%b[01]+\-[01]+\b`, Literal, nil},
+			{`%b[01]+(\.[01]+)*\b`, Literal, nil},
+			{`%d[0-9]+\-[0-9]+\b`, Literal, nil},
+			{`%d[0-9]+(\.[0-9]+)*\b`, Literal, nil},
+			{`%x[0-9a-fA-F]+\-[0-9a-fA-F]+\b`, Literal, nil},
+			{`%x[0-9a-fA-F]+(\.[0-9a-fA-F]+)*\b`, Literal, nil},
+			{`\b[0-9]+\*[0-9]+`, Operator, nil},
+			{`\b[0-9]+\*`, Operator, nil},
+			{`\b[0-9]+`, Operator, nil},
+			{`\*`, Operator, nil},
+			{Words(``, `\b`, `ALPHA`, `BIT`, `CHAR`, `CR`, `CRLF`, `CTL`, `DIGIT`, `DQUOTE`, `HEXDIG`, `HTAB`, `LF`, `LWSP`, `OCTET`, `SP`, `VCHAR`, `WSP`), Keyword, nil},
+			{`[a-zA-Z][a-zA-Z0-9-]+\b`, NameClass, nil},
+			{`(=/|=|/)`, Operator, nil},
+			{`[\[\]()]`, Punctuation, nil},
+			{`\s+`, Text, nil},
+			{`.`, Text, nil},
+		},
+	},
+))
diff --git a/lexers/actionscript.go b/lexers/actionscript.go
@@ -0,0 +1,38 @@
+package lexers
+
+import (
+	. "github.com/alecthomas/chroma" // nolint
+)
+
+// Actionscript lexer.
+var Actionscript = Register(MustNewLexer(
+	&Config{
+		Name:         "ActionScript",
+		Aliases:      []string{"as", "actionscript"},
+		Filenames:    []string{"*.as"},
+		MimeTypes:    []string{"application/x-actionscript", "text/x-actionscript", "text/actionscript"},
+		NotMultiline: true,
+		DotAll:       true,
+	},
+	Rules{
+		"root": {
+			{`\s+`, Text, nil},
+			{`//.*?\n`, CommentSingle, nil},
+			{`/\*.*?\*/`, CommentMultiline, nil},
+			{`/(\\\\|\\/|[^/\n])*/[gim]*`, LiteralStringRegex, nil},
+			{`[~^*!%&<>|+=:;,/?\\-]+`, Operator, nil},
+			{`[{}\[\]();.]+`, Punctuation, nil},
+			{Words(``, `\b`, `case`, `default`, `for`, `each`, `in`, `while`, `do`, `break`, `return`, `continue`, `if`, `else`, `throw`, `try`, `catch`, `var`, `with`, `new`, `typeof`, `arguments`, `instanceof`, `this`, `switch`), Keyword, nil},
+			{Words(``, `\b`, `class`, `public`, `final`, `internal`, `native`, `override`, `private`, `protected`, `static`, `import`, `extends`, `implements`, `interface`, `intrinsic`, `return`, `super`, `dynamic`, `function`, `const`, `get`, `namespace`, `package`, `set`), KeywordDeclaration, nil},
+			{`(true|false|null|NaN|Infinity|-Infinity|undefined|Void)\b`, KeywordConstant, nil},
+			{Words(``, `\b`, `Accessibility`, `AccessibilityProperties`, `ActionScriptVersion`, `ActivityEvent`, `AntiAliasType`, `ApplicationDomain`, `AsBroadcaster`, `Array`, `AsyncErrorEvent`, `AVM1Movie`, `BevelFilter`, `Bitmap`, `BitmapData`, `BitmapDataChannel`, `BitmapFilter`, `BitmapFilterQuality`, `BitmapFilterType`, `BlendMode`, `BlurFilter`, `Boolean`, `ByteArray`, `Camera`, `Capabilities`, `CapsStyle`, `Class`, `Color`, `ColorMatrixFilter`, `ColorTransform`, `ContextMenu`, `ContextMenuBuiltInItems`, `ContextMenuEvent`, `ContextMenuItem`, `ConvultionFilter`, `CSMSettings`, `DataEvent`, `Date`, `DefinitionError`, `DeleteObjectSample`, `Dictionary`, `DisplacmentMapFilter`, `DisplayObject`, `DisplacmentMapFilterMode`, `DisplayObjectContainer`, `DropShadowFilter`, `Endian`, `EOFError`, `Error`, `ErrorEvent`, `EvalError`, `Event`, `EventDispatcher`, `EventPhase`, `ExternalInterface`, `FileFilter`, `FileReference`, `FileReferenceList`, `FocusDirection`, `FocusEvent`, `Font`, `FontStyle`, `FontType`, `FrameLabel`, `FullScreenEvent`, `Function`, `GlowFilter`, `GradientBevelFilter`, `GradientGlowFilter`, `GradientType`, `Graphics`, `GridFitType`, `HTTPStatusEvent`, `IBitmapDrawable`, `ID3Info`, `IDataInput`, `IDataOutput`, `IDynamicPropertyOutputIDynamicPropertyWriter`, `IEventDispatcher`, `IExternalizable`, `IllegalOperationError`, `IME`, `IMEConversionMode`, `IMEEvent`, `int`, `InteractiveObject`, `InterpolationMethod`, `InvalidSWFError`, `InvokeEvent`, `IOError`, `IOErrorEvent`, `JointStyle`, `Key`, `Keyboard`, `KeyboardEvent`, `KeyLocation`, `LineScaleMode`, `Loader`, `LoaderContext`, `LoaderInfo`, `LoadVars`, `LocalConnection`, `Locale`, `Math`, `Matrix`, `MemoryError`, `Microphone`, `MorphShape`, `Mouse`, `MouseEvent`, `MovieClip`, `MovieClipLoader`, `Namespace`, `NetConnection`, `NetStatusEvent`, `NetStream`, `NewObjectSample`, `Number`, `Object`, `ObjectEncoding`, `PixelSnapping`, `Point`, `PrintJob`, `PrintJobOptions`, `PrintJobOrientation`, `ProgressEvent`, `Proxy`, `QName`, `RangeError`, `Rectangle`, `ReferenceError`, `RegExp`, `Responder`, `Sample`, `Scene`, `ScriptTimeoutError`, `Security`, `SecurityDomain`, `SecurityError`, `SecurityErrorEvent`, `SecurityPanel`, `Selection`, `Shape`, `SharedObject`, `SharedObjectFlushStatus`, `SimpleButton`, `Socket`, `Sound`, `SoundChannel`, `SoundLoaderContext`, `SoundMixer`, `SoundTransform`, `SpreadMethod`, `Sprite`, `StackFrame`, `StackOverflowError`, `Stage`, `StageAlign`, `StageDisplayState`, `StageQuality`, `StageScaleMode`, `StaticText`, `StatusEvent`, `String`, `StyleSheet`, `SWFVersion`, `SyncEvent`, `SyntaxError`, `System`, `TextColorType`, `TextField`, `TextFieldAutoSize`, `TextFieldType`, `TextFormat`, `TextFormatAlign`, `TextLineMetrics`, `TextRenderer`, `TextSnapshot`, `Timer`, `TimerEvent`, `Transform`, `TypeError`, `uint`, `URIError`, `URLLoader`, `URLLoaderDataFormat`, `URLRequest`, `URLRequestHeader`, `URLRequestMethod`, `URLStream`, `URLVariabeles`, `VerifyError`, `Video`, `XML`, `XMLDocument`, `XMLList`, `XMLNode`, `XMLNodeType`, `XMLSocket`, `XMLUI`), NameBuiltin, nil},
+			{Words(``, `\b`, `decodeURI`, `decodeURIComponent`, `encodeURI`, `escape`, `eval`, `isFinite`, `isNaN`, `isXMLName`, `clearInterval`, `fscommand`, `getTimer`, `getURL`, `getVersion`, `parseFloat`, `parseInt`, `setInterval`, `trace`, `updateAfterEvent`, `unescape`), NameFunction, nil},
+			{`[$a-zA-Z_]\w*`, NameOther, nil},
+			{`[0-9][0-9]*\.[0-9]+([eE][0-9]+)?[fd]?`, LiteralNumberFloat, nil},
+			{`0x[0-9a-f]+`, LiteralNumberHex, nil},
+			{`[0-9]+`, LiteralNumberInteger, nil},
+			{`"(\\\\|\\"|[^"])*"`, LiteralStringDouble, nil},
+			{`'(\\\\|\\'|[^'])*'`, LiteralStringSingle, nil},
+		},
+	},
+))