Skip to content

Commit

Permalink
Switch to github.com/dlclark/regexp2.
Browse files Browse the repository at this point in the history
This makes translating Pygments lexers much much simpler (and possible).
  • Loading branch information
alecthomas committed Sep 18, 2017
1 parent 86bda70 commit a10fd0a
Show file tree
Hide file tree
Showing 139 changed files with 9,588 additions and 875 deletions.
19 changes: 19 additions & 0 deletions COPYING
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Copyright (C) 2017 Alec Thomas

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
31 changes: 17 additions & 14 deletions _tools/pygments2chroma.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import re
import os
import functools
import importlib
import json
import os
import re
import sys
import types
import json

import pystache
from pygments import lexer as pygments_lexer
Expand Down Expand Up @@ -47,13 +48,11 @@
'''


def go_regex(s):
return go_string(s)


def go_string(s):
# TODO: Search for substring ranges and convert them to character classes.
#
# This seems to commonly occur with Unicode character classes, which presumably
# aren't supported by Python's regex engine.
if '(?<' in s:
warning('perl regex found in %r' % s)
if '`' not in s:
return '`' + s + '`'
return json.dumps(s)
Expand Down Expand Up @@ -105,6 +104,8 @@ def resolve_emitter(emitter):


def process_state_action(action):
if isinstance(action, tuple):
return functools.reduce(lambda a, b: a + b, (process_state_action(a) for a in action))
if action.startswith('#'):
action = action[1:]
if action== 'pop':
Expand All @@ -119,7 +120,7 @@ def process_state_action(action):
raise ValueError('unsupported action %r' % (action,))
else:
action = 'Push("%s")' % action
return action
return (action,)


def translate_rules(rules):
Expand All @@ -128,16 +129,18 @@ def translate_rules(rules):
if isinstance(rule, tuple):
regex = rule[0]
if isinstance(regex, str):
regex = go_string(regex)
regex = go_regex(regex)
elif isinstance(regex, pygments_lexer.words):
regex = go_string('`%s(?:%s)%s`' % (regex.prefix, '|'.join(regex.words), regex.suffix))
regex = 'Words(%s, %s, %s)' % (go_string(regex.prefix),
go_string(regex.suffix),
', '.join(go_string(w) for w in regex.words))
else:
raise ValueError('expected regex string but got %r' % regex)
emitter = resolve_emitter(rule[1])
if len(rule) == 2:
modifier = 'nil'
elif type(rule[2]) is str:
modifier = process_state_action(rule[2])
modifier = process_state_action(rule[2])[0]
elif isinstance(rule[2], pygments_lexer.combined):
modifier = 'Combined("%s")' % '", "'.join(rule[2])
elif type(rule[2]) is tuple:
Expand All @@ -148,7 +151,7 @@ def translate_rules(rules):
elif isinstance(rule, pygments_lexer.include):
out.append('Include("{}")'.format(rule))
elif isinstance(rule, pygments_lexer.default):
out.append('Default({})'.format(process_state_action(rule.state)))
out.append('Default({})'.format(', '.join(process_state_action(rule.state))))
else:
raise ValueError('unsupported rule %r' % (rule,))
return out
Expand Down
6 changes: 3 additions & 3 deletions cmd/chroma/main.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package main

import (
"bufio"
"fmt"
"io"
"io/ioutil"
Expand Down Expand Up @@ -54,8 +53,9 @@ func main() {
}()
defer pprof.StopCPUProfile()
}
w := bufio.NewWriterSize(os.Stdout, 16384)
defer w.Flush()
// w := bufio.NewWriterSize(os.Stdout, 16384)
w := os.Stdout
// defer w.Flush()
if *formatterFlag == "html" {
options := []html.Option{}
if *htmlPrefixFlag != "" {
Expand Down
72 changes: 54 additions & 18 deletions lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"regexp"
"strings"
"sync"

"github.com/dlclark/regexp2"
)
Expand Down Expand Up @@ -54,6 +55,9 @@ type Config struct {

// If given and greater than 0, expand tabs in the input.
// TabSize int

// Whether to track how long rules take to process.
TimeRules bool
}

// Token output to formatter.
Expand Down Expand Up @@ -153,11 +157,11 @@ func UsingSelf(state string) Emitter {
}

// Words creates a regex that matches any of the given literal words.
func Words(words ...string) string {
func Words(prefix, suffix string, words ...string) string {
for i, word := range words {
words[i] = regexp.QuoteMeta(word)
}
return `\b(?:` + strings.Join(words, `|`) + `)\b`
return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
}

// Rules maps from state to a sequence of Rules.
Expand Down Expand Up @@ -186,7 +190,6 @@ func NewLexer(config *Config, rules Rules) (*RegexLexer, error) {
compiledRules := map[string][]CompiledRule{}
for state, rules := range rules {
for _, rule := range rules {
crule := CompiledRule{Rule: rule}
flags := ""
if !config.NotMultiline {
flags += "m"
Expand All @@ -197,12 +200,7 @@ func NewLexer(config *Config, rules Rules) (*RegexLexer, error) {
if config.DotAll {
flags += "s"
}
re, err := regexp2.Compile("^(?"+flags+")(?:"+rule.Pattern+")", 0)
if err != nil {
return nil, fmt.Errorf("invalid regex %q for state %q: %s", rule.Pattern, state, err)
}
crule.Regexp = re
compiledRules[state] = append(compiledRules[state], crule)
compiledRules[state] = append(compiledRules[state], CompiledRule{Rule: rule, flags: flags})
}
}
return &RegexLexer{
Expand All @@ -215,25 +213,39 @@ func NewLexer(config *Config, rules Rules) (*RegexLexer, error) {
type CompiledRule struct {
Rule
Regexp *regexp2.Regexp
flags string
}

type CompiledRules map[string][]CompiledRule

type LexerState struct {
Text string
Text []rune
Pos int
Rules map[string][]CompiledRule
Stack []string
State string
Rule int
// Group matches.
Groups []string
// Custum context for mutators.
MutatorContext map[interface{}]interface{}
}

func (l *LexerState) Set(key interface{}, value interface{}) {
l.MutatorContext[key] = value
}

func (l *LexerState) Get(key interface{}) interface{} {
return l.MutatorContext[key]
}

type RegexLexer struct {
config *Config
rules map[string][]CompiledRule
analyser func(text string) float32

mu sync.Mutex
compiled bool
rules map[string][]CompiledRule
}

// SetAnalyser sets the analyser function used to perform content inspection.
Expand All @@ -253,21 +265,45 @@ func (r *RegexLexer) Config() *Config {
return r.config
}

// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
func (r *RegexLexer) maybeCompile() (err error) {
r.mu.Lock()
defer r.mu.Unlock()
if r.compiled {
return nil
}
for _, rules := range r.rules {
for i, rule := range rules {
if rule.Regexp == nil {
rule.Regexp, err = regexp2.Compile("^(?"+rule.flags+")(?:"+rule.Pattern+")", 0)
if err != nil {
return err
}
}
rules[i] = rule
}
}
r.compiled = true
return nil
}

func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string, out func(*Token)) error {
r.maybeCompile()
if options == nil {
options = defaultOptions
}
state := &LexerState{
Text: text,
Stack: []string{options.State},
Rules: r.rules,
Text: []rune(text),
Stack: []string{options.State},
Rules: r.rules,
MutatorContext: map[interface{}]interface{}{},
}
for state.Pos < len(text) && len(state.Stack) > 0 {
for state.Pos < len(state.Text) && len(state.Stack) > 0 {
state.State = state.Stack[len(state.Stack)-1]
ruleIndex, rule, groups := matchRules(state.Text[state.Pos:], state.Rules[state.State])
// No match.
if groups == nil {
out(&Token{Error, state.Text[state.Pos : state.Pos+1]})
out(&Token{Error, string(state.Text[state.Pos : state.Pos+1])})
state.Pos++
continue
}
Expand All @@ -294,9 +330,9 @@ func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]*Token, err
return out, lexer.Tokenise(options, text, func(token *Token) { out = append(out, token) })
}

func matchRules(text string, rules []CompiledRule) (int, CompiledRule, []string) {
func matchRules(text []rune, rules []CompiledRule) (int, CompiledRule, []string) {
for i, rule := range rules {
match, err := rule.Regexp.FindStringMatch(text)
match, err := rule.Regexp.FindRunesMatch(text)
if match != nil && err == nil {
groups := []string{}
for _, g := range match.Groups() {
Expand Down
37 changes: 37 additions & 0 deletions lexers/abnf.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package lexers

import (
. "github.com/alecthomas/chroma" // nolint
)

// Abnf lexer.
var Abnf = Register(MustNewLexer(
&Config{
Name: "ABNF",
Aliases: []string{"abnf"},
Filenames: []string{"*.abnf"},
MimeTypes: []string{"text/x-abnf"},
},
Rules{
"root": {
{`;.*$`, CommentSingle, nil},
{`(%[si])?"[^"]*"`, Literal, nil},
{`%b[01]+\-[01]+\b`, Literal, nil},
{`%b[01]+(\.[01]+)*\b`, Literal, nil},
{`%d[0-9]+\-[0-9]+\b`, Literal, nil},
{`%d[0-9]+(\.[0-9]+)*\b`, Literal, nil},
{`%x[0-9a-fA-F]+\-[0-9a-fA-F]+\b`, Literal, nil},
{`%x[0-9a-fA-F]+(\.[0-9a-fA-F]+)*\b`, Literal, nil},
{`\b[0-9]+\*[0-9]+`, Operator, nil},
{`\b[0-9]+\*`, Operator, nil},
{`\b[0-9]+`, Operator, nil},
{`\*`, Operator, nil},
{Words(``, `\b`, `ALPHA`, `BIT`, `CHAR`, `CR`, `CRLF`, `CTL`, `DIGIT`, `DQUOTE`, `HEXDIG`, `HTAB`, `LF`, `LWSP`, `OCTET`, `SP`, `VCHAR`, `WSP`), Keyword, nil},
{`[a-zA-Z][a-zA-Z0-9-]+\b`, NameClass, nil},
{`(=/|=|/)`, Operator, nil},
{`[\[\]()]`, Punctuation, nil},
{`\s+`, Text, nil},
{`.`, Text, nil},
},
},
))
38 changes: 38 additions & 0 deletions lexers/actionscript.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package lexers

import (
. "github.com/alecthomas/chroma" // nolint
)

// Actionscript lexer.
var Actionscript = Register(MustNewLexer(
&Config{
Name: "ActionScript",
Aliases: []string{"as", "actionscript"},
Filenames: []string{"*.as"},
MimeTypes: []string{"application/x-actionscript", "text/x-actionscript", "text/actionscript"},
NotMultiline: true,
DotAll: true,
},
Rules{
"root": {
{`\s+`, Text, nil},
{`//.*?\n`, CommentSingle, nil},
{`/\*.*?\*/`, CommentMultiline, nil},
{`/(\\\\|\\/|[^/\n])*/[gim]*`, LiteralStringRegex, nil},
{`[~^*!%&<>|+=:;,/?\\-]+`, Operator, nil},
{`[{}\[\]();.]+`, Punctuation, nil},
{Words(``, `\b`, `case`, `default`, `for`, `each`, `in`, `while`, `do`, `break`, `return`, `continue`, `if`, `else`, `throw`, `try`, `catch`, `var`, `with`, `new`, `typeof`, `arguments`, `instanceof`, `this`, `switch`), Keyword, nil},
{Words(``, `\b`, `class`, `public`, `final`, `internal`, `native`, `override`, `private`, `protected`, `static`, `import`, `extends`, `implements`, `interface`, `intrinsic`, `return`, `super`, `dynamic`, `function`, `const`, `get`, `namespace`, `package`, `set`), KeywordDeclaration, nil},
{`(true|false|null|NaN|Infinity|-Infinity|undefined|Void)\b`, KeywordConstant, nil},
{Words(``, `\b`, `Accessibility`, `AccessibilityProperties`, `ActionScriptVersion`, `ActivityEvent`, `AntiAliasType`, `ApplicationDomain`, `AsBroadcaster`, `Array`, `AsyncErrorEvent`, `AVM1Movie`, `BevelFilter`, `Bitmap`, `BitmapData`, `BitmapDataChannel`, `BitmapFilter`, `BitmapFilterQuality`, `BitmapFilterType`, `BlendMode`, `BlurFilter`, `Boolean`, `ByteArray`, `Camera`, `Capabilities`, `CapsStyle`, `Class`, `Color`, `ColorMatrixFilter`, `ColorTransform`, `ContextMenu`, `ContextMenuBuiltInItems`, `ContextMenuEvent`, `ContextMenuItem`, `ConvultionFilter`, `CSMSettings`, `DataEvent`, `Date`, `DefinitionError`, `DeleteObjectSample`, `Dictionary`, `DisplacmentMapFilter`, `DisplayObject`, `DisplacmentMapFilterMode`, `DisplayObjectContainer`, `DropShadowFilter`, `Endian`, `EOFError`, `Error`, `ErrorEvent`, `EvalError`, `Event`, `EventDispatcher`, `EventPhase`, `ExternalInterface`, `FileFilter`, `FileReference`, `FileReferenceList`, `FocusDirection`, `FocusEvent`, `Font`, `FontStyle`, `FontType`, `FrameLabel`, `FullScreenEvent`, `Function`, `GlowFilter`, `GradientBevelFilter`, `GradientGlowFilter`, `GradientType`, `Graphics`, `GridFitType`, `HTTPStatusEvent`, `IBitmapDrawable`, `ID3Info`, `IDataInput`, `IDataOutput`, `IDynamicPropertyOutputIDynamicPropertyWriter`, `IEventDispatcher`, `IExternalizable`, `IllegalOperationError`, `IME`, `IMEConversionMode`, `IMEEvent`, `int`, `InteractiveObject`, `InterpolationMethod`, `InvalidSWFError`, `InvokeEvent`, `IOError`, `IOErrorEvent`, `JointStyle`, `Key`, `Keyboard`, `KeyboardEvent`, `KeyLocation`, `LineScaleMode`, `Loader`, `LoaderContext`, `LoaderInfo`, `LoadVars`, `LocalConnection`, `Locale`, `Math`, `Matrix`, `MemoryError`, `Microphone`, `MorphShape`, `Mouse`, `MouseEvent`, `MovieClip`, `MovieClipLoader`, `Namespace`, `NetConnection`, `NetStatusEvent`, `NetStream`, `NewObjectSample`, `Number`, `Object`, `ObjectEncoding`, `PixelSnapping`, `Point`, `PrintJob`, `PrintJobOptions`, `PrintJobOrientation`, `ProgressEvent`, `Proxy`, `QName`, `RangeError`, `Rectangle`, `ReferenceError`, `RegExp`, `Responder`, `Sample`, `Scene`, `ScriptTimeoutError`, `Security`, `SecurityDomain`, `SecurityError`, `SecurityErrorEvent`, `SecurityPanel`, `Selection`, `Shape`, `SharedObject`, `SharedObjectFlushStatus`, `SimpleButton`, `Socket`, `Sound`, `SoundChannel`, `SoundLoaderContext`, `SoundMixer`, `SoundTransform`, `SpreadMethod`, `Sprite`, `StackFrame`, `StackOverflowError`, `Stage`, `StageAlign`, `StageDisplayState`, `StageQuality`, `StageScaleMode`, `StaticText`, `StatusEvent`, `String`, `StyleSheet`, `SWFVersion`, `SyncEvent`, `SyntaxError`, `System`, `TextColorType`, `TextField`, `TextFieldAutoSize`, `TextFieldType`, `TextFormat`, `TextFormatAlign`, `TextLineMetrics`, `TextRenderer`, `TextSnapshot`, `Timer`, `TimerEvent`, `Transform`, `TypeError`, `uint`, `URIError`, `URLLoader`, `URLLoaderDataFormat`, `URLRequest`, `URLRequestHeader`, `URLRequestMethod`, `URLStream`, `URLVariabeles`, `VerifyError`, `Video`, `XML`, `XMLDocument`, `XMLList`, `XMLNode`, `XMLNodeType`, `XMLSocket`, `XMLUI`), NameBuiltin, nil},
{Words(``, `\b`, `decodeURI`, `decodeURIComponent`, `encodeURI`, `escape`, `eval`, `isFinite`, `isNaN`, `isXMLName`, `clearInterval`, `fscommand`, `getTimer`, `getURL`, `getVersion`, `parseFloat`, `parseInt`, `setInterval`, `trace`, `updateAfterEvent`, `unescape`), NameFunction, nil},
{`[$a-zA-Z_]\w*`, NameOther, nil},
{`[0-9][0-9]*\.[0-9]+([eE][0-9]+)?[fd]?`, LiteralNumberFloat, nil},
{`0x[0-9a-f]+`, LiteralNumberHex, nil},
{`[0-9]+`, LiteralNumberInteger, nil},
{`"(\\\\|\\"|[^"])*"`, LiteralStringDouble, nil},
{`'(\\\\|\\'|[^'])*'`, LiteralStringSingle, nil},
},
},
))
Loading

0 comments on commit a10fd0a

Please sign in to comment.