New Grammars Compiler (#3915)

* grammars: Update several grammars with compat issues

* [WIP] Add new grammar conversion tools

* Wrap in a Docker script

* Proper Dockerfile support

* Add Javadoc grammar

* Remove NPM package.json

* Remove superfluous test

This is now always checked by the grammars compiler

* Update JSyntax grammar to new submodule

* Approve Javadoc license

* grammars: Remove checked-in dependencies

* grammars: Add regex checks to the compiler

* grammars: Point Oz to its actual submodule

* grammars: Refactor compiler to group errors by repo

* grammars: Cleanups to error reporting
This commit is contained in:
Vicent Martí
2017-11-30 16:15:48 +01:00
committed by GitHub
parent 4f46155c05
commit e335d48625
37 changed files with 1445 additions and 416 deletions

View File

@@ -0,0 +1,227 @@
package compiler
import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"os"
"path"
"runtime"
"sort"
"strings"
"sync"
grammar "github.com/github/linguist/tools/grammars/proto"
"github.com/golang/protobuf/proto"
pb "gopkg.in/cheggaaa/pb.v1"
yaml "gopkg.in/yaml.v2"
)
type Converter struct {
root string
modified bool
grammars map[string][]string
Loaded map[string]*Repository
progress *pb.ProgressBar
wg sync.WaitGroup
queue chan string
mu sync.Mutex
}
func (conv *Converter) Load(src string) *Repository {
if strings.HasPrefix(src, "http://") || strings.HasPrefix(src, "https://") {
return LoadFromURL(src)
}
return LoadFromFilesystem(conv.root, src)
}
func (conv *Converter) work() {
for source := range conv.queue {
repo := conv.Load(source)
conv.mu.Lock()
conv.Loaded[source] = repo
conv.mu.Unlock()
conv.progress.Increment()
}
conv.wg.Done()
}
func (conv *Converter) AddGrammar(source string) error {
repo := conv.Load(source)
if len(repo.Files) == 0 {
return fmt.Errorf("source '%s' contains no grammar files", source)
}
conv.grammars[source] = repo.Scopes()
conv.modified = true
fmt.Printf("OK! added grammar source '%s'\n", source)
for scope := range repo.Files {
fmt.Printf("\tnew scope: %s\n", scope)
}
return nil
}
func (conv *Converter) ScopeMap() map[string]*Repository {
allScopes := make(map[string]*Repository)
for _, repo := range conv.Loaded {
for scope := range repo.Files {
if original := allScopes[scope]; original != nil {
repo.Fail(&DuplicateScopeError{original, scope})
} else {
allScopes[scope] = repo
}
}
}
return allScopes
}
func (conv *Converter) ConvertGrammars(update bool) error {
conv.Loaded = make(map[string]*Repository)
conv.queue = make(chan string, 128)
conv.progress = pb.New(len(conv.grammars))
conv.progress.Start()
for i := 0; i < runtime.NumCPU(); i++ {
conv.wg.Add(1)
go conv.work()
}
for src := range conv.grammars {
conv.queue <- src
}
close(conv.queue)
conv.wg.Wait()
done := fmt.Sprintf("done! processed %d grammars\n", len(conv.Loaded))
conv.progress.FinishPrint(done)
if update {
conv.grammars = make(map[string][]string)
conv.modified = true
}
knownScopes := conv.ScopeMap()
for source, repo := range conv.Loaded {
repo.FixRules(knownScopes)
if update {
conv.grammars[source] = repo.Scopes()
} else {
expected := conv.grammars[source]
repo.CompareScopes(expected)
}
}
return nil
}
func (conv *Converter) WriteProto(path string) error {
library := grammar.Library{
Grammars: make(map[string]*grammar.Rule),
}
for _, repo := range conv.Loaded {
for scope, file := range repo.Files {
library.Grammars[scope] = file.Rule
}
}
pb, err := proto.Marshal(&library)
if err != nil {
return err
}
return ioutil.WriteFile(path, pb, 0666)
}
func (conv *Converter) writeJSONFile(path string, rule *grammar.Rule) error {
j, err := os.Create(path)
if err != nil {
return err
}
defer j.Close()
enc := json.NewEncoder(j)
enc.SetIndent("", " ")
return enc.Encode(rule)
}
func (conv *Converter) WriteJSON(rulePath string) error {
if err := os.MkdirAll(rulePath, os.ModePerm); err != nil {
return err
}
for _, repo := range conv.Loaded {
for scope, file := range repo.Files {
p := path.Join(rulePath, scope+".json")
if err := conv.writeJSONFile(p, file.Rule); err != nil {
return err
}
}
}
return nil
}
func (conv *Converter) WriteGrammarList() error {
if !conv.modified {
return nil
}
outyml, err := yaml.Marshal(conv.grammars)
if err != nil {
return err
}
ymlpath := path.Join(conv.root, "grammars.yml")
return ioutil.WriteFile(ymlpath, outyml, 0666)
}
func (conv *Converter) Report(w io.Writer) {
var failed []*Repository
for _, repo := range conv.Loaded {
if len(repo.Errors) > 0 {
failed = append(failed, repo)
}
}
sort.Slice(failed, func(i, j int) bool {
return failed[i].Source < failed[j].Source
})
for _, repo := range failed {
fmt.Fprintf(w, "- [ ] %s (%d errors)\n", repo, len(repo.Errors))
for _, err := range repo.Errors {
fmt.Fprintf(w, " - [ ] %s\n", err)
}
fmt.Fprintf(w, "\n")
}
}
func NewConverter(root string) (*Converter, error) {
yml, err := ioutil.ReadFile(path.Join(root, "grammars.yml"))
if err != nil {
return nil, err
}
conv := &Converter{root: root}
if err := yaml.Unmarshal(yml, &conv.grammars); err != nil {
return nil, err
}
return conv, nil
}

View File

@@ -0,0 +1,21 @@
package compiler
import (
"bytes"
"os/exec"
)
func ConvertCSON(data []byte) ([]byte, error) {
stdin := bytes.NewBuffer(data)
stdout := &bytes.Buffer{}
cmd := exec.Command("csonc")
cmd.Stdin = stdin
cmd.Stdout = stdout
if err := cmd.Run(); err != nil {
return nil, err
}
return stdout.Bytes(), nil
}

View File

@@ -0,0 +1,29 @@
package compiler
var GrammarAliases = map[string]string{
"source.erb": "text.html.erb",
"source.cpp": "source.c++",
"source.less": "source.css.less",
"text.html.markdown": "source.gfm",
"text.md": "source.gfm",
"source.php": "text.html.php",
"text.plain": "",
"source.asciidoc": "text.html.asciidoc",
"source.perl6": "source.perl6fe",
"source.css.scss": "source.scss",
}
var KnownFields = map[string]bool{
"comment": true,
"uuid": true,
"author": true,
"comments": true,
"macros": true,
"fileTypes": true,
"firstLineMatch": true,
"keyEquivalent": true,
"foldingStopMarker": true,
"foldingStartMarker": true,
"foldingEndMarker": true,
"limitLineLength": true,
}

View File

@@ -0,0 +1,85 @@
package compiler
import "fmt"
import "strings"
type ConversionError struct {
Path string
Err error
}
func (err *ConversionError) Error() string {
return fmt.Sprintf(
"Grammar conversion failed. File `%s` failed to parse: %s",
err.Path, err.Err)
}
type DuplicateScopeError struct {
Original *Repository
Duplicate string
}
func (err *DuplicateScopeError) Error() string {
return fmt.Sprintf(
"Duplicate scope in repository: scope `%s` was already defined in %s",
err.Duplicate, err.Original)
}
type MissingScopeError struct {
Scope string
}
func (err *MissingScopeError) Error() string {
return fmt.Sprintf(
"Missing scope in repository: `%s` is listed in grammars.yml but cannot be found",
err.Scope)
}
type UnexpectedScopeError struct {
File *LoadedFile
Scope string
}
func (err *UnexpectedScopeError) Error() string {
return fmt.Sprintf(
"Unexpected scope in repository: `%s` declared in %s was not listed in grammars.yml",
err.Scope, err.File)
}
type MissingIncludeError struct {
File *LoadedFile
Include string
}
func (err *MissingIncludeError) Error() string {
return fmt.Sprintf(
"Missing include in grammar: %s attempts to include `%s` but the scope cannot be found",
err.File, err.Include)
}
type UnknownKeysError struct {
File *LoadedFile
Keys []string
}
func (err *UnknownKeysError) Error() string {
var keys []string
for _, k := range err.Keys {
keys = append(keys, fmt.Sprintf("`%s`", k))
}
return fmt.Sprintf(
"Unknown keys in grammar: %s contains invalid keys (%s)",
err.File, strings.Join(keys, ", "))
}
type InvalidRegexError struct {
File *LoadedFile
Err error
}
func (err *InvalidRegexError) Error() string {
return fmt.Sprintf(
"Invalid regex in grammar: %s contains a malformed regex (%s)",
err.File, err.Err)
}

View File

@@ -0,0 +1,124 @@
package compiler
import (
"fmt"
"os"
"path/filepath"
"sort"
"strings"
grammar "github.com/github/linguist/tools/grammars/proto"
)
type LoadedFile struct {
Path string
Rule *grammar.Rule
}
func (f *LoadedFile) String() string {
return fmt.Sprintf("`%s` (in `%s`)", f.Rule.ScopeName, f.Path)
}
type Repository struct {
Source string
Upstream string
Files map[string]*LoadedFile
Errors []error
}
func newRepository(src string) *Repository {
return &Repository{
Source: src,
Files: make(map[string]*LoadedFile),
}
}
func (repo *Repository) String() string {
str := fmt.Sprintf("repository `%s`", repo.Source)
if repo.Upstream != "" {
str = str + fmt.Sprintf(" (from %s)", repo.Upstream)
}
return str
}
func (repo *Repository) Fail(err error) {
repo.Errors = append(repo.Errors, err)
}
func (repo *Repository) AddFile(path string, rule *grammar.Rule, uk []string) {
file := &LoadedFile{
Path: path,
Rule: rule,
}
repo.Files[rule.ScopeName] = file
if len(uk) > 0 {
repo.Fail(&UnknownKeysError{file, uk})
}
}
func toMap(slice []string) map[string]bool {
m := make(map[string]bool)
for _, s := range slice {
m[s] = true
}
return m
}
func (repo *Repository) CompareScopes(scopes []string) {
expected := toMap(scopes)
for scope, file := range repo.Files {
if !expected[scope] {
repo.Fail(&UnexpectedScopeError{file, scope})
}
}
for scope := range expected {
if _, ok := repo.Files[scope]; !ok {
repo.Fail(&MissingScopeError{scope})
}
}
}
func (repo *Repository) FixRules(knownScopes map[string]*Repository) {
for _, file := range repo.Files {
w := walker{
File: file,
Known: knownScopes,
Missing: make(map[string]bool),
}
w.walk(file.Rule)
repo.Errors = append(repo.Errors, w.Errors...)
}
}
func (repo *Repository) Scopes() (scopes []string) {
for s := range repo.Files {
scopes = append(scopes, s)
}
sort.Strings(scopes)
return
}
func isValidGrammar(path string, info os.FileInfo) bool {
if info.IsDir() {
return false
}
dir := filepath.Dir(path)
ext := filepath.Ext(path)
switch strings.ToLower(ext) {
case ".plist":
return strings.HasSuffix(dir, "/Syntaxes")
case ".tmlanguage", ".yaml-tmlanguage":
return true
case ".cson", ".json":
return strings.HasSuffix(dir, "/grammars")
default:
return false
}
}

View File

@@ -0,0 +1,80 @@
package compiler
import (
"io/ioutil"
"os"
"os/exec"
"path"
"path/filepath"
"strings"
)
type fsLoader struct {
*Repository
abspath string
}
func (l *fsLoader) findGrammars() (files []string, err error) {
err = filepath.Walk(l.abspath,
func(path string, info os.FileInfo, err error) error {
if err == nil && isValidGrammar(path, info) {
files = append(files, path)
}
return nil
})
return
}
func (l *fsLoader) load() {
grammars, err := l.findGrammars()
if err != nil {
l.Fail(err)
return
}
for _, path := range grammars {
data, err := ioutil.ReadFile(path)
if err != nil {
l.Fail(err)
continue
}
if rel, err := filepath.Rel(l.abspath, path); err == nil {
path = rel
}
rule, unknown, err := ConvertProto(filepath.Ext(path), data)
if err != nil {
l.Fail(&ConversionError{path, err})
continue
}
if _, ok := l.Files[rule.ScopeName]; ok {
continue
}
l.AddFile(path, rule, unknown)
}
}
func gitRemoteName(path string) (string, error) {
remote, err := exec.Command("git", "-C", path, "remote", "get-url", "origin").Output()
if err != nil {
return "", err
}
return strings.TrimSpace(string(remote)), nil
}
func LoadFromFilesystem(root, src string) *Repository {
loader := fsLoader{
Repository: newRepository(src),
abspath: path.Join(root, src),
}
loader.load()
if ups, err := gitRemoteName(loader.abspath); err == nil {
loader.Repository.Upstream = ups
}
return loader.Repository
}

View File

@@ -0,0 +1,93 @@
package compiler
import (
"archive/tar"
"compress/gzip"
"io"
"io/ioutil"
"net/http"
"path/filepath"
"strings"
)
type urlLoader struct {
*Repository
}
func (l *urlLoader) loadTarball(r io.Reader) {
gzf, err := gzip.NewReader(r)
if err != nil {
l.Fail(err)
return
}
defer gzf.Close()
tarReader := tar.NewReader(gzf)
for true {
header, err := tarReader.Next()
if err != nil {
if err != io.EOF {
l.Fail(err)
}
return
}
if isValidGrammar(header.Name, header.FileInfo()) {
data, err := ioutil.ReadAll(tarReader)
if err != nil {
l.Fail(err)
return
}
ext := filepath.Ext(header.Name)
rule, unknown, err := ConvertProto(ext, data)
if err != nil {
l.Fail(&ConversionError{header.Name, err})
continue
}
if _, ok := l.Files[rule.ScopeName]; ok {
continue
}
l.AddFile(header.Name, rule, unknown)
}
}
}
func (l *urlLoader) load() {
res, err := http.Get(l.Source)
if err != nil {
l.Fail(err)
return
}
defer res.Body.Close()
if strings.HasSuffix(l.Source, ".tar.gz") {
l.loadTarball(res.Body)
return
}
data, err := ioutil.ReadAll(res.Body)
if err != nil {
l.Fail(err)
return
}
ext := filepath.Ext(l.Source)
filename := filepath.Base(l.Source)
rule, unknown, err := ConvertProto(ext, data)
if err != nil {
l.Fail(&ConversionError{filename, err})
return
}
l.AddFile(filename, rule, unknown)
}
func LoadFromURL(src string) *Repository {
loader := urlLoader{newRepository(src)}
loader.load()
return loader.Repository
}

View File

@@ -0,0 +1,68 @@
package compiler
import (
"fmt"
"github.com/github/linguist/tools/grammars/pcre"
)
type replacement struct {
pos int
len int
val string
}
func fixRegex(re string) (string, bool) {
var (
replace []replacement
escape = false
hasBackRefs = false
)
for i, ch := range re {
if escape {
if ch == 'h' {
replace = append(replace, replacement{i - 1, 2, "[[:xdigit:]]"})
}
if '0' <= ch && ch <= '9' {
hasBackRefs = true
}
}
escape = !escape && ch == '\\'
}
if len(replace) > 0 {
reb := []byte(re)
offset := 0
for _, repl := range replace {
reb = append(
reb[:offset+repl.pos],
append([]byte(repl.val), reb[offset+repl.pos+repl.len:]...)...)
offset += len(repl.val) - repl.len
}
return string(reb), hasBackRefs
}
return re, hasBackRefs
}
func CheckPCRE(re string) (string, error) {
hasBackRefs := false
if re == "" {
return "", nil
}
if len(re) > 32*1024 {
return "", fmt.Errorf(
"regex %s: definition too long (%d bytes)",
pcre.RegexPP(re), len(re))
}
re, hasBackRefs = fixRegex(re)
if !hasBackRefs {
if err := pcre.CheckRegexp(re, pcre.DefaultFlags); err != nil {
return "", err
}
}
return re, nil
}

View File

@@ -0,0 +1,27 @@
package compiler
import (
"testing"
)
func Test_fixRegex(t *testing.T) {
tests := []struct {
re string
want string
}{
{"foobar", "foobar"},
{`testing\h`, "testing[[:xdigit:]]"},
{`\htest`, `[[:xdigit:]]test`},
{`abc\hdef`, `abc[[:xdigit:]]def`},
{`\\\htest`, `\\[[:xdigit:]]test`},
{`\\htest`, `\\htest`},
{`\h\h\h\h`, `[[:xdigit:]][[:xdigit:]][[:xdigit:]][[:xdigit:]]`},
{`abc\hdef\hghi\h`, `abc[[:xdigit:]]def[[:xdigit:]]ghi[[:xdigit:]]`},
}
for _, tt := range tests {
got, _ := fixRegex(tt.re)
if got != tt.want {
t.Errorf("fixRegex() got = %v, want %v", got, tt.want)
}
}
}

View File

@@ -0,0 +1,96 @@
package compiler
import (
"encoding/json"
"fmt"
"reflect"
"strings"
grammar "github.com/github/linguist/tools/grammars/proto"
"github.com/groob/plist"
"github.com/mitchellh/mapstructure"
yaml "gopkg.in/yaml.v2"
)
func looseDecoder(f reflect.Kind, t reflect.Kind, data interface{}) (interface{}, error) {
dataVal := reflect.ValueOf(data)
switch t {
case reflect.Bool:
switch f {
case reflect.Bool:
return dataVal.Bool(), nil
case reflect.Float32, reflect.Float64:
return (int(dataVal.Float()) != 0), nil
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return (dataVal.Int() != 0), nil
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return (dataVal.Uint() != 0), nil
case reflect.String:
switch dataVal.String() {
case "1":
return true, nil
case "0":
return false, nil
}
}
}
return data, nil
}
func filterUnusedKeys(keys []string) (out []string) {
for _, k := range keys {
parts := strings.Split(k, ".")
field := parts[len(parts)-1]
if !KnownFields[field] {
out = append(out, k)
}
}
return
}
func ConvertProto(ext string, data []byte) (*grammar.Rule, []string, error) {
var (
raw map[string]interface{}
out grammar.Rule
err error
md mapstructure.Metadata
)
switch strings.ToLower(ext) {
case ".plist", ".tmlanguage":
err = plist.Unmarshal(data, &raw)
case ".yaml-tmlanguage":
err = yaml.Unmarshal(data, &raw)
case ".cson":
data, err = ConvertCSON(data)
if err == nil {
err = json.Unmarshal(data, &raw)
}
case ".json":
err = json.Unmarshal(data, &raw)
default:
err = fmt.Errorf("grammars: unsupported extension '%s'", ext)
}
if err != nil {
return nil, nil, err
}
config := mapstructure.DecoderConfig{
Result: &out,
Metadata: &md,
DecodeHook: looseDecoder,
}
decoder, err := mapstructure.NewDecoder(&config)
if err != nil {
return nil, nil, err
}
if err := decoder.Decode(raw); err != nil {
return nil, nil, err
}
return &out, filterUnusedKeys(md.Unused), nil
}

View File

@@ -0,0 +1,79 @@
package compiler
import (
"strings"
grammar "github.com/github/linguist/tools/grammars/proto"
)
func (w *walker) checkInclude(rule *grammar.Rule) {
include := rule.Include
if include == "" || include[0] == '#' || include[0] == '$' {
return
}
if alias, ok := GrammarAliases[include]; ok {
rule.Include = alias
return
}
include = strings.Split(include, "#")[0]
_, ok := w.Known[include]
if !ok {
if !w.Missing[include] {
w.Missing[include] = true
w.Errors = append(w.Errors, &MissingIncludeError{w.File, include})
}
rule.Include = ""
}
}
func (w *walker) checkRegexps(rule *grammar.Rule) {
check := func(re string) string {
re2, err := CheckPCRE(re)
if err != nil {
w.Errors = append(w.Errors, &InvalidRegexError{w.File, err})
}
return re2
}
rule.Match = check(rule.Match)
rule.Begin = check(rule.Begin)
rule.While = check(rule.While)
rule.End = check(rule.End)
}
func (w *walker) walk(rule *grammar.Rule) {
w.checkInclude(rule)
w.checkRegexps(rule)
for _, rule := range rule.Patterns {
w.walk(rule)
}
for _, rule := range rule.Captures {
w.walk(rule)
}
for _, rule := range rule.BeginCaptures {
w.walk(rule)
}
for _, rule := range rule.WhileCaptures {
w.walk(rule)
}
for _, rule := range rule.EndCaptures {
w.walk(rule)
}
for _, rule := range rule.Repository {
w.walk(rule)
}
for _, rule := range rule.Injections {
w.walk(rule)
}
}
type walker struct {
File *LoadedFile
Known map[string]*Repository
Missing map[string]bool
Errors []error
}