Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build index of compiled parsers and add api for file-type resolution #145

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions tests/test_tree_sitter.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,33 @@
# pylint: disable=missing-docstring

import re
import os
from unittest import TestCase
from os import path
from tree_sitter import Language, Parser

LIB_PATH = path.join("build", "languages.so")
LIB_LANGS = [
path.join("tests", "fixtures", "tree-sitter-python"),
path.join("tests", "fixtures", "tree-sitter-javascript"),
]
Language.build_library(
LIB_PATH,
[
path.join("tests", "fixtures", "tree-sitter-python"),
path.join("tests", "fixtures", "tree-sitter-javascript"),
],
LIB_LANGS,
)
PYTHON = Language(LIB_PATH, "python")
JAVASCRIPT = Language(LIB_PATH, "javascript")

class TestLanguage(TestCase):
def test_build_library_index(self):
index = dict()
Language.build_library(LIB_PATH, LIB_LANGS, index=index)
self.assertEqual(index["python"][0]["scope"], "source.python")

def test_lookup_language(self):
index = dict()
Language.build_library(LIB_PATH, LIB_LANGS, index=index)
self.assertEqual(Language.lookup_language_name_for_file(index, "foo.py"), "python")

class TestParser(TestCase):
def test_set_language(self):
Expand Down
83 changes: 82 additions & 1 deletion tree_sitter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from os import path
from platform import system
from tempfile import TemporaryDirectory
from glob import glob
import re
import json
from tree_sitter.binding import _language_field_id_for_name, _language_query
from tree_sitter.binding import Node, Parser, Tree, TreeCursor # noqa: F401

Expand All @@ -14,7 +17,7 @@ class Language:
"""A tree-sitter language"""

@staticmethod
def build_library(output_path, repo_paths):
def build_library(output_path, repo_paths, index=dict()):
"""
Build a dynamic library at the given path, based on the parser
repositories at the given paths.
Expand Down Expand Up @@ -42,6 +45,56 @@ def build_library(output_path, repo_paths):
path.getmtime(path_) for path_ in source_paths
]

if index is not None:
for repo_path in repo_paths:
# find the symbol name of the parser to use for dlopen later.
# doesn't always match the scope, repo name, or file types.
parser = None
with open(path.join(repo_path, "src", "parser.c"), 'r') as file:
for line in file:
if line.startswith("extern const TSLanguage *tree_sitter_"):
parser = re.search(r"tree_sitter_(.+?)\(", line).group(1)
break
if parser is None:
print("ERROR: failed to find parser name in", repo_path)
continue

# package.json is required, but may be missing.
# find the json file, and parse out the tree-sitter section.
package_json_path = path.join(repo_path, 'package.json')
if not path.isfile(package_json_path):
print("NOTE: missing package.json in", repo_path)
index[parser] = {}
continue
with open(package_json_path, 'r') as file:
package_json = json.load(file)

# we may also be nested in a repo with multiple parsers (typescript, ocaml).
nested = False
if 'main' in package_json and package_json['main'].startswith('../'):
nested = True
package_json_path = path.join(repo_path, '..', 'package.json')
with open(package_json_path, 'r') as file:
package_json = json.load(file)

if 'tree-sitter' not in package_json:
print("NOTE: missing tree-sitter section in package.json from", repo_path)
index[parser] = {}
continue

# tree-sitter section can contain multiple entries.
# if nested, attempt to find the one that matches this parser.
entries = package_json['tree-sitter']
if not nested:
index[parser] = entries
continue
for entry in entries:
if entry['scope'].endswith(parser) or ('path' in entry and entry['path'] == parser):
index[parser] = [entry]
break
if parser not in index:
index[parser] = entries

compiler = new_compiler()
if isinstance(compiler, UnixCCompiler):
compiler.compiler_cxx[0] = "c++"
Expand Down Expand Up @@ -73,6 +126,34 @@ def build_library(output_path, repo_paths):
)
return True

@staticmethod
def lookup_language_name_for_file(index, file_name, file_contents=None):
matching_keys = []
for key, entries in index.items():
for entry in entries:
if 'file-types' not in entry:
continue
for ft in entry['file-types']:
if file_name == ft or file_name.endswith(ft):
matching_keys.append(key)

if file_contents is None or len(matching_keys) <= 1:
return matching_keys[0] if matching_keys else None

best_score = -1
best_key = None
for key in matching_keys:
for entry in index[key]:
if 'content-regex' in entry and file_contents is not None:
match = re.search(entry['content-regex'], file_contents)
if match:
score = match.end() - match.start()
if score > best_score:
best_score = score
best_key = key

return best_key if best_key else matching_keys[0]

def __init__(self, library_path, name):
"""
Load the language with the given name from the dynamic library
Expand Down
Loading