From ece9431711a6bea8f0afdc2169b7215c16bd4d03 Mon Sep 17 00:00:00 2001 From: Aman Karmani Date: Fri, 28 Jul 2023 10:03:56 -0700 Subject: [PATCH 1/2] build.py: create an index of the languages compiled into the library --- tests/test_tree_sitter.py | 15 ++++++++--- tree_sitter/__init__.py | 55 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 5 deletions(-) diff --git a/tests/test_tree_sitter.py b/tests/test_tree_sitter.py index e4841c2..cfeb412 100644 --- a/tests/test_tree_sitter.py +++ b/tests/test_tree_sitter.py @@ -1,21 +1,28 @@ # pylint: disable=missing-docstring import re +import os from unittest import TestCase from os import path from tree_sitter import Language, Parser LIB_PATH = path.join("build", "languages.so") +LIB_LANGS = [ + path.join("tests", "fixtures", "tree-sitter-python"), + path.join("tests", "fixtures", "tree-sitter-javascript"), +] Language.build_library( LIB_PATH, - [ - path.join("tests", "fixtures", "tree-sitter-python"), - path.join("tests", "fixtures", "tree-sitter-javascript"), - ], + LIB_LANGS, ) PYTHON = Language(LIB_PATH, "python") JAVASCRIPT = Language(LIB_PATH, "javascript") +class TestLanguage(TestCase): + def test_build_library_index(self): + index = dict() + Language.build_library(LIB_PATH, LIB_LANGS, index=index) + self.assertEqual(index["python"][0]["scope"], "source.python") class TestParser(TestCase): def test_set_language(self): diff --git a/tree_sitter/__init__.py b/tree_sitter/__init__.py index 5f7cb48..888de1c 100644 --- a/tree_sitter/__init__.py +++ b/tree_sitter/__init__.py @@ -6,6 +6,9 @@ from os import path from platform import system from tempfile import TemporaryDirectory +from glob import glob +import re +import json from tree_sitter.binding import _language_field_id_for_name, _language_query from tree_sitter.binding import Node, Parser, Tree, TreeCursor # noqa: F401 @@ -14,7 +17,7 @@ class Language: """A tree-sitter language""" @staticmethod - def build_library(output_path, repo_paths): + def build_library(output_path, repo_paths, index=dict()): """ Build a dynamic library at the given path, based on the parser repositories at the given paths. @@ -42,6 +45,56 @@ def build_library(output_path, repo_paths): path.getmtime(path_) for path_ in source_paths ] + if index is not None: + for repo_path in repo_paths: + # find the symbol name of the parser to use for dlopen later. + # doesn't always match the scope, repo name, or file types. + parser = None + with open(path.join(repo_path, "src", "parser.c"), 'r') as file: + for line in file: + if line.startswith("extern const TSLanguage *tree_sitter_"): + parser = re.search(r"tree_sitter_(.+?)\(", line).group(1) + break + if parser is None: + print("ERROR: failed to find parser name in", repo_path) + continue + + # package.json is required, but may be missing. + # find the json file, and parse out the tree-sitter section. + package_json_path = path.join(repo_path, 'package.json') + if not path.isfile(package_json_path): + print("NOTE: missing package.json in", repo_path) + index[parser] = {} + continue + with open(package_json_path, 'r') as file: + package_json = json.load(file) + + # we may also be nested in a repo with multiple parsers (typescript, ocaml). + nested = False + if 'main' in package_json and package_json['main'].startswith('../'): + nested = True + package_json_path = path.join(repo_path, '..', 'package.json') + with open(package_json_path, 'r') as file: + package_json = json.load(file) + + if 'tree-sitter' not in package_json: + print("NOTE: missing tree-sitter section in package.json from", repo_path) + index[parser] = {} + continue + + # tree-sitter section can contain multiple entries. + # if nested, attempt to find the one that matches this parser. + entries = package_json['tree-sitter'] + if not nested: + index[parser] = entries + continue + for entry in entries: + if entry['scope'].endswith(parser) or ('path' in entry and entry['path'] == parser): + index[parser] = [entry] + break + if parser not in index: + index[parser] = entries + compiler = new_compiler() if isinstance(compiler, UnixCCompiler): compiler.compiler_cxx[0] = "c++" From 976d5df7917ab34b559ca7edde32dfc80f88e558 Mon Sep 17 00:00:00 2001 From: Aman Karmani Date: Fri, 28 Jul 2023 10:19:00 -0700 Subject: [PATCH 2/2] add Language.lookup_language_name_for_file --- tests/test_tree_sitter.py | 5 +++++ tree_sitter/__init__.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/tests/test_tree_sitter.py b/tests/test_tree_sitter.py index cfeb412..7d21778 100644 --- a/tests/test_tree_sitter.py +++ b/tests/test_tree_sitter.py @@ -24,6 +24,11 @@ def test_build_library_index(self): Language.build_library(LIB_PATH, LIB_LANGS, index=index) self.assertEqual(index["python"][0]["scope"], "source.python") + def test_lookup_language(self): + index = dict() + Language.build_library(LIB_PATH, LIB_LANGS, index=index) + self.assertEqual(Language.lookup_language_name_for_file(index, "foo.py"), "python") + class TestParser(TestCase): def test_set_language(self): parser = Parser() diff --git a/tree_sitter/__init__.py b/tree_sitter/__init__.py index 888de1c..f17d1bb 100644 --- a/tree_sitter/__init__.py +++ b/tree_sitter/__init__.py @@ -126,6 +126,34 @@ def build_library(output_path, repo_paths, index=dict()): ) return True + @staticmethod + def lookup_language_name_for_file(index, file_name, file_contents=None): + matching_keys = [] + for key, entries in index.items(): + for entry in entries: + if 'file-types' not in entry: + continue + for ft in entry['file-types']: + if file_name == ft or file_name.endswith(ft): + matching_keys.append(key) + + if file_contents is None or len(matching_keys) <= 1: + return matching_keys[0] if matching_keys else None + + best_score = -1 + best_key = None + for key in matching_keys: + for entry in index[key]: + if 'content-regex' in entry and file_contents is not None: + match = re.search(entry['content-regex'], file_contents) + if match: + score = match.end() - match.start() + if score > best_score: + best_score = score + best_key = key + + return best_key if best_key else matching_keys[0] + def __init__(self, library_path, name): """ Load the language with the given name from the dynamic library