From 979dfd3474b0ebb7af5ea148191061898fd7f3d7 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Tue, 9 Jan 2024 22:43:02 +0000 Subject: [PATCH] org export: massive speedup for checking --- .ci/run | 2 +- exobrain/src/build.py | 8 +- exobrain/src/check_org.py | 164 +++++++++++++++++++++++++------------- 3 files changed, 118 insertions(+), 56 deletions(-) diff --git a/.ci/run b/.ci/run index 89ae70e..992a98e 100755 --- a/.ci/run +++ b/.ci/run @@ -39,4 +39,4 @@ fi cd exobrain $PY_BIN -m pip install more-itertools beautifulsoup4 orgparse lxml click psutil loguru pytest -$PY_BIN -m pytest src/test.py src/fixup_org.py +$PY_BIN -m pytest src/test.py src/fixup_org.py src/check_org.py diff --git a/exobrain/src/build.py b/exobrain/src/build.py index ced9c2e..39036ed 100755 --- a/exobrain/src/build.py +++ b/exobrain/src/build.py @@ -149,10 +149,12 @@ def compile_org_to_org(ctx: Context, paths: list[Path]) -> None: output_dir = ctx.output_dir output_dir = output_dir.absolute() # emacs seems unhappy if we don't do it + batch = list(map(str, rpaths)) + logger.info(f'exporting {batch}') + for rpath in rpaths: # create target dirs (emacs struggles without them) (output_dir / rpath).parent.mkdir(parents=True, exist_ok=True) - logger.debug(f'exporting {list(map(str, rpaths))} to {output_dir}') check_call( [ 'emacs', '--batch', '-l', @@ -164,6 +166,7 @@ def compile_org_to_org(ctx: Context, paths: list[Path]) -> None: stderr=DEVNULL, ) + logger.debug(f'fixing up {batch}') from fixup_org import fixup for rpath in rpaths: @@ -172,6 +175,7 @@ def compile_org_to_org(ctx: Context, paths: list[Path]) -> None: # TODO hiding tags from export (e.g. 'refile') -- will need to be implemented manually? # TODO need to test it! + logger.debug(f'checking {batch}') from check_org import check_one for rpath in rpaths: @@ -179,6 +183,8 @@ def compile_org_to_org(ctx: Context, paths: list[Path]) -> None: errors = list(check_one(path)) assert len(errors) == 0, (path, errors) + logger.info(f'finished {batch}') + def compile_org_to_html(ctx: Context, paths: list[Path]) -> None: if len(paths) == 0: diff --git a/exobrain/src/check_org.py b/exobrain/src/check_org.py index 58c0327..58cc44a 100644 --- a/exobrain/src/check_org.py +++ b/exobrain/src/check_org.py @@ -1,77 +1,133 @@ -from concurrent.futures import ProcessPoolExecutor +from __future__ import annotations + +from dataclasses import dataclass import os from pathlib import Path -from subprocess import run -from typing import Iterator, List +import re +from typing import Iterator, Optional import orgparse -# TODO !! implement a test for this (all of params) -def search(*args) -> Iterator[Exception]: - res = run(['rg', '--follow', '-i', *args]) - if res.returncode == 1: - return # ok, nothin is found - else: - yield RuntimeError(res) +@dataclass +class Config: + f_checks: list[str] # exact matches + word_checks: list[str] + tag_checks: set[str] -def check_one(path: Path) -> Iterator[Exception]: - if 'CI' not in os.environ: - from checks import F_CHECKS, WORD_CHECKS, TAG_CHECKS +def get_config() -> Config: + if 'CI' in os.environ: + return Config(f_checks=[], word_checks=[], tag_checks=set()) else: - F_CHECKS = [] - WORD_CHECKS = [] - TAG_CHECKS = set() - - for x in F_CHECKS: - # TODO might be too many calls, maybe do it in a single regex? - yield from search( - '-F', - x, - path, - ) - for x in WORD_CHECKS: - yield from search( - '--word-regexp', - x, - path, - ) - - # TODO not sure if should rely on a unit test? + from checks import F_CHECKS, WORD_CHECKS, TAG_CHECKS + + return Config(f_checks=F_CHECKS, word_checks=WORD_CHECKS, tag_checks=TAG_CHECKS) + + +def check_one(path: Path, *, cfg: Optional[Config] = None) -> Iterator[Exception]: + if cfg is None: + cfg = get_config() + + text = path.read_text() + + ## find exact matches + if len(cfg.f_checks) > 0: + rgx = '|'.join(re.escape(x) for x in cfg.f_checks) + m = re.search(rgx, text) + if m is not None: + yield RuntimeError('found occurence', rgx, m.group()) + ## + + ## find 'words' + if len(cfg.word_checks) > 0: + rgx = r'\b(' + '|'.join(re.escape(x) for x in cfg.word_checks) + r')\b' + m = re.search(rgx, text) + if m is not None: + yield RuntimeError('found occurence', rgx, m.group()) + ## + + ### detect timestamps with time ts = orgparse.date.TIMESTAMP_RE - for line in path.read_text().splitlines(): - m = ts.search(line) + allowed = { + 'inactive_year', + 'inactive_month', + 'inactive_day', + } + pos = 0 + while pos < len(text): + m = ts.search(text, pos=pos) if m is None: - continue - allowed = { - 'inactive_year', - 'inactive_month', - 'inactive_day', + break + d = { + k: v + for k, v in m.groupdict().items() + if v is not None and k not in allowed } - d = {k: v for k, v in m.groupdict().items() if v is not None and k not in allowed} if len(d) != 0: - yield RuntimeError(d, line) + yield RuntimeError(d, m.group()) + pos = m.start() + 1 + ### - o = orgparse.loads(path.read_text()) + ### find forbidden tags + o = orgparse.loads(text) for n in o: - found = n.tags.intersection(TAG_CHECKS) + found = n.tags.intersection(cfg.tag_checks) if len(found) > 0: yield RuntimeError(path, n.heading, found) + ### + + +def test_checks(tmp_path: Path) -> None: + cfg = Config( + f_checks=[ + 'web.telegram.org', + 'mail.google.com', + ], + word_checks=['some', 'noth'], + tag_checks=set(), + ) + + def do(text: str) -> list[Exception]: + p = tmp_path / 'file.org' + p.write_text(text) + return list(check_one(p, cfg=cfg)) + + assert len(do(''' +* nothing +** [2019-11-02 Sat] wrong +with + +* this file :sometag: +''')) == 0 + assert len(do(''' +somethings +whoopsweb.telegram.org/accidental link +more text +''')) == 1 -def _check_one(path: Path) -> List[str]: - # helper for multiprocessing.. - return list(map(str, check_one(path))) + assert len(do(''' +somethings +oh +mail.google.com/somelink +more text +''')) == 1 + assert len(do(''' +* bad link in body [2019-10-17 Thu] +alala mail.google.com/whatever +* but also bad timestamp + [2019-10-18 Fri 02:06] +''')) == 2 -def check_all(path: Path) -> None: - # TODO not sure about org? - org_files = sorted(path.rglob('*.org')) + assert len(do(''' +* I am ok +* I contain some forbidden words +''')) == 1 - with ProcessPoolExecutor() as pool: - for f, res in zip(org_files, pool.map(_check_one, org_files)): - for x in res: - # TODO collect errors, report once? - raise RuntimeError(x) + assert len(do(''' +* I end with forbidden noth +* but I am ok +''')) == 1