From 0968b57c067847a63f12faaab506296718db9b3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=B8rre=20Gaup?= Date: Wed, 21 Feb 2024 14:32:19 +0100 Subject: [PATCH] =?UTF-8?q?We=20don=E2=80=99t=20need=20scripts=20here,=20t?= =?UTF-8?q?hey=20have=20been=20moved=20to=20another=20repo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/gramcheck-test.py | 281 -------- scripts/gramcheck_comparator.py | 998 ----------------------------- scripts/make_grammarchecker_zip.py | 39 -- 3 files changed, 1318 deletions(-) delete mode 100755 scripts/gramcheck-test.py delete mode 100755 scripts/gramcheck_comparator.py delete mode 100755 scripts/make_grammarchecker_zip.py diff --git a/scripts/gramcheck-test.py b/scripts/gramcheck-test.py deleted file mode 100755 index 9e41d903..00000000 --- a/scripts/gramcheck-test.py +++ /dev/null @@ -1,281 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding:utf-8 -*- - -# Copyright © 2020-2023 UiT The Arctic University of Norway -# License: GPL3 -# Author: Børre Gaup -"""Check if grammarchecker tests pass.""" - -import sys -from pathlib import Path -import io - -import libdivvun -import yaml -from lxml import etree - -from corpustools import errormarkup -from gramcheck_comparator import COLORS, UI, GramChecker, GramTest - - -class YamlGramChecker(GramChecker): - def __init__(self, config): - super().__init__() - self.config = config - self.checker = self.app() - - @staticmethod - def print_error(string): - print(string, file=sys.stderr) - - def get_variant(self, checker_spec): - for variant in self.config.get("variants"): - if checker_spec.hasPipe(variant): - return variant - - def app(self): - spec_file = self.config.get("spec") - - checker_spec = ( - libdivvun.ArCheckerSpec(spec_file.as_posix()) - if spec_file.suffix == ".zcheck" - else libdivvun.CheckerSpec(str(spec_file)) - ) - if self.config.get("variants") is None: - return checker_spec.getChecker( - pipename=checker_spec.defaultPipe(), - verbose=False, - ) - else: - variant = self.get_variant(checker_spec) - if variant is not None: - return checker_spec.getChecker( - pipename=variant, - verbose=False, - ) - else: - self.print_error( - "Error in section Variant of the yaml file.\n" - "There is no pipeline named " - f"{variant} in {spec_file}" - ) - available_names = "\n".join(checker_spec.pipeNames()) - self.print_error("Available pipelines are\n" f"{available_names}") - sys.exit(5) - - -class YamlGramTest(GramTest): - explanations = { - "tp": "GramDivvun found marked up error and has the suggested correction", - "fp1": "GramDivvun found manually marked up error, but corrected wrongly", - "fp2": "GramDivvun found error which is not manually marked up", - "fn1": "GramDivvun found manually marked up error, but has no correction", - "fn2": "GramDivvun did not find manually marked up error", - } - - def __init__(self, args): - super().__init__() - self.config = self.load_config(args) - - def load_config(self, args): - config = {} - - if args.silent: - config["out"] = GramTest.NoOutput(args) - else: - config["out"] = { - "normal": GramTest.NormalOutput, - "terse": GramTest.TerseOutput, - "compact": GramTest.CompactOutput, - "silent": GramTest.NoOutput, - "final": GramTest.FinalOutput, - }.get(args.output, lambda x: None)(args) - - config["test_file"] = Path(args.test_files[0]) - - if not args.colour: - for key in list(COLORS.keys()): - COLORS[key] = "" - - yaml_settings = self.yaml_reader(config["test_file"]) - - config["spec"] = ( - config["test_file"].parent / yaml_settings.get("Config").get("Spec") - if not args.spec - else Path(args.spec) - ) - config["variants"] = ( - yaml_settings.get("Config").get("Variants") - if not args.variant - else [args.variant] - ) - config["tests"] = yaml_settings.get("Tests", []) - - if args.total and len(args.test_files) == 1: - notfixed = ( - config["test_file"].parent / f"{config['test_file'].stem}.notfixed.yaml" - ) - tests = self.yaml_reader(notfixed).get("Tests") - if notfixed.is_file() and tests: - config["tests"].extend(tests) - - if len(args.test_files) > 1: - for test_file in args.test_files[1:]: - tests = self.yaml_reader(Path(test_file)).get("Tests") - if tests: - config["tests"].extend(tests) - - return config - - @staticmethod - def yaml_reader(test_file): - with test_file.open() as test_file: - return yaml.load(test_file, Loader=yaml.FullLoader) - - def make_error_markup(self, text): - para = etree.Element("p") - try: - para.text = text - errormarkup.convert_to_errormarkupxml(para) - except TypeError: - print(f'Error in {self.config["test_file"]}') - print(text, "is not a string") - return para - - @property - def paragraphs(self): - grammarchecker = YamlGramChecker(self.config) - - return ( - ( - grammarchecker.get_data( - str(self.config["test_file"]), self.make_error_markup(text) - ) - for text in self.config["tests"] - ) - if self.config["tests"] - else [] - ) - - def move_passes_from_fail(self): - if "FAIL" in self.config["test_file"].name and any(self.test_results): - passing_tests = [ - self.config["tests"][index] - for (index, test_result) in enumerate(self.test_results) - if test_result - ] - - pass_path = Path(str(self.config["test_file"]).replace("FAIL", "PASS")) - with pass_path.open("a") as pass_stream: - print( - "\n".join([f' - "{this_test}"' for this_test in passing_tests]), - file=pass_stream, - ) - - with io.StringIO() as temp_stream: - with self.config["test_file"].open("r") as input: - temp_stream.write( - "".join( - [ - line - for line in input - if not any( - [ - passing_test in line.strip() - for passing_test in passing_tests - ] - ) - ] - ) - ) - self.config["test_file"].open("w").write(temp_stream.getvalue()) - - def run(self): - failed_or_not = super().run() - - self.move_passes_from_fail() - - return failed_or_not - - -class YamlUI(UI): - def __init__(self): - super().__init__() - - self.description = "Test errormarkuped up sentences" - self.add_argument( - "-o", - "--output", - choices=["normal", "compact", "terse", "final"], - dest="output", - default="normal", - help="""Desired output style (Default: normal)""", - ) - self.add_argument( - "-q", - "--silent", - dest="silent", - action="store_true", - help="Hide all output; exit code only", - ) - self.add_argument( - "-p", - "--hide-passes", - dest="hide_pass", - action="store_true", - help="Suppresses passes to make finding fails easier", - ) - self.add_argument( - "-s", - "--spec", - dest="spec", - required=False, - help="""Path to the pipeline.xml spec file. Usefull when doing out - of tree builds""", - ) - self.add_argument( - "-V", - "--variant", - dest="variant", - required=False, - help="""Which variant should be used.""", - ) - self.add_argument( - "-t", - "--total", - dest="total", - action="store_true", - required=False, - help="""Merge tests from x.yaml and x.notfixed.yaml""", - ) - self.add_argument( - "-v", - "--verbose", - dest="verbose", - action="store_true", - help="More verbose output.", - ) - self.add_argument("test_files", nargs="+", help="YAML files with test rules") - - self.test = YamlGramTest(self.parse_args()) - - -def main(): - try: - ui = YamlUI() - ui.start() - except KeyboardInterrupt: - sys.exit(130) - - -if __name__ == "__main__": - try: - main() - except ( - FileNotFoundError, - yaml.scanner.ScannerError, - yaml.parser.ParserError, - errormarkup.ErrorMarkupError, - ) as error: - print(str(error), file=sys.stderr) - raise SystemExit(1) diff --git a/scripts/gramcheck_comparator.py b/scripts/gramcheck_comparator.py deleted file mode 100755 index a3ee5612..00000000 --- a/scripts/gramcheck_comparator.py +++ /dev/null @@ -1,998 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding:utf-8 -*- - -# Copyright © 2020-2023 UiT The Arctic University of Norway -# License: GPL3 -# Author: Børre Gaup -"""Write report on differences on manual markup and gramdivvun markup""" -import ctypes -import io -import os -import sys -import tempfile -from argparse import ArgumentParser -from collections import Counter -from contextlib import contextmanager -from io import StringIO -from pathlib import Path - -import libdivvun -from corpustools import ccat, errormarkup -from lxml import etree - - -@contextmanager -def stderr_redirector(stream): - """Catch errors from libdivvun""" - libc = ctypes.CDLL(None) - c_stderr = ( - ctypes.c_void_p.in_dll(libc, "__stderrp") - if sys.platform == "darwin" - else ctypes.c_void_p.in_dll(libc, "stderr") - ) - - # The original fd stdout points to. Usually 1 on POSIX systems. - original_stderr_fd = sys.stderr.fileno() - - def _redirect_stderr(to_fd): - """Redirect stderr to the given file descriptor.""" - # Flush the C-level buffer stderr - libc.fflush(c_stderr) - # Flush and close sys.stderr - also closes the file descriptor (fd) - sys.stderr.close() - # Make original_stderr_fd point to the same file as to_fd - os.dup2(to_fd, original_stderr_fd) - # Create a new sys.stderr that points to the redirected fd - sys.stderr = io.TextIOWrapper(os.fdopen(original_stderr_fd, "wb")) - - # Save a copy of the original stderr fd in saved_stderr_fd - saved_stderr_fd = os.dup(original_stderr_fd) - try: - # Create a temporary file and redirect stderr to it - tfile = tempfile.TemporaryFile(mode="w+b") - _redirect_stderr(tfile.fileno()) - # Yield to caller, then redirect stderr back to the saved fd - yield - _redirect_stderr(saved_stderr_fd) - # Copy contents of temporary file to the given stream - tfile.flush() - tfile.seek(0, io.SEEK_SET) - stream.write(tfile.read()) - finally: - tfile.close() - os.close(saved_stderr_fd) - - -COLORS = { - "red": "\033[1;31m", - "green": "\033[0;32m", - "orange": "\033[0;33m", - "yellow": "\033[1;33m", - "blue": "\033[0;34m", - "light_blue": "\033[0;36m", - "reset": "\033[m", -} - - -def extract_correction(child): - """Replace error element with correction attribute.""" - correct = child.find("./correct") - parts = [correct.text if correct is not None and correct.text is not None else ""] - if child.tail: - parts.append(child.tail) - - return "".join(parts) - - -def colourise(string, *args, **kwargs): - kwargs.update(COLORS) - return string.format(*args, **kwargs) - - -class GramChecker: - def __init__(self, ignore_typos=False): - self.ignore_typos = ignore_typos - - def check_grammar(self, sentence): - f = io.BytesIO() - - with stderr_redirector(f): # catch stderr from libdivvun - d_errors = libdivvun.proc_errs_bytes(self.checker, sentence) - - errs = [ - [ - d_error.form, - d_error.beg, - d_error.end, - d_error.err, - d_error.dsc, - [ - rep - if d_error.form != d_error.form.capitalize() - else rep.capitalize() - for rep in d_error.rep - ], - d_error.msg, - ] - for d_error in d_errors - ] - - return {"text": sentence, "errs": errs} - - @staticmethod - def remove_dupes(double_spaces, d_errors): - for removable_error in [ - d_error - for double_space in double_spaces - for d_error in d_errors - if double_space[1:2] == d_error[1:2] - ]: - d_errors.remove(removable_error) - - @staticmethod - def sortByRange(error): - return error[1:2] - - def add_part(self, part, start, end, d_errors): - res = self.check_grammar(part) - errors = res["errs"] - for error in [error for error in errors if error]: - candidate = [error[0], start, end, error[3], error[4], error[5], error[6]] - if candidate not in d_errors: - d_errors.append(candidate) - - def fix_no_space_before_parent_start(self, space_error, d_errors): - for dupe in [ - d_error for d_error in d_errors if d_error[1:2] == space_error[1:2] - ]: - d_errors.remove(dupe) - - parenthesis = space_error[0].find("(") - d_errors.append( - [ - space_error[0][parenthesis:], - space_error[1] + parenthesis, - space_error[2], - space_error[3], - space_error[4], - [" ("], - space_error[6], - ] - ) - part1 = space_error[0][:parenthesis] - start = space_error[1] - end = space_error[1] + len(part1) - if part1: - self.add_part(part1, start, end, d_errors) - - part2 = space_error[0][parenthesis + 1 :] - start = space_error[1] + parenthesis + 1 - end = space_error[1] + parenthesis + 1 + len(part2) - if part2: - self.add_part(part2, start, end, d_errors) - - d_errors.sort(key=self.sortByRange) - - def fix_aistton_left(self, d_error, d_errors, position): - sentence = d_error[0][1:] - d_error[0] = d_error[0][0] - d_error[5] = ["”"] - d_error[2] = d_error[1] + 1 - - res = self.check_grammar(sentence) - new_d_error = res["errs"] - if new_d_error: - new_d_error[0][1] = d_error[1] + 1 - new_d_error[0][2] = d_error[1] + 1 + len(sentence) - d_errors.insert(position + 1, new_d_error[0]) - - def fix_aistton_right(self, d_error, d_errors, position): - sentence = d_error[0][:-1] - d_error[0] = d_error[0][-1] - d_error[5] = ["”"] - d_error[1] = d_error[2] - 1 - - res = self.check_grammar(sentence) - new_d_error = res["errs"] - if new_d_error: - new_d_error[0][1] = d_error[1] - len(sentence) - new_d_error[0][2] = d_error[1] - d_errors.insert(position, new_d_error[0]) - - def fix_hidden_by_aistton_both(self, d_errors): - """Make the index, error and suggestions match the manual errormarkup.""" - - def is_hidden_error(error): - return (error[1], error[2]) in aistton_both_ranges and error[ - 3 - ] != "punct-aistton-both" - - def fix_hidden_error(error): - return [ - error[0][1:-1], - error[1] + 1, - error[2] - 1, - error[3], - error[4], - [suggestion[1:-1] for suggestion in error[5]], - ] - - aistton_both_ranges = [ - (error[1], error[2]) - for error in d_errors - if error[3] == "punct-aistton-both" - ] - return [ - fix_hidden_error(error) if is_hidden_error(error) else error - for error in d_errors - ] - - def fix_aistton_both(self, d_error, d_errors, position): - if d_error[0][-1] != "”": - right_error = [part for part in d_error] - right_error[0] = right_error[0][-1] - right_error[5] = ["”"] - right_error[1] = right_error[2] - 1 - right_error[3] = "punct-aistton-both" - d_errors.insert(position + 1, right_error) - - d_error[0] = d_error[0][0] - d_error[5] = ["”"] - d_error[2] = d_error[1] + 1 - - def fix_aistton(self, d_errors): - aistton_fixers = { - "punct-aistton-left": self.fix_aistton_left, - "punct-aistton-right": self.fix_aistton_right, - "punct-aistton-both": self.fix_aistton_both, - } - - for position, d_error in enumerate(d_errors): - if ( - d_error[3] in aistton_fixers - and len(d_error[0]) > 1 - and len(d_error[5]) == 1 - ): - aistton_fixers[d_error[3]](d_error, d_errors, position) - - def get_error_corrections(self, para): - parts = [] - if para.text is not None: - parts.append(para.text) - for child in para: - if child.tag != "correct": - correct = child.find("./correct") - parts.append(correct.text if correct.text is not None else "") - for grandchild in child: - if grandchild.tag != "correct": - parts.append(self.get_error_corrections(grandchild)) - - if not len(para) and para.tail: - parts.append(para.tail) - - return "".join(parts) - - @staticmethod - def is_non_nested_error(para): - """Check if the only children are correct elements.""" - return all(child.tag == "correct" for child in para) - - def extract_error_info(self, parts, errors, para): - """Only collect unnested errors.""" - info = ["", "", "", "", "", ""] - if para.tag.startswith("error"): - info[0] = self.get_error_corrections(para) if len(para) else para.text - info[1] = len("".join(parts)) - info[3] = para.tag - correct = para.find("./correct") - info[4] = correct.attrib.get("errorinfo", default="") - info[5] = [ - correct.text if correct.text is not None else "" - for correct in para.xpath("./correct") - ] - - if para.text: - parts.append(para.text) - - for child in para: - if child.tag != "correct": - if self.is_non_nested_error(child): - errors.append(self.extract_error_info(parts, errors, child)) - else: - self.extract_error_info(parts, errors, child) - - if para.tag.startswith("error"): - info[2] = len("".join(parts)) - - if para.tail: - parts.append(para.tail) - - return info - - def fix_all_errors(self, d_errors): - """Remove errors that cover the same area of the typo and msyn types.""" - - def report_dupes(errors): - found_errors = set() - index_set = set() - for error1 in errors: - for error2 in errors: - if error1[:3] == error2[:3] and error1 != error2: - if ( - str(error1) not in found_errors - and str(error2) not in found_errors - ): - found_errors.add(str(error1)) - found_errors.add(str(error2)) - index_set.add(errors.index(error1)) - - for pos in sorted(index_set, reverse=True): - del errors[pos] - - d_errors = self.fix_hidden_by_aistton_both(d_errors) - self.fix_aistton(d_errors) - for d_error in d_errors: - if d_error[3] == "no-space-before-parent-start": - self.fix_no_space_before_parent_start(d_error, d_errors) - - report_dupes(d_errors) - - return d_errors - - def check_sentence(self, sentence): - res = self.check_grammar(sentence) - - return self.fix_all_errors(res["errs"]) - - def normalise_error_markup(self, errors): - for error in errors: - if ( - error[3] == "errorformat" - and error[4] == "notspace" - and " " in error[0] - ): - d_pos = error[0].find(" ") - error[1] = error[1] + d_pos - error[2] = error[1] + 3 - error[0] = error[0][error[1] : error[2]] - - def normalise_grammar_markup(self, errors): - for error in errors: - if error[3] == "double-space-before": - d_pos = error[0].find(" ") - error[1] = error[1] + d_pos - error[2] = error[1] + 3 - error[0] = error[0][error[1] : error[2]] - - def remove_non_hits(self, errors, d_errors): - """Find the d_errors that correspond with errors.""" - return [ - d_error - for error in errors - for d_error in d_errors - if d_error[1:2] == error[1:2] - ] - - def correct_lowest_level(self, para): - """Replace error markup of zero length with correction.""" - new_para = etree.Element(para.tag) - new_para.text = para.text if para.text else "" - - for child in para: - if child.tag.startswith("error"): - if self.is_non_nested_error(child): - if len(new_para): - new_para[-1].tail += extract_correction(child) - else: - new_para.text += extract_correction(child) - else: - new_para.append(self.correct_lowest_level(child)) - else: - new_para.append(child) - - new_para.tail = para.tail if para.tail else "" - - return new_para - - def nested_errors(self, para): - """Grammarcheck a level at a time.""" - while True: - para = self.correct_lowest_level(para) - if self.is_non_nested_error(para): - break - _, errors, d_errors = self.error_extractor(para) - yield errors, self.remove_non_hits(errors, d_errors) - - def error_extractor(self, para): - """Extract sentence, markup errors and grammarchecker errors.""" - parts = [] - errors = [] - self.extract_error_info(parts, errors, para) - self.normalise_error_markup(errors) - - sentence = "".join(parts) - d_errors = self.check_sentence(sentence) - self.normalise_grammar_markup(d_errors) - - return sentence, errors, d_errors - - def remove_foreign(self, marked_errors, found_errors): - """Remove foreign language error elements.""" - foreign_ranges = [ - (marked_error[1], marked_error[2]) - for marked_error in marked_errors - if marked_error[3] == "errorlang" - ] - return ( - [ - marked_error - for marked_error in marked_errors - if marked_error[3] != "errorlang" - ], - [ - found_error - for found_error in found_errors - if not any( - foreign_range[0] <= found_error[1] < foreign_range[1] - and found_error[2] <= foreign_range[1] - for foreign_range in foreign_ranges - ) - ], - ) - - def remove_typo(self, marked_errors, found_errors): - """Remove foreign language error elements.""" - return ( - [ - marked_error - for marked_error in marked_errors - if marked_error[3] != "errorort" - ], - [found_error for found_error in found_errors if found_error[3] != "typo"], - ) - - def get_data(self, filename, para): - """Extract data for reporting from a paragraph.""" - sentence, errors, d_errors = self.error_extractor(para) - - for next_errors, next_d_errors in self.nested_errors(para): - errors.extend(next_errors) - d_errors.extend(next_d_errors) - - errors, d_errors = self.remove_foreign(errors, d_errors) - if self.ignore_typos: - errors, d_errors = self.remove_typo(errors, d_errors) - - return { - "uncorrected": sentence, - "expected_errors": errors, - "gramcheck_errors": d_errors, - "filename": filename, - } - - -class CorpusGramChecker(GramChecker): - """Check for grammarerrors in errormarkup files from a Giella corpus.""" - - def __init__(self, archive, ignore_typos): - super().__init__(ignore_typos) - self.archive = archive - self.checker = self.app() - - def app(self): - def print_error(string): - print(string, file=sys.stderr) - - archive_file = Path(self.archive) - if archive_file.is_file(): - spec = libdivvun.ArCheckerSpec(str(archive_file)) - pipename = spec.defaultPipe() - verbose = False - return spec.getChecker(pipename, verbose) - else: - print_error( - "Error in section Archive of the yaml file.\n" - + f"The file {archive_file} does not exist" - ) - sys.exit(2) - - -class GramTest: - class AllOutput: - def __init__(self, args): - self._io = StringIO() - self.args = args - - def __str__(self): - return self._io.getvalue() - - def write(self, data): - self._io.write(data) - - def info(self, data): - self.write(data) - - def title(self, *args): - pass - - def success(self, *args): - pass - - def failure(self, *args): - pass - - def false_positive_1(self, *args): - pass - - def result(self, *args): - pass - - def final_result(self, count): - passes = count["tp"] - fails = sum([count[key] for key in count if key != "tp"]) - self.write( - colourise( - "Total passes: {green}{passes}{reset}, " - + "Total fails: {red}{fails}{reset}, " - + "Total: {light_blue}{total}{reset}\n", - passes=passes, - fails=fails, - total=fails + passes, - ) - ) - - class NormalOutput(AllOutput): - def title(self, index, length, test_case): - self.write(f'{colourise("{light_blue}")}') - self.write("-" * 10) - self.write(f"\nTest {index}/{length}: {test_case}\n") - self.write("-" * 10) - self.write(f'{colourise("{reset}")}\n') - - def success(self, case, total, type, expected_error, gramcheck_error, filename): - self.write(filename + "\n") - errorinfo = f", ({expected_error[4]})" - x = colourise( - ( - "[{light_blue}{case:>%d}/{total}{reset}]" - + "[{green}PASS {type}{reset}] " - + "{error}:{correction} ({expectected_type}) {blue}=>{reset} " - + "{gramerr}:{errlist} ({gram_type})\n" - ) - % len(str(total)), - type=type, - error=expected_error[0], - correction=", ".join(expected_error[5]), - expectected_type=f"{expected_error[4]}{errorinfo}", - case=case, - total=total, - gramerr=gramcheck_error[0], - errlist=f'[{", ".join(gramcheck_error[5])}]', - gram_type=gramcheck_error[3], - ) - self.write(x) - - def failure(self, case, total, type, expected_error, gramcheck_error, filename): - self.write(filename + "\n") - errorinfo = f", ({expected_error[4]})" - x = colourise( - ( - "[{light_blue}{case:>%d}/{total}{reset}][{red}FAIL {type}" - "{reset}] {error}:{correction} ({expectected_type}) " - + "{blue}=>{reset} {gramerr}:{errlist} ({gram_type})\n" - ) - % len(str(total)), - type=type, - error=expected_error[0], - correction=", ".join(expected_error[5]), - expectected_type=f"{expected_error[4]}{errorinfo}", - case=case, - total=total, - gramerr=gramcheck_error[0], - errlist=f'[{", ".join(gramcheck_error[5])}]', - gram_type=gramcheck_error[3], - ) - self.write(x) - - def result(self, number, count, test_case): - passes = sum([count[key] for key in count if key.startswith("t")]) - fails = sum([count[key] for key in count if key.startswith("f")]) - text = colourise( - "Test {number} - Passes: {green}{passes}{reset}, " - + "Fails: {red}{fails}{reset}, " - + "Total: {light_blue}{total}{reset}\n\n", - number=number, - passes=passes, - fails=fails, - total=passes + fails, - ) - self.write(text) - - def final_result(self, count): - passes = sum([count[key] for key in count if key.startswith("t")]) - fails = sum([count[key] for key in count if key.startswith("f")]) - self.write( - colourise( - "Total passes: {green}{passes}{reset}, " - + "Total fails: {red}{fails}{reset}, " - + "Total: {light_blue}{total}{reset}\n", - passes=passes, - fails=fails, - total=fails + passes, - ) - ) - self.precision(count) - - def precision(self, count): - try: - true_positives = count["tp"] - false_positives = count["fp1"] + count["fp2"] - false_negatives = count["fn1"] + count["fn2"] - - prec = true_positives / (true_positives + false_positives) - recall = true_positives / (true_positives + false_negatives) - f1score = 2 * prec * recall / (prec + recall) - - self.write( - colourise( - "True positive: {green}{true_positive}{reset}\n" - + "True negative: {green}{true_negative}{reset}\n" - + "False positive 1: {red}{fp1}{reset}\n" - + "False positive 2: {red}{fp2}{reset}\n" - + "False negative 1: {red}{fn1}{reset}\n" - + "False negative 2: {red}{fn2}{reset}\n" - + "Precision: {prec:.1f}%\n" - + "Recall: {recall:.1f}%\n" - + "F₁ score: {f1score:.1f}%\n", - true_positive=count["tp"], - true_negative=count["tn"], - fp1=count["fp1"], - fp2=count["fp2"], - fn1=count["fn1"], - fn2=count["fn2"], - prec=prec * 100, - recall=recall * 100, - f1score=f1score * 100, - ) - ) - except ZeroDivisionError: - pass - - class CompactOutput(AllOutput): - def result(self, number, count, test_case): - passes = sum([count[key] for key in count if key.startswith("t")]) - fails = sum([count[key] for key in count if key.startswith("f")]) - out = f"{test_case} {passes}/{fails}/{passes + fails}" - if fails: - self.write(colourise("[{red}FAIL{reset}] {}\n", out)) - else: - self.write(colourise("[{green}PASS{reset}] {}\n", out)) - - class TerseOutput(AllOutput): - def success(self, *args): - self.write(colourise("{green}.{reset}")) - - def failure(self, *args): - self.write(colourise("{red}!{reset}")) - - def result(self, *args): - self.write("\n") - - def final_result(self, count): - fails = sum([count[key] for key in count if key != "tp"]) - if fails: - self.write(colourise("{red}FAIL{reset}\n")) - else: - self.write(colourise("{green}PASS{reset}\n")) - - class FinalOutput(AllOutput): - def final_result(self, count): - passes = sum([count[key] for key in count if key.startswith("t")]) - fails = sum([count[key] for key in count if key.startswith("f")]) - self.write(f"{passes}/{fails}/{passes+fails}") - - class NoOutput(AllOutput): - def final_result(self, *args): - pass - - def __init__(self): - self.count = Counter() - - def run_tests(self): - tests = self.tests - self.test_results = [ - self.run_test(item, len(tests)) - for item in enumerate(tests.items(), start=1) - ] - - self.config.get("out").final_result(self.count) - - def run_test(self, item, length): - count = Counter() - - out = self.config.get("out") - out.title(item[0], length, item[1][0]) - - expected_errors = item[1][1]["expected_errors"] - gramcheck_errors = item[1][1]["gramcheck_errors"] - filename = item[1][1]["filename"] - - for true_positive in self.has_true_positives(expected_errors, gramcheck_errors): - count["tp"] += 1 - out.success( - item[0], length, "tp", true_positive[0], true_positive[1], filename - ) - - for true_negative in self.has_true_negatives(expected_errors, gramcheck_errors): - count["tn"] += 1 - out.success( - item[0], length, "tn", true_negative[0], true_negative[1], filename - ) - - for false_positive_1 in self.has_false_positives_1( - expected_errors, gramcheck_errors - ): - count["fp1"] += 1 - out.failure( - item[0], - length, - "fp1", - false_positive_1[0], - false_positive_1[1], - filename, - ) - - expected_error = ["", "", "", "", "", ""] - for false_positive_2 in self.has_false_positives_2( - expected_errors, gramcheck_errors - ): - count["fp2"] += 1 - out.failure( - item[0], length, "fp2", expected_error, false_positive_2, filename - ) - - for false_negative_1 in self.has_false_negatives_1( - expected_errors, gramcheck_errors - ): - count["fn1"] += 1 - out.failure( - item[0], - length, - "fn1", - false_negative_1[0], - false_negative_1[1], - filename, - ) - - for false_negative_2 in self.has_false_negatives_2( - expected_errors, gramcheck_errors - ): - gramcheck_error = ["", "", "", "", "", []] - count["fn2"] += 1 - out.failure( - item[0], length, "fn2", false_negative_2, gramcheck_error, filename - ) - - out.result(item[0], count, item[1][0]) - - for key in count: - self.count[key] += count[key] - - # Did this test sentence as a whole pass or not - return all(key.startswith("t") for key in count.keys()) - - def has_same_range_and_error(self, c_error, d_error): - """Check if the errors have the same range and error""" - if d_error[3] == "double-space-before": - return c_error[1:2] == d_error[1:2] - else: - return c_error[:3] == d_error[:3] - - def has_suggestions_with_hit(self, c_error, d_error): - """Check if markup error correction exists in grammarchecker error.""" - return ( - len(d_error[5]) > 0 - and self.has_same_range_and_error(c_error, d_error) - and any([correct in d_error[5] for correct in c_error[5]]) - ) - - def has_true_negatives(self, correct, dc): - if not correct and not dc: - return [(["", "", "", "", "", ""], ["", "", "", "", "", ""])] - - return [] - - def has_true_positives(self, correct, dc): - return [ - (c_error, d_error) - for c_error in correct - for d_error in dc - if self.has_suggestions_with_hit(c_error, d_error) - ] - - def has_false_positives_1(self, correct, dc): - return [ - (c_error, d_error) - for c_error in correct - for d_error in dc - if self.has_suggestions_without_hit(c_error, d_error) - ] - - def has_suggestions_without_hit(self, c_error, d_error): - return ( - self.has_same_range_and_error(c_error, d_error) - and d_error[5] - and not any([correct in d_error[5] for correct in c_error[5]]) - ) - - def has_false_positives_2(self, correct, dc): - return [ - d_error - for d_error in dc - if not any( - self.has_same_range_and_error(c_error, d_error) for c_error in correct - ) - ] - - def has_false_negatives_2(self, c_errors, d_errors): - corrects = [] - for c_error in c_errors: - for d_error in d_errors: - if self.has_same_range_and_error(c_error, d_error): - break - else: - corrects.append(c_error) - - return corrects - - def has_false_negatives_1(self, correct, dc): - return [ - (c_error, d_error) - for c_error in correct - for d_error in dc - if self.has_no_suggestions(c_error, d_error) - ] - - def has_no_suggestions(self, c_error, d_error): - return self.has_same_range_and_error(c_error, d_error) and not d_error[5] - - def run(self): - self.run_tests() - - return 1 if any([key.startswith("f") for key in self.count]) else 0 - - def __str__(self): - return str(self.config.get("out")) - - @property - def tests(self): - return {test["uncorrected"]: test for test in self.paragraphs} - - -class CorpusGramTest(GramTest): - def __init__(self, args): - super().__init__() - self.ignore_typos = args.ignore_typos - self.archive = args.archive - self.targets = args.targets - self.config = {"out": GramTest.NormalOutput(args)} - if not args.colour: - for key in list(COLORS.keys()): - COLORS[key] = "" - - def flatten_para(self, para): - """Convert non-error xml elements into plain text.""" - if not (para.tag.startswith("error") or para.tag == "correct"): - text = para.text if para.text else "" - - if para.tail: - text += para.tail - - parent = para.getparent() - if parent is not None: - parent.remove(para) - if parent.text: - parent.text = parent.text + text - else: - parent.text = text - - for child in para: - self.flatten_para(child) - - def keep_url(self, root): - """Keep url as plain text.""" - for url in root.xpath('.//errorlang[@correct="url"]'): - parent = url.getparent() - previous = url.getprevious() - if previous is not None: - if url.text is not None: - if previous.tail is not None: - previous.tail += url.text - else: - previous.tail = url.text - if url.tail is not None: - if previous.tail is not None: - previous.tail += url.tail - else: - previous.tail = url.tail - else: - if url.text is not None: - if parent.text is not None: - parent.text += url.text - else: - parent.text = url.text - if url.tail is not None: - if parent.text is not None: - parent.text += url.tail - else: - parent.text = url.tail - - parent.remove(url) - - @property - def paragraphs(self): - grammarchecker = CorpusGramChecker(self.archive, self.ignore_typos) - - for filename in ccat.find_files(self.targets, ".xml"): - root = etree.parse(filename) - self.keep_url(root) - for para in root.iter("p"): - # the xml:lang attribute indicates that the sentence is not the expected - # language. These paragraphs are not included in the test. - if not para.get("{http://www.w3.org/XML/1998/namespace}lang"): - self.flatten_para(para) - yield grammarchecker.get_data(filename, para) - - -class UI(ArgumentParser): - def __init__(self): - super().__init__() - self.add_argument( - "-c", - "--colour", - dest="colour", - action="store_true", - help="Colours the output", - ) - self.test = None - - def start(self): - ret = self.test.run() - sys.stdout.write(str(self.test)) - sys.exit(ret) - - -class CorpusUI(UI): - def __init__(self): - super().__init__() - self.add_argument( - "--ignore-typos", - dest="ignore_typos", - action="store_true", - help="Pretend as if typos are correct", - ) - self.add_argument("archive", help="The grammarchecker archive") - self.add_argument( - "targets", - nargs="+", - help="""Name of the file or directories to process. - If a directory is given, all files in this directory - and its subdirectories will be listed.""", - ) - - self.test = CorpusGramTest(self.parse_args()) - - -def main(): - try: - ui = CorpusUI() - ui.start() - except KeyboardInterrupt: - sys.exit(130) - - -if __name__ == "__main__": - try: - main() - except (FileNotFoundError, errormarkup.ErrorMarkupError) as error: - raise SystemExit(error) diff --git a/scripts/make_grammarchecker_zip.py b/scripts/make_grammarchecker_zip.py deleted file mode 100755 index fcd4cf16..00000000 --- a/scripts/make_grammarchecker_zip.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding:utf-8 -*- - -# Copyright © 2023 UiT The Arctic University of Norway -# License: GPL3 -# Author: Børre Gaup -"""Make a grammarchecker zip archive without '-dev' variants""" -from lxml import etree -from zipfile import ZipFile -import sys - - -def get_pipespec(spec_file): - """Remove all '-dev' pipelines.""" - pipespec = etree.parse(spec_file) - for pipeline in pipespec.iter("pipeline"): - if pipeline.xpath(".//*[contains(@n, './')]"): - pipeline.getparent().remove(pipeline) - - return pipespec - - -def make_archive(specfile, archive_name): - """Make grammarchecker archive without '-dev' variants.""" - pipespec = get_pipespec(specfile) - with ZipFile(archive_name, "w") as archive_zip: - archive_zip.writestr("pipespec.xml", etree.tostring(pipespec)) - - for filename in { - element.attrib.get("n") for element in pipespec.xpath(".//*[@n]") - }: - archive_zip.write(filename) - - -if __name__ == "__main__": - try: - make_archive(specfile=sys.argv[1], archive_name=sys.argv[2]) - except FileNotFoundError as error: - raise SystemExit(error)