From bc164b5d9692b71622ed97ec5c78619ef805fd9e Mon Sep 17 00:00:00 2001 From: Frankie Robertson Date: Sat, 26 May 2018 10:40:57 +0300 Subject: [PATCH] Run black --- finntk/__init__.py | 14 +++++++------- finntk/omor/anlys.py | 30 +++++++++++++++++------------- finntk/omor/extract.py | 9 ++++----- finntk/omor/inst.py | 3 ++- finntk/omor/seg.py | 10 +++++----- finntk/omor/tok.py | 4 ++-- setup.cfg | 5 +++++ setup.py | 19 ++++++------------- 8 files changed, 48 insertions(+), 46 deletions(-) create mode 100644 setup.cfg diff --git a/finntk/__init__.py b/finntk/__init__.py index d6c0e54..a9987b7 100644 --- a/finntk/__init__.py +++ b/finntk/__init__.py @@ -2,13 +2,13 @@ from .omor.inst import get_omorfi from .omor.anlys import analysis_to_subword_dicts -from .omor.extract import ( - extract_lemmas, - extract_lemmas_combs, - extract_lemmas_recurs) +from .omor.extract import extract_lemmas, extract_lemmas_combs, extract_lemmas_recurs __all__ = [ - 'get_token_positions', 'get_omorfi', - 'analysis_to_subword_dicts', 'extract_lemmas', - 'extract_lemmas_combs', 'extract_lemmas_recurs' + "get_token_positions", + "get_omorfi", + "analysis_to_subword_dicts", + "extract_lemmas", + "extract_lemmas_combs", + "extract_lemmas_recurs", ] diff --git a/finntk/omor/anlys.py b/finntk/omor/anlys.py index dac8f7d..4ec200e 100644 --- a/finntk/omor/anlys.py +++ b/finntk/omor/anlys.py @@ -2,9 +2,10 @@ import re from itertools import product + def analysis_to_pairs(ana): - for bit in ana.split(']['): - k, v = bit.strip('[]').split('=', 1) + for bit in ana.split("]["): + k, v = bit.strip("[]").split("=", 1) yield k, v @@ -17,14 +18,15 @@ def analysis_to_dict(ana): def dict_to_analysis(d): - return "[{}]".format(']['.join( - ['{}={}'.format(k.upper(), v) for k, v in d.items()])) + return "[{}]".format( + "][".join(["{}={}".format(k.upper(), v) for k, v in d.items()]) + ) def chunk_subwords(it): + def is_cmp_bound(kv): - return (kv[0] == 'BOUNDARY' and - kv[1] == 'COMPOUND') + return (kv[0] == "BOUNDARY" and kv[1] == "COMPOUND") return split_at(it, is_cmp_bound) @@ -40,25 +42,27 @@ def analysis_to_subword_dicts(ana): def generate_dict(ana): from .inst import get_omorfi + omor = get_omorfi() ana_cp = ana.copy() - if 'weight' in ana_cp: - del ana_cp['weight'] + if "weight" in ana_cp: + del ana_cp["weight"] ana_txt = dict_to_analysis(ana_cp) - return {gen['surf'] for gen in omor.generate(ana_txt)} + return {gen["surf"] for gen in omor.generate(ana_txt)} def generate_or_passthrough(ana): - return {ana['word_id'] if s.startswith('[') else s - for s in generate_dict(ana)} + return {ana["word_id"] if s.startswith("[") else s for s in generate_dict(ana)} def lemmas_of_subword_dicts(subword_dicts): subword_dicts = list(subword_dicts) return [ - ''.join(prefixes) + norm_word_id(subword_dicts[-1]['word_id']) + "".join(prefixes) + norm_word_id(subword_dicts[-1]["word_id"]) for prefixes in product( - *(generate_or_passthrough(d) for d in subword_dicts[:-1]))] + *(generate_or_passthrough(d) for d in subword_dicts[:-1]) + ) + ] EXTRA_WORD_ID = re.compile("_\d$") diff --git a/finntk/omor/extract.py b/finntk/omor/extract.py index df06711..ca07340 100644 --- a/finntk/omor/extract.py +++ b/finntk/omor/extract.py @@ -15,7 +15,7 @@ def _extract_lemmas(word_form, get_slices): analyses = omorfi.analyse(word_form) res = set() for analysis in analyses: - analysis_dicts = analysis_to_subword_dicts(analysis['anal']) + analysis_dicts = analysis_to_subword_dicts(analysis["anal"]) for analysis_slice in get_slices(analysis_dicts): for lemma in lemmas_of_subword_dicts(analysis_slice): res.add(lemma) @@ -24,9 +24,8 @@ def _extract_lemmas(word_form, get_slices): def extract_lemmas(word_form): return _extract_lemmas( - word_form, - lambda analysis_dicts: - [[d] for d in analysis_dicts]) + word_form, lambda analysis_dicts: [[d] for d in analysis_dicts] + ) def extract_lemmas_combs(word_form): @@ -40,7 +39,7 @@ def extract_lemmas_recurs(word_form): word_form = expand_queue.pop() new_lemmas = extract_lemmas_combs(word_form) novel_lemmas = new_lemmas - res - print('novel_lemmas', novel_lemmas) + print("novel_lemmas", novel_lemmas) expand_queue.extend(novel_lemmas) for lemma in novel_lemmas: res.add(lemma) diff --git a/finntk/omor/inst.py b/finntk/omor/inst.py index f6b558a..1c2873f 100644 --- a/finntk/omor/inst.py +++ b/finntk/omor/inst.py @@ -21,5 +21,6 @@ def get_omorfi(): segment=True, labelsegment=True, guesser=True, - udpipe=True) + udpipe=True, + ) return _omorfi diff --git a/finntk/omor/seg.py b/finntk/omor/seg.py index e1a8693..33ee941 100644 --- a/finntk/omor/seg.py +++ b/finntk/omor/seg.py @@ -1,11 +1,11 @@ import re from more_itertools import split_at -LABELSEGMENT_RE = r''' +LABELSEGMENT_RE = r""" \{ (?P [^\}]* ) \} | \[ (?P [^\]]* ) \] | (?P [^\[\{]+ ) -''' +""" _labelsegment_lex = None @@ -26,15 +26,15 @@ def labelsegment_to_tokens(labelsegmented): def tokens_to_subword_tokens(it): + def is_cmp_bound(kv): - return (kv[0] == 'seg' and - kv[1] == 'wB') + return (kv[0] == "seg" and kv[1] == "wB") return split_at(it, is_cmp_bound) def tokens_to_surf(it): - return "".join(v for (t, v) in it if t == 'surf') + return "".join(v for (t, v) in it if t == "surf") def labelsegment_to_subword_tokens(labelsegmented): diff --git a/finntk/omor/tok.py b/finntk/omor/tok.py index c2f8789..af694ca 100644 --- a/finntk/omor/tok.py +++ b/finntk/omor/tok.py @@ -6,7 +6,7 @@ def get_token_positions(tokenised, text): starts = [] start = 0 for token in tokenised: - start = text.index(token['surf'], start) + start = text.index(token["surf"], start) starts.append(start) return starts @@ -15,4 +15,4 @@ def form_of_tok(token): if isinstance(token, str): return token.lower() else: - return token['surf'].lower() + return token["surf"].lower() diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..dc6ce33 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,5 @@ +[metadata] +description-file = README.md + +[flake8] +max-line-length = 88 diff --git a/setup.py b/setup.py index 5038f45..96c3c3e 100644 --- a/setup.py +++ b/setup.py @@ -4,22 +4,15 @@ name="finntk", version="0.0.2", url="https://github.com/frankier/finntk", - author="Frankie Robertson", - description="Finnish NLP toolkit", - long_description=open('README.md').read(), - + long_description=open("README.md").read(), packages=setuptools.find_packages(), - - install_requires=[ - "more_itertools>=4.1.0" - ], - + install_requires=["more_itertools>=4.1.0"], classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', + "Development Status :: 2 - Pre-Alpha", + "Programming Language :: Python", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", ], )