Skip to content

Commit

Permalink
Run black
Browse files Browse the repository at this point in the history
  • Loading branch information
frankier committed May 26, 2018
1 parent 266ba85 commit bc164b5
Show file tree
Hide file tree
Showing 8 changed files with 48 additions and 46 deletions.
14 changes: 7 additions & 7 deletions finntk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
from .omor.inst import get_omorfi
from .omor.anlys import analysis_to_subword_dicts

from .omor.extract import (
extract_lemmas,
extract_lemmas_combs,
extract_lemmas_recurs)
from .omor.extract import extract_lemmas, extract_lemmas_combs, extract_lemmas_recurs

__all__ = [
'get_token_positions', 'get_omorfi',
'analysis_to_subword_dicts', 'extract_lemmas',
'extract_lemmas_combs', 'extract_lemmas_recurs'
"get_token_positions",
"get_omorfi",
"analysis_to_subword_dicts",
"extract_lemmas",
"extract_lemmas_combs",
"extract_lemmas_recurs",
]
30 changes: 17 additions & 13 deletions finntk/omor/anlys.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import re
from itertools import product


def analysis_to_pairs(ana):
for bit in ana.split(']['):
k, v = bit.strip('[]').split('=', 1)
for bit in ana.split("]["):
k, v = bit.strip("[]").split("=", 1)
yield k, v


Expand All @@ -17,14 +18,15 @@ def analysis_to_dict(ana):


def dict_to_analysis(d):
return "[{}]".format(']['.join(
['{}={}'.format(k.upper(), v) for k, v in d.items()]))
return "[{}]".format(
"][".join(["{}={}".format(k.upper(), v) for k, v in d.items()])
)


def chunk_subwords(it):

def is_cmp_bound(kv):
return (kv[0] == 'BOUNDARY' and
kv[1] == 'COMPOUND')
return (kv[0] == "BOUNDARY" and kv[1] == "COMPOUND")

return split_at(it, is_cmp_bound)

Expand All @@ -40,25 +42,27 @@ def analysis_to_subword_dicts(ana):

def generate_dict(ana):
from .inst import get_omorfi

omor = get_omorfi()
ana_cp = ana.copy()
if 'weight' in ana_cp:
del ana_cp['weight']
if "weight" in ana_cp:
del ana_cp["weight"]
ana_txt = dict_to_analysis(ana_cp)
return {gen['surf'] for gen in omor.generate(ana_txt)}
return {gen["surf"] for gen in omor.generate(ana_txt)}


def generate_or_passthrough(ana):
return {ana['word_id'] if s.startswith('[') else s
for s in generate_dict(ana)}
return {ana["word_id"] if s.startswith("[") else s for s in generate_dict(ana)}


def lemmas_of_subword_dicts(subword_dicts):
subword_dicts = list(subword_dicts)
return [
''.join(prefixes) + norm_word_id(subword_dicts[-1]['word_id'])
"".join(prefixes) + norm_word_id(subword_dicts[-1]["word_id"])
for prefixes in product(
*(generate_or_passthrough(d) for d in subword_dicts[:-1]))]
*(generate_or_passthrough(d) for d in subword_dicts[:-1])
)
]


EXTRA_WORD_ID = re.compile("_\d$")
Expand Down
9 changes: 4 additions & 5 deletions finntk/omor/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def _extract_lemmas(word_form, get_slices):
analyses = omorfi.analyse(word_form)
res = set()
for analysis in analyses:
analysis_dicts = analysis_to_subword_dicts(analysis['anal'])
analysis_dicts = analysis_to_subword_dicts(analysis["anal"])
for analysis_slice in get_slices(analysis_dicts):
for lemma in lemmas_of_subword_dicts(analysis_slice):
res.add(lemma)
Expand All @@ -24,9 +24,8 @@ def _extract_lemmas(word_form, get_slices):

def extract_lemmas(word_form):
return _extract_lemmas(
word_form,
lambda analysis_dicts:
[[d] for d in analysis_dicts])
word_form, lambda analysis_dicts: [[d] for d in analysis_dicts]
)


def extract_lemmas_combs(word_form):
Expand All @@ -40,7 +39,7 @@ def extract_lemmas_recurs(word_form):
word_form = expand_queue.pop()
new_lemmas = extract_lemmas_combs(word_form)
novel_lemmas = new_lemmas - res
print('novel_lemmas', novel_lemmas)
print("novel_lemmas", novel_lemmas)
expand_queue.extend(novel_lemmas)
for lemma in novel_lemmas:
res.add(lemma)
Expand Down
3 changes: 2 additions & 1 deletion finntk/omor/inst.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,6 @@ def get_omorfi():
segment=True,
labelsegment=True,
guesser=True,
udpipe=True)
udpipe=True,
)
return _omorfi
10 changes: 5 additions & 5 deletions finntk/omor/seg.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import re
from more_itertools import split_at

LABELSEGMENT_RE = r'''
LABELSEGMENT_RE = r"""
\{ (?P<seg> [^\}]* ) \} |
\[ (?P<tag> [^\]]* ) \] |
(?P<surf> [^\[\{]+ )
'''
"""

_labelsegment_lex = None

Expand All @@ -26,15 +26,15 @@ def labelsegment_to_tokens(labelsegmented):


def tokens_to_subword_tokens(it):

def is_cmp_bound(kv):
return (kv[0] == 'seg' and
kv[1] == 'wB')
return (kv[0] == "seg" and kv[1] == "wB")

return split_at(it, is_cmp_bound)


def tokens_to_surf(it):
return "".join(v for (t, v) in it if t == 'surf')
return "".join(v for (t, v) in it if t == "surf")


def labelsegment_to_subword_tokens(labelsegmented):
Expand Down
4 changes: 2 additions & 2 deletions finntk/omor/tok.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def get_token_positions(tokenised, text):
starts = []
start = 0
for token in tokenised:
start = text.index(token['surf'], start)
start = text.index(token["surf"], start)
starts.append(start)
return starts

Expand All @@ -15,4 +15,4 @@ def form_of_tok(token):
if isinstance(token, str):
return token.lower()
else:
return token['surf'].lower()
return token["surf"].lower()
5 changes: 5 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[metadata]
description-file = README.md

[flake8]
max-line-length = 88
19 changes: 6 additions & 13 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,15 @@
name="finntk",
version="0.0.2",
url="https://github.com/frankier/finntk",

author="Frankie Robertson",

description="Finnish NLP toolkit",
long_description=open('README.md').read(),

long_description=open("README.md").read(),
packages=setuptools.find_packages(),

install_requires=[
"more_itertools>=4.1.0"
],

install_requires=["more_itertools>=4.1.0"],
classifiers=[
'Development Status :: 2 - Pre-Alpha',
'Programming Language :: Python',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
"Development Status :: 2 - Pre-Alpha",
"Programming Language :: Python",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
],
)

0 comments on commit bc164b5

Please sign in to comment.