From 61ada6d717cddd94f5bc18e4a3d0cf22dbeb23ff Mon Sep 17 00:00:00 2001 From: Mario Graff Date: Thu, 7 Nov 2024 08:13:09 -0600 Subject: [PATCH] EncExp extra token (2) --- encexp/build_encexp.py | 31 +++++++++++++------------------ encexp/tests/test_build.py | 18 ++++-------------- encexp/utils.py | 15 +-------------- 3 files changed, 18 insertions(+), 46 deletions(-) diff --git a/encexp/build_encexp.py b/encexp/build_encexp.py index 3b7fd64..179e897 100644 --- a/encexp/build_encexp.py +++ b/encexp/build_encexp.py @@ -56,7 +56,8 @@ def update_tokens(seq, tokens=None): return seq -def encode(vocabulary, fname, tokens=None, limit=None): +def encode(vocabulary:dict, fname: str, tokens: list=None, + limit: int=None): """Encode file""" limit = np.inf if limit is None else limit loop = count() if limit == np.inf else range(limit) @@ -75,9 +76,9 @@ def encode(vocabulary, fname, tokens=None, limit=None): return output, cnt -def feasible_tokens(vocabulary, count, - tokens=None, - min_pos=512): +def feasible_tokens(vocabulary: dict, count: dict, + tokens: list=None, + min_pos: int=512): """Feasible tokens""" seq = SeqTM(vocabulary=vocabulary) tokens = seq.names if tokens is None else tokens @@ -94,10 +95,9 @@ def build_encexp_token(index, vocabulary, precision=np.float16, transform=None, estimator_kwargs=None, - tokens=None): + label=None): """Build token classifier""" seq = SeqTM(vocabulary=vocabulary) - label = seq.names[index] if tokens is None else tokens[index][1] output_fname = encode_output(fname, prefix=f'{index}') POS = [] NEG = [] @@ -141,16 +141,11 @@ def build_encexp_token(index, vocabulary, return output_fname -def build_encexp(vocabulary, - fname, output, - min_pos=512, - max_pos=2**13, - n_jobs = -1, - precision=np.float16, - estimator_kwargs=None, - limit=None, - transform=None, - tokens=None): +def build_encexp(vocabulary, fname, output, + min_pos: int=512, max_pos: int=2**13, + n_jobs: int = -1, precision=np.float16, + estimator_kwargs: dict=None, limit: int=None, + transform=None, tokens: list=None): """Build EncExp""" encode_fname, cnt = encode(vocabulary, fname, tokens=tokens, limit=limit) @@ -163,8 +158,8 @@ def build_encexp(vocabulary, max_pos=max_pos, estimator_kwargs=estimator_kwargs, transform=transform, - tokens=tokens) - for index, _ in progress_bar(tokens, + label=label) + for index, label in progress_bar(tokens, desc=output, total=len(tokens))) with gzip.open(output, 'wb') as fpt: diff --git a/encexp/tests/test_build.py b/encexp/tests/test_build.py index 1b52158..f8ebcea 100644 --- a/encexp/tests/test_build.py +++ b/encexp/tests/test_build.py @@ -50,16 +50,6 @@ def test_build_voc(): os.unlink('t.json.gz') -def test_build_voc_stats(): - """Test build voc statistics""" - samples() - statistics = [] - build_voc('es-mx-sample.json', output='t.json.gz', - voc_size_exponent=10, statistics=statistics) - assert statistics[:3] == [78037, 75690, 72900] - os.unlink('t.json.gz') - - def test_encexp_encode(): """Test encode method""" samples() @@ -104,8 +94,8 @@ def test_build_encexp_token(): output, cnt = encode(voc, 'es-mx-sample.json') tokens = feasible_tokens(voc, cnt) index, token = tokens[-3] - fname = build_encexp_token(index, voc, output) - assert fname == '559-encode-es-mx-sample.json' + fname = build_encexp_token(index, voc, output, label=token) + assert fname == f'{index}-encode-es-mx-sample.json' os.unlink('encode-es-mx-sample.json') data = next(tweet_iterator(fname)) assert data['label'] == token @@ -199,7 +189,7 @@ def test_build_encexp_tokens(): tokens = feasible_tokens(voc, cnt, tokens=words, min_pos=8) fname = build_encexp_token(0, voc, output, precision=np.float16, - tokens=tokens) + label=tokens[0][1]) assert isfile(fname) assert next(tweet_iterator(fname))['label'] == tokens[0][1] @@ -207,7 +197,7 @@ def test_build_encexp_tokens(): # assert output == 'encode-es-mx-sample.json' # os.unlink('encode-es-mx-sample.json') build_encexp(voc, 'es-mx-sample.json', 'encexp-es-mx.json.gz', - tokens=words, min_pos=8, n_jobs=1) + tokens=words, min_pos=8) assert isfile('encexp-es-mx.json.gz') enc = EncExp(lang=None, voc_source=None, EncExp_filename='encexp-es-mx.json.gz', diff --git a/encexp/utils.py b/encexp/utils.py index 7b05ed3..8fc9d68 100644 --- a/encexp/utils.py +++ b/encexp/utils.py @@ -151,8 +151,7 @@ def compute_b4msa_vocabulary(filename, limit=None, lang='es', def compute_seqtm_vocabulary(instance, vocabulary, filename, limit=None, voc_size_exponent=13, - prefix_suffix=False, - statistics=None): + prefix_suffix=False): """Compute SeqTM""" def current_lost_words(): @@ -193,18 +192,6 @@ def optimize_vocabulary(): _ = {token: base_voc[word] for token in tokens} cnt.update(_) current = [k for k, v in cnt.most_common(n=2**voc_size_exponent)] - if statistics is not None: - _ = Counter(dict(cnt.most_common(n=2**voc_size_exponent)), - update_calls=base_voc.update_calls) - _ = dict(params=vocabulary['params'], - counter=_) - tok2 = instance(vocabulary=_).tokenize - tot = 0 - for token in words: - _ = ''.join([tok.replace('~', '').replace('q:', '') - for tok in set(tok2(token))]) - tot += base_voc[token] * (len(token) - len(_)) - statistics.append(tot) return cnt.most_common(n=2**voc_size_exponent) limit = np.inf if limit is None else limit