EncExp extra token (2)

INGEOTEC · Nov 7, 2024 · 61ada6d · 61ada6d
1 parent 2fa679c
commit 61ada6d
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 46 deletions.
diff --git a/encexp/build_encexp.py b/encexp/build_encexp.py
@@ -56,7 +56,8 @@ def update_tokens(seq, tokens=None):
     return seq
 
 
-def encode(vocabulary, fname, tokens=None, limit=None):
+def encode(vocabulary:dict, fname: str, tokens: list=None,
+           limit: int=None):
     """Encode file"""
     limit = np.inf if limit is None else limit
     loop = count() if limit == np.inf else range(limit)    
@@ -75,9 +76,9 @@ def encode(vocabulary, fname, tokens=None, limit=None):
     return output, cnt
 
 
-def feasible_tokens(vocabulary, count,
-                    tokens=None,
-                    min_pos=512):
+def feasible_tokens(vocabulary: dict, count: dict,
+                    tokens: list=None,
+                    min_pos: int=512):
     """Feasible tokens"""
     seq = SeqTM(vocabulary=vocabulary)
     tokens = seq.names if tokens is None else tokens
@@ -94,10 +95,9 @@ def build_encexp_token(index, vocabulary,
                        precision=np.float16,
                        transform=None,
                        estimator_kwargs=None,
-                       tokens=None):
+                       label=None):
     """Build token classifier"""
     seq = SeqTM(vocabulary=vocabulary)
-    label = seq.names[index] if tokens is None else tokens[index][1]
     output_fname = encode_output(fname, prefix=f'{index}')
     POS = []
     NEG = []
@@ -141,16 +141,11 @@ def build_encexp_token(index, vocabulary,
     return output_fname
 
 
-def build_encexp(vocabulary,
-                 fname, output,
-                 min_pos=512,
-                 max_pos=2**13,
-                 n_jobs = -1,
-                 precision=np.float16,
-                 estimator_kwargs=None,
-                 limit=None,
-                 transform=None,
-                 tokens=None):
+def build_encexp(vocabulary, fname, output,
+                 min_pos: int=512, max_pos: int=2**13,
+                 n_jobs: int = -1, precision=np.float16,
+                 estimator_kwargs: dict=None, limit: int=None,
+                 transform=None, tokens: list=None):
     """Build EncExp"""
     encode_fname, cnt = encode(vocabulary, fname, tokens=tokens, 
                                limit=limit)
@@ -163,8 +158,8 @@ def build_encexp(vocabulary,
                                                                  max_pos=max_pos,
                                                                  estimator_kwargs=estimator_kwargs,
                                                                  transform=transform,
-                                                                 tokens=tokens)
-                                     for index, _ in progress_bar(tokens,
+                                                                 label=label)
+                                     for index, label in progress_bar(tokens,
                                                                   desc=output,
                                                                   total=len(tokens)))
     with gzip.open(output, 'wb') as fpt:

diff --git a/encexp/tests/test_build.py b/encexp/tests/test_build.py
@@ -50,16 +50,6 @@ def test_build_voc():
     os.unlink('t.json.gz')
 
 
-def test_build_voc_stats():
-    """Test build voc statistics"""
-    samples()
-    statistics = []
-    build_voc('es-mx-sample.json', output='t.json.gz',
-              voc_size_exponent=10, statistics=statistics)
-    assert statistics[:3] == [78037, 75690, 72900]
-    os.unlink('t.json.gz')
-
-
 def test_encexp_encode():
     """Test encode method"""
     samples()
@@ -104,8 +94,8 @@ def test_build_encexp_token():
     output, cnt = encode(voc, 'es-mx-sample.json')
     tokens = feasible_tokens(voc, cnt)
     index, token = tokens[-3]
-    fname = build_encexp_token(index, voc, output)
-    assert fname == '559-encode-es-mx-sample.json'
+    fname = build_encexp_token(index, voc, output, label=token)
+    assert fname == f'{index}-encode-es-mx-sample.json'
     os.unlink('encode-es-mx-sample.json')
     data = next(tweet_iterator(fname))
     assert data['label'] == token
@@ -199,15 +189,15 @@ def test_build_encexp_tokens():
     tokens = feasible_tokens(voc, cnt, tokens=words,
                              min_pos=8)
     fname = build_encexp_token(0, voc, output, precision=np.float16,
-                               tokens=tokens)
+                               label=tokens[0][1])
     assert isfile(fname)
     assert next(tweet_iterator(fname))['label'] == tokens[0][1]
 
     # assert isfile(output)
     # assert output == 'encode-es-mx-sample.json'
     # os.unlink('encode-es-mx-sample.json')
     build_encexp(voc, 'es-mx-sample.json', 'encexp-es-mx.json.gz',
-                 tokens=words, min_pos=8, n_jobs=1)
+                 tokens=words, min_pos=8)
     assert isfile('encexp-es-mx.json.gz')
     enc = EncExp(lang=None, voc_source=None,
                  EncExp_filename='encexp-es-mx.json.gz',

diff --git a/encexp/utils.py b/encexp/utils.py
@@ -151,8 +151,7 @@ def compute_b4msa_vocabulary(filename, limit=None, lang='es',
 def compute_seqtm_vocabulary(instance, vocabulary,
                              filename, limit=None,
                              voc_size_exponent=13,
-                             prefix_suffix=False,
-                             statistics=None):
+                             prefix_suffix=False):
     """Compute SeqTM"""
 
     def current_lost_words():
@@ -193,18 +192,6 @@ def optimize_vocabulary():
                 _ = {token: base_voc[word] for token in tokens}
                 cnt.update(_)
             current = [k for k, v in cnt.most_common(n=2**voc_size_exponent)]
-            if statistics is not None:
-                _ = Counter(dict(cnt.most_common(n=2**voc_size_exponent)),
-                                 update_calls=base_voc.update_calls)
-                _ = dict(params=vocabulary['params'],
-                         counter=_)
-                tok2 = instance(vocabulary=_).tokenize 
-                tot = 0
-                for token in words:
-                    _ = ''.join([tok.replace('~', '').replace('q:', '')
-                                 for tok in set(tok2(token))])
-                    tot += base_voc[token] * (len(token) - len(_))
-                statistics.append(tot)
         return cnt.most_common(n=2**voc_size_exponent)
 
     limit = np.inf if limit is None else limit