Skip to content

Commit

Permalink
EncExp extra token (2)
Browse files Browse the repository at this point in the history
  • Loading branch information
mgraffg committed Nov 7, 2024
1 parent 2fa679c commit 61ada6d
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 46 deletions.
31 changes: 13 additions & 18 deletions encexp/build_encexp.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ def update_tokens(seq, tokens=None):
return seq


def encode(vocabulary, fname, tokens=None, limit=None):
def encode(vocabulary:dict, fname: str, tokens: list=None,
limit: int=None):
"""Encode file"""
limit = np.inf if limit is None else limit
loop = count() if limit == np.inf else range(limit)
Expand All @@ -75,9 +76,9 @@ def encode(vocabulary, fname, tokens=None, limit=None):
return output, cnt


def feasible_tokens(vocabulary, count,
tokens=None,
min_pos=512):
def feasible_tokens(vocabulary: dict, count: dict,
tokens: list=None,
min_pos: int=512):
"""Feasible tokens"""
seq = SeqTM(vocabulary=vocabulary)
tokens = seq.names if tokens is None else tokens
Expand All @@ -94,10 +95,9 @@ def build_encexp_token(index, vocabulary,
precision=np.float16,
transform=None,
estimator_kwargs=None,
tokens=None):
label=None):
"""Build token classifier"""
seq = SeqTM(vocabulary=vocabulary)
label = seq.names[index] if tokens is None else tokens[index][1]
output_fname = encode_output(fname, prefix=f'{index}')
POS = []
NEG = []
Expand Down Expand Up @@ -141,16 +141,11 @@ def build_encexp_token(index, vocabulary,
return output_fname


def build_encexp(vocabulary,
fname, output,
min_pos=512,
max_pos=2**13,
n_jobs = -1,
precision=np.float16,
estimator_kwargs=None,
limit=None,
transform=None,
tokens=None):
def build_encexp(vocabulary, fname, output,
min_pos: int=512, max_pos: int=2**13,
n_jobs: int = -1, precision=np.float16,
estimator_kwargs: dict=None, limit: int=None,
transform=None, tokens: list=None):
"""Build EncExp"""
encode_fname, cnt = encode(vocabulary, fname, tokens=tokens,
limit=limit)
Expand All @@ -163,8 +158,8 @@ def build_encexp(vocabulary,
max_pos=max_pos,
estimator_kwargs=estimator_kwargs,
transform=transform,
tokens=tokens)
for index, _ in progress_bar(tokens,
label=label)
for index, label in progress_bar(tokens,
desc=output,
total=len(tokens)))
with gzip.open(output, 'wb') as fpt:
Expand Down
18 changes: 4 additions & 14 deletions encexp/tests/test_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,6 @@ def test_build_voc():
os.unlink('t.json.gz')


def test_build_voc_stats():
"""Test build voc statistics"""
samples()
statistics = []
build_voc('es-mx-sample.json', output='t.json.gz',
voc_size_exponent=10, statistics=statistics)
assert statistics[:3] == [78037, 75690, 72900]
os.unlink('t.json.gz')


def test_encexp_encode():
"""Test encode method"""
samples()
Expand Down Expand Up @@ -104,8 +94,8 @@ def test_build_encexp_token():
output, cnt = encode(voc, 'es-mx-sample.json')
tokens = feasible_tokens(voc, cnt)
index, token = tokens[-3]
fname = build_encexp_token(index, voc, output)
assert fname == '559-encode-es-mx-sample.json'
fname = build_encexp_token(index, voc, output, label=token)
assert fname == f'{index}-encode-es-mx-sample.json'
os.unlink('encode-es-mx-sample.json')
data = next(tweet_iterator(fname))
assert data['label'] == token
Expand Down Expand Up @@ -199,15 +189,15 @@ def test_build_encexp_tokens():
tokens = feasible_tokens(voc, cnt, tokens=words,
min_pos=8)
fname = build_encexp_token(0, voc, output, precision=np.float16,
tokens=tokens)
label=tokens[0][1])
assert isfile(fname)
assert next(tweet_iterator(fname))['label'] == tokens[0][1]

# assert isfile(output)
# assert output == 'encode-es-mx-sample.json'
# os.unlink('encode-es-mx-sample.json')
build_encexp(voc, 'es-mx-sample.json', 'encexp-es-mx.json.gz',
tokens=words, min_pos=8, n_jobs=1)
tokens=words, min_pos=8)
assert isfile('encexp-es-mx.json.gz')
enc = EncExp(lang=None, voc_source=None,
EncExp_filename='encexp-es-mx.json.gz',
Expand Down
15 changes: 1 addition & 14 deletions encexp/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,7 @@ def compute_b4msa_vocabulary(filename, limit=None, lang='es',
def compute_seqtm_vocabulary(instance, vocabulary,
filename, limit=None,
voc_size_exponent=13,
prefix_suffix=False,
statistics=None):
prefix_suffix=False):
"""Compute SeqTM"""

def current_lost_words():
Expand Down Expand Up @@ -193,18 +192,6 @@ def optimize_vocabulary():
_ = {token: base_voc[word] for token in tokens}
cnt.update(_)
current = [k for k, v in cnt.most_common(n=2**voc_size_exponent)]
if statistics is not None:
_ = Counter(dict(cnt.most_common(n=2**voc_size_exponent)),
update_calls=base_voc.update_calls)
_ = dict(params=vocabulary['params'],
counter=_)
tok2 = instance(vocabulary=_).tokenize
tot = 0
for token in words:
_ = ''.join([tok.replace('~', '').replace('q:', '')
for tok in set(tok2(token))])
tot += base_voc[token] * (len(token) - len(_))
statistics.append(tot)
return cnt.most_common(n=2**voc_size_exponent)

limit = np.inf if limit is None else limit
Expand Down

0 comments on commit 61ada6d

Please sign in to comment.