test.py

import sys
import json
import h5py
import numpy as np
from elmoformanylangs import Embedder

elmo = Embedder('./zhs.model')

def cache_dataset(data_path,  out_file):
  with open(data_path) as in_file:
    for doc_num, line in enumerate(in_file.readlines()):
      example = json.loads(line)
      sentences = example["sentences"]
      max_sentence_length = max(len(s) for s in sentences)
      tokens = [[""] * max_sentence_length for _ in sentences]
      text_len = np.array([len(s) for s in sentences])
      for i, sentence in enumerate(sentences):
        for j, word in enumerate(sentence):
          tokens[i][j] = word
      tokens = np.array(tokens)
      file_key = example["doc_key"].replace("/", ":")
      group = out_file.create_group(file_key)
      tf_lm_emb = elmo.sents2elmo(sentences)

      print("\ntf_lm_emb.shape=", tf_lm_emb.shape, "\n")
      for i, (e, l) in enumerate(zip(tf_lm_emb, text_len)):
        e = e[:l, :, :]
        print(e.shape)
        group[str(i)] = e
      if doc_num % 10 == 0:
        print("Cached {} documents in {}".format(doc_num + 1, data_path))


if __name__ == '__main__':

    with h5py.File("elmo_cache0.hdf5", "w") as out_file:
        for json_filename in sys.argv[1:]:
            cache_dataset(json_filename, out_file)

# sents = [['今', '天'], ['今', '天', '天气', '真', "好", '好']]
# the list of lists which store the sentences
# after segment if necessary.