Skip to content

Commit

Permalink
cleanup and minor fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
matiaslindgren committed Apr 3, 2020
1 parent 1818a91 commit b1f89c8
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 69 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ Or feel free to just copy paste useful parts into your own program if you want.
Install TensorFlow 2.1 or 2.2 (tested with both).

Clone the repo and install `lidbox` as a Python package in setuptools develop mode (`pip install --editable`).
This makes it easier to experiment with the code.
This makes it easier to experiment with the code since there's no need to reinstall the package after making changes.
```
git clone --depth 1 https://github.com/matiaslindgren/lidbox.git
pip install --editable ./lidbox
```
Check that the entry point is working
Check that the command line entry point is working
```
lidbox -h
```
Expand Down
15 changes: 11 additions & 4 deletions lidbox/schemas/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ definitions:
size_multiplier:
type: integer
exclusiveMinimum: 0
copy_cache_to_tmp:
cache_prepared_dataset_to_tmpdir:
type: boolean
group_by_sequence_length:
type: object
Expand Down Expand Up @@ -143,16 +143,23 @@ definitions:
length:
type: object
additionalProperties: false
required:
- max
- min
- num_bins
properties:
min:
type: integer
minimum: 0
max:
type: integer
exclusiveMinimum: 0
min:
type: integer
minimum: 0
num_bins:
type: integer
exclusiveMinimum: 0
min_overlap:
type: number
exclusiveMaximum: 1.0

properties:
datasets:
Expand Down
76 changes: 13 additions & 63 deletions lidbox/tf_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ def chunk_timedim_randomly(features):
begin = tf.boolean_mask(begin, begin < num_total_frames)
end = begin + tf.boolean_mask(rand_chunk_lengths, begin < num_total_frames)
end = tf.math.minimum(num_total_frames, end)
# TODO gather is overkill here since we only want several slices
chunk_indices = tf.ragged.range(begin, end)
return tf.gather(features, chunk_indices)
return chunk_timedim_randomly
Expand Down Expand Up @@ -253,6 +254,8 @@ def to_frames(feats, *meta):
if verbosity > 1:
print("Using random chunk config:")
yaml_pprint(frame_config)
if "cache_prepared_dataset_to_tmpdir" not in config:
print("Warning: 'cache_prepared_dataset_to_tmpdir' not given, the dataset will contain different samples at every epoch due to 'random_frames', this will break assumptions made by Keras")
tmp_fn = make_random_frame_chunker_fn(frame_config["length"])
frame_chunker_fn = lambda feats, *args: (tmp_fn(feats), *args)
ds = ds.map(frame_chunker_fn, num_parallel_calls=TF_AUTOTUNE)
Expand Down Expand Up @@ -337,7 +340,7 @@ def to_model_input(feats, str_meta, *rest):
config["group_by_sequence_length"]["max_batch_size"],
verbosity=verbosity,
sequence_dim=0)
if config.get("copy_cache_to_tmp", False):
if config.get("cache_prepared_dataset_to_tmpdir", False):
tmp_cache_path = "{}/tensorflow-cache/{}/training-prepared_{}_{}".format(
os.environ.get("TMPDIR", "/tmp"),
model_id,
Expand Down Expand Up @@ -396,7 +399,8 @@ def inspect_batches(batch_idx, batch):
tf.summary.histogram("input_labels", labels, step=batch_idx)
tf.summary.image(features_name, image, step=batch_idx, max_outputs=max_outputs)
tf.debugging.assert_equal(tf.expand_dims(wavs.sample_rate[0], 0), wavs.sample_rate, message="All utterances in a batch must have the same sample rate")
tf.summary.audio("utterances", tf.expand_dims(wavs.audio, -1), wavs.sample_rate[0], step=batch_idx, max_outputs=max_outputs)
if tf.reduce_all(tf.size(wavs.audio) > 0) and tf.reduce_all(wavs.sample_rate != 0):
tf.summary.audio("utterances", tf.expand_dims(wavs.audio, -1), wavs.sample_rate[0], step=batch_idx, max_outputs=max_outputs)
enumerated_uttids = tf.strings.reduce_join(
(tf.strings.as_string(tf.range(1, max_outputs + 1)), uttids[:max_outputs]),
axis=0,
Expand Down Expand Up @@ -679,67 +683,13 @@ def pack_wav_tuples(signal, sample_rate, meta):
def wav_batch_to_features(wavs, *meta):
return extract_features(wavs, *feat_extract_args), (*meta, wavs)
return wavs_batched.map(wav_batch_to_features, num_parallel_calls=TF_AUTOTUNE)
## This was for debugging the tf normalization implementation
# if "mean_var_norm_numpy" in feat_config:
# window_len = tf.constant(feat_config["mean_var_norm_numpy"]["window_len"], tf.int32)
# normalize_variance = tf.constant(feat_config["mean_var_norm_numpy"].get("normalize_variance", True), tf.bool)
# if verbosity:
# tf_util.tf_print("Using numpy to apply mean_var_norm sliding window of length", window_len, "without padding. Will also normalize variance:", normalize_variance)
# def apply_mean_var_norm_numpy(feats, *rest):
# normalized = tf.numpy_function(
# mean_var_norm_nopad_slide_numpy,
# [feats, window_len, normalize_variance],
# feats.dtype)
# normalized.set_shape(feats.shape.as_list())
# return (normalized, *rest)
# features = features.map(apply_mean_var_norm_numpy, num_parallel_calls=TF_AUTOTUNE)
# return features

def parse_sparsespeech_features(feat_config, enc_path, feat_path, seg2utt, utt2label):
utt2label = {u: d["label"] for u, d in utt2meta.items()}
ss_input = kaldiio.load_scp(feat_path)
with open(enc_path, "rb") as f:
ss_encoding = np.load(f, fix_imports=False, allow_pickle=True).item()
assert set(ss_input.keys()) == set(ss_encoding.keys()), "missing utterances, maybe all were not encoded?"
keys = list(ss_encoding.keys())
assert all(ss_encoding[k].ndim == ss_input[k].ndim for k in keys), "ss input and output must have the same dimensions"
assert all(ss_encoding[k].shape[0] == ss_input[k].shape[0] for k in keys), "mismatching amount of time steps in ss input and the output encoding"
encodingtype = tf.as_dtype(ss_encoding[keys[0]].dtype)
noise_mean = feat_config.get("noise_mean", 0.0)
noise_stddev = feat_config.get("noise_stddev", 0.01)
feat_scale_kwargs = feat_config.get("sample_minmax_scaling", {})
labels_only = feat_config.get("labels_only", False)
def datagen():
for seg_id, onehot_enc in ss_encoding.items():
label = utt2label[seg2utt[seg_id]]
noise = tf.random.normal(onehot_enc.shape, mean=noise_mean, stddev=noise_stddev, dtype=encodingtype)
input_feat = ss_input[seg_id]
output_feat = onehot_enc + noise
# Apply feature scaling separately
if feat_scale_kwargs:
input_feat = feature_scaling(input_feat, **feat_scale_kwargs)
output_feat = feature_scaling(output_feat, **feat_scale_kwargs)
if labels_only:
out = output_feat
else:
# Stack input and output features
out = tf.concat((input_feat, output_feat), 1)
yield out, seg_id, label
#FIXME metadata into tuple and add dummy signals
return tf.data.Dataset.from_generator(
datagen,
(encodingtype, tf.string, tf.string),
(tf.TensorShape(feat_config["shape_after_concat"]),
tf.TensorShape([]),
tf.TensorShape([])))

def parse_kaldi_features(utterance_list, features_path, utt2meta, expected_shape, feat_conf):
utt2label = {u: d["label"] for u, d in utt2meta.items()}
utt2feats = kaldiio.load_scp(features_path)
feat_conf = dict(feat_conf)
normalize_mean_axis = feat_conf.pop("normalize_mean_axis", None)
normalize_stddev_axis = feat_conf.pop("normalize_stddev_axis", None)
assert not feat_conf, "feat_conf contains unrecognized keys: {}".format(','.join(str(k) for k in feat_conf))
def assert_shape(shape):
shape_str = "{} vs {}".format(shape, expected_shape)
assert len(shape) == len(expected_shape), shape_str
Expand All @@ -758,12 +708,12 @@ def datagen():
if normalize_stddev_axis is not None:
stddev = tf.math.reduce_std(feats, axis=normalize_stddev_axis, keepdims=True)
normalized = tf.math.divide_no_nan(normalized, stddev)
yield normalized, utt, utt2label[utt]
yield normalized, (utt, utt2label[utt])
ds = tf.data.Dataset.from_generator(
datagen,
(tf.float32, tf.string, tf.string),
(tf.TensorShape(expected_shape), tf.TensorShape([]), tf.TensorShape([])))
# Group metadata into tuple, note that this is not the same as using a 2-element tf.string tensor for metadata
# Also add empty signals (since we don't know the origin of these features)
ds = ds.map(lambda feats, utt, label: (feats, (utt, label, audio_feat.Wav(tf.zeros([1]), 16000))))
return ds.batch(1)
(tf.float32, tf.string),
(tf.TensorShape(expected_shape), tf.TensorShape([2])))
# Add empty signals (since we don't know the origin of these features)
empty_wav = audio_feat.Wav(tf.zeros([0]), 0)
add_empty_signal = lambda feats, meta: (feats, (meta, empty_wav))
return ds.map(add_empty_signal).batch(1)

0 comments on commit b1f89c8

Please sign in to comment.