Skip to content

Commit

Permalink
streaming_decode.py, relax the audio range from [-1,+1] to [-10,+10]
Browse files Browse the repository at this point in the history
- some AudioTransform classes produce audio signals out of range [-1,+1]
   - Resample produced 1.0079
   - The range [-10,+10] was chosen to still be able to reliably
     distinguish from the [-32k,+32k] signal...
- this is related to : lhotse-speech/lhotse#1254
  • Loading branch information
KarelVesely84 committed Jan 4, 2024
1 parent 8136ad7 commit 849a93a
Show file tree
Hide file tree
Showing 18 changed files with 78 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,10 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)

Expand Down
10 changes: 4 additions & 6 deletions egs/aishell/ASR/zipformer/streaming_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,12 +597,10 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
if audio.max() > 1:
logging.warning(
f"The audio should be normalized to [-1, 1], audio.max : {audio.max()}."
f"Clipping to [-1, 1]."
)
audio = np.clip(audio, -1, 1)
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,10 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)

Expand Down
5 changes: 4 additions & 1 deletion egs/gigaspeech/ASR/zipformer/streaming_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,10 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -681,8 +681,12 @@ def decode_dataset(
assert len(audio.shape) == 2
assert audio.shape[0] == 1, "Should be single channel"
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)
feature = fbank(samples)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -681,8 +681,12 @@ def decode_dataset(
assert len(audio.shape) == 2
assert audio.shape[0] == 1, "Should be single channel"
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)
feature = fbank(samples)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -673,8 +673,12 @@ def decode_dataset(
assert len(audio.shape) == 2
assert audio.shape[0] == 1, "Should be single channel"
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)
feature = fbank(samples)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -673,8 +673,12 @@ def decode_dataset(
assert len(audio.shape) == 2
assert audio.shape[0] == 1, "Should be single channel"
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)
feature = fbank(samples)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,10 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,10 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,10 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,10 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,10 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,10 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,10 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)

Expand Down
5 changes: 4 additions & 1 deletion egs/librispeech/ASR/zipformer/streaming_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,10 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
assert audio.max() <= 1, "Should be normalized to [-1, 1])"
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,12 @@ def decode_dataset(
assert audio.shape[0] == 1, "Should be single channel"
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)

fbank = Fbank(opts)
Expand Down
10 changes: 4 additions & 6 deletions egs/wenetspeech/ASR/zipformer/streaming_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,12 +597,10 @@ def decode_dataset(
assert audio.dtype == np.float32, audio.dtype

# The trained model is using normalized samples
if audio.max() > 1:
logging.warning(
f"The audio should be normalized to [-1, 1], audio.max : {audio.max()}."
f"Clipping to [-1, 1]."
)
audio = np.clip(audio, -1, 1)
# - this is to avoid sending [-32k,+32k] signal in...
# - some lhotse AudioTransform classes can make the signal
# be out of range [-1, 1], hence the tolerance 10
assert np.abs(audio).max() <= 10, "Should be normalized to [-1, 1], 10 for tolerance..."

samples = torch.from_numpy(audio).squeeze(0)

Expand Down

0 comments on commit 849a93a

Please sign in to comment.