wav.py

import logging
import cv2
import numpy as np
from chunk import Chunk
import struct
import math
from time import time
import os.path
from common import SushiError, clip

WAVE_FORMAT_PCM = 0x0001
WAVE_FORMAT_EXTENSIBLE = 0xFFFE


class DownmixedWavFile(object):
    _file = None

    def __init__(self, path):
        super(DownmixedWavFile, self).__init__()
        self._file = open(path, 'rb')
        try:
            riff = Chunk(self._file, bigendian=False)
            if riff.getname() != 'RIFF':
                raise SushiError('File does not start with RIFF id')
            if riff.read(4) != 'WAVE':
                raise SushiError('Not a WAVE file')

            fmt_chunk_read = False
            data_chink_read = False
            file_size = os.path.getsize(path)

            while True:
                try:
                    chunk = Chunk(self._file, bigendian=False)
                except EOFError:
                    break

                if chunk.getname() == 'fmt ':
                    self._read_fmt_chunk(chunk)
                    fmt_chunk_read = True
                elif chunk.getname() == 'data':
                    if file_size > 0xFFFFFFFF:
                        # large broken wav
                        self.frames_count = (file_size - self._file.tell()) // self.frame_size
                    else:
                        self.frames_count = chunk.chunksize // self.frame_size
                    data_chink_read = True
                    break
                chunk.skip()
            if not fmt_chunk_read or not data_chink_read:
                raise SushiError('Invalid WAV file')
        except:
            self.close()
            raise

    def __del__(self):
        self.close()

    def close(self):
        if self._file:
            self._file.close()
            self._file = None

    def readframes(self, count):
        if not count:
            return ''
        data = self._file.read(count * self.frame_size)
        if self.sample_width == 2:
            unpacked = np.fromstring(data, dtype=np.int16)
        elif self.sample_width == 3:
            raw_bytes = np.ndarray(len(data), 'int8', data)
            unpacked = np.zeros(len(data) / 3, np.int16)
            unpacked.view(dtype='int8')[0::2] = raw_bytes[1::3]
            unpacked.view(dtype='int8')[1::2] = raw_bytes[2::3]
        else:
            raise SushiError('Unsupported sample width: {0}'.format(self.sample_width))

        unpacked = unpacked.astype('float32')

        if self.channels_count == 1:
            return unpacked
        else:
            min_length = len(unpacked) // self.channels_count
            real_length = len(unpacked) / float(self.channels_count)
            if min_length != real_length:
                logging.error("Length of audio channels didn't match. This might result in broken output")

            channels = (unpacked[i::self.channels_count] for i in xrange(self.channels_count))
            data = reduce(lambda a, b: a[:min_length]+b[:min_length], channels)
            data /= float(self.channels_count)
            return data

    def _read_fmt_chunk(self, chunk):
        wFormatTag, self.channels_count, self.framerate, dwAvgBytesPerSec, wBlockAlign = struct.unpack('<HHLLH',
                                                                                                       chunk.read(14))
        if wFormatTag == WAVE_FORMAT_PCM or wFormatTag == WAVE_FORMAT_EXTENSIBLE:  # ignore the rest
            bits_per_sample = struct.unpack('<H', chunk.read(2))[0]
            self.sample_width = (bits_per_sample + 7) // 8
        else:
            raise SushiError('unknown format: {0}'.format(wFormatTag))
        self.frame_size = self.channels_count * self.sample_width


class WavStream(object):
    READ_CHUNK_SIZE = 1  # one second, seems to be the fastest
    PADDING_SECONDS = 10

    def __init__(self, path, sample_rate=12000, sample_type='uint8'):
        if sample_type not in ('float32', 'uint8'):
            raise SushiError('Unknown sample type of WAV stream, must be uint8 or float32')

        stream = DownmixedWavFile(path)
        total_seconds = stream.frames_count / float(stream.framerate)
        downsample_rate = sample_rate / float(stream.framerate)

        self.sample_count = math.ceil(total_seconds * sample_rate)
        self.sample_rate = sample_rate
        # pre-allocating the data array and some place for padding
        self.data = np.empty((1, int(self.PADDING_SECONDS * 2 * stream.framerate + self.sample_count)), np.float32)
        self.padding_size = 10 * stream.framerate
        before_read = time()
        try:
            seconds_read = 0
            samples_read = self.padding_size
            while seconds_read < total_seconds:
                data = stream.readframes(int(self.READ_CHUNK_SIZE * stream.framerate))
                new_length = int(round(len(data) * downsample_rate))

                dst_view = self.data[0][samples_read:samples_read+new_length]

                if downsample_rate != 1:
                    data = data.reshape((1, len(data)))
                    data = cv2.resize(data, (new_length, 1), interpolation=cv2.INTER_NEAREST)[0]

                np.copyto(dst_view, data, casting='no')
                samples_read += new_length
                seconds_read += self.READ_CHUNK_SIZE

            # padding the audio from both sides
            self.data[0][0:self.padding_size].fill(self.data[0][self.padding_size])
            self.data[0][-self.padding_size:].fill(self.data[0][-self.padding_size-1])

            # normalizing
            # also clipping the stream by 3*median value from both sides of zero
            max_value = np.median(self.data[self.data >= 0], overwrite_input=True) * 3
            min_value = np.median(self.data[self.data <= 0], overwrite_input=True) * 3

            np.clip(self.data, min_value, max_value, out=self.data)

            self.data -= min_value
            self.data /= (max_value - min_value)

            if sample_type == 'uint8':
                self.data *= 255.0
                self.data += 0.5
                self.data = self.data.astype('uint8')

        except Exception as e:
            raise SushiError('Error while loading {0}: {1}'.format(path, e))
        finally:
            stream.close()
        logging.info('Done reading WAV {0} in {1}s'.format(path, time() - before_read))

    @property
    def duration_seconds(self):
        return self.sample_count / self.sample_rate

    def get_substream(self, start, end):
        start_off = self._get_sample_for_time(start)
        end_off = self._get_sample_for_time(end)
        return self.data[:, start_off:end_off]

    def _get_sample_for_time(self, timestamp):
        # this function gets REAL sample for time, taking padding into account
        return int(self.sample_rate * timestamp) + self.padding_size

    def find_substream(self, pattern, window_center, window_size):
        start_time = clip(window_center - window_size, -self.PADDING_SECONDS, self.duration_seconds)
        end_time = clip(window_center + window_size, 0, self.duration_seconds + self.PADDING_SECONDS)

        start_sample = self._get_sample_for_time(start_time)
        end_sample = self._get_sample_for_time(end_time) + len(pattern[0])

        search_source = self.data[:, start_sample:end_sample]
        result = cv2.matchTemplate(search_source, pattern, cv2.TM_SQDIFF_NORMED)
        min_idx = result.argmin(axis=1)[0]

        return result[0][min_idx], start_time + (min_idx / float(self.sample_rate))