From 3441e1a59f66b82360c53da60aee0dd407f1de67 Mon Sep 17 00:00:00 2001 From: thesamovar Date: Fri, 17 Jan 2025 17:44:30 +0000 Subject: [PATCH] added comparison to ML and limitations of approach --- paper/paper.bib | 16 ++++++++++++++++ paper/sections/science.md | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/paper/paper.bib b/paper/paper.bib index e6ea930..ec423aa 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -537,4 +537,20 @@ @inproceedings{PeiYe2021 booktitle={Advances in Neural Information Processing Systems (NeurIPS), Track on Datasets and Benchmarks}, year={2021}, url={https://arxiv.org/abs/2109.04463} +} + +@article{Grumiaux2022, + author = {Grumiaux, Pierre-Amaury and Kitić, Srđan and Girin, Laurent and Guérin, Alexandre}, + title = {A survey of sound source localization with deep learning methods}, + journal = {The Journal of the Acoustical Society of America}, + volume = {152}, + number = {1}, + pages = {107-151}, + year = {2022}, + month = {07}, + abstract = {This article is a survey of deep learning methods for single and multiple sound source localization, with a focus on sound source localization in indoor environments, where reverberation and diffuse noise are present. We provide an extensive topography of the neural network-based sound source localization literature in this context, organized according to the neural network architecture, the type of input features, the output strategy (classification or regression), the types of data used for model training and evaluation, and the model training strategy. Tables summarizing the literature survey are provided at the end of the paper, allowing a quick search of methods with a given set of target characteristics.}, + issn = {0001-4966}, + doi = {10.1121/10.0011809}, + url = {https://doi.org/10.1121/10.0011809}, + eprint = {https://pubs.aip.org/asa/jasa/article-pdf/152/1/107/16525693/107\_1\_online.pdf}, } \ No newline at end of file diff --git a/paper/sections/science.md b/paper/sections/science.md index 116336d..4b9875c 100644 --- a/paper/sections/science.md +++ b/paper/sections/science.md @@ -7,7 +7,7 @@ Animals localise sounds by detecting location- or direction-specific cues in the The classic model of ITD sensitivity is the delay line model of {cite:t}`Jeffress1948` in which an array of binaural coincidence detector neurons receive inputs from the two ears with different delays. When a neuron's delays exactly match the acoustic delays induced by the sound location, it will be maximally active. Therefore, the identity of the most active neuron indicates the direction of the sound. This model is widely accepted, though was shown to be inefficient with respect to neural noise by {cite:t}`McAlpine2003`, who proposed an alternative model based on the two binaural hemispheres average firing rates - which is optimally robust to neural noise. However, {cite:t}`goodman_decoding_2013` showed that these models perform too poorly to account for behavioural data, especially in situations where sounds had complex and unknown spectral properties, or in the presence of background noise, and proposed an alternative based on a perceptron-like neural network - which is both robust to neural noise and performed well across a range of conditions. -Building on this literature, and our Cosyne tutorial, the starting point of our project was to ask: what solutions would you find if you directly optimised a spiking neural network to localise sounds? How would those solutions depend on the available neural mechanisms and statistics of the sound? Could we understand the solutions found? What properties would the solution have in terms of robustness to noise, generalisation, and so forth? And could the solutions found by optimisation throw light on features found in the auditory systems of different animals? +Building on this literature, and our Cosyne tutorial, the starting point of our project was to ask: what solutions would you find if you directly optimised a spiking neural network to localise sounds? Two things are worth noting here. Firstly, this may be very different to the optimal solutions you would find in the unconstrained space of computational solutions, or even of neural network-based solutions. This literature is reviewed in {cite:t}`Grumiaux2022`, including classical engineering approaches such as beamforming, and deep learning approaches including convolutional neural networks, recurrent neural networks and increasingly attention-based networks as used in large language models. Secondly, our setup is very limited in terms of the available cues and network structure: we only use pure tones we have no spectral or cross-frequency cues, we fix the level so we have no interaural level differences, etc. We would not necessarily expect this approach to explain a wide range of observed phenomena, but it may still throw light on some fundamental aspects of interaural time or phase difference circuits. With these restrictions in place, we can ask several questions about the optimal spike-based solutions we find. How would those solutions depend on the available neural mechanisms and statistics of the sound? Could we understand the solutions found? What properties would the solution have in terms of robustness to noise, generalisation, and so forth? And could the solutions found by optimisation throw light on features found in the auditory systems of different animals? ## A simple spiking neural network model