@article {5014, title = {Deep neural network models of sound localization reveal how perception is adapted to real-world environments}, journal = {Nature Human Behavior}, volume = {6}, year = {2022}, month = {01/2022}, pages = {111{\textendash}133 }, chapter = {111}, abstract = {

Mammals localize sounds using information from their two ears. Localization in real-world conditions is challenging, as echoes provide erroneous information and noises mask parts of target sounds. To better understand real-world localization, we equipped a deep neural network with human ears and trained it to localize sounds in a virtual environment. The resulting model localized accurately in realistic conditions with noise and reverberation. In simulated experiments, the model exhibited many features of human spatial hearing: sensitivity to monaural spectral cues and interaural time and level differences, integration across frequency, biases for sound onsets and limits on localization of concurrent sources. But when trained in unnatural environments without reverberation, noise or natural sounds, these performance characteristics deviated from those of humans. The results show how biological hearing is adapted to the challenges of real-world environments and illustrate how artificial neural networks can reveal the real-world constraints that shape perception.

}, doi = {https://doi.org/10.1038/s41562-021-01244-z}, url = {https://www.nature.com/articles/s41562-021-01244-z}, author = {Andrew Francl and Josh H. McDermott} } @article {5087, title = {Harmonicity aids hearing in noise}, journal = {Attention, Perception, \& Psychophysics}, year = {2022}, month = {01/2022}, abstract = {

Hearing in noise is a core problem in audition, and a challenge for hearing-impaired listeners, yet the underlying mechanisms are poorly understood. We explored whether harmonic frequency relations, a signature property of many communication sounds, aid hearing in noise for normal hearing listeners. We measured detection thresholds in noise for tones and speech synthesized to have harmonic or inharmonic spectra. Harmonic signals were consistently easier to detect than otherwise identical inharmonic signals. Harmonicity also improved discrimination of sounds in noise. The largest benefits were observed for two-note up-down {\textquotedblleft}pitch{\textquotedblright} discrimination and melodic contour discrimination, both of which could be performed equally well with harmonic and inharmonic tones in quiet, but which showed large harmonic advantages in noise. The results show that harmonicity facilitates hearing in noise, plausibly by providing a noise-robust pitch cue that aids detection and discrimination.

}, issn = {1943-3921}, doi = {10.3758/s13414-021-02376-0}, url = {https://link.springer.com/10.3758/s13414-021-02376-0}, author = {McPherson, Malinda J. and Grace, River C. and Josh H. McDermott} } @article {5027, title = {A neural population selective for song in human auditory cortex}, journal = {Current Biology}, year = {2022}, month = {02/2022}, abstract = {

How is music represented in the brain? While neuroimaging has revealed some spatial segregation between responses to music versus other sounds, little is known about the neural code for music itself. To address this question, we developed a method to infer canonical response components of human auditory cortex using intracranial responses to natural sounds, and further used the superior coverage of fMRI to map their spatial distribution. The inferred components replicated many prior findings, including distinct neural selectivity for speech and music, but also revealed a novel component that responded nearly exclusively to music with singing. Song selectivity was not explainable by standard acoustic features, was located near speech and music-selective responses, and was also evident in individual electrodes. These results suggest that representations of music are fractionated into subpopulations selective for different types of music, one of which is specialized for the analysis of song.

}, issn = {09609822}, doi = {10.1016/j.cub.2022.01.069}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0960982222001312}, author = {Norman-Haignere, Sam V. and Jenelle Feather and Boebinger, Dana and Brunner, Peter and Ritaccio, Anthony and Josh H. McDermott and Schalk, Gerwin and Nancy Kanwisher} } @article {4823, title = {Causal inference in environmental sound recognition}, journal = {Cognition}, year = {2021}, month = {03/2021}, abstract = {

Sound is caused by physical events in the world. Do humans infer these causes when recognizing sound sources? We tested whether the recognition of common environmental sounds depends on the inference of a basic physical variable -- the source intensity (i.e. the power that produces a sound). A source{\textquoteright}s intensity can be inferred from the intensity it produces at the ear and its distance, which is normally conveyed by reverberation. Listeners could thus use intensity at the ear and reverberation to constrain recognition by inferring the underlying source intensity. Alternatively, listeners might separate these acoustic cues from their representation of a sound{\textquoteright}s identity in the interest of invariant recognition. We compared these two hypotheses by measuring recognition accuracy for sounds with typically low or high source intensity (e.g. pepper grinders vs. trucks) that were presented across a range of intensities at the ear or with reverberation cues to distance. The recognition of low-intensity sources (e.g. pepper grinders) was impaired by high presentation intensities or reverberation that conveyed distance, either of which imply high source intensity. Neither effect occurred for high-intensity sources. The results suggest that listeners implicitly use the intensity at the ear along with distance cues to infer a source{\textquoteright}s power and constrain its identity. The recognition of real-world sounds thus appears to depend upon the inference of their physical generative parameters, even generative parameters whose cues might otherwise be separated from the representation of a sound{\textquoteright}s identity.

}, doi = {10.1016/j.cognition.2021.104627}, author = {James Traer and Sam Norman-Haignere and Josh H. McDermott} } @article {4990, title = {Deep neural network models reveal interplay of peripheral coding and stimulus statistics in pitch perception}, journal = {Nature Communications}, volume = {12}, year = {2021}, month = {12/2021}, abstract = {

Perception is thought to be shaped by the environments for which organisms are optimized. These influences are difficult to test in biological organisms but may be revealed by machine perceptual systems optimized under different conditions. We investigated environmental and physiological influences on pitch perception, whose properties are commonly linked to peripheral neural coding limits. We first trained artificial neural networks to estimate fundamental frequency from biologically faithful cochlear representations of natural sounds. The best-performing networks replicated many characteristics of human pitch judgments. To probe the origins of these characteristics, we then optimized networks given altered cochleae or sound statistics. Human-like behavior emerged only when cochleae had high temporal fidelity and when models were optimized for naturalistic sounds. The results suggest pitch perception is critically shaped by the constraints of natural environments in addition to those of the cochlea, illustrating the use of artificial neural networks to reveal underpinnings of behavior.

}, doi = {10.1038/s41467-021-27366-6}, url = {https://www.nature.com/articles/s41467-021-27366-6}, author = {Saddler, Mark R. and Gonzalez, Ray and Josh H. McDermott} } @conference {4539, title = {Segregation from Noise as Outlier Detection }, booktitle = {Association for Research in Otolaryngology}, year = {2020}, month = {01/2020}, address = {San Jose, CA, USA}, author = {Jarrod M Hicks and Josh H. McDermott} } @article {4632, title = {ThreeDWorld: A Platform for Interactive Multi-Modal Physical Simulation}, journal = {arXiv}, year = {2020}, month = {07/2020}, type = {Preprint}, abstract = {

We introduce ThreeDWorld (TDW), a platform for interactive multi-modal physical simulation. With TDW, users can simulate high-fidelity sensory data and physical interactions between mobile agents and objects in a wide variety of rich 3D environments. TDW has several unique properties: 1) realtime near photo-realistic image rendering quality; 2) a library of objects and environments with materials for high-quality rendering, and routines enabling user customization of the asset library; 3) generative procedures for efficiently building classes of new environments 4) high-fidelity audio rendering; 5) believable and realistic physical interactions for a wide variety of material types, including cloths, liquid, and deformable objects; 6) a range of "avatar" types that serve as embodiments of AI agents, with the option for user avatar customization; and 7) support for human interactions with VR devices. TDW also provides a rich API enabling multiple agents to interact within a simulation and return a range of sensor and physics data representing the state of the world. We present initial experiments enabled by the platform around emerging research directions in computer vision, machine learning, and cognitive science, including multi-modal physical scene understanding, multi-agent interactions, models that "learn like a child", and attention studies in humans and neural networks. The simulation platform will be made publicly available.

}, url = {https://arxiv.org/abs/2007.04954}, author = {Chuang Gen and Jeremy Schwartz and Seth Alter and Martin Schrimpf and James Traer and Julian De Freitas and Jonas Kubilius and Abhishek Bhandwaldar and Nick Haber and Megumi Sano and Kuno Kim and Elias Wang and Damian Mrowca and Michael Lingelbach and Aidan Curtis and Kevin Feigleis and Daniel Bear and Dan Gutfreund and David Cox and James J. DiCarlo and Josh H. McDermott and Joshua B. Tenenbaum and Daniel L K Yamins} } @article {4633, title = {ThreeDWorld (TDW): A High-Fidelity, Multi-Modal Platform for Interactive Physical Simulation}, year = {2020}, month = {07/2020}, abstract = {

TDW is a 3D virtual world simulation platform, utilizing state-of-the-art video game engine technology

A TDW simulation consists of two components: a) the Build, a compiled executable running on the Unity3D Engine, which is responsible for image rendering, audio synthesis and physics simulations; and b) the Controller, an external Python interface to communicate with the build.

Researchers write Controllers that send commands to the Build, which executes those commands and returns a broad range of data types representing the state of the virtual world.

TDW provides researchers with:

TDW is being used on a daily basis in multiple labs, supporting research that sits at the nexus of neuroscience, cognitive science and artificial intelligence.

Find out more about ThreeDWorld on the project weobsite using the link below.

}, url = {http://www.threedworld.org/}, author = {Jeremy Schwartz and Seth Alter and James J. DiCarlo and Josh H. McDermott and Joshua B. Tenenbaum and Daniel L K Yamins and Dan Gutfreund and Chuang Gan and James Traer and Jonas Kubilius and Martin Schrimpf and Abhishek Bhandwaldar and Julian De Freitas and Damian Mrowca and Michael Lingelbach and Megumi Sano and Daniel Bear and Kuno Kim and Nick Haber and Chaofei Fan} } @article {4819, title = {Time-dependent discrimination advantages for harmonic sounds suggest efficient coding for memory}, journal = {Proceedings of the National Academy of Sciences}, volume = {117}, year = {2020}, month = {12/2020}, pages = {32169 - 32180}, abstract = {

Perceptual systems have finite memory resources and must store incoming signals in compressed formats. To explore whether representations of a sound{\textquoteright}s pitch might derive from this need for compression, we compared discrimination of harmonic and inharmonic sounds across delays. In contrast to inharmonic spectra, harmonic spectra can be summarized, and thus compressed, using their fundamental frequency (f0). Participants heard two sounds and judged which was higher. Despite being comparable for sounds presented back-to-back, discrimination was better for harmonic than inharmonic stimuli when sounds were separated in time, implicating memory representations unique to harmonic sounds. Patterns of individual differences (correlations between thresholds in different conditions) indicated that listeners use different representations depending on the time delay between sounds, directly comparing the spectra of temporally adjacent sounds, but transitioning to comparing f0s across delays. The need to store sound in memory appears to determine reliance on f0-based pitch, and may explain its importance in music, in which listeners must extract relationships between notes separated in time.

}, issn = {0027-8424}, doi = {10.1073/pnas.2008956117}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.2008956117}, author = {McPherson, Malinda J. and Josh H. McDermott} } @article {4187, title = {Deep neural network models of sensory systems: windows onto the role of task constraints}, journal = {Current Opinion in Neurobiology}, volume = {55}, year = {2019}, month = {01/2019}, pages = {121 - 132}, abstract = {

Sensory neuroscience aims to build models that predict neural responses and perceptual behaviors, and that provide insight into the principles that give rise to them. For decades, artificial neural networks trained to perform perceptual tasks have attracted interest as potential models of neural computation. Only recently, however, have such systems begun to perform at human levels on some real-world tasks. The recent engineering successes of deep learning have led to renewed interest in artificial neural networks as models of the brain. Here we review applications of deep learning to sensory neuroscience, discussing potential limitations and future directions. We highlight the potential uses of deep neural networks to reveal how task performance may constrain neural systems and behavior. In particular, we consider how task-optimized networks can generate hypotheses about neural representations and functional organization in ways that are analogous to traditional ideal observer models.

}, issn = {09594388}, doi = {10.1016/j.conb.2019.02.003}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0959438818302034}, author = {Alexander J. E. Kell and Josh H. McDermott} } @article {4255, title = {Divergence in the functional organization of human and macaque auditory cortex revealed by fMRI responses to harmonic tones}, journal = {Nature Neuroscience}, year = {2019}, month = {06/10/2019}, abstract = {

We report a difference between humans and macaque monkeys in the functional organization of cortical regions implicated in pitch perception. Humans but not macaques showed regions with a strong preference for harmonic sounds compared to noise, measured with both synthetic tones and macaque vocalizations. In contrast, frequency-selective tonotopic maps were similar between the two species. This species difference may be driven by the unique demands of speech and music perception in humans.

}, issn = {1097-6256}, doi = {10.1038/s41593-019-0410-7}, url = {https://www.nature.com/articles/s41593-019-0410-7}, author = {Sam V Norman-Haignere and Nancy Kanwisher and Josh H. McDermott and B. R. Conway} } @article {4510, title = {Ecological origins of perceptual grouping principles in the auditory system}, journal = {Proceedings of the National Academy of Sciences}, volume = {116}, year = {2019}, month = {12/2019}, pages = {25355 - 25364}, abstract = {

Events and objects in the world must be inferred from sensory signals to support behavior. Because sensory measurements are temporally and spatially local, the estimation of an object or event can be viewed as the grouping of these measurements into representations of their common causes. Perceptual grouping is believed to reflect internalized regularities of the natural environment, yet grouping cues have traditionally been identified using informal observation and investigated using artificial stimuli. The relationship of grouping to natural signal statistics has thus remained unclear, and additional or alternative cues remain possible. Here, we develop a general methodology for relating grouping to natural sensory signals and apply it to derive auditory grouping cues from natural sounds. We first learned local spectrotemporal features from natural sounds and measured their co-occurrence statistics. We then learned a small set of stimulus properties that could predict the measured feature co-occurrences. The resulting cues included established grouping cues, such as harmonic frequency relationships and temporal coincidence, but also revealed previously unappreciated grouping principles. Human perceptual grouping was predicted by natural feature co-occurrence, with humans relying on the derived grouping cues in proportion to their informativity about co-occurrence in natural sounds. The results suggest that auditory grouping is adapted to natural stimulus statistics, show how these statistics can reveal previously unappreciated grouping phenomena, and provide a framework for studying grouping in natural signals.

}, issn = {0027-8424}, doi = {10.1073/pnas.1903887116}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.1903887116}, author = {M{\l}ynarski, Wiktor and Josh H. McDermott} } @article {4509, title = {Illusory sound texture reveals multi-second statistical completion in auditory scene analysis}, journal = {Nature Communications}, volume = {10}, year = {2019}, month = {11/2019}, abstract = {

Sound sources in the world are experienced as stable even when intermittently obscured, implying perceptual completion mechanisms that "fill in"\ missing sensory information. We demonstrate a filling-in phenomenon in which the brain extrapolates the statistics of background sounds (textures) over periods of several seconds when they are interrupted by another sound, producing vivid percepts of illusory texture. The effect differs from previously described completion effects in that 1) the extrapolated sound must be defined statistically given the stochastic nature of texture, and 2) the effect lasts much longer, enabling introspection and facilitating assessment of the underlying representation. Illusory texture biases subsequent texture statistic estimates indistinguishably from actual texture, suggesting that it is represented similarly to actual texture. The illusion appears to represent an inference about whether the background is likely to continue during concurrent sounds, providing a stable statistical representation of the ongoing environment despite unstable sensory evidence.

}, doi = {10.1038/s41467-019-12893-0}, url = {http://www.nature.com/articles/s41467-019-12893-0}, author = {McWalter, Richard and Josh H. McDermott} } @article {4507, title = {Invariance to background noise as a signature of non-primary auditory cortex}, journal = {Nature Communications}, volume = {10}, year = {2019}, month = {09/2019}, abstract = {

Despite well-established anatomical differences between primary and non-primary auditory cortex, the associated representational transformations have remained elusive. Here we show that primary and non-primary auditory cortex are differentiated by their invariance to real-world background noise. We measured fMRI responses to natural sounds presented in isolation and in real-world noise, quantifying invariance as the correlation between the two responses for individual voxels. Non-primary areas were substantially more noise-invariant than primary areas. This primary-nonprimary difference occurred both for speech and non-speech sounds and was unaffected by a concurrent demanding visual task, suggesting that the observed invariance is not specific to speech processing and is robust to inattention. The difference was most pronounced for real-world background noise {\textendash} both primary and non-primary areas were relatively robust to simple types of synthetic noise. Our results suggest a general representational transformation between auditory cortical stages, illustrating a representational consequence of hierarchical organization in the auditory system.

}, doi = {10.1038/s41467-019-11710-y}, url = {http://www.nature.com/articles/s41467-019-11710-y}, author = {Alexander J. E. Kell and Josh H. McDermott} } @proceedings {4373, title = {Metamers of neural networks reveal divergence from human perceptual systems}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

Deep neural networks have been embraced as models of sensory systems, instantiating representational transformations that appear to resemble those in the visual and auditory systems. To more thoroughly investigate their similarity to biological systems, we synthesized model metamers {\textendash} stimuli that produce the same responses at some stage of a network{\textquoteright}s representation. We generated model metamers for natural stimuli by performing gradient descent on a noise signal, matching the responses of individual layers of image and audio networks to a natural image or speech signal. The resulting signals reflect the invariances instantiated in the network up to the matched layer. We then measured whether model metamers were recognizable to human observers {\textendash} a necessary condition for the model representations to replicate those of humans. Although model metamers from early network layers were recognizable to humans, those from deeper layers were not. Auditory model metamers became more human-recognizable with architectural modifications that reduced aliasing from pooling operations, but those from the deepest layers remained unrecognizable. We also used the metamer test to compare model representations. Cross-model metamer recognition dropped off for deeper layers, roughly at the same point that human recognition deteriorated, indicating divergence across model representations. The results reveal discrepancies between model and human representations, but also show how metamers can help guide model refinement and elucidate model representations.

}, url = {https://papers.nips.cc/paper/9198-metamers-of-neural-networks-reveal-divergence-from-human-perceptual-systems}, author = {Jenelle Feather and Alex Durango and Ray Gonzalez and Josh H. McDermott} } @article {4500, title = {A perceptually inspired generative model of rigid-body contact sounds}, journal = {Proceedings of the 22nd International Conference on Digital Audio Effects (DAFx-19)}, year = {2019}, month = {09/2019}, abstract = {

Contact between rigid-body objects produces a diversity of impact and friction sounds. These sounds can be synthesized with detailed simulations of the motion, vibration and sound radiation of the objects, but such synthesis is computationally expensive and prohibitively slow for many applications. Moreover, detailed physical simulations may not be necessary for perceptually compelling synthesis; humans infer ecologically relevant causes of sound, such as material categories, but not with arbitrary precision. We present a generative model of impact sounds which summarizes the effect of physical variables on acoustic features via statistical distributions fit to empirical measurements of object acoustics. Perceptual experiments show that sampling from these distributions allows efficient synthesis of realistic impact and scraping sounds that convey material, mass, and motion.

}, author = {James Traer and Maddie Cusimano and Josh H. McDermott} } @conference {4526, title = {Scrape, rub, and roll: causal inference in the perception of sustained contact sounds }, booktitle = {Cognitive Science}, year = {2019}, month = {07/2019}, address = {Montreal, Qu{\'e}bec, Canada}, author = {Maddie Cusimano and James Traer and Josh H. McDermott} } @article {4313, title = {Universal and Non-universal Features of Musical Pitch Perception Revealed by Singing}, journal = {Current Biology}, year = {2019}, month = {09/2019}, abstract = {

Musical pitch perception is argued to result from nonmusical biological constraints and thus to have similar characteristics across cultures, but its universality remains unclear. We probed pitch representations in residents of the Bolivian Amazon{\textemdash}the Tsimane{\textquoteright}, who live in relative isolation from Western culture{\textemdash}as well as US musicians and non-musicians. Participants sang back tone sequences presented in different frequency ranges. Sung responses of Amazonian and US participants approximately replicated heard intervals on a logarithmic scale, even for tones outside the singing range. Moreover, Amazonian and US reproductions both deteriorated for high-frequency tones even though they were fully audible. But whereas US participants tended to reproduce notes an integer number of octaves above or below the heard tones, Amazonians did not, ignoring the note {\textquotedblleft}chroma{\textquotedblright} (C, D, etc.). Chroma matching in US participants was more pronounced in US musicians than non-musicians, was not affected by feedback, and was correlated with similarity-based measures of octave equivalence as well as the ability to match the absolute f0 of a stimulus in the singing range. The results suggest the cross-cultural presence of logarithmic scales for pitch, and biological constraints on the limits of pitch, but indicate that octave equivalence may be culturally contingent, plausibly dependent on pitch representations that develop from experience with particular musical systems.

}, keywords = {absolute pitch, bio-musicology, cross-cultural psychology, mental scales, music cognition, octave equivalence, pitch, relative pitch, singing, Tsimane{\textquoteright}}, issn = {09609822}, doi = {10.1016/j.cub.2019.08.020}, url = {https://linkinghub.elsevier.com/retrieve/pii/S096098221931036X}, author = {Jacoby, Nori and Undurraga, Eduardo A. and McPherson, Malinda J. and Vald{\'e}s, Joaqu{\'\i}n and Ossand{\'o}n, Tom{\'a}s and Josh H. McDermott} } @proceedings {4389, title = {Untangling in Invariant Speech Recognition}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

Encouraged by the success of deep convolutional neural networks on a variety of visual tasks, much theoretical and experimental work has been aimed at understanding and interpreting how vision networks operate. At the same time, deep neural networks have also achieved impressive performance in audio processing applications, both as sub-components of larger systems and as complete end-to-end systems by themselves. Despite their empirical successes, comparatively little is understood about how these audio models accomplish these tasks.In this work, we employ a recently developed statistical mechanical theory that connects geometric properties of network representations and the separability of classes to probe how information is untangled within neural networks trained to recognize speech. We observe that speaker-specific nuisance variations are discarded by the network{\textquoteright}s hierarchy, whereas task-relevant properties such as words and phonemes are untangled in later layers. Higher level concepts such as parts-of-speech and context dependence also emerge in the later layers of the network. Finally, we find that the deep representations carry out significant temporal untangling by efficiently extracting task-relevant features at each time step of the computation. Taken together, these findings shed light on how deep auditory models process their time dependent input signals to carry out invariant speech recognition, and show how different concepts emerge through the layers of the network.

}, author = {Cory Stephenson and Jenelle Feather and Suchismita Padhy and Oguz Elibol and Hanlin Tang and Josh H. McDermott and SueYeon Chung} } @article {3565, title = {Co-occurrence statistics of natural sound features predict perceptual grouping}, year = {2018}, month = {03/2018}, address = {Denver, Colorado}, url = {http://www.cosyne.org/c/index.php?title=Cosyne_18}, author = {Wiktor Mlynarski and Josh H. McDermott} } @article {3578, title = {Co-occurrence statistics of natural sound features predict perceptual grouping}, year = {2018}, author = {Wiktor Mlynarski and Josh H. McDermott} } @article {3576, title = {Human inference of force from impact sounds: Perceptual evidence for inverse physics}, volume = {143}, year = {2018}, month = {03/2018}, abstract = {

An impact sound is determined both by material properties of the objects involved (e.g., mass, density, shape, and rigidity) and by the force of the collision. Human listeners can typically estimate the force of an impact as well as the material which has been struck. To investigate the underlying auditory mechanisms we played listeners audio recordings of two boards being struck and measured their ability to identify the board struck with more force. Listeners significantly outperformed models based on simple acoustic features (e.g., signal power or spectral centroid). We repeated the experiment with synthetic sounds generated from simulated object resonant modes and simulated contact forces derived from a spring model. Listeners could not distinguish synthetic from real recordings and successfully estimated simulated impact force. When the synthetic modes were altered (e.g., to simulate a harder material) listeners altered their judgments of both material and impact force, consistent with the physical implications of the alteration. The results suggest that humans use resonant modes to infer object material, and use this knowledge to estimate the impact force, explaining away material contributions to the sound.

}, doi = {10.1121/1.5035721}, url = {https://asa.scitation.org/doi/abs/10.1121/1.5035721}, author = {James Traer and Josh H. McDermott} } @article {3577, title = {Human recognition of environmental sounds is not always robust to reverberation}, volume = {143}, year = {2018}, edition = {The Journal of the Acoustical Society of America }, abstract = {

Reverberation is ubiquitous in natural environments, but its effect on the recognition of non-speech sounds is poorly documented. To evaluate human robustness to reverberation, we measured its effect on the recognizability of everyday sounds. Listeners identified a diverse set of recorded environmental sounds (footsteps, animal vocalizations, vehicles moving, hammering, etc.) in an open set recognition task. For each participant, half of the sounds (randomly assigned) were presented in reverberation. We found the effect of reverberation to depend on the typical listening conditions for a sound. Sounds that are typically loud and heard in indoor environments, and which thus should often be accompanied by reverberation, were recognized robustly, with only a small impairment for reverberant conditions. In contrast, sounds that are either typically quiet or typically heard outdoors, for which reverberation should be less pronounced, produced a large recognition decrement in reverberation. These results demonstrate that humans can be remarkably robust to the distortion induced by reverberation, but that this robustness disappears when the reverberation is not consistent with the expected source properties. The results are consistent with the idea that listeners perceptually separate sound sources from reverberation, constrained by the likelihood of source-environment pairings.

}, doi = {10.1121/1.5035960}, url = {https://asa.scitation.org/doi/abs/10.1121/1.5035960}, author = {James Traer and Josh H. McDermott} } @article {3562, title = {Learning Mid-Level Auditory Codes from Natural Sound Statistics}, journal = {Neural Computation}, volume = {30}, year = {2018}, month = {03/2018}, pages = {631-669}, author = {Wiktor Mlynarski and Josh H. McDermott} } @article {3573, title = {A task-optimized neural network replicates human auditory behavior, predicts brain responses, and reveals a cortical processing hierarchy}, journal = {Neuron}, volume = {98}, year = {2018}, month = {04/2018}, abstract = {

A core goal of auditory neuroscience is to build quantitative models that predict cortical responses to natural sounds. Reasoning that a complete model of auditory cortex must solve ecologically relevant tasks, we optimized hierarchical neural networks for speech and music recognition. The best-performing network contained separate music and speech pathways following early shared processing, potentially replicating human cortical organization. The network performed both tasks as well as humans and exhibited human-like errors despite not being optimized to do so, suggesting common constraints on network and human performance. The network predicted fMRI voxel responses substantially better than traditional spectrotemporal filter models throughout auditory cortex. It also provided a quantitative signature of cortical representational hierarchy{\textemdash}primary and non-primary responses were best predicted by intermediate and late network layers, respectively. The results suggest that task optimization provides a powerful set of tools for modeling sensory systems.

}, keywords = {auditory cortex, convolutional neural network, deep learning, deep neural network, encoding models, fMRI, Hierarchy, human auditory cortex, natural sounds, word recognition}, doi = {10.1016/j.neuron.2018.03.044}, url = {https://www.sciencedirect.com/science/article/pii/S0896627318302502}, author = {Alexander J. E. Kell and Daniel L K Yamins and Erica N Shook and Sam V Norman-Haignere and Josh H. McDermott} } @article {2668, title = {Adaptive Compression of Statistically Homogenous Sensory Signals}, year = {2017}, author = {Wiktor Mlynarski and Josh H. McDermott} } @article {2749, title = {Auditory Perception of Material and Force from Impact Sounds}, year = {2017}, author = {James Traer and Josh H. McDermott} } @conference {3575, title = {Generative modeling of audible shapes for object perception}, booktitle = {The IEEE International Conference on Computer Vision (ICCV)}, year = {2017}, month = {10/2017}, address = {Venice, Italy}, abstract = {

Humans infer rich knowledge of objects from both auditory and visual cues. Building a machine of such competency, however, is very challenging, due to the great difficulty in capturing large-scale, clean data of objects with both their appearance and the sound they make. In this paper, we present a novel, open-source pipeline that generates audio-visual data, purely from 3D object shapes and their physical properties. Through comparison with audio recordings and human behavioral studies, we validate the accuracy of the sounds it generates. Using this generative model, we are able to construct a synthetic audio-visual dataset, namely Sound-20K, for object perception tasks. We demonstrate that auditory and visual information play complementary roles in object perception, and further, that the representation learned on synthetic audio-visual data can transfer to real-world scenarios.

}, url = {http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_Generative_Modeling_of_ICCV_2017_paper.html}, author = {Zhoutong Zhang and Jiajun Wu and Qiujia Li and Zhengjia Huang and James Traer and Josh H. McDermott and Joshua B. Tenenbaum and William T. Freeman} } @article {2751, title = {Investigating audition with a generative model of impact sounds}, year = {2017}, author = {James Traer and Josh H. McDermott} } @article {2386, title = {Learning Mid-Level Auditory Codes from Natural Sound Statistics}, year = {2017}, month = {01/2017}, abstract = {

Interaction with the world requires an organism to transform sensory signals into representations in which behaviorally meaningful properties of the environment are made explicit. These representations are derived through cascades of neuronal processing stages in which neurons at each stage recode the output of preceding stages. Explanations of sensory coding may thus involve understanding how low-level patterns are combined into more complex structures. Although models exist in the visual domain to explain how mid-level features such as junctions and curves might be derived from oriented filters in early visual cortex, little is known about analogous grouping principles for mid-level auditory representations. We propose a hierarchical generative model of natural sounds that learns combinations of spectrotemporal features from natural stimulus statistics. In the first layer the model forms a sparse convolutional code of spectrograms using a dictionary of learned spectrotemporal kernels. To generalize from specific kernel activation patterns, the second layer encodes patterns of time-varying magnitude of multiple first layer coefficients. Because second-layer features are sensitive to combinations of spectrotemporal features, the representation they support encodes more complex acoustic patterns than the first layer. When trained on corpora of speech and environmental sounds, some second-layer units learned to group spectrotemporal features that occur together in natural sounds. Others instantiate opponency between dissimilar sets of spectrotemporal features. Such groupings might be instantiated by neurons in the auditory cortex, providing a hypothesis for mid-level neuronal computation.

}, author = {Wiktor Mlynarski and Josh H. McDermott} } @article {2666, title = {Learning Mid-Level Codes for Natural Sounds}, year = {2017}, author = {Wiktor Mlynarski and Josh H. McDermott} } @article {2750, title = {A library of real-world reverberation and a toolbox for its analysis and measurement}, year = {2017}, author = {James Traer and Josh H. McDermott} } @article {2667, title = {Lossy Compression of Uninformative Stimuli in the Auditory System}, year = {2017}, author = {Wiktor Mlynarski and Josh H. McDermott} } @article {2748, title = {Environmental statistics enable perceptual separation of sound and space}, year = {2016}, abstract = {

The\ sound that reaches our ears from colliding objects (i.e. bouncing, scraping, rolling etc.) is structured, both by the physical characteristics of the sound source and by environmental reverberation. The inference of any one single parameter\ (mass, size, material, motion, room size, distance) is ill-posed, yet humans can simultaneously identify properties of sound sources and environments from the resulting sound, via mechanisms that remain unclear.\ We investigate whether our ability to recognize sound sources and spaces reflects an ability to separately infer how physical factors effect sound, and whether any such separation is enabled by statistical regularities of real-world sounds and real-world reverberation.\ To first determine whether such statistical regularities exist, we measured impulse responses (IRs) of both solid objects and environmental spaces sampled from the distribution encountered by humans during daily life. Both the objects and the\ sampled spaces were diverse, but their IRs were tightly constrained, exhibiting exponential decay at frequency-dependent rates. \ Object IRs showed sharp spectral peaks due to strong resonances and environmental IRs showed broad frequency variation:\ mid frequencies reverberated longest while higher and lower frequencies decayed more rapidly, presumably due to absorptive properties of materials and air.\ To test whether humans utilize these regularities to separate reverberation from sources, we manipulated environmental IR characteristics in simulated reverberant audio. Listeners could discriminate sound sources and environments from these signals, but we found that their abilities degraded when reverberation characteristics deviated from those of real-world environments.\ Subjectively, atypical IRs were mistaken for sound sources. The results suggest the brain separates sound into contributions from the source and the environment, constrained by a prior on natural reverberation. This separation process may contribute to robust recognition while providing information about spaces around us.

}, author = {James Traer and Josh H. McDermott} } @article {1797, title = {Learning mid-level codes for natural sounds}, year = {2016}, month = {02/2016}, address = {Salt Lake City, UT}, abstract = {

Auditory perception depends critically on abstract and behaviorally meaningful representations of natural auditory scenes. These representations are implemented by cascades of neuronal processing stages in which neurons at each stage recode outputs of preceding units. Explanations of auditory coding strategies must thus involve understanding how low-level acoustic patterns are combined into more complex structures. While models exist in the visual domain to explain how phase invariance is achieved by V1 complex cells, and how curvature representations emerge in V2, little is known about analogous grouping principles for mid-level auditory representations.

We propose a hierarchical, generative model of natural sounds that learns combinations of spectrotemporal features from natural stimulus statistics. In the first layer the model forms a sparse, convolutional code of spectrograms. Features learned on speech and environmental sounds resemble spectrotemporal receptive fields (STRFs) of mid-brain and cortical neurons, consistent with previous findings [1]. To generalize from specific STRF activation patterns, the second layer encodes patterns of time-varying magnitude (i.e. variance) of multiple first layer coefficients. Because it forms a code of a non- stationary distribution of STRF activations, it is partially invariant to their specific values. Moreover, because second-layer features are sensitive to STRF combinations, the representation they support is more selective to complex acoustic patterns. The second layer substantially improved the model{\textquoteright}s performance on a denoising task, implying a closer match to the natural stimulus distribution.

Quantitative hypotheses emerge from the model regarding selectivity of auditory neurons characterized by multidimensional STRFs [2] and sensitivity to increasingly more abstract structure [3]. The model also predicts that the auditory system constructs representations progressively more invariant to noise, consistent with recent experimental findings [4]. Our results suggest that mid-level auditory representations may be derived from high-order stimulus dependencies present in the natural environment.\ 

}, url = {http://www.cosyne.org/c/index.php?title=Cosyne2016_posters_2}, author = {Wiktor Mlynarski and Josh H. McDermott} } @article {2664, title = {Learning Mid-Level Codes for Natural Sounds}, year = {2016}, author = {Wiktor Mlynarski and Josh H. McDermott} } @proceedings {2618, title = {Lecture Notes in Computer ScienceComputer Vision {\textendash} ECCV 2016Ambient Sound Provides Supervision for Visual Learning}, year = {2016}, month = {10/2016}, pages = {801 - 816}, address = {Cham}, abstract = {

The sound of crashing waves, the roar of fast-moving cars {\textendash} sound conveys important information about the objects in our surroundings. In this work, we show that ambient sounds can be used as a supervisory signal for learning visual models. To demonstrate this, we train a convolutional neural network to predict a statistical summary of the sound associated with a video frame. We show that, through this process, the network learns a representation that conveys information about objects and scenes. We evaluate this representation on several recognition tasks, finding that its performance is comparable to that of other state-of-the-art unsupervised learning methods. Finally, we show through visualizations that the network learns units that are selective to objects that are often associated with characteristic sounds.

}, keywords = {convolutional networks, Sound, unsupervised learning}, isbn = {978-3-319-46447-3}, issn = {0302-9743}, doi = {10.1007/978-3-319-46448-010.1007/978-3-319-46448-0_48}, url = {http://link.springer.com/10.1007/978-3-319-46448-0}, author = {Owens, Andrew and Isola, P. and Josh H. McDermott and William T. Freeman and Torralba, Antonio} } @article {2665, title = {Lossy Compression of Sound Texture by the Human Auditory System}, year = {2016}, author = {Wiktor Mlynarski and Josh H. McDermott} } @article {2617, title = {Statistics of natural reverberation enable perceptual separation of sound and space}, journal = {Proceedings of the National Academy of Sciences}, volume = {113}, year = {2016}, month = {09/2016}, pages = {E7856 - E7865}, abstract = {

In everyday listening, sound reaches our ears directly from a source as well as indirectly via reflections known as reverberation. Reverberation profoundly distorts the sound from a source, yet humans can both identify sound sources and distinguish environments from the resulting sound, via mechanisms that remain unclear. The core computational challenge is that the acoustic signatures of the source and environment are combined in a single signal received by the ear. Here we ask whether our recognition of sound sources and spaces reflects an ability to separate their effects and whether any such separation is enabled by statistical regularities of real-world reverberation. To first determine whether such statistical regularities exist, we measured impulse responses (IRs) of 271 spaces sampled from the distribution encountered by humans during daily life. The sampled spaces were diverse, but their IRs were tightly constrained, exhibiting exponential decay at frequency-dependent rates: Mid frequencies reverberated longest whereas higher and lower frequencies decayed more rapidly, presumably due to absorptive properties of materials and air. To test whether humans leverage these regularities, we manipulated IR decay characteristics in simulated reverberant audio. Listeners could discriminate sound sources and environments from these signals, but their abilities degraded when reverberation characteristics deviated from those of real-world environments. Subjectively, atypical IRs were mistaken for sound sources. The results suggest the brain separates sound into contributions from the source and the environment, constrained by a prior on natural reverberation. This separation process may contribute to robust recognition while providing information about spaces around us.

}, keywords = {auditory scene analysis, environmental acoustics, natural scene statistics, psychoacoustics, Psychophysics}, issn = {0027-8424}, doi = {10.1073/pnas.1612524113}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.1612524113}, author = {James Traer and Josh H. McDermott} } @conference {2747, title = {Visually indicated sounds}, booktitle = {Conference on Computer Vision and Pattern Recognition}, year = {2016}, month = {06/2016}, author = {Owens, Andrew and Isola, P. and Josh H. McDermott and Torralba, Antonio and Adelson, Edward H. and William T. Freeman} }