@article {4823,
	title = {Causal inference in environmental sound recognition},
	journal = {Cognition},
	year = {2021},
	month = {03/2021},
	abstract = {<p>Sound is caused by physical events in the world. Do humans infer these causes when recognizing sound sources? We tested whether the recognition of common environmental sounds depends on the inference of a basic physical variable -- the source intensity (i.e. the power that produces a sound). A source{\textquoteright}s intensity can be inferred from the intensity it produces at the ear and its distance, which is normally conveyed by reverberation. Listeners could thus use intensity at the ear and reverberation to constrain recognition by inferring the underlying source intensity. Alternatively, listeners might separate these acoustic cues from their representation of a sound{\textquoteright}s identity in the interest of invariant recognition. We compared these two hypotheses by measuring recognition accuracy for sounds with typically low or high source intensity (e.g. pepper grinders vs. trucks) that were presented across a range of intensities at the ear or with reverberation cues to distance. The recognition of low-intensity sources (e.g. pepper grinders) was impaired by high presentation intensities or reverberation that conveyed distance, either of which imply high source intensity. Neither effect occurred for high-intensity sources. The results suggest that listeners implicitly use the intensity at the ear along with distance cues to infer a source{\textquoteright}s power and constrain its identity. The recognition of real-world sounds thus appears to depend upon the inference of their physical generative parameters, even generative parameters whose cues might otherwise be separated from the representation of a sound{\textquoteright}s identity.</p>
},
	doi = {10.1016/j.cognition.2021.104627},
	author = {James Traer and Sam Norman-Haignere and Josh H. McDermott}
}
@article {4632,
	title = {ThreeDWorld: A Platform for Interactive Multi-Modal Physical Simulation},
	journal = {arXiv},
	year = {2020},
	month = {07/2020},
	type = {Preprint},
	abstract = {<p>We introduce ThreeDWorld (TDW), a platform for interactive multi-modal physical simulation. With TDW, users can simulate high-fidelity sensory data and physical interactions between mobile agents and objects in a wide variety of rich 3D environments. TDW has several unique properties: 1) realtime near photo-realistic image rendering quality; 2) a library of objects and environments with materials for high-quality rendering, and routines enabling user customization of the asset library; 3) generative procedures for efficiently building classes of new environments 4) high-fidelity audio rendering; 5) believable and realistic physical interactions for a wide variety of material types, including cloths, liquid, and deformable objects; 6) a range of "avatar" types that serve as embodiments of AI agents, with the option for user avatar customization; and 7) support for human interactions with VR devices. TDW also provides a rich API enabling multiple agents to interact within a simulation and return a range of sensor and physics data representing the state of the world. We present initial experiments enabled by the platform around emerging research directions in computer vision, machine learning, and cognitive science, including multi-modal physical scene understanding, multi-agent interactions, models that "learn like a child", and attention studies in humans and neural networks. The simulation platform will be made publicly available.</p>
},
	url = {https://arxiv.org/abs/2007.04954},
	author = {Chuang Gen and Jeremy Schwartz and Seth Alter and Martin Schrimpf and James Traer and Julian De Freitas and Jonas Kubilius and Abhishek Bhandwaldar and Nick Haber and Megumi Sano and Kuno Kim and Elias Wang and Damian Mrowca and Michael Lingelbach and Aidan Curtis and Kevin Feigleis and Daniel Bear and Dan Gutfreund and David Cox and James J. DiCarlo and Josh H. McDermott and Joshua B. Tenenbaum and Daniel L K Yamins}
}
@article {4633,
	title = {ThreeDWorld (TDW): A High-Fidelity, Multi-Modal Platform for Interactive Physical Simulation},
	year = {2020},
	month = {07/2020},
	abstract = {<h3>TDW is a 3D virtual world simulation platform, utilizing state-of-the-art video game engine technology</h3>

<p>A TDW simulation consists of two components: a) the <strong>Build</strong>, a compiled executable running on the Unity3D Engine, which is responsible for image rendering, audio synthesis and physics simulations; and b) the <strong>Controller</strong>, an external Python interface to communicate with the build.</p>

<p>Researchers write Controllers that send commands to the Build, which executes those commands and returns a broad range of data types representing the state of the virtual world.</p>

<p>TDW provides researchers with:</p>

<ul>
	<li>A general, flexible design that does not impose constraints on the types of use-cases it can support, nor force any particular metaphor on the user.</li>
	<li>Support for multiple modalities -- visual rendering with near-photoreal image quality, coupled with superior audio rendering fidelity.</li>
	<li>A comprehensive, highly extensible and thoroughly documented command and control Python API.</li>
	<li>Multiple paradigms for object interaction, capable of generating physically-realistic behavior.</li>
</ul>

<p>TDW is being used on a daily basis in multiple labs, supporting research that sits at the nexus of neuroscience, cognitive science and artificial intelligence.</p>

<p>Find out more about ThreeDWorld on the project weobsite using the link below.</p>
},
	url = {http://www.threedworld.org/},
	author = {Jeremy Schwartz and Seth Alter and James J. DiCarlo and Josh H. McDermott and Joshua B. Tenenbaum and Daniel L K Yamins and Dan Gutfreund and Chuang Gan and James Traer and Jonas Kubilius and Martin Schrimpf and Abhishek Bhandwaldar and Julian De Freitas and Damian Mrowca and Michael Lingelbach and Megumi Sano and Daniel Bear and Kuno Kim and Nick Haber and Chaofei Fan}
}
@article {4500,
	title = {A perceptually inspired generative model of rigid-body contact sounds},
	journal = {Proceedings of the 22nd International Conference on Digital Audio Effects (DAFx-19)},
	year = {2019},
	month = {09/2019},
	abstract = {<p>Contact between rigid-body objects produces a diversity of impact and friction sounds. These sounds can be synthesized with detailed simulations of the motion, vibration and sound radiation of the objects, but such synthesis is computationally expensive and prohibitively slow for many applications. Moreover, detailed physical simulations may not be necessary for perceptually compelling synthesis; humans infer ecologically relevant causes of sound, such as material categories, but not with arbitrary precision. We present a generative model of impact sounds which summarizes the effect of physical variables on acoustic features via statistical distributions fit to empirical measurements of object acoustics. Perceptual experiments show that sampling from these distributions allows efficient synthesis of realistic impact and scraping sounds that convey material, mass, and motion.</p>
},
	author = {James Traer and Maddie Cusimano and Josh H. McDermott}
}
@conference {4526,
	title = {Scrape, rub, and roll: causal inference in the perception of sustained contact sounds },
	booktitle = {Cognitive Science},
	year = {2019},
	month = {07/2019},
	address = {Montreal, Qu{\'e}bec, Canada},
	author = {Maddie Cusimano and James Traer and Josh H. McDermott}
}
@article {3576,
	title = {Human inference of force from impact sounds: Perceptual evidence for inverse physics},
	volume = {143},
	year = {2018},
	month = {03/2018},
	abstract = {<p>An impact sound is determined both by material properties of the objects involved (e.g., mass, density, shape, and rigidity) and by the force of the collision. Human listeners can typically estimate the force of an impact as well as the material which has been struck. To investigate the underlying auditory mechanisms we played listeners audio recordings of two boards being struck and measured their ability to identify the board struck with more force. Listeners significantly outperformed models based on simple acoustic features (e.g., signal power or spectral centroid). We repeated the experiment with synthetic sounds generated from simulated object resonant modes and simulated contact forces derived from a spring model. Listeners could not distinguish synthetic from real recordings and successfully estimated simulated impact force. When the synthetic modes were altered (e.g., to simulate a harder material) listeners altered their judgments of both material and impact force, consistent with the physical implications of the alteration. The results suggest that humans use resonant modes to infer object material, and use this knowledge to estimate the impact force, explaining away material contributions to the sound.</p>
},
	doi = {10.1121/1.5035721},
	url = {https://asa.scitation.org/doi/abs/10.1121/1.5035721},
	author = {James Traer and Josh H. McDermott}
}
@article {3577,
	title = {Human recognition of environmental sounds is not always robust to reverberation},
	volume = {143},
	year = {2018},
	edition = {The Journal of the Acoustical Society of America },
	abstract = {<p>Reverberation is ubiquitous in natural environments, but its effect on the recognition of non-speech sounds is poorly documented. To evaluate human robustness to reverberation, we measured its effect on the recognizability of everyday sounds. Listeners identified a diverse set of recorded environmental sounds (footsteps, animal vocalizations, vehicles moving, hammering, etc.) in an open set recognition task. For each participant, half of the sounds (randomly assigned) were presented in reverberation. We found the effect of reverberation to depend on the typical listening conditions for a sound. Sounds that are typically loud and heard in indoor environments, and which thus should often be accompanied by reverberation, were recognized robustly, with only a small impairment for reverberant conditions. In contrast, sounds that are either typically quiet or typically heard outdoors, for which reverberation should be less pronounced, produced a large recognition decrement in reverberation. These results demonstrate that humans can be remarkably robust to the distortion induced by reverberation, but that this robustness disappears when the reverberation is not consistent with the expected source properties. The results are consistent with the idea that listeners perceptually separate sound sources from reverberation, constrained by the likelihood of source-environment pairings.</p>
},
	doi = {10.1121/1.5035960},
	url = {https://asa.scitation.org/doi/abs/10.1121/1.5035960},
	author = {James Traer and Josh H. McDermott}
}
@article {2749,
	title = {Auditory Perception of Material and Force from Impact Sounds},
	year = {2017},
	author = {James Traer and Josh H. McDermott}
}
@conference {3575,
	title = {Generative modeling of audible shapes for object perception},
	booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
	year = {2017},
	month = {10/2017},
	address = {Venice, Italy},
	abstract = {<p>Humans infer rich knowledge of objects from both auditory and visual cues. Building a machine of such competency, however, is very challenging, due to the great difficulty in capturing large-scale, clean data of objects with both their appearance and the sound they make. In this paper, we present a novel, open-source pipeline that generates audio-visual data, purely from 3D object shapes and their physical properties. Through comparison with audio recordings and human behavioral studies, we validate the accuracy of the sounds it generates. Using this generative model, we are able to construct a synthetic audio-visual dataset, namely Sound-20K, for object perception tasks. We demonstrate that auditory and visual information play complementary roles in object perception, and further, that the representation learned on synthetic audio-visual data can transfer to real-world scenarios.</p>
},
	url = {http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_Generative_Modeling_of_ICCV_2017_paper.html},
	author = {Zhoutong Zhang and Jiajun Wu and Qiujia Li and Zhengjia Huang and James Traer and Josh H. McDermott and Joshua B. Tenenbaum and William T. Freeman}
}
@article {2751,
	title = {Investigating audition with a generative model of impact sounds},
	year = {2017},
	author = {James Traer and Josh H. McDermott}
}
@article {2750,
	title = {A library of real-world reverberation and a toolbox for its analysis and measurement},
	year = {2017},
	author = {James Traer and Josh H. McDermott}
}
@article {2748,
	title = {Environmental statistics enable perceptual separation of sound and space},
	year = {2016},
	abstract = {<p>The\&nbsp;sound that reaches our ears from colliding objects (i.e. bouncing, scraping, rolling etc.) is structured, both by the physical characteristics of the sound source and by environmental reverberation. The inference of any one single parameter\&nbsp;(mass, size, material, motion, room size, distance) is ill-posed, yet humans can simultaneously identify properties of sound sources and environments from the resulting sound, via mechanisms that remain unclear.\&nbsp;We investigate whether our ability to recognize sound sources and spaces reflects an ability to separately infer how physical factors effect sound, and whether any such separation is enabled by statistical regularities of real-world sounds and real-world reverberation.\&nbsp;To first determine whether such statistical regularities exist, we measured impulse responses (IRs) of both solid objects and environmental spaces sampled from the distribution encountered by humans during daily life. Both the objects and the\&nbsp;sampled spaces were diverse, but their IRs were tightly constrained, exhibiting exponential decay at frequency-dependent rates. \&nbsp;Object IRs showed sharp spectral peaks due to strong resonances and environmental IRs showed broad frequency variation:\&nbsp;mid frequencies reverberated longest while higher and lower frequencies decayed more rapidly, presumably due to absorptive properties of materials and air.\&nbsp;To test whether humans utilize these regularities to separate reverberation from sources, we manipulated environmental IR characteristics in simulated reverberant audio. Listeners could discriminate sound sources and environments from these signals, but we found that their abilities degraded when reverberation characteristics deviated from those of real-world environments.\&nbsp;Subjectively, atypical IRs were mistaken for sound sources. The results suggest the brain separates sound into contributions from the source and the environment, constrained by a prior on natural reverberation. This separation process may contribute to robust recognition while providing information about spaces around us.</p>
},
	author = {James Traer and Josh H. McDermott}
}
@article {2617,
	title = {Statistics of natural reverberation enable perceptual separation of sound and space},
	journal = {Proceedings of the National Academy of Sciences},
	volume = {113},
	year = {2016},
	month = {09/2016},
	pages = {E7856 - E7865},
	abstract = {<p>In everyday listening, sound reaches our ears directly from a source as well as indirectly via reflections known as reverberation. Reverberation profoundly distorts the sound from a source, yet humans can both identify sound sources and distinguish environments from the resulting sound, via mechanisms that remain unclear. The core computational challenge is that the acoustic signatures of the source and environment are combined in a single signal received by the ear. Here we ask whether our recognition of sound sources and spaces reflects an ability to separate their effects and whether any such separation is enabled by statistical regularities of real-world reverberation. To first determine whether such statistical regularities exist, we measured impulse responses (IRs) of 271 spaces sampled from the distribution encountered by humans during daily life. The sampled spaces were diverse, but their IRs were tightly constrained, exhibiting exponential decay at frequency-dependent rates: Mid frequencies reverberated longest whereas higher and lower frequencies decayed more rapidly, presumably due to absorptive properties of materials and air. To test whether humans leverage these regularities, we manipulated IR decay characteristics in simulated reverberant audio. Listeners could discriminate sound sources and environments from these signals, but their abilities degraded when reverberation characteristics deviated from those of real-world environments. Subjectively, atypical IRs were mistaken for sound sources. The results suggest the brain separates sound into contributions from the source and the environment, constrained by a prior on natural reverberation. This separation process may contribute to robust recognition while providing information about spaces around us.</p>
},
	keywords = {auditory scene analysis, environmental acoustics, natural scene statistics, psychoacoustics, Psychophysics},
	issn = {0027-8424},
	doi = {10.1073/pnas.1612524113},
	url = {http://www.pnas.org/lookup/doi/10.1073/pnas.1612524113},
	author = {James Traer and Josh H. McDermott}
}