@proceedings {2618, title = {Lecture Notes in Computer ScienceComputer Vision {\textendash} ECCV 2016Ambient Sound Provides Supervision for Visual Learning}, year = {2016}, month = {10/2016}, pages = {801 - 816}, address = {Cham}, abstract = {

The sound of crashing waves, the roar of fast-moving cars {\textendash} sound conveys important information about the objects in our surroundings. In this work, we show that ambient sounds can be used as a supervisory signal for learning visual models. To demonstrate this, we train a convolutional neural network to predict a statistical summary of the sound associated with a video frame. We show that, through this process, the network learns a representation that conveys information about objects and scenes. We evaluate this representation on several recognition tasks, finding that its performance is comparable to that of other state-of-the-art unsupervised learning methods. Finally, we show through visualizations that the network learns units that are selective to objects that are often associated with characteristic sounds.

}, keywords = {convolutional networks, Sound, unsupervised learning}, isbn = {978-3-319-46447-3}, issn = {0302-9743}, doi = {10.1007/978-3-319-46448-010.1007/978-3-319-46448-0_48}, url = {http://link.springer.com/10.1007/978-3-319-46448-0}, author = {Owens, Andrew and Isola, P. and Josh H. McDermott and William T. Freeman and Torralba, Antonio} }