@conference {3575, title = {Generative modeling of audible shapes for object perception}, booktitle = {The IEEE International Conference on Computer Vision (ICCV)}, year = {2017}, month = {10/2017}, address = {Venice, Italy}, abstract = {

Humans infer rich knowledge of objects from both auditory and visual cues. Building a machine of such competency, however, is very challenging, due to the great difficulty in capturing large-scale, clean data of objects with both their appearance and the sound they make. In this paper, we present a novel, open-source pipeline that generates audio-visual data, purely from 3D object shapes and their physical properties. Through comparison with audio recordings and human behavioral studies, we validate the accuracy of the sounds it generates. Using this generative model, we are able to construct a synthetic audio-visual dataset, namely Sound-20K, for object perception tasks. We demonstrate that auditory and visual information play complementary roles in object perception, and further, that the representation learned on synthetic audio-visual data can transfer to real-world scenarios.

}, url = {http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_Generative_Modeling_of_ICCV_2017_paper.html}, author = {Zhoutong Zhang and Jiajun Wu and Qiujia Li and Zhengjia Huang and James Traer and Josh H. McDermott and Joshua B. Tenenbaum and William T. Freeman} } @proceedings {3240, title = {Learning to See Physics via Visual De-animation}, year = {2017}, month = {12/2017}, pages = {152{\textendash}163}, abstract = {
We introduce a paradigm for understanding physical scenes without human annotations. At the core of our system is a physical world representation that is first recovered by a perception module and then utilized by physics and graphics
engines. During training, the perception module and the generative models learn by visual de-animation
{\textemdash} interpreting and reconstructing the visual information stream. During testing, the system first recovers the physical world state, and then uses the generative models for reasoning and future prediction.
Even more so than forward simulation, inverting a physics or graphics engine is a computationally hard problem; we overcome this challenge by using a convolutional inversion network. Our system quickly recognizes the physical world
state from appearance and motion cues, and has the flexibility to incorporate both differentiable and non-differentiable physics and graphics engines. We evaluate our system on both synthetic and real datasets involving multiple physical scenes, and demonstrate that our system performs well on both physical state estimation and reasoning problems. We further show that the knowledge learned on the synthetic dataset generalizes to constrained real images.
}, url = {http://papers.nips.cc/paper/6620-learning-to-see-physics-via-visual-de-animation.pdf}, author = {Jiajun Wu and Lu, Erika and Kohli, Pushmeet and William T. Freeman and Joshua B. Tenenbaum}, editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett} } @proceedings {3241, title = {MarrNet: 3D Shape Reconstruction via 2.5D Sketches}, year = {2017}, month = {12/2017}, pages = {540{\textendash}550}, publisher = {Curran Associates, Inc.}, address = {Long Beach, CA}, abstract = {

3D object reconstruction from a single image is a highly under-determined problem, requiring strong prior knowledge of plausible 3D shapes. This introduces challenge for learning-based approaches, as 3D object annotations in real images are scarce. Previous work chose to train on synthetic data with ground truth 3D information, but suffered from the domain adaptation issue when tested on real data. In this work, we propose an end-to-end trainable framework, sequentially estimating 2.5D sketches and 3D object shapes. Our disentangled, two-step formulation has three advantages. First, compared to full 3D shape, 2.5D sketches are much easier to be recovered from a 2D image, and to transfer from synthetic to real data. Second, for 3D reconstruction from the 2.5D sketches, we can easily transfer the learned model on synthetic data to real images, as rendered 2.5D sketches are invariant to object appearance variations in real images, including lighting, texture, etc. This further relieves the domain adaptation problem. Third, we derive differentiable projective functions from 3D shape to 2.5D sketches, making the framework end-to-end trainable on real images, requiring no real-image annotations. Our framework achieves state-of-the-art performance on 3D shape reconstruction.

}, url = {http://papers.nips.cc/paper/6657-marrnet-3d-shape-reconstruction-via-25d-sketches.pdf}, author = {Jiajun Wu and Wang, Yifan and Xue, Tianfan and Sun, Xingyuan and William T. Freeman and Joshua B. Tenenbaum}, editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett} } @proceedings {3242, title = {Shape and Material from Sound}, year = {2017}, month = {12/2017}, pages = {1278{\textendash}1288}, address = {Long Beach, CA}, abstract = {

What can we infer from hearing an object falling onto the ground? Based on knowledge of the physical world, humans are able to infer rich information from such limited data: rough shape of the object, its material, the height of falling, etc. In this paper, we aim to approximate such competency. We first mimic the human knowledge about the physical world using a fast physics-based generative model. Then, we present an analysis-by-synthesis approach to infer properties of the falling object. We further approximate human past experience by directly mapping audio to object properties using deep learning with self-supervision. We evaluate our method through behavioral studies, where we compare human predictions with ours on inferring object shape, material, and initial height of falling. Results show that our method achieves near-human performance, without any annotations.

}, url = {http://papers.nips.cc/paper/6727-shape-and-material-from-sound.pdf}, author = {zhang, zhoutong and Qiujia Li and Zhengjia Huang and Jiajun Wu and Joshua B. Tenenbaum and William T. Freeman}, editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett} } @proceedings {2618, title = {Lecture Notes in Computer ScienceComputer Vision {\textendash} ECCV 2016Ambient Sound Provides Supervision for Visual Learning}, year = {2016}, month = {10/2016}, pages = {801 - 816}, address = {Cham}, abstract = {

The sound of crashing waves, the roar of fast-moving cars {\textendash} sound conveys important information about the objects in our surroundings. In this work, we show that ambient sounds can be used as a supervisory signal for learning visual models. To demonstrate this, we train a convolutional neural network to predict a statistical summary of the sound associated with a video frame. We show that, through this process, the network learns a representation that conveys information about objects and scenes. We evaluate this representation on several recognition tasks, finding that its performance is comparable to that of other state-of-the-art unsupervised learning methods. Finally, we show through visualizations that the network learns units that are selective to objects that are often associated with characteristic sounds.

}, keywords = {convolutional networks, Sound, unsupervised learning}, isbn = {978-3-319-46447-3}, issn = {0302-9743}, doi = {10.1007/978-3-319-46448-010.1007/978-3-319-46448-0_48}, url = {http://link.springer.com/10.1007/978-3-319-46448-0}, author = {Owens, Andrew and Isola, P. and Josh H. McDermott and William T. Freeman and Torralba, Antonio} } @conference {2747, title = {Visually indicated sounds}, booktitle = {Conference on Computer Vision and Pattern Recognition}, year = {2016}, month = {06/2016}, author = {Owens, Andrew and Isola, P. and Josh H. McDermott and Torralba, Antonio and Adelson, Edward H. and William T. Freeman} } @conference {1825, title = {Galileo: Perceiving physical object properties by integrating a physics engine with deep learning.}, booktitle = {NIPS 2015}, year = {2015}, address = { Montr{\'e}al, Canada}, abstract = {
Humans demonstrate remarkable abilities to predict physical events in dynamicscenes, and to infer the physical properties of objects from static images. We propose a generative model for solving these problems of physical scene understanding from real-world videos and images. At the core of our generative modelis a 3D physics engine, operating on an object-based representation of physical properties, including mass, position, 3D shape, and friction. We can infer these latent properties using relatively brief runs of MCMC, which drive simulations in
the physics engine to fit key features of visual observations. We further explore directly mapping visual inputs to physical properties, inverting a part of the generative process using deep learning. We name our model Galileo, and evaluate it on a video dataset with simple yet physically rich scenarios. Results show that Galileo is able to infer the physical properties of objects and predict the outcome of a variety of physical events, with an accuracy comparable to human subjects. Our study points towards an account of human vision with generative physical knowledge at its core, and various recognition models as helpers leading to efficient inference.
}, url = {https://papers.nips.cc/paper/5780-galileo-perceiving-physical-object-properties-by-integrating-a-physics-engine-with-deep-learning}, author = {Jiajun Wu and Ilker Yildirim and Joseph J. Lim and William T. Freeman and Joshua B. Tenenbaum} }