@article {5316, title = {NOPA: Neurally-guided Online Probabilistic Assistance for Building Socially Intelligent Home Assistants}, journal = {arXiv}, year = {2023}, month = {01/2023}, abstract = {

In this work, we study how to build socially intelligent robots to assist people in their homes. In particular, we focus on assistance with online goal inference, where robots must simultaneously infer humans{\textquoteright} goals and how to help them achieve those goals. Prior assistance methods either lack the adaptivity to adjust helping strategies (i.e., when and how to help) in response to uncertainty about goals or the scalability to conduct fast inference in a large goal space. Our NOPA (Neurally-guided Online Probabilistic Assistance) method addresses both of these challenges. NOPA consists of (1) an online goal inference module combining neural goal proposals with inverse planning and particle filtering for robust inference under uncertainty, and (2) a helping planner that discovers valuable subgoals to help with and is aware of the uncertainty in goal inference. We compare NOPA against multiple baselines in a new embodied AI assistance challenge: Online Watch-And-Help, in which a helper agent needs to simultaneously watch a main agent{\textquoteright}s action, infer its goal, and help perform a common household task faster in realistic virtual home environments. Experiments show that our helper agent robustly updates its goal inference and adapts its helping plans to the changing level of uncertainty.

}, url = {https://arxiv.org/abs/2301.05223}, author = {Xavier Puig and Tianmin Shu and Joshua B. Tenenbaum and Torralba, Antonio} } @proceedings {2618, title = {Lecture Notes in Computer ScienceComputer Vision {\textendash} ECCV 2016Ambient Sound Provides Supervision for Visual Learning}, year = {2016}, month = {10/2016}, pages = {801 - 816}, address = {Cham}, abstract = {

The sound of crashing waves, the roar of fast-moving cars {\textendash} sound conveys important information about the objects in our surroundings. In this work, we show that ambient sounds can be used as a supervisory signal for learning visual models. To demonstrate this, we train a convolutional neural network to predict a statistical summary of the sound associated with a video frame. We show that, through this process, the network learns a representation that conveys information about objects and scenes. We evaluate this representation on several recognition tasks, finding that its performance is comparable to that of other state-of-the-art unsupervised learning methods. Finally, we show through visualizations that the network learns units that are selective to objects that are often associated with characteristic sounds.

}, keywords = {convolutional networks, Sound, unsupervised learning}, isbn = {978-3-319-46447-3}, issn = {0302-9743}, doi = {10.1007/978-3-319-46448-010.1007/978-3-319-46448-0_48}, url = {http://link.springer.com/10.1007/978-3-319-46448-0}, author = {Owens, Andrew and Isola, P. and Josh H. McDermott and William T. Freeman and Torralba, Antonio} } @conference {2747, title = {Visually indicated sounds}, booktitle = {Conference on Computer Vision and Pattern Recognition}, year = {2016}, month = {06/2016}, author = {Owens, Andrew and Isola, P. and Josh H. McDermott and Torralba, Antonio and Adelson, Edward H. and William T. Freeman} }