@article {5316, title = {NOPA: Neurally-guided Online Probabilistic Assistance for Building Socially Intelligent Home Assistants}, journal = {arXiv}, year = {2023}, month = {01/2023}, abstract = {

In this work, we study how to build socially intelligent robots to assist people in their homes. In particular, we focus on assistance with online goal inference, where robots must simultaneously infer humans{\textquoteright} goals and how to help them achieve those goals. Prior assistance methods either lack the adaptivity to adjust helping strategies (i.e., when and how to help) in response to uncertainty about goals or the scalability to conduct fast inference in a large goal space. Our NOPA (Neurally-guided Online Probabilistic Assistance) method addresses both of these challenges. NOPA consists of (1) an online goal inference module combining neural goal proposals with inverse planning and particle filtering for robust inference under uncertainty, and (2) a helping planner that discovers valuable subgoals to help with and is aware of the uncertainty in goal inference. We compare NOPA against multiple baselines in a new embodied AI assistance challenge: Online Watch-And-Help, in which a helper agent needs to simultaneously watch a main agent{\textquoteright}s action, infer its goal, and help perform a common household task faster in realistic virtual home environments. Experiments show that our helper agent robustly updates its goal inference and adapts its helping plans to the changing level of uncertainty.

}, url = {https://arxiv.org/abs/2301.05223}, author = {Xavier Puig and Tianmin Shu and Joshua B. Tenenbaum and Torralba, Antonio} } @conference {5322, title = {Zero-shot linear combinations of grounded social interactions with Linear Social MDPs}, booktitle = {Proceedings of the 37th AAAI Conference on Artificial Intelligence (AAAI)}, year = {2023}, month = {02/2023}, abstract = {

Humans and animals engage in rich social interactions. It is often theorized that a relatively small number of basic social interactions give rise to the full range of behavior observed. But no computational theory explaining how social interactions combine together has been proposed before. We do so here. We take a model, the Social MDP, which is able to express a range of social interactions, and extend it to represent linear combinations of social interactions. Practically for robotics applications, such models are now able to not just express that an agent should help another agent, but to express goal-centric social interactions. Perhaps an agent is helping someone get dressed, but preventing them from falling, and is happy to exchange stories in the meantime. How an agent responds socially, should depend on what it thinks the other agent is doing at that point in time. To encode this notion, we take linear combinations of social interactions as defined in Social MDPs, and compute the weights on those combinations on the fly depending on the estimated goals of other agents. This new model, the Linear Social MDP, enables zero-shot reasoning about complex social interactions, provides a mathematical basis for the long-standing intuition that social interactions should compose, and leads to interesting new behaviors that we validate using human observers. Complex social interactions are part of the future of intelligent agents, and having principled mathematical models built on a foundation like MDPs will make it possible to bring social interactions to every robotic application.

}, author = {Ravi Tejwani and Yen-Ling Kuo and Tianmin Shu and Bennett Stankovits and Dan Gutfreund and Joshua B. Tenenbaum and Boris Katz and Andrei Barbu} } @article {5050, title = {Incorporating Rich Social Interactions Into MDPs}, year = {2022}, abstract = {

Much of what we do as humans is engage socially with other agents, a skill that robots must also eventually possess. We demonstrate that a rich theory of social interactions originating from microso- ciology and economics can be formalized by extending a nested MDP where agents reason about arbitrary functions of each other{\textquoteright}s hidden rewards. This extended Social MDP allows us to encode the five basic interactions that underlie microsociology: cooperation, conflict, coercion, competition, and exchange. The result is a robotic agent capable of executing social interactions zero-shot in new environments; like humans it can engage socially in novel ways even without a single example of that social interaction. Moreover, the judgments of these Social MDPs align closely with those of humans when considering which social interaction is taking place in an environment. This method both sheds light on the nature of social interactions, by providing concrete mathematical definitions, and brings rich social interactions into a mathematical framework that has proven to be natural for robotics, MDPs.

}, author = {Ravi Tejwani and Yen-Ling Kuo and Tianmin Shu and Bennett Stankovits and Dan Gutfreund and Joshua B. Tenenbaum and Boris Katz and Andrei Barbu} } @article {5296, title = {Mental Jenga: A counterfactual simulation model of causal judgments about physical support}, journal = {PsyArXiv}, year = {2022}, month = {02/2022}, abstract = {

From building towers to picking an orange from a stack of fruit, assessing support is critical for successfully interacting with the physical world. But how do people determine whether one object supports another? In this paper, we develop the Counterfactual Simulation Model (CSM) of causal judgments about physical support. The CSM predicts that people judge physical support by mentally simulating what would happen to a scene if the object of interest were removed. Three experiments test the model by asking one group of participants to judge what would happen to a tower if one of the blocks were removed, and another group of participants how responsible that block was for the tower{\textquoteright}s stability. The CSM accurately captures participants{\textquoteright} predictions by running noisy simulations that incorporate different sources of uncertainty. Participants{\textquoteright} responsibility judgments are closely related to counterfactual predictions: a block is more responsible when many other blocks would fall if it were removed. By construing physical support as preventing from falling, the CSM provides a unified account of how causal judgments in dynamic and static physical scenes arise from the process of counterfactual simulation.

}, url = {https://psyarxiv.com/4a5uh}, author = {Liang Zhou and Kevin Smith and Joshua B. Tenenbaum and Tobias Gerstenberg} } @article {5065, title = { AGENT: A Benchmark for Core Psychological Reasoning}, year = {2021}, month = {07/2021}, author = {Tianmin Shu and Abhishek Bhandwaldar and Chuang Gan and Kevin A Smith and Shari Liu and Dan Gutfreund and Elizabeth S Spelke and Joshua B. Tenenbaum and Tomer D. Ullman} } @article {5077, title = {Meta-strategy learning in physical problem solving: the effect of embodied experience}, journal = {bioRxiv}, year = {2021}, month = {08/2021}, abstract = {

Embodied cognition suggests that our experience in our bodies -- including our motor experiences -- shape our cognitive and perceptual capabilities broadly. Much work has studied how differences in the physical body (either natural or manipulated) can impact peoples cognitive and perceptual capacities, but often these judgments relate directly to those body differences. Here we focus instead on how natural embodied experience affects what kinds of abstract physical problem-solving strategies people use in a virtual task. We compare how groups with different embodied experience -- children and adults with congenital limb differences versus those born with two hands -- perform on this task, and find that while there is no difference in overall accuracy or time to complete the task, the groups use different meta-strategies to come to solutions. Specifically, both children and adults born with limb differences take a longer time to think before acting, and as a result take fewer overall actions to reach solutions to physical reasoning problems. Conversely, the process of development affects the particular actions children use as they age regardless of how many hands they were born with, as well as their persistence with their current strategy. Taken together, our findings suggest that differences in embodied experience drive the acquisition of different meta-strategies for balancing acting with thinking, deciding what kinds of actions to try, and deciding how persistent to be with a current action plan.

}, author = {Kelsey Allen and Kevin A Smith and Laura Bird and Joshua B. Tenenbaum and Tamar Makin and Dorothy Cowie} } @article {5078, title = {Moral dynamics: Grounding moral judgment in intuitive physics and intuitive psychology}, journal = {Cognition}, volume = {217}, year = {2021}, month = {05/2021}, pages = {104890}, abstract = {

When holding others morally responsible, we care about what they did, and what they thought. Traditionally, research in moral psychology has relied on vignette studies, in which a protagonist{\textquoteright}s actions and thoughts are explicitly communicated. While this research has revealed what variables are important for moral judgment, such as actions and intentions, it is limited in providing a more detailed understanding of exactly how these variables affect moral judgment. Using dynamic visual stimuli that allow for a more fine-grained experimental control, recent studies have proposed a direct mapping from visual features to moral judgments. We embrace the use of visual stimuli in moral psychology, but question the plausibility of a feature-based theory of moral judgment. We propose that the connection from visual features to moral judgments is mediated by an inference about what the observed action reveals about the agent{\textquoteright}s mental states, and what causal role the agent{\textquoteright}s action played in bringing about the outcome. We present a computational model that formalizes moral judgments of agents in visual scenes as computations over an intuitive theory of physics combined with an intuitive theory of mind. We test the model{\textquoteright}s quantitative predictions in three experiments across a wide variety of dynamic interactions between agent and patient.

}, issn = {00100277}, doi = {10.1016/j.cognition.2021.104890}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0010027721003139}, author = {Sosa, Felix A. and Ullman, Tomer and Joshua B. Tenenbaum and Samuel J Gershman and Gerstenberg, Tobias} } @article {4969, title = {The neural architecture of language: Integrative modeling converges on predictive processing}, journal = {Proceedings of the National Academy of Sciences}, volume = {118}, year = {2021}, month = {11/2021}, pages = {e2105646118}, abstract = {

Significance

Language is a quintessentially human ability. Research has long probed the functional architecture of language in the mind and brain using diverse neuroimaging, behavioral, and computational modeling approaches. However, adequate neurally-mechanistic accounts of how meaning might be extracted from language are sorely lacking. Here, we report a first step toward addressing this gap by connecting recent artificial neural networks from machine learning to human recordings during language processing. We find that the most powerful models predict neural and behavioral responses across different datasets up to noise levels. Models that perform better at predicting the next word in a sequence also better predict brain measurements{\textemdash}providing computationally explicit evidence that predictive processing fundamentally shapes the language comprehension mechanisms in the brain.

Abstract

The neuroscience of perception has recently been revolutionized with an integrative modeling approach in which computation, brain function, and behavior are linked across many datasets and many computational models. By revealing trends across models, this approach yields novel insights into cognitive and neural mechanisms in the target domain. We here present a systematic study taking this approach to higher-level cognition: human language processing, our species{\textquoteright} signature cognitive skill. We find that the most powerful {\textquotedblleft}transformer{\textquotedblright} models predict nearly 100\% of explainable variance in neural responses to sentences and generalize across different datasets and imaging modalities (functional MRI and electrocorticography). Models{\textquoteright} neural fits ({\textquotedblleft}brain score{\textquotedblright}) and fits to behavioral responses are both strongly correlated with model accuracy on the next-word prediction task (but not other language tasks). Model architecture appears to substantially contribute to neural fit. These results provide computationally explicit evidence that predictive processing fundamentally shapes the language comprehension mechanisms in the human brain.

}, issn = {0027-8424}, doi = {10.1073/pnas.2105646118}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.2105646118}, author = {Martin Schrimpf and Blank, Idan Asher and Tuckute, Greta and Kauf, Carina and Hosseini, Eghbal A. and Nancy Kanwisher and Joshua B. Tenenbaum and Fedorenko, Evelina} } @book {5076, title = {The Neural Basis of Mentalizing: Linking Models of Theory of Mind and Measures of Human Brain Activity}, year = {2021}, month = {05/2021}, pages = {209 - 235}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Cham}, isbn = {978-3-030-51889-9}, doi = {10.1007/978-3-030-51890-510.1007/978-3-030-51890-5_11}, url = {https://link.springer.com/10.1007/978-3-030-51890-5}, author = {Sean Dae Houlihan and Joshua B. Tenenbaum and Rebecca Saxe}, editor = {Gilead, Michael and Ochsner, Kevin N.} } @conference {4830, title = {PHASE: PHysically-grounded Abstract Social Events for Machine Social Perception}, booktitle = {AAAI-21}, year = {2021}, abstract = {

The ability to perceive and reason about social interactions in the context of physical environments is core to human social intelligence and human-machine cooperation. However, no prior dataset or benchmark has systematically evaluated physically grounded perception of complex social interactions that go beyond short actions, such as high-fiving, or simple group activities, such as gathering. In this work, we create a dataset of physically-grounded abstract social events, PHASE, that resemble a wide range of real-life social interactions by including social concepts such as helping another agent. PHASE consists of 2D animations of pairs of agents moving in a continuous space generated procedurally using a physics engine and a hierarchical planner. Agents have a limited field of view, and can interact with multiple objects, in an environment that has multiple landmarks and obstacles. Using PHASE, we design a social recognition task and a social prediction task. PHASE is validated with human experiments demonstrating that humans perceive rich interactions in the social events, and that the simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-the-art feed-forward neural networks. We hope that PHASE can serve as a difficult new challenge for developing new models that can recognize complex social interactions.

}, author = {Aviv Netanyahu and Tianmin Shu and Boris Katz and Andrei Barbu and Joshua B. Tenenbaum} } @article {5058, title = {PHASE: PHysically-grounded Abstract Social Events for Machine Social Perception}, number = {123}, year = {2021}, month = {03/2021}, abstract = {

The ability to perceive and reason about social interactions in the context of physical environments
is core to human social intelligence and human-machine cooperation. However, no prior dataset or
benchmark has systematically evaluated physically grounded perception of complex social interactions
that go beyond short actions, such as high-fiving, or simple group activities, such as gathering. In this
work, we create a dataset of physically-grounded abstract social events, PHASE, that resemble a wide
range of real-life social interactions by including social concepts such as helping another agent. PHASE
consists of 2D animations of pairs of agents moving in a continuous space generated procedurally
using a physics engine and a hierarchical planner. Agents have a limited field of view, and can interact
with multiple objects, in an environment that has multiple landmarks and obstacles. Using PHASE,
we design a social recognition task and a social prediction task. PHASE is validated with human
experiments demonstrating that humans perceive rich interactions in the social events, and that the
simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse
planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-
the-art feedforward neural networks. We hope that PHASE can serve as a difficult new challenge for
developing new models that can recognize complex social interactions.

}, author = {Aviv Netanyahu and Tianmin Shu and Boris Katz and Andrei Barbu and Joshua B. Tenenbaum} } @article {5062, title = {Plans or Outcomes: How Do We Attribute Intelligence to Others?}, journal = {Cognitive Science}, volume = {45}, year = {2021}, month = {09/2021}, issn = {0364-0213}, doi = {10.1111/cogs.v45.910.1111/cogs.13041}, url = {https://onlinelibrary.wiley.com/toc/15516709/45/9}, author = {Marta Kryven and Ullman, Tomer D. and Cowan, William and Joshua B. Tenenbaum} } @conference {5063, title = {Temporal and Object Quantification Networks}, booktitle = {Thirtieth International Joint Conference on Artificial Intelligence {IJCAI-21}Proceedings of the Thirtieth International Joint Conference on Artificial Intelligence}, year = {2021}, month = {06/2021}, address = {Montreal, Canada}, abstract = {

We present Temporal and Object Quantification Networks (TOQ-Nets), a new class of neuro-symbolic networks with a structural bias that enables them to learn to recognize complex relational-temporal events. This is done by including reasoning layers that implement finite-domain quantification over objects and time. The structure allows them to generalize directly to input instances with varying numbers of objects in temporal sequences of varying lengths. We evaluate TOQ-Nets on input domains that require recognizing event-types in terms of complex temporal relational patterns. We demonstrate that TOQ-Nets can generalize from small amounts of data to scenarios containing more objects than were present during training and to temporal warpings of input sequences.

}, doi = {10.24963/ijcai.2021/386}, url = {https://www.ijcai.org/proceedings/2021}, author = {Mao, Jiayuan and Luo, Zhezheng and Gan, Chuang and Joshua B. Tenenbaum and Wu, Jiajun and Kaelbling, Leslie Pack and Ullman, Tomer D.}, editor = {Zhou, Zhi-Hua} } @conference {4820, title = {Unsupervised Discovery of 3D Physical Objects}, booktitle = {International Conference on Learning Representations}, year = {2021}, month = {07/2020}, abstract = {

We study the problem of unsupervised physical object discovery. Unlike existing frameworks that aim to learn to decompose scenes into 2D segments purely based on each object{\textquoteright}s appearance, we explore how physics, especially object interactions,facilitates learning to disentangle and segment instances from raw videos, and to infer the 3D geometry and position of each object, all without supervision. Drawing inspiration from developmental psychology, our Physical Object Discovery Network (POD-Net) uses both multi-scale pixel cues and physical motion cues to accurately segment observable and partially occluded objects of varying sizes, and infer properties of those objects. Our model reliably segments objects on both synthetic and real scenes. The discovered object properties can also be used to reason about physical events.

}, url = {https://openreview.net/forum?id=lf7st0bJIA5}, author = {Yilun Du and Kevin A Smith and Tomer Ullman and Joshua B. Tenenbaum and Jiajun Wu} } @article {4955, title = {Vector-based pedestrian navigation in cities}, journal = {Nature Computational Science}, volume = {1}, year = {2021}, month = {10/2021}, pages = {678 - 685}, abstract = {

How do pedestrians choose their paths within city street networks? Researchers have tried to shed light on this matter through strictly controlled experiments, but an ultimate answer based on real-world mobility data is still lacking. Here, we analyze salient features of human path planning through a statistical analysis of a massive dataset of GPS traces, which reveals that (1) people increasingly deviate from the shortest path when the distance between origin and destination increases and (2) chosen paths are statistically different when origin and destination are swapped. We posit that direction to goal is a main driver of path planning and develop a vector-based navigation model; the resulting trajectories, which we have termed pointiest paths, are a statistically better predictor of human paths than a model based on minimizing distance with stochastic effects. Our findings generalize across two major US cities with different street networks, hinting to the fact that vector-based navigation might be a universal property of human path planning.

}, doi = {10.1038/s43588-021-00130-y}, url = {https://www.nature.com/articles/s43588-021-00130-y}, author = {Bongiorno, Christian and Zhou, Yulun and Marta Kryven and Theurel, David and Rizzo, Alessandro and Santi, Paolo and Joshua B. Tenenbaum and Ratti, Carlo} } @article {4802, title = {Bayesian Models of Conceptual Development: Learning as Building Models of the World}, journal = {Annual Review of Developmental Psychology}, volume = {2}, year = {2020}, month = {12/2021}, pages = {533 - 558}, issn = {2640-7922}, doi = {10.1146/annurev-devpsych-121318-084833}, url = {https://www.annualreviews.org/doi/10.1146/annurev-devpsych-121318-084833}, author = {Ullman, Tomer D. and Joshua B. Tenenbaum} } @article {4448, title = {Efficient inverse graphics in biological face processing}, journal = {Science Advances}, volume = {6}, year = {2020}, month = {03/2020}, pages = {eaax5979}, abstract = {

Vision not only detects and recognizes objects, but performs rich inferences about the underlying scene structure that causes the patterns of light we see. Inverting generative models, or {\textquotedblleft}analysis-by-synthesis{\textquotedblright}, presents a possible solution, but its mechanistic implementations have typically been too slow for online perception, and their mapping to neural circuits remains unclear. Here we present a neurally plausible efficient inverse graphics model and test it in the domain of face recognition. The model is based on a deep neural network that learns to invert a three-dimensional face graphics program in a single fast feedforward pass. It explains human behavior qualitatively and quantitatively, including the classic {\textquotedblleft}hollow face{\textquotedblright} illusion, and it maps directly onto a specialized face-processing circuit in the primate brain. The model fits both behavioral and neural data better than state-of-the-art computer vision models, and suggests an interpretable reverse-engineering account of how the brain transforms images into percepts.

}, doi = {10.1126/sciadv.aax5979}, url = {https://advances.sciencemag.org/lookup/doi/10.1126/sciadv.aax5979}, author = {Ilker Yildirim and Mario Belledonne and W. A. Freiwald and Joshua B. Tenenbaum} } @conference {4817, title = {The fine structure of surprise in intuitive physics: when, why, and how much?}, booktitle = {Proceedings of the 42th Annual Meeting of the Cognitive Science Society - Developing a Mind: Learning in Humans, Animals, and Machines, CogSci 2020, virtual, July 29 - August 1, 2020}, year = {2020}, url = {https://cogsci.mindmodeling.org/2020/papers/0761/index.html}, author = {Kevin A Smith and Lingjie Mei and Shunyu Yao and Jiajun Wu and Elizabeth S Spelke and Joshua B. Tenenbaum and Tomer D. Ullman}, editor = {Stephanie Denison and Michael Mack and Yang Xu and Blair C. Armstrong} } @conference {4702, title = {Learning abstract structure for drawing by efficient motor program induction}, booktitle = {Advances in Neural Information Processing Systems 33 pre-proceedings (NeurIPS 2020)}, year = {2020}, month = {12/2020}, abstract = {

Humans flexibly solve new problems that differ from those previously practiced. This ability to flexibly generalize is supported by learned concepts that represent useful structure common across different problems. Here we develop a naturalistic drawing task to study how humans rapidly acquire structured prior knowledge. The task requires drawing visual figures that share underlying structure, based on a set of composable geometric rules and simple objects. We show that people spontaneously learn abstract drawing procedures that support generalization, and propose a model of how learners can discover these reusable drawing procedures. Trained in the same setting as humans, and constrained to produce efficient motor actions, this model discovers new drawing program subroutines that generalize to test figures and resemble learned features of human behavior. These results suggest that two principles guiding motor program induction in the model - abstraction (programs can reflect high-level structure that ignores figure-specific details) and compositionality (new programs are discovered by recombining previously learned programs) - are key for explaining how humans learn structured internal representations that guide flexible reasoning and learning.

}, url = {https://papers.nips.cc/paper/2020/hash/1c104b9c0accfca52ef21728eaf01453-Abstract.html}, author = {Lucas Tian and Kevin Ellis and Marta Kryven and Joshua B. Tenenbaum} } @conference {4695, title = {Learning Compositional Rules via Neural Program Synthesis}, booktitle = {Advances in Neural Information Processing Systems 33 pre-proceedings (NeurIPS 2020)}, year = {2020}, month = {12/2020}, abstract = {

Many aspects of human reasoning, including language, require learning rules from very little data. Humans can do this, often learning systematic rules from very few examples, and combining these rules to form compositional rule-based systems. Current neural architectures, on the other hand, often fail to generalize in a compositional manner, especially when evaluated in ways that vary systematically from training. In this work, we present a neuro-symbolic model which learns entire rule systems from a small set of examples. Instead of directly predicting outputs from inputs, we train our model to induce the explicit system of rules governing a set of previously seen examples, drawing upon techniques from the neural program synthesis literature. Our rule-synthesis approach outperforms neural meta-learning techniques in three domains: an artificial instruction-learning domain used to evaluate human learning, the SCAN challenge datasets, and learning rule-based translations of number words into integers for a wide range of human languages.

Code can be found at this https URL | arXive entry

}, url = {https://proceedings.neurips.cc/paper/2020/hash/7a685d9edd95508471a9d3d6fcace432-Abstract.html}, author = {Maxwell Nye and Armando Solar-Lezama and Joshua B. Tenenbaum and Brenden M Lake} } @article {4655, title = {The logic of universalization guides moral judgment}, journal = {Proceedings of the National Academy of Sciences (PNAS)}, year = {2020}, month = {Feb-10-2020}, pages = {202014505}, abstract = {

To explain why an action is wrong, we sometimes say, {\textquotedblleft}What if everybody did that?{\textquotedblright} In other words, even if a single person{\textquoteright}s behavior is harmless, that behavior may be wrong if it would be harmful once universalized. We formalize the process of universalization in a computational model, test its quantitative predictions in studies of human moral judgment, and distinguish it from alternative models. We show that adults spontaneously make moral judgments consistent with the logic of universalization, and report comparable patterns of judgment in children. We conclude that, alongside other well-characterized mechanisms of moral judgment, such as outcome-based and rule-based thinking, the logic of universalizing holds an important place in our moral minds.

}, issn = {0027-8424}, doi = {10.1073/pnas.2014505117}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.2014505117}, author = {Levine, Sydney and Max Kleiman-Weiner and Laura Schulz and Joshua B. Tenenbaum and Fiery A Cushman} } @article {4814, title = {Online Developmental Science to Foster Innovation, Access, and Impact}, journal = {Trends in Cognitive Sciences}, volume = {24}, year = {2020}, month = {09/2020}, pages = {675 - 678}, issn = {13646613}, doi = {10.1016/j.tics.2020.06.004}, url = {https://linkinghub.elsevier.com/retrieve/pii/S1364661320301455}, author = {Sheskin, Mark and Scott, Kimberly and Mills, Candice M. and Bergelson, Elika and Bonawitz, Elizabeth and Elizabeth S Spelke and Fei-Fei, Li and Keil, Frank C. and Gweon, Hyowon and Joshua B. Tenenbaum and Julian Jara-Ettinger and Adolph, Karen E. and Rhodes, Marjorie and Frank, Michael C. and Mehr, Samuel A. and Laura Schulz} } @conference {4700, title = {PHASE: PHysically-grounded Abstract Social Eventsfor Machine Social Perception}, booktitle = {Shared Visual Representations in Human and Machine Intelligence (SVRHM) workshop at NeurIPS 2020}, year = {2020}, month = {12/2020}, abstract = {

The ability to perceive and reason about social interactions in the context ofphysical environments is core to human social intelligence and human-machinecooperation. However, no prior dataset or benchmark has systematically evaluatedphysically grounded perception of complex social interactions that go beyondshort actions, such as high-fiving, or simple group activities, such as gathering.In this work, we create a dataset of physically-grounded abstract social events,PHASE, that resemble a wide range of real-life social interactions by includingsocial concepts such as helping another agent. PHASE consists of 2D animationsof pairs of agents moving in a continuous space generated procedurally using aphysics engine and a hierarchical planner. Agents have a limited field of view, andcan interact with multiple objects, in an environment that has multiple landmarksand obstacles. Using PHASE, we design a social recognition task and a social prediction task. PHASE is validated with human experiments demonstrating thathumans perceive rich interactions in the social events, and that the simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-the-art feed-forward neural networks. We hope that PHASEcan serve as a difficult new challenge for developing new models that can recognize complex social interactions.

}, url = {https://openreview.net/forum?id=_bokm801zhx}, author = {Aviv Netanyahu and Tianmin Shu and Boris Katz and Andrei Barbu and Joshua B. Tenenbaum} } @article {4683, title = {Rapid trial-and-error learning with simulation supports flexible tool use and physical reasoning}, journal = {Proceedings of the National Academy of Sciences}, year = {2020}, month = {11/2021}, pages = {201912341}, abstract = {

Many animals, and an increasing number of artificial agents, display sophisticated capabilities to perceive and manipulate objects. But human beings remain distinctive in their capacity for flexible, creative tool use{\textemdash}using objects in new ways to act on the world, achieve a goal, or solve a problem. To study this type of general physical problem solving, we introduce the Virtual Tools game. In this game, people solve a large range of challenging physical puzzles in just a handful of attempts. We propose that the flexibility of human physical problem solving rests on an ability to imagine the effects of hypothesized actions, while the efficiency of human search arises from rich action priors which are updated via observations of the world. We instantiate these components in the {\textquotedblleft}sample, simulate, update{\textquotedblright} (SSUP) model and show that it captures human performance across 30 levels of the Virtual Tools game. More broadly, this model provides a mechanism for explaining how people condense general physical knowledge into actionable, task-specific plans to achieve flexible and efficient physical problem solving.

}, keywords = {intuitive physics, physical problem solving, tool use}, issn = {0027-8424}, doi = {10.1073/pnas.1912341117}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.1912341117}, author = {Kelsey Allen and Kevin A Smith and Joshua B. Tenenbaum} } @article {4501, title = {A theory of learning to infer.}, journal = {Psychological Review}, volume = {127}, year = {2020}, month = {04/2020}, pages = {412 - 441}, abstract = {

Bayesian theories of cognition assume that people can integrate probabilities rationally. However, several empirical findings contradict this proposition: human probabilistic inferences are prone to systematic deviations from optimality. Puzzlingly, these deviations sometimes go in opposite directions. Whereas some studies suggest that people underreact to prior probabilities (base rate neglect), other studies find that people underreact to the likelihood of the data (conservatism). We argue that these deviations arise because the human brain does not rely solely on a general-purpose mechanism for approximating Bayesian inference that is invariant across queries. Instead, the brain is equipped with a recognition model that maps queries to probability distributions. The parameters of this recognition model are optimized to get the output as close as possible, on average, to the true posterior. Because of our limited computational resources, the recognition model will allocate its resources so as to be more accurate for high probability queries than for low probability queries. By adapting to the query distribution, the recognition model learns to infer. We show that this theory can explain why and when people underreact to the data or the prior, and a new experiment demonstrates that these two forms of underreaction can be systematically controlled by manipulating the query distribution. The theory also explains a range of related phenomena: memory effects, belief bias, and the structure of response variability in probabilistic reasoning. We also discuss how the theory can be integrated with prior sampling-based accounts of approximate inference.

}, issn = {0033-295X}, doi = {10.1037/rev0000178}, url = {http://doi.apa.org/getdoi.cfm?doi=10.1037/rev0000178}, author = {Ishita Dasgupta and Eric Schulz and Joshua B. Tenenbaum and Samuel J Gershman} } @article {4632, title = {ThreeDWorld: A Platform for Interactive Multi-Modal Physical Simulation}, journal = {arXiv}, year = {2020}, month = {07/2020}, type = {Preprint}, abstract = {

We introduce ThreeDWorld (TDW), a platform for interactive multi-modal physical simulation. With TDW, users can simulate high-fidelity sensory data and physical interactions between mobile agents and objects in a wide variety of rich 3D environments. TDW has several unique properties: 1) realtime near photo-realistic image rendering quality; 2) a library of objects and environments with materials for high-quality rendering, and routines enabling user customization of the asset library; 3) generative procedures for efficiently building classes of new environments 4) high-fidelity audio rendering; 5) believable and realistic physical interactions for a wide variety of material types, including cloths, liquid, and deformable objects; 6) a range of "avatar" types that serve as embodiments of AI agents, with the option for user avatar customization; and 7) support for human interactions with VR devices. TDW also provides a rich API enabling multiple agents to interact within a simulation and return a range of sensor and physics data representing the state of the world. We present initial experiments enabled by the platform around emerging research directions in computer vision, machine learning, and cognitive science, including multi-modal physical scene understanding, multi-agent interactions, models that "learn like a child", and attention studies in humans and neural networks. The simulation platform will be made publicly available.

}, url = {https://arxiv.org/abs/2007.04954}, author = {Chuang Gen and Jeremy Schwartz and Seth Alter and Martin Schrimpf and James Traer and Julian De Freitas and Jonas Kubilius and Abhishek Bhandwaldar and Nick Haber and Megumi Sano and Kuno Kim and Elias Wang and Damian Mrowca and Michael Lingelbach and Aidan Curtis and Kevin Feigleis and Daniel Bear and Dan Gutfreund and David Cox and James J. DiCarlo and Josh H. McDermott and Joshua B. Tenenbaum and Daniel L K Yamins} } @article {4633, title = {ThreeDWorld (TDW): A High-Fidelity, Multi-Modal Platform for Interactive Physical Simulation}, year = {2020}, month = {07/2020}, abstract = {

TDW is a 3D virtual world simulation platform, utilizing state-of-the-art video game engine technology

A TDW simulation consists of two components: a) the Build, a compiled executable running on the Unity3D Engine, which is responsible for image rendering, audio synthesis and physics simulations; and b) the Controller, an external Python interface to communicate with the build.

Researchers write Controllers that send commands to the Build, which executes those commands and returns a broad range of data types representing the state of the virtual world.

TDW provides researchers with:

TDW is being used on a daily basis in multiple labs, supporting research that sits at the nexus of neuroscience, cognitive science and artificial intelligence.

Find out more about ThreeDWorld on the project weobsite using the link below.

}, url = {http://www.threedworld.org/}, author = {Jeremy Schwartz and Seth Alter and James J. DiCarlo and Josh H. McDermott and Joshua B. Tenenbaum and Daniel L K Yamins and Dan Gutfreund and Chuang Gan and James Traer and Jonas Kubilius and Martin Schrimpf and Abhishek Bhandwaldar and Julian De Freitas and Damian Mrowca and Michael Lingelbach and Megumi Sano and Daniel Bear and Kuno Kim and Nick Haber and Chaofei Fan} } @conference {4534, title = {Toward human-like object naming in artificial neural systems }, booktitle = {International Conference on Learning Representations (ICLR 2020), Bridging AI and Cognitive Science workshop}, year = {2020}, month = {04/2020}, address = {Virtual conference (due to Covid-19)}, author = {Tiwalayo Eisape and Roger Levy and Joshua B. Tenenbaum and Noga Zaslavsky} } @article {4514, title = {Choosing a Transformative Experience }, year = {2019}, month = {07/2019}, author = {Marta Kryven and Niemi, L. and Paul, L. and Joshua B. Tenenbaum} } @article {4512, title = { Does intuitive inference of physical stability interruptattention?}, year = {2019}, month = {07/2019}, author = {Marta Kryven and Scholl, B. and Joshua B. Tenenbaum} } @proceedings {4261, title = {Draping an Elephant: Uncovering Children{\textquoteright}s Reasoning About Cloth-Covered Objects}, year = {2019}, month = {07/2019}, address = {Montreal, Canada}, abstract = {

Humans have an intuitive understanding of physics. They can predict how a physical scene will unfold, and reason about how it came to be. Adults may rely on such a physical representation for visual reasoning and recognition, going beyond visual features and capturing objects in terms of their physical properties. Recently, the use of draped objects in recognition was used to examine adult object representations in the absence of many common visual features. In this paper we examine young children{\textquoteright}s reasoning about draped objects in order to examine the develop of physical object representation. In addition, we argue that a better understanding of the development of the concept of cloth as a physical entity is worthwhile in and of itself, as it may form a basic ontological category in intuitive physical reasoning akin to liquids and solids. We use two experiments to investigate young children{\textquoteright}s (ages 3{\textendash}5) reasoning about cloth-covered objects, and find that they perform significantly above chance (though far from perfectly) indicating a representation of physical objects that can interact dynamically with the world. Children{\textquoteright}s success and failure pattern is similar across the two experiments, and we compare it to adult behavior. We find a small effect, which suggests the specific features that make reasoning about certain objects more difficult may carry into adulthood.

}, keywords = {analysis-by-synthesis, cloth, cognitive development, imagination, intuitive physics, object recognition, occlusion, perception, vision}, url = {https://mindmodeling.org/cogsci2019/papers/0506/index.html}, author = {Tomer D Ullman and Eliza Kosoy and Ilker Yildirim and Amir Arsalan Soltani and Max Siegel and Joshua B. Tenenbaum and Elizabeth S Spelke} } @proceedings {4387, title = {Finding Friend and Foe in Multi-Agent Games}, year = {2019}, month = {05/2019}, address = {Vancouver, Canada}, abstract = {

Recent breakthroughs in AI for multi-agent games like Go, Poker, and Dota, have seen great strides in recent years. Yet none of these games address the real-life challenge of cooperation in the presence of unknown and uncertain teammates. This challenge is a key game mechanism in hidden role games. Here we develop the DeepRole algorithm, a multi-agent reinforcement learning agent that we test on The Resistance: Avalon, the most popular hidden role game. DeepRole combines counterfactual regret minimization (CFR) with deep value networks trained through self-play. Our algorithm integrates deductive reasoning into vector-form CFR to reason about joint beliefs and deduce partially observable actions. We augment deep value networks with constraints that yield interpretable representations of win probabilities. These innovations enable DeepRole to scale to the full Avalon game. Empirical game-theoretic methods show that DeepRole outperforms other hand-crafted and learned agents in five-player Avalon. DeepRole played with and against human players on the web in hybrid human-agent teams. We find that DeepRole outperforms human players as both a cooperator and a competitor.

}, author = {Jack Serrino and Max Kleiman-Weiner and David C. Parkes and Joshua B. Tenenbaum} } @article {4178, title = {An integrative computational architecture for object-driven cortex}, journal = {Current Opinion in Neurobiology}, volume = {55}, year = {2019}, month = {01/2019}, pages = {73 - 81}, abstract = {

Computational architecture for object-driven cortex

Objects in motion activate multiple cortical regions in every lobe of the human brain. Do these regions represent a collection of independent systems, or is there an overarching functional architecture spanning all of object-driven cortex? Inspired by recent work in artificial intelligence (AI), machine learning, and cognitive science, we consider the hypothesis that these regions can be understood as a coherent network implementing an integrative computational system that unifies the functions needed to perceive, predict, reason about, and plan with physical objects{\textemdash}as in the paradigmatic case of using or making tools. Our proposal draws on a modeling framework that combines multiple AI methods, including causal generative models, hybrid symbolic-continuous planning algorithms, and neural recognition networks, with object-centric, physics-based representations. We review evidence relating specific components of our proposal to the specific regions that comprise object-driven cortex, and lay out future research directions with the goal of building a complete functional and mechanistic account of this system.

}, issn = {09594388}, doi = {10.1016/j.conb.2019.01.010}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0959438818301995}, author = {Ilker Yildirim and Jiajun Wu and Nancy Kanwisher and Joshua B. Tenenbaum} } @article {4554, title = {Invariant representations of mass in the human brain}, journal = {eLife}, volume = {8}, year = {2019}, month = {May-12-2020}, doi = {10.7554/eLife.46619}, url = {https://www.ncbi.nlm.nih.gov/pubmed/31845887}, author = {Schwettmann, Sarah and Joshua B. Tenenbaum and Nancy Kanwisher} } @proceedings {4380, title = {Modeling Expectation Violation in Intuitive Physics with Coarse Probabilistic Object Representations}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

From infancy, humans have expectations about how objects will move and interact. Even young children expect objects not to move through one another, teleport, or disappear. They are surprised by mismatches between physical expectations and perceptual observations, even in unfamiliar scenes with completely novel objects. A model that exhibits human-like understanding of physics should be similarly surprised, and adjust its beliefs accordingly. We propose ADEPT, a model that uses a coarse (approximate geometry) object-centric representation for dynamic 3D scene understanding. Inference integrates deep recognition networks, extended probabilistic physical simulation, and particle filtering for forming predictions and expectations across occlusion. We also present a new test set for measuring violations of physical expectations, using a range of scenarios derived from de- velopmental psychology. We systematically compare ADEPT, baseline models, and human expectations on this test set. ADEPT outperforms standard network architectures in discriminating physically implausible scenes, and often performs this discrimination at the same level as people.

}, url = {http: //physadept.csail.mit.edu/}, author = {Kevin A Smith and Lingjie Mei and Shunyu Yao and Jiajun Wu and Elizabeth S Spelke and Joshua B. Tenenbaum and Tomer D. Ullman} } @proceedings {4388, title = {ObjectNet: A large-scale bias-controlled dataset for pushing the limits of object recognition models}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

We collect a large real-world test set, ObjectNet, for object recognition with controls where object backgrounds, rotations, and imaging viewpoints are random. Most scientific experiments have controls, confounds which are removed from the data, to ensure that subjects cannot perform a task by exploiting trivial correlations in the data. Historically, large machine learning and computer vision datasets have lacked such controls. This has resulted in models that must be fine-tuned for new datasets and perform better on datasets than in real-world applications. When tested on ObjectNet, object detectors show a 40-45\% drop in performance, with respect to their performance on other benchmarks, due to the controls for biases. Controls make ObjectNet robust to fine-tuning showing only small performance increases. We develop a highly automated platform that enables gathering datasets with controls by crowdsourcing image capturing and annotation. ObjectNet is the same size as the ImageNet test set (50,000 images), and by design does not come paired with a training set in order to encourage generalization. The dataset is both easier than ImageNet (objects are largely centered and unoccluded) and harder (due to the controls). Although we focus on object recognition here, data with controls can be gathered at scale using automated tools throughout machine learning to generate datasets that exercise models in new ways thus providing valuable feedback to researchers. This work opens up new avenues for research in generalizable, robust, and more human-like computer vision and in creating datasets where results are predictive of real-world performance.

}, author = {Andrei Barbu and David Mayo and Julian Alverio and William Luo and Christopher Wang and Dan Gutfreund and Joshua B. Tenenbaum and Boris Katz} } @conference {4523, title = {Query-guided visual search }, booktitle = {41st Annual conference of the Cognitive Science Society}, year = {2019}, month = {07/2019}, address = {Montreal, Qu{\'e}bec, Canada}, author = {Junyi Chu and Jon Gauthier and Roger Levy and Joshua B. Tenenbaum and Laura Schulz} } @article {4118, title = {See, feel, act: Hierarchical learning for complex manipulation skills with multisensory fusion}, journal = {Science Robotics}, volume = {4}, year = {2019}, month = {01/2019}, pages = {eaav3123}, abstract = {

{Humans are able to seamlessly integrate tactile and visual stimuli with their intuitions to explore and execute complex manipulation skills. They not only see but also feel their actions. Most current robotic learning methodologies exploit recent progress in computer vision and deep learning to acquire data-hungry pixel-to-action policies. These methodologies do not exploit intuitive latent structure in physics or tactile signatures. Tactile reasoning is omnipresent in the animal kingdom, yet it is underdeveloped in robotic manipulation. Tactile stimuli are only acquired through invasive interaction, and interpretation of the data stream together with visual stimuli is challenging. Here, we propose a methodology to emulate hierarchical reasoning and multisensory fusion in a robot that learns to play Jenga, a complex game that requires physical interaction to be played effectively. The game mechanics were formulated as a generative process using a temporal hierarchical Bayesian model, with representations for both behavioral archetypes and noisy block states. This model captured descriptive latent structures, and the robot learned probabilistic models of these relationships in force and visual domains through a short exploration phase. Once learned, the robot used this representation to infer block behavior patterns and states as it played the game. Using its inferred beliefs, the robot adjusted its behavior with respect to both its current actions and its game strategy, similar to the way humans play the game. We evaluated the performance of the approach against three standard baselines and show its fidelity on a real-world implementation of the game.

}, doi = {10.1126/scirobotics.aav3123}, url = {http://robotics.sciencemag.org/lookup/doi/10.1126/scirobotics.aav3123}, author = {Fazeli, N. and Oller, M. and Wu, J. and Wu, Z. and Joshua B. Tenenbaum and Rodriguez, A.} } @proceedings {4390, title = {Visual Concept-Metaconcept Learning}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

Humans reason with concepts and metaconcepts: we recognize red and blue from visual input; we also understand that they are colors, i.e., red is an instance of color. In this paper, we propose the visual concept-metaconcept learner (VCML) for joint learning of concepts and metaconcepts from images and associated question-answer pairs. The key is to exploit the bidirectional connection between visual concepts and metaconcepts. Visual representations provide grounding cues for predicting relations between unseen pairs of concepts. Knowing that red and blue are instances of color, we generalize to the fact that green is also an instance of color since they all categorize the hue of objects. Meanwhile, knowledge about metaconcepts empowers visual concept learning from limited, noisy, and even biased data. From just a few examples of purple cubes we can understand a new color purple, which resembles the hue of the cubes instead of the shape of them. Evaluation on both synthetic and real-world datasets validates our claims.

}, author = {Chi Han and Jiayuan Mao and Chuang Gan and Joshua B. Tenenbaum and Jiajun Wu} } @proceedings {4384, title = {Write, Execute, Assess: Program Synthesis with a REPL}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

We present a neural program synthesis approach integrating components which write, execute, and assess code to navigate the search space of possible programs. We equip the search process with an interpreter or a read-eval-print-loop (REPL), which immediately executes partially written programs, exposing their semantics. The REPL addresses a basic challenge of program synthesis: tiny changes in syntax can lead to huge changes in semantics. We train a pair of models, a policy that proposes the new piece of code to write, and a value function that assesses the prospects of the code written so-far. At test time we can combine these models with a Sequential Monte Carlo algorithm. We apply our approach to two domains: synthesizing text editing programs and inferring 2D and 3D graphics programs.

}, author = {Kevin Ellis and Maxwell Nye and Yewen Pu and Felix Sosa and Joshua B. Tenenbaum and Armando Solar-Lezama} } @proceedings {4179, title = {Differentiable physics and stable modes for tool-use and manipulation planning}, year = {2018}, month = {06/2018}, abstract = {

We consider the problem of sequential manipulation and tool-use planning in domains that include physical interactions such as hitting and throwing. The approach integrates a Task And Motion Planning formulation with primitives that either impose stable kinematic constraints or differentiable dynamical and impulse exchange constraints at the path optimization level. We demonstrate our approach on a variety of physical puzzles that involve tool use and dynamic interactions. We then compare manipulation sequences generated by our approach to human actions on analogous tasks, suggesting future directions and illuminating current limitations.

}, author = {Marc Toussaint and Kelsey Allen and Kevin A Smith and Joshua B. Tenenbaum} } @article {3422, title = {Discovery and usage of joint attention in images}, journal = {arXiv.org}, year = {2018}, month = {04/2018}, abstract = {

Joint visual attention is characterized by two or more individuals looking at a common target at the same time. The ability to identify joint attention in scenes, the people involved, and their common target, is fundamental to the understanding of social interactions, including others{\textquoteright} intentions and goals. In this work we deal with the extraction of joint attention events, and the use of such events for image descriptions. The work makes two novel contributions. First, our extraction algorithm is the first which identifies joint visual attention in single static images. It computes 3D gaze direction, identifies the gaze target by combining gaze direction with a 3D depth map computed for the image, and identifies the common gaze target. Second, we use a human study to demonstrate the sensitivity of humans to joint attention, suggesting that the detection of such a configuration in an image can be useful for understanding the image, including the goals of the agents and their joint activity, and therefore can contribute to image captioning and related tasks.

}, keywords = {compositional approach, computational study, Gaze perception, human study, joint attention}, url = {https://arxiv.org/abs/1804.04604}, author = {Daniel Harari and Joshua B. Tenenbaum and Shimon Ullman} } @article {4181, title = {End-to-end differentiable physics for learning and control}, year = {2018}, author = {Filipe de Avila Belbute-Peres and Kevin A Smith and Kelsey Allen and Joshua B. Tenenbaum and Zico Kolter} } @article {3513, title = {Learning physical parameters from dynamic scenes.}, journal = {Cognitive Psychology}, volume = {104}, year = {2018}, month = {8/2018}, pages = {57-82}, abstract = {

Humans acquire their most basic physical concepts early in development, and continue to enrich and expand their intuitive physics throughout life as they are exposed to more and varied dynamical environments. We introduce a hierarchical Bayesian framework to explain how people can learn physical parameters at multiple levels. In contrast to previous Bayesian models of theory acquisition (Tenenbaum et al., 2011), we work with more ex- pressive probabilistic program representations suitable for learning the forces and properties that govern how objects interact in dynamic scenes unfolding over time. We compare our model to human learners on a challenging task of estimating multiple physical parameters in novel microworlds given short movies. This task requires people to reason simultane- ously about multiple interacting physical laws and properties. People are generally able to learn in this setting and are consistent in their judgments. Yet they also make systematic errors indicative of the approximations people might make in solving this computationally demanding problem with limited computational resources. We propose two approximations that complement the top-down Bayesian approach. One approximation model relies on a more bottom-up feature-based inference scheme. The second approximation combines the strengths of the bottom-up and top-down approaches, by taking the feature-based inference as its point of departure for a search in physical-parameter space.

}, keywords = {intuitive physics, intuitive theory, learning, physical reasoning, probabilistic inference}, doi = {10.1016/j.cogpsych.2017.05.006}, url = {https://www-sciencedirect-com.libproxy.mit.edu/science/article/pii/S0010028517301822}, author = {Ullman, Tomer D. and Stuhlm{\"u}ller, Andreas and Noah D. Goodman and Joshua B. Tenenbaum} } @article {3514, title = {Lucky or clever? From changed expectations to attributions of responsibility}, journal = {Cognition}, year = {2018}, month = {08/2018}, author = {Tobias Gerstenberg and Ullman, Tomer D. and Nagel, Jonas and Max Kleiman-Weiner and D. A. Lagnado and Joshua B. Tenenbaum} } @article {3622, title = {Rational inference of beliefs and desires from emotional expressions}, journal = {Cognitive Science}, volume = {42}, year = {2018}, month = {04/2018}, chapter = {850-884}, abstract = {

We investigated people{\textquoteright}s ability to infer others{\textquoteright} mental states from their emotional reactions, manipulating whether agents wanted, expected, and caused an outcome. Participants recovered agents{\textquoteright} desires throughout. When the agent observed, but did not cause the outcome, participants{\textquoteright} ability to recover the agent{\textquoteright}s beliefs depended on the evidence they got (i.e., her reaction only to the actual outcome or to both the expected and actual outcomes; Experiments 1 and 2). When the agent caused the event, participants{\textquoteright} judgments also depended on the probability of the action (Experiments 3 and 4); when actions were improbable given the mental states, people failed to recover the agent{\textquoteright}s beliefs even when they saw her react to both the anticipated and actual outcomes. A Bayesian model captured human performance throughout (rs >= .95), consistent with the proposal that people rationally integrate information about others{\textquoteright} actions and emotional reactions to infer their unobservable mental states.

}, author = {Wu, Yang and Chris Baker and Joshua B. Tenenbaum and Laura Schulz} } @article {4182, title = {Relational inductive bias for physical construction in humans and machines}, year = {2018}, month = {06/2018}, abstract = {

While current deep learning systems excel at tasks such as object classification, language processing, and gameplay, few can construct or modify a complex system such as a tower of blocks. We hypothesize that what these systems lack is a "relational inductive bias": a capacity for reasoning about inter-object relations and making choices over a structured description of a scene. To test this hypothesis, we focus on a task that involves gluing pairs of blocks together to stabilize a tower, and quantify how well humans perform. We then introduce a deep reinforcement learning agent which uses object- and relation-centric scene and policy representations and apply it to the task. Our results show that these structured representations allow the agent to outperform both humans and more naive approaches, suggesting that relational inductive bias is an important component in solving structured reasoning problems and for building more intelligent, flexible machines.

}, author = {Jessica B. Hamrick and Kelsey Allen and Victor Bapst and Tina Zhu and Kevin R. McKee and Joshua B. Tenenbaum and Battaglia, Peter} } @article {3441, title = {Building machines that learn and think like people.}, journal = {Behavioral and Brain Sciences}, volume = {40}, year = {2017}, month = {2017 Jan}, pages = {e253}, abstract = {

Recent progress in artificial intelligence has renewed interest in building systems that learn and think like people. Many advances have come from using deep neural networks trained end-to-end in tasks such as object recognition, video games, and board games, achieving performance that equals or even beats that of humans in some respects. Despite their biological inspiration and performance achievements, these systems differ from human intelligence in crucial ways. We review progress in cognitive science suggesting that truly human-like learning and thinking machines will have to reach beyond current engineering trends in both what they learn and how they learn it. Specifically, we argue that these machines should (1) build causal models of the world that support explanation and understanding, rather than merely solving pattern recognition problems; (2) ground learning in intuitive theories of physics and psychology to support and enrich the knowledge that is learned; and (3) harness compositionality and learning-to-learn to rapidly acquire and generalize knowledge to new tasks and situations. We suggest concrete challenges and promising routes toward these goals that can combine the strengths of recent neural network advances with more structured cognitive models.

}, issn = {1469-1825}, doi = {https://doi.org/10.1017/S0140525X16001837}, url = {https://www.cambridge.org/core/journals/behavioral-and-brain-sciences/article/building-machines-that-learn-and-think-like-people/A9535B1D745A0377E16C590E14B94993/core-reader}, author = {Brenden M Lake and Ullman, Tomer D and Joshua B. Tenenbaum and Samuel J Gershman} } @conference {2822, title = {Causal and compositional generative models in online perception}, booktitle = {39th Annual Conference of the Cognitive Science Society}, year = {2017}, address = {London, UK}, abstract = {

From a quick glance or the touch of an object, our brains map sensory signals to scenes composed of rich and detailed shapes and surfaces. Unlike the standard pattern recognition approaches to perception, we argue that this mapping draws on internal causal and compositional models of the outside phys- ical world, and that such internal models underlie the general- ization capacity of human perception. Here, we present a gen- erative model of visual and multisensory perception in which the latent variables encode intrinsic properties of objects such as their shapes and surfaces in addition to their extrinsic prop- erties such as pose and occlusion. These latent variables can be composed in novel ways and are inputs to sensory-specific causal models that output sense-specific signals. We present a novel recognition network that performs efficient inference in the generative model, computing at a speed similar to online perception. We show that our model, but not an alternative baseline model or a lesion of our model, can account for hu- man performance in an occluded face matching task and in a cross-modal visual-to-haptic face matching task.\ 

}, author = {Ilker Yildirim and Michael Janner}, editor = {Mario Belledonne and Christian Wallraven and W. A. Freiwald and Joshua B. Tenenbaum} } @proceedings {3539, title = {Causal and compositional generative models in online perception}, year = {2017}, month = {07/2017}, address = {London, UK}, abstract = {

From a quick glance or the touch of an object, our brains map sensory signals to scenes composed of rich and detailed shapes and surfaces. Unlike the standard approaches to perception, we argue that this mapping draws on internal causal and compositional models of the physical world and these internal models underlie the generalization capacity of human perception. Here, we present a generative model of visual and multisensory perception in which the latent variables encode intrinsic (e.g., shape) and extrinsic (e.g., occlusion) object properties. Latent variables are inputs to causal models that output sense-specific signals. We present a recognition network that performs efficient inference in the generative model, computing at a speed similar to online perception. We show that our model, but not alternatives, can account for human performance in an occluded face matching task and in a visual-to-haptic face matching task.

}, url = {https://mindmodeling.org/cogsci2017/papers/0266/index.html}, author = {Ilker Yildirim and Michael Janner and Mario Belledonne and Christian Wallraven and W. A. Freiwald and Joshua B. Tenenbaum} } @article {2736, title = {Changing minds: Children{\textquoteright}s inferences about third party belief revision}, journal = {Developmental Science}, year = {2017}, month = {05/2017}, pages = {e12553}, abstract = {

By the age of five, children explicitly represent that agents can have both true and false beliefs

based on epistemic access to information (e.g., Wellman, Cross, \& Watson, 2001). Children also begin to understand that agents can view identical evidence and draw different inferences from it (e.g., Carpenter \& Chandler, 1996). However, much less is known about when, and under what conditions, children expect other agents to change their minds. Here, inspired by formal ideal observer models of learning, we investigate children{\textquoteright}s expectations of the dynamics that underlie third parties{\textquoteright} belief revision. We introduce an agent who has prior beliefs about the location of a population of toys and then observes evidence that, from an ideal observer perspective, either does, or does not justify revising those beliefs. We show that children{\textquoteright}s inferences on behalf of third parties are consistent with the ideal observer perspective, but not with a number of alternative possibilities, including that children expect other agents to be influenced only by their prior beliefs, only by the sampling process, or only by the observed data. Rather, children integrate all three factors in determining how and when agents will update their beliefs from evidence.\ 

}, keywords = {learning, rational action, theory of mind}, doi = {10.1111/desc.12553}, author = {Rachel Magid and Phyllis Yan and Max Siegel and Joshua B. Tenenbaum and Laura Schulz} } @article {3621, title = {Children understand that agents maximize expected utilities.}, journal = {Journal of Experimental Psychology: General}, volume = {146}, year = {2017}, month = {Jan-11-2017}, pages = {1574 - 1585}, abstract = {

A growing set of studies suggests that our ability to infer, and reason about, mental states is supported by the assumption that agents maximize utilities{\textemdash}the rewards they attain minus the costs they incur. This assumption enables observers to work backward from agents{\textquoteright} observed behavior to their underlying beliefs, preferences, and competencies. Intuitively, however, agents may have incomplete, uncertain, or wrong beliefs about what they want. More formally, agents try to maximize their expected utilities. This understanding is crucial when reasoning about others{\textquoteright} behavior: It dictates when actions reveal preferences, and it makes predictions about the stability of behavior over time. In a set of 7 experiments we show that 4- and 5-year-olds understand that agents try to maximize expected utilities, and that these responses cannot be explained by simpler accounts. In particular, these results suggest a modification to the standard belief/desire model of intuitive psychology. Children do not treat beliefs and desires as independent; rather, they recognize that agents have beliefs about their own desires and that this has consequences for the interpretation of agents{\textquoteright} actions.

}, issn = {0096-3445}, doi = {10.1037/xge0000345}, url = {http://doi.apa.org/getdoi.cfm?doi=10.1037/xge0000345http://psycnet.apa.org/journals/xge/146/11/1574.pdf}, author = {Julian Jara-Ettinger and Floyd, Samantha and Joshua B. Tenenbaum and Laura Schulz} } @article {3440, title = {Compositional inductive biases in function learning.}, journal = {Cogn Psychol}, volume = {99}, year = {2017}, month = {2017 Dec}, pages = {44-79}, abstract = {

How do people recognize and learn about complex functional structure? Taking inspiration from other areas of cognitive science, we propose that this is achieved by harnessing compositionality: complex structure is decomposed into simpler building blocks. We formalize this idea within the framework of Bayesian regression using a grammar over Gaussian process kernels, and compare this approach with other structure learning approaches. Participants consistently chose compositional (over non-compositional) extrapolations and interpolations of functions. Experiments designed to elicit priors over functional patterns revealed an inductive bias for compositional structure. Compositional functions were perceived as subjectively more predictable than non-compositional functions, and exhibited other signatures of predictability, such as enhanced memorability and reduced numerosity. Taken together, these results support the view that the human intuitive theory of functions is inherently compositional.

}, issn = {1095-5623}, doi = {10.1016/j.cogpsych.2017.11.002}, url = {https://www.sciencedirect.com/science/article/pii/S0010028517301743?via\%3Dihub}, author = {Eric Schulz and Joshua B. Tenenbaum and David Duvenaud and Maarten Speekenbrink and Samuel J Gershman} } @proceedings {2605, title = {Critical Cues in Early Physical Reasoning}, year = {2017}, address = {Austin, TX}, author = {Tomer Ullman and Joshua B. Tenenbaum and Elizabeth S Spelke} } @article {3088, title = {Eye-Tracking Causality}, journal = {Psychological Science}, volume = {73}, year = {2017}, month = {10/2017}, abstract = {

How do people make causal judgments? What role, if any, does counterfactual simulation play? Counterfactual theories of causal judgments predict that people compare what actually happened with what would have happened if the candidate cause had been absent. Process theories predict that people focus only on what actually happened, to assess the mechanism linking candidate cause and outcome. We tracked participants{\textquoteright} eye movements while they judged whether one billiard ball caused another one to go through a gate or prevented it from going through. Both participants{\textquoteright} looking patterns and their judgments demonstrated that counterfactual simulation played a critical role. Participants simulated where the target ball would have gone if the candidate cause had been removed from the scene. The more certain participants were that the outcome would have been different, the stronger the causal judgments. These results provide the first direct evidence for spontaneous counterfactual simulation in an important domain of high-level cognition.

}, keywords = {causality, counterfactuals, eye tracking, intuitive physics, mental simulation, open data, open materials}, issn = {0956-7976}, doi = {10.1177/0956797617713053}, url = {http://journals.sagepub.com/doi/10.1177/0956797617713053}, author = {Tobias Gerstenberg and M.F. Peterson and Noah D. Goodman and D. A. Lagnado and Joshua B. Tenenbaum} } @article {3444, title = {Eye-Tracking Causality}, journal = {Psychological Science}, year = {2017}, abstract = {

How do people make causal judgments? What role, if any, does counterfactual simulation play? Counterfactual theories of causal judgments predict that people compare what actually happened with what would have happened if the candidate cause had been absent. Process theories predict that people focus only on what actually happened, to assess the mechanism linking candidate cause and outcome. We tracked participants{\textquoteright} eye movements while they judged whether one billiard ball caused another one to go through a gate or prevented it from going through. Both participants{\textquoteright} looking patterns and their judgments demonstrated that counterfactual simulation played a critical role. Participants simulated where the target ball would have gone if the candidate cause had been removed from the scene. The more certain participants were that the outcome would have been different, the stronger the causal judgments. These results provide the first direct evidence for spontaneous counterfactual simulation in an important domain of high-level cognition.

}, keywords = {causality, counterfactuals, eye tracking, intuitive physics, mental simulation}, author = {Tobias Gerstenberg and M.F. Peterson and Noah D. Goodman and D. A. Lagnado and Joshua B. Tenenbaum} } @proceedings {2535, title = {Faulty Towers: A counterfactual simulation model of physical support}, year = {2017}, month = {07/2017}, abstract = {

In this paper we extend the counterfactual simulation model (CSM)\  {\textendash}\  originally\  developed\  to\  capture\  causal\  judgments about\  dynamic\  events\  (Gerstenberg,\  Goodman,\  Lagnado,\  \& Tenenbaum, 2014) {\textendash} to explain judgments of physical support. The CSM predicts that people judge physical support by men- tally\  simulating\  what\  would\  happen\  if\  the\  object\  of\  interest were removed. Two experiments test the model by asking par- ticipants to evaluate the extent to which one brick in a tower is responsible for the rest of the bricks staying on the table. The results of both experiments show a very close correspon- dence\  between\  counterfactual\  simulations\  and\  responsibility judgments. We compare three versions of the CSM which dif- fer in how they model people{\textquoteright}s uncertainty about what would have happened. Participants{\textquoteright} selections of which bricks would fall are best explained by assuming that counterfactual inter- ventions only affect some aspects while leaving the rest of the scene unchanged.

}, keywords = {causality, counterfactual, intuitive physics, mental simulation, support}, author = {Tobias Gerstenberg and Liang Zhou and Kevin A Smith and Joshua B. Tenenbaum} } @conference {3575, title = {Generative modeling of audible shapes for object perception}, booktitle = {The IEEE International Conference on Computer Vision (ICCV)}, year = {2017}, month = {10/2017}, address = {Venice, Italy}, abstract = {

Humans infer rich knowledge of objects from both auditory and visual cues. Building a machine of such competency, however, is very challenging, due to the great difficulty in capturing large-scale, clean data of objects with both their appearance and the sound they make. In this paper, we present a novel, open-source pipeline that generates audio-visual data, purely from 3D object shapes and their physical properties. Through comparison with audio recordings and human behavioral studies, we validate the accuracy of the sounds it generates. Using this generative model, we are able to construct a synthetic audio-visual dataset, namely Sound-20K, for object perception tasks. We demonstrate that auditory and visual information play complementary roles in object perception, and further, that the representation learned on synthetic audio-visual data can transfer to real-world scenarios.

}, url = {http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_Generative_Modeling_of_ICCV_2017_paper.html}, author = {Zhoutong Zhang and Jiajun Wu and Qiujia Li and Zhengjia Huang and James Traer and Josh H. McDermott and Joshua B. Tenenbaum and William T. Freeman} } @conference {3639, title = {Human Learning in Atari}, booktitle = {AAAI Spring Symposium Series}, year = {2017}, abstract = {

Atari games are an excellent testbed for studying intelligent behavior, as they offer a range of tasks that differ widely in their visual representation, game dynamics, and goals presented to an agent. The last two years have seen a spate of research into artificial agents that use a single algorithm to learn to play these games. The best of these artificial agents perform at better-than-human levels on most games, but require hundreds of hours of game-play experience to produce such behavior. Humans, on the other hand, can learn to perform well on these tasks in a matter of minutes. In this paper we present data on human learning trajectories for several Atari games, and test several hypotheses about the mechanisms that lead to such rapid learning.\ 

}, author = {Pedro Tsividis and Thomas Pouncy and Jacqueline L. Xu and Joshua B. Tenenbaum and Samuel J Gershman} } @proceedings {3240, title = {Learning to See Physics via Visual De-animation}, year = {2017}, month = {12/2017}, pages = {152{\textendash}163}, abstract = {
We introduce a paradigm for understanding physical scenes without human annotations. At the core of our system is a physical world representation that is first recovered by a perception module and then utilized by physics and graphics
engines. During training, the perception module and the generative models learn by visual de-animation
{\textemdash} interpreting and reconstructing the visual information stream. During testing, the system first recovers the physical world state, and then uses the generative models for reasoning and future prediction.
Even more so than forward simulation, inverting a physics or graphics engine is a computationally hard problem; we overcome this challenge by using a convolutional inversion network. Our system quickly recognizes the physical world
state from appearance and motion cues, and has the flexibility to incorporate both differentiable and non-differentiable physics and graphics engines. We evaluate our system on both synthetic and real datasets involving multiple physical scenes, and demonstrate that our system performs well on both physical state estimation and reasoning problems. We further show that the knowledge learned on the synthetic dataset generalizes to constrained real images.
}, url = {http://papers.nips.cc/paper/6620-learning-to-see-physics-via-visual-de-animation.pdf}, author = {Jiajun Wu and Lu, Erika and Kohli, Pushmeet and William T. Freeman and Joshua B. Tenenbaum}, editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett} } @proceedings {3241, title = {MarrNet: 3D Shape Reconstruction via 2.5D Sketches}, year = {2017}, month = {12/2017}, pages = {540{\textendash}550}, publisher = {Curran Associates, Inc.}, address = {Long Beach, CA}, abstract = {

3D object reconstruction from a single image is a highly under-determined problem, requiring strong prior knowledge of plausible 3D shapes. This introduces challenge for learning-based approaches, as 3D object annotations in real images are scarce. Previous work chose to train on synthetic data with ground truth 3D information, but suffered from the domain adaptation issue when tested on real data. In this work, we propose an end-to-end trainable framework, sequentially estimating 2.5D sketches and 3D object shapes. Our disentangled, two-step formulation has three advantages. First, compared to full 3D shape, 2.5D sketches are much easier to be recovered from a 2D image, and to transfer from synthetic to real data. Second, for 3D reconstruction from the 2.5D sketches, we can easily transfer the learned model on synthetic data to real images, as rendered 2.5D sketches are invariant to object appearance variations in real images, including lighting, texture, etc. This further relieves the domain adaptation problem. Third, we derive differentiable projective functions from 3D shape to 2.5D sketches, making the framework end-to-end trainable on real images, requiring no real-image annotations. Our framework achieves state-of-the-art performance on 3D shape reconstruction.

}, url = {http://papers.nips.cc/paper/6657-marrnet-3d-shape-reconstruction-via-25d-sketches.pdf}, author = {Jiajun Wu and Wang, Yifan and Xue, Tianfan and Sun, Xingyuan and William T. Freeman and Joshua B. Tenenbaum}, editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett} } @article {3512, title = {Mind Games: Game Engines as an Architecture for Intuitive Physics}, journal = {Trends in Cognitive Science}, volume = {21}, year = {2017}, month = {09/2017}, pages = {649 - 665}, chapter = {649}, issn = {1364-6613}, doi = {10.1016/j.tics.2017.05.012}, url = {https://www.cell.com/trends/cognitive-sciences/fulltext/S1364-6613(17)30113-4}, author = {Ullman, Tomer D. and Elizabeth S Spelke and Battaglia, Peter and Joshua B. Tenenbaum} } @proceedings {2537, title = {Physical problem solving: Joint planning with symbolic, geometric, and dynamic constraints}, year = {2017}, month = {07/2017}, abstract = {

In this paper, we present a new task that investigates how peo- ple interact with and make judgments about towers of blocks. In Experiment 1, participants in the lab solved a series of prob- lems in which they had to re-configure three blocks from an initial to a final configuration. We recorded whether they used one hand or two hands to do so.\  In Experiment 2, we asked participants online to judge whether they think the person in the lab used one or two hands.\ \  The results revealed a close correspondence between participants{\textquoteright} actions in the lab,\  and the mental simulations of participants online.\  To explain par- ticipants{\textquoteright} actions and mental simulations, we develop a model that plans over a symbolic representation of the situation, exe- cutes the plan using a geometric solver, and checks the plan{\textquoteright}s feasibility by taking into account the physical constraints of the scene. Our model explains participants{\textquoteright} actions and judgments to a high degree of quantitative accuracy.

}, keywords = {intuitive physics, logic-geometric programming, planning, problem solving, scene understanding}, author = {Ilker Yildirim and Tobias Gerstenberg and Basil Saeed and Marc Toussant and Joshua B. Tenenbaum} } @article {2763, title = {Rational quantitative attribution of beliefs, desires, and percepts in human mentalizing}, journal = {Nature Human Behavior}, volume = {1}, year = {2017}, month = {03/2017}, abstract = {

Social cognition depends on our capacity for {\textquoteleft}mentalizing{\textquoteright}, or explaining an agent{\textquoteright}s behaviour in terms of their mental states. The development and neural substrates of mentalizing are well-studied, but its computational basis is only beginning to be probed. Here we present a model of core mentalizing computations: inferring jointly an actor{\textquoteright}s beliefs, desires and percepts from how they move in the local spatial environment. Our Bayesian theory of mind (BToM) model is based on probabilistically inverting artificial-intelligence approaches to rational planning and state estimation, which extend classical expected-utility agent models to sequential actions in complex, partially observable domains. The model accurately captures the quantitative mental-state judgements of human participants in two experiments, each varying multiple stimulus dimensions across a large number of stimuli. Comparative model fits with both simpler {\textquoteleft}lesioned{\textquoteright} BToM models and a family of simpler non-mentalistic motion features reveal the value contributed by each component of our model.

}, keywords = {Human behaviour, Social behaviour}, doi = {doi:10.1038/s41562-017-0064}, url = {http://www.nature.com/articles/s41562-017-0064}, author = {Chris Baker and Julian Jara-Ettinger and Rebecca Saxe and Joshua B. Tenenbaum} } @conference {3596, title = {Self-supervised intrinsic image decomposition.}, booktitle = { Annual Conference on Neural Information Processing Systems (NIPS)}, year = {2017}, month = {12/2017}, address = {Long Beach, CA}, url = {https://papers.nips.cc/paper/7175-self-supervised-intrinsic-image-decomposition}, author = {Michael Janner and Jiajun Wu and Tejas Kulkarni and Ilker Yildirim and Joshua B. Tenenbaum} } @proceedings {3242, title = {Shape and Material from Sound}, year = {2017}, month = {12/2017}, pages = {1278{\textendash}1288}, address = {Long Beach, CA}, abstract = {

What can we infer from hearing an object falling onto the ground? Based on knowledge of the physical world, humans are able to infer rich information from such limited data: rough shape of the object, its material, the height of falling, etc. In this paper, we aim to approximate such competency. We first mimic the human knowledge about the physical world using a fast physics-based generative model. Then, we present an analysis-by-synthesis approach to infer properties of the falling object. We further approximate human past experience by directly mapping audio to object properties using deep learning with self-supervision. We evaluate our method through behavioral studies, where we compare human predictions with ours on inferring object shape, material, and initial height of falling. Results show that our method achieves near-human performance, without any annotations.

}, url = {http://papers.nips.cc/paper/6727-shape-and-material-from-sound.pdf}, author = {zhang, zhoutong and Qiujia Li and Zhengjia Huang and Jiajun Wu and Joshua B. Tenenbaum and William T. Freeman}, editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett} } @conference {4235, title = {Synthesizing 3D Shapes via Modeling Multi-view Depth Maps and Silhouettes with Deep Generative Networks}, booktitle = {2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, year = {2017}, month = {07/2017}, address = {Honolulu, HI}, abstract = {

We study the problem of learning generative models of 3D shapes. Voxels or 3D parts have been widely used as the underlying representations to build complex 3D shapes; however, voxel-based representations suffer from high memory requirements, and parts-based models require a large collection of cached or richly parametrized parts. We take an alternative approach: learning a generative model over multi-view depth maps or their corresponding silhouettes, and using a deterministic rendering function to produce 3D shapes from these images. A multi-view representation of shapes enables generation of 3D models with fine details, as 2D depth maps and silhouettes can be modeled at a much higher resolution than 3D voxels. Moreover, our approach naturally brings the ability to recover the underlying 3D representation from depth maps of one or a few viewpoints. Experiments show that our framework can generate 3D shapes with variations and details. We also demonstrate that our model has out-of-sample generalization power for real-world tasks with occluded objects.

}, keywords = {2d to 3d, 3D generation, 3D reconstruction, Core object system, depth map, generative, perception, silhouette}, doi = {10.1109/CVPR.2017.269}, url = {http://ieeexplore.ieee.org/document/8099752/http://xplorestaging.ieee.org/ielx7/8097368/8099483/08099752.pdf?arnumber=8099752}, author = {Amir Arsalan Soltani and Haibin Huang and Jiajun Wu and Tejas Kulkarni and Joshua B. Tenenbaum} } @article {3485, title = {Ten-month-old infants infer the value of goals from the costs of actions}, journal = {Science}, volume = {358}, year = {2017}, month = {11/2017}, pages = {1038-1041}, chapter = {1038}, abstract = {

Infants understand that people pursue goals, but how do they learn which goals people prefer? We tested whether infants solve this problem by inverting a mental model of action planning, trading off the costs of acting against the rewards actions bring. After seeing an agent attain two goals equally often at varying costs, infants expected the agent to prefer the goal it attained through costlier actions. These expectations held across three experiments that conveyed cost through different physical path features (height, width, and incline angle), suggesting that an abstract variable{\textemdash}such as {\textquotedblleft}force,{\textquotedblright} {\textquotedblleft}work,{\textquotedblright} or {\textquotedblleft}effort{\textquotedblright}{\textemdash}supported infants{\textquoteright} inferences. We modeled infants{\textquoteright} expectations as Bayesian inferences over utility-theoretic calculations, providing a bridge to recent quantitative accounts of action understanding in older children and adults.

}, author = {Shari Liu and Ullman, Tomer D. and Joshua B. Tenenbaum and Elizabeth S Spelke} } @article {2636, title = {Ten-month-old infants infer value from effort}, year = {2017}, author = {Shari Liu and Tomer Ullman and Joshua B. Tenenbaum and Elizabeth S Spelke} } @article {2604, title = {Ten-month-old infants infer value from effort}, year = {2017}, address = {Austin, TX}, author = {Shari Liu and Tomer Ullman and Joshua B. Tenenbaum and Elizabeth S Spelke} } @article {1984, title = {Building machines that learn and think like people}, year = {2016}, month = {04/2016}, abstract = {

Recent progress in artificial intelligence (AI) has renewed interest in building systems that learn and think like people. Many advances have come from using deep neural networks trained end-to-end in tasks such as object recognition, video games, and board games, achieving performance that equals or even beats humans in some respects. Despite their biological inspiration and performance achievements, these systems differ from human intelligence in crucial ways. We review progress in cognitive science suggesting that truly human-like learning and thinking machines will have to reach beyond current engineering trends in both what they learn, and how they learn it. Specifically, we argue that these machines should (a) build causal models of the world that support explanation and understanding, rather than merely solving pattern recognition problems; (b) ground learning in intuitive theories of physics and psychology, to support and enrich the knowledge that is learned; and (c) harness compositionality and learning-to-learn to rapidly acquire and generalize knowledge to new tasks and situations. We suggest concrete challenges and promising routes towards these goals that can combine the strengths of recent neural network advances with more structured cognitive models.

}, author = {Brenden M Lake and Tomer Ullman and Joshua B. Tenenbaum and Samuel J Gershman} } @conference {2590, title = {Coordinate to cooperate or compete: abstract goals and joint intentions in social interaction}, booktitle = {Proceedings of the 38th Annual Conference of the Cognitive Science Society}, year = {2016}, author = {Max Kleiman-Weiner and Ho, Mark K and Austerweil, Joe L and Michael L, Littman and Joshua B. Tenenbaum} } @conference {1962, title = {Effort as a bridging concept across action and action understanding: Weight and Physical Effort in Predictions of Efficiency in Other Agents}, booktitle = {International Conference on Infant Studies (ICIS) }, year = {2016}, month = {05/2016}, address = {New Orleans, Louisiana}, author = {Tomer Ullman and Joshua B. Tenenbaum and Elizabeth S Spelke} } @article {2570, title = {Functional neuroanatomy of intuitive physical inference}, journal = {Proceedings of the National Academy of Sciences}, volume = {113}, year = {2016}, month = {06/2016}, pages = {E5072 - E5081}, abstract = {

To engage with the world{\textemdash}to understand the scene in front of us, plan actions, and predict what will happen next{\textemdash}we must have an intuitive grasp of the world{\textquoteright}s physical structure and dynamics. How do the objects in front of us rest on and support each other, how much force would be required to move them, and how will they behave when they fall, roll, or collide? Despite the centrality of physical inferences in daily life, little is known about the brain mechanisms recruited to interpret the physical structure of a scene and predict how physical events will unfold. Here, in a series of fMRI experiments, we identified a set of cortical regions that are selectively engaged when people watch and predict the unfolding of physical events{\textemdash}a {\textquotedblleft}physics engine{\textquotedblright} in the brain. These brain regions are selective to physical inferences relative to nonphysical but otherwise highly similar scenes and tasks. However, these regions are not exclusively engaged in physical inferences per se or, indeed, even in scene understanding; they overlap with the domain-general {\textquotedblleft}multiple demand{\textquotedblright} system, especially the parts of that system involved in action planning and tool use, pointing to a close relationship between the cognitive and neural mechanisms involved in parsing the physical content of a scene and preparing an appropriate action.

}, issn = {0027-8424}, doi = {10.1073/pnas.1610344113}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.1610344113}, author = {Fischer, Jason and Mikhael, John G. and Joshua B. Tenenbaum and Nancy Kanwisher} } @conference {2600, title = {Integrating Identification and Perception: A case study of familiar and unfamiliar face processing}, booktitle = {Proceedings of the Thirty-Eight Annual Conference of the Cognitive Science Society}, year = {2016}, month = {2016}, author = {Kelsey Allen and Ilker Yildirim and Joshua B. Tenenbaum} } @inbook {1722, title = {Intuitive theories}, booktitle = {Oxford Handbook of Causal Reasoning}, year = {2016}, month = {02/2016}, publisher = {Oxford University Press}, organization = {Oxford University Press}, author = {Tobias Gerstenberg and Joshua B. Tenenbaum} } @article {2326, title = {Measuring and modeling the perception of natural and unconstrained gaze in humans and machines}, year = {2016}, month = {11/2016}, abstract = {

Humans are remarkably adept at interpreting the gaze direction of other individuals in their surroundings. This skill is at the core of the ability to engage in joint visual attention, which is essential for establishing social interactions. How accurate are humans in determining the gaze direction of others in lifelike scenes, when they can move their heads and eyes freely, and what are the sources of information for the underlying perceptual processes? These questions pose a challenge from both empirical and computational perspectives, due to the complexity of the visual input in real-life situations. Here we measure empirically human accuracy in perceiving the gaze direction of others in lifelike scenes, and study computationally the sources of information and representations underlying this cognitive capacity. We show that humans perform better in face-to-face conditions compared with recorded conditions, and that this advantage is not due to the availability of input dynamics. We further show that humans are still performing well when only the eyes-region is visible, rather than the whole face. We develop a computational model, which replicates the pattern of human performance, including the finding that the eyes-region contains on its own, the required information for estimating both head orientation and direction of gaze. Consistent with neurophysiological findings on task-specific face regions in the brain, the learned computational representations reproduce perceptual effects such as the Wollaston illusion, when trained to estimate direction of gaze, but not when trained to recognize objects or faces.

}, keywords = {computational evaluation, computational modeling, Computer vision, empirical evaluation, estimation of gaze direction, Gaze perception, joint attention, Machine Learning}, author = {Daniel Harari and Tao Gao and Nancy Kanwisher and Joshua B. Tenenbaum and Shimon Ullman} } @proceedings {1865, title = {Modeling Human Ad Hoc Coordination}, year = {2016}, month = {02/2016}, author = {Peter Krafft and Chris Baker and Alex "Sandy" Pentland and Joshua B. Tenenbaum} } @conference {1864, title = {Modeling human understanding of complex intentional action with a Bayesian nonparametric subgoal model}, booktitle = {AAAI}, year = {2016}, month = {02/2016}, author = {Ryo Nakahashi and Chris Baker and Joshua B. Tenenbaum} } @article {2492, title = {The naive utility calculus: computational principles underlying social cognition}, journal = {Trends Cogn Sci.}, year = {2016}, doi = {10.1016/j.tics.2016.05.011}, url = {https://www.ncbi.nlm.nih.gov/pubmed/27388875}, author = {Julian Jara-Ettinger and Hyowon Gweon and Laura Schulz and Joshua B. Tenenbaum} } @proceedings {1723, title = {Natural science: Active learning in dynamic physical microworlds}, year = {2016}, publisher = {38th Annual Meeting of the Cognitive Science Society}, author = {Neil Bramley and Tobias Gerstenberg and Joshua B. Tenenbaum} } @article {2109, title = {Probing the compositionality of intuitive functions}, year = {2016}, month = {05/2016}, abstract = {

How do people learn about complex functional structure? Taking inspiration from other areas of cognitive science, we propose that this is accomplished by harnessing compositionality: complex structure is decomposed into simpler building blocks. We formalize this idea within the framework of Bayesian regression using a grammar over Gaussian process kernels. We show that participants prefer compositional over non-compositional function extrapolations, that samples from the human prior over functions are best described by a compositional model, and that people perceive compositional functions as more predictable than their non-compositional but otherwise similar counterparts. We argue that the compositional nature of intuitive functions is consistent with broad principles of human cognition.

}, author = {Eric Schulz and Joshua B. Tenenbaum and David Duvenaud and Maarten Speekenbrink and Samuel J Gershman} } @article {2531, title = {Rapid Physical Predictions from Convolutional Neural Networks}, year = {2016}, url = {http://phys.csail.mit.edu/papers/9.pdf}, author = {Filipe Peres and Kevin A Smith and Joshua B. Tenenbaum} } @proceedings {1724, title = {Understanding "almost": Empirical and computational studies of near misses}, year = {2016}, publisher = {38th Annual Meeting of the Cognitive Science Society}, author = {Tobias Gerstenberg and Joshua B. Tenenbaum} } @article {1776, title = {The causes and consequences explicit in verbs}, journal = {Language, Cognition and Neuroscience}, volume = {30}, year = {2015}, month = {02/09/2015}, pages = {716-734}, abstract = {
Interpretation of a pronoun in one clause can be systematically affected by the verb in the previous clause. Compare Archibald angered Bartholomew because he ...(he = Archibald) with Archibald criticised Bartholomew because he ...(he = Bartholomew). While it is clear that meaning plays a critical role, it is unclear whether that meaning is directly encodedin the verb or, alternatively, inferred from world knowledge. We report evidence favouring the former account. We elicitedpronoun biases for 502 verbs from seven Levin verb classes in two discourse contexts (implicit causality and implicitconsequentiality), showing that in both contexts, verb class reliably predicts pronoun bias. These results confirm and extendrecent findings about implicit causality and represent the first such study for implicit consequentiality. We discuss these findingsin the context of recent work in semantics, and also develop a new, probabilistic generative account of pronoun interpretation.
}, issn = {2327-3798 (Print) 2327-3801 (Online)}, doi = {10.1080/23273798.2015.1008524}, url = {http://dx.doi.org/10.1080/23273798.2015.1008524}, author = {J. K. Hartshorne}, editor = {T. J. O{\textquoteright}Donnell and Joshua B. Tenenbaum} } @article {1194, title = {Children{\textquoteright}s understanding of the costs and rewards underlying rational action}, journal = {Cognition}, volume = {140}, year = {2015}, month = {07/2015}, pages = {14{\textendash}23}, abstract = {

Humans explain and predict other agents{\textquoteright} behavior using mental state concepts, such as beliefs and desires. Computational and developmental evidence suggest that such inferences are enabled by a principle of rational action: the expectation that agents act efficiently, within situational constraints, to achieve their goals. Here we propose that the expectation of rational action is instantiated by a na{\"\i}ve utility calculus sensitive to both agent-constant and agent-specific aspects of costs and rewards associated with actions. In four experiments, we show that, given an agent{\textquoteright}s choices, children (range: 5-6 year olds; N=96) can infer unobservable aspects of costs (differences in agents{\textquoteright} competence) from information about subjective differences in rewards (differences in agents{\textquoteright} preferences) and vice versa. Moreover, children can design informative experiments on both objects and agents to infer unobservable constraints on agents{\textquoteright} actions.

}, doi = {10.1016/j.cognition.2015.03.006}, url = {http://www.sciencedirect.com/science/article/pii/S0010027715000566}, author = {Julian Jara-Ettinger and Hyowon Gweon and Joshua B. Tenenbaum and Laura Schulz} } @article {1397, title = {Computational rationality: A converging paradigm for intelligence in brains, minds, and machines}, journal = {Science}, volume = {349}, year = {2015}, month = {07/17/2015}, pages = {273-278}, type = {Review; Special Section: Artificial Intelligence}, abstract = {
After growing up together, and mostly growing apart in the second half of the 20th century,the fields of artificial intelligence (AI), cognitive science, and neuroscience arereconverging on a shared view of the computational foundations of intelligence thatpromotes valuable cross-disciplinary exchanges on questions, methods, and results. We chart advances over the past several decades that address challenges of perceptionand action under uncertainty through the lens of computation. Advances include thedevelopment of representations and inferential procedures for large-scale probabilisticinference and machinery for enabling reflection and decisions about tradeoffs in effort, precision, and timeliness of computations. These tools are deployed toward the goal of computational rationality: identifying decisions with highest expected utility, while taking into consideration the costs of computation in complex real-world problems inwhich most relevant calculations can only be approximated. We highlight key concepts with examples that show the potential for interchange between computer science, cognitive science, and neuroscience.
}, doi = {10.1126/science.aac6076 }, url = {http://www.sciencemag.org/content/349/6245/273.abstract}, author = {Samuel J Gershman and Eric J. Horvitz and Joshua B. Tenenbaum} } @article {800, title = {Discovering hierarchical motion structure}, journal = {Vision Research}, volume = {Available online 26 March 2015}, year = {2015}, month = {03/2015}, abstract = {

Scenes filled with moving objects are often hierarchically organized: the motion of a migrating goose is nested within the flight pattern of its flock, the motion of a car is nested within the traffic pattern of other cars on the road, the motion of body parts are nested in the motion of the body. Humans perceive hierarchical structure even in stimuli with two or three moving dots. An influential theory of hierarchical motion perception holds that the visual system performs a "vector analysis" of moving objects, decomposing them into common and relative motions. However, this theory does not specify how to resolve ambiguity when a scene admits more than one vector analysis. We describe a Bayesian theory of vector analysis and show that it can account for classic results from dot motion experiments, as well as new experimental data. Our theory takes a step towards understanding how moving scenes are parsed into objects.

}, doi = {doi:10.1016/j.visres.2015.03.004}, url = {http://www.sciencedirect.com/science/article/pii/S0042698915000814$\#$sthash.vpJfuWmr.dpuf}, author = {Samuel J Gershman and Joshua B. Tenenbaum and Frank Jaekel} } @conference {1045, title = {Efficient and robust analysis-by-synthesis in vision: A computational framework, behavioral tests, and modeling neuronal representations}, booktitle = {Annual Conference of the Cognitive Science Society}, year = {2015}, author = {Ilker Yildirim and Tejas Kulkarni and W. A. Freiwald and Joshua B. Tenenbaum} } @conference {1825, title = {Galileo: Perceiving physical object properties by integrating a physics engine with deep learning.}, booktitle = {NIPS 2015}, year = {2015}, address = { Montr{\'e}al, Canada}, abstract = {
Humans demonstrate remarkable abilities to predict physical events in dynamicscenes, and to infer the physical properties of objects from static images. We propose a generative model for solving these problems of physical scene understanding from real-world videos and images. At the core of our generative modelis a 3D physics engine, operating on an object-based representation of physical properties, including mass, position, 3D shape, and friction. We can infer these latent properties using relatively brief runs of MCMC, which drive simulations in
the physics engine to fit key features of visual observations. We further explore directly mapping visual inputs to physical properties, inverting a part of the generative process using deep learning. We name our model Galileo, and evaluate it on a video dataset with simple yet physically rich scenarios. Results show that Galileo is able to infer the physical properties of objects and predict the outcome of a variety of physical events, with an accuracy comparable to human subjects. Our study points towards an account of human vision with generative physical knowledge at its core, and various recognition models as helpers leading to efficient inference.
}, url = {https://papers.nips.cc/paper/5780-galileo-perceiving-physical-object-properties-by-integrating-a-physics-engine-with-deep-learning}, author = {Jiajun Wu and Ilker Yildirim and Joseph J. Lim and William T. Freeman and Joshua B. Tenenbaum} } @proceedings {755, title = {How, whether, why: Causal judgments as counterfactual contrasts}, year = {2015}, month = {07/22/2015}, pages = {782-787}, address = {Pasadena, CA}, issn = {978-0-9911967-2-2}, url = {https://mindmodeling.org/cogsci2015/papers/0142/index.html}, author = {Tobias Gerstenberg and Noah D. Goodman and D. A. Lagnado and Joshua B. Tenenbaum} } @article {1566, title = {Human-level concept learning through probabilistic program induction}, journal = {Science}, volume = {350}, year = {2015}, month = {12/11/2015}, pages = {1332-1338 }, abstract = {

People learning new concepts can often generalize successfully from just a single example, yet machine learning algorithms typically require tens or hundreds of examples to perform with similar accuracy. People can also use learned concepts in richer ways than conventional algorithms{\textemdash}for action, imagination, and explanation. We present a computational model that captures these human learning abilities for a large class of simple visual concepts: handwritten characters from the world{\textquoteright}s alphabets. The model represents concepts as simple programs that best explain observed examples under a Bayesian criterion. On a challenging one-shot classification task, the model achieves human-level performance while outperforming recent deep learning approaches. We also present several {\textquotedblleft}visual Turing tests{\textquotedblright} probing the model{\textquoteright}s creative generalization abilities, which in many cases are indistinguishable from human behavior.

}, keywords = {Machine Learning}, doi = {10.1126/science.aab3050 }, url = {http://www.sciencemag.org/content/350/6266/1332.short}, author = {Brenden M Lake and Salakhutdinov, Ruslan and Joshua B. Tenenbaum} } @proceedings {1205, title = {Hypothesis-Space Constraints in Causal Learning}, year = {2015}, month = {07/2015}, address = {Pasadena, CA}, url = {https://mindmodeling.org/cogsci2015/papers/0418/index.html}, author = {Pedro Tsividis and Joshua B. Tenenbaum and Laura Schulz} } @article {1811, title = {Information Selection in Noisy Environments with Large Action Spaces}, volume = {Columbus, OH}, year = {2015}, author = {Pedro Tsividis and Samuel J Gershman and Joshua B. Tenenbaum and Laura Schulz} } @article {1206, title = {Not So Innocent: Toddlers{\textquoteright} Inferences About Costs and Culpability}, journal = {Psychological Science }, volume = {26}, year = {2015}, month = {05/2015}, pages = {633-40}, abstract = {

Adults{\textquoteright} social evaluations are influenced by their perception of other people{\textquoteright}s competence and motivation: Helping when it is difficult to help is praiseworthy, and not helping when it is easy to help is reprehensible. Here, we look at whether children{\textquoteright}s social evaluations are affected by the costs that agents incur. We found that toddlers can use the time and effort associated with goal-directed actions to distinguish agents, and that children prefer agents who incur fewer costs in completing a goal. When two agents refuse to help, children retain a preference for the more competent agent but infer that the less competent agent is nicer. These results suggest that children value agents who incur fewer costs, but understand that failure to engage in a low-cost action implies a lack of motivation. We propose that a naive utility calculus underlies inferences from the costs and rewards of goal-directed action and thereby supports social cognition.

}, keywords = {cognitive development, open data, open materials, social cognition, theory of mind}, doi = {10.1177/0956797615572806}, url = {http://pss.sagepub.com/content/early/2015/04/09/0956797615572806}, author = {Julian Jara-Ettinger}, editor = {Joshua B. Tenenbaum and Laura Schulz} } @conference {1803, title = {Perceiving Fully Occluded Objects with Physical Simulation}, booktitle = {Cognitive Science Conference (CogSci)}, year = {2015}, month = {07/2015}, address = {Pasadena, CA}, author = {Ilker Yildirim and Max Siegel and Joshua B. Tenenbaum} } @conference {1084, title = {Picture: An Imperative Probabilistic Programming Language for Scene Perception}, booktitle = {Computer Vision and Pattern Recognition}, year = {2015}, author = {Tejas Kulkarni and Pushmeet Kohli and Joshua B. Tenenbaum and Vikash Mansinghka} } @proceedings {924, title = {Responsibility judgments in voting scenarios}, year = {2015}, month = {07/22/2015}, pages = {788-793}, address = {Pasadena, CA}, issn = {978-0-9911967-2-2}, url = {https://mindmodeling.org/cogsci2015/papers/0143/index.html}, author = {Tobias Gerstenberg and Joseph Y Halpern and Joshua B. Tenenbaum} } @article {449, title = {Concepts in a Probabilistic Language of Thought.}, number = {010}, year = {2014}, month = {06/2014}, abstract = {

Knowledge organizes our understanding of the world, determining what we expect given what we have already seen. Our predictive representations have two key properties: they are productive, and they are graded. Productive generalization is possible because our knowledge decomposes into concepts{\textemdash}elements of knowledge that are combined and recombined to describe particular situations. Gradedness is the observable effect of accounting for uncertainty{\textemdash}our knowledge encodes degrees of belief that lead to graded probabilistic predictions. To put this a different way, concepts form a combinatorial system that enables description of many different situations; each such situation specifies a distribution over what we expect to see in the world, given what we have seen. We may think of this system as a probabilistic language of thought (PLoT) in which representations are built from language-like composition of concepts and the content of those representations is a probability distribution on world states. The purpose of this chapter is to formalize these ideas in computational terms, to illustrate key properties of the PLoT approach with a concrete example, and to draw connections with other views of
conceptual structure.

Note: The book chapter is reprinted courtesy of The MIT Press, from the forthcoming edited collection {\textquotedblleft}The Conceptual Mind: New Directions in the Study of Concepts{\textquotedblright} edited by Eric Margolis and Stephen Laurence, print date Spring 2015.

}, keywords = {Development of Intelligence}, author = {Noah D. Goodman and Joshua B. Tenenbaum and Tobias Gerstenberg} } @article {1037, title = {Explaining Monkey Face Patch System as Efficient Analysis-by-Synthesis}, year = {2014}, author = {Ilker Yildirim and Tejas Kulkarni and W. A. Freiwald and Joshua B. Tenenbaum} } @article {459, title = {When Computer Vision Gazes at Cognition.}, number = {025}, year = {2014}, month = {12/2014}, abstract = {

Joint attention is a core, early-developing form of social interaction. It is based on our ability to discriminate the third party objects that other people are looking at. While it has been shown that people can accurately determine whether another person is looking directly at them versus away, little is known about human ability to discriminate a third person gaze directed towards objects that are further away, especially in unconstraint cases where the looker can move her head and eyes freely. In this paper we address this question by jointly exploring human psychophysics and a cognitively motivated computer vision model, which can detect the 3D direction of gaze from 2D face images. The synthesis of behavioral study and computer vision yields several interesting discoveries. (1) Human accuracy of discriminating targets 8{\deg}-10{\deg} of visual angle apart is around 40\% in a free looking gaze task; (2) The ability to interpret gaze of different lookers vary dramatically; (3) This variance can be captured by the computational model; (4) Human outperforms the current model significantly. These results collectively show that the acuity of human joint attention is indeed highly impressive, given the computational challenge of the natural looking task. Moreover, the gap between human and model performance, as well as the variability of gaze interpretation across different lookers, require further understanding of the underlying mechanisms utilized by humans for this challenging task.

}, author = {Tao Gao and Daniel Harari and Joshua B. Tenenbaum and Shimon Ullman} }