@article {4448, title = {Efficient inverse graphics in biological face processing}, journal = {Science Advances}, volume = {6}, year = {2020}, month = {03/2020}, pages = {eaax5979}, abstract = {

Vision not only detects and recognizes objects, but performs rich inferences about the underlying scene structure that causes the patterns of light we see. Inverting generative models, or {\textquotedblleft}analysis-by-synthesis{\textquotedblright}, presents a possible solution, but its mechanistic implementations have typically been too slow for online perception, and their mapping to neural circuits remains unclear. Here we present a neurally plausible efficient inverse graphics model and test it in the domain of face recognition. The model is based on a deep neural network that learns to invert a three-dimensional face graphics program in a single fast feedforward pass. It explains human behavior qualitatively and quantitatively, including the classic {\textquotedblleft}hollow face{\textquotedblright} illusion, and it maps directly onto a specialized face-processing circuit in the primate brain. The model fits both behavioral and neural data better than state-of-the-art computer vision models, and suggests an interpretable reverse-engineering account of how the brain transforms images into percepts.

}, doi = {10.1126/sciadv.aax5979}, url = {https://advances.sciencemag.org/lookup/doi/10.1126/sciadv.aax5979}, author = {Ilker Yildirim and Mario Belledonne and W. A. Freiwald and Joshua B. Tenenbaum} } @proceedings {4261, title = {Draping an Elephant: Uncovering Children{\textquoteright}s Reasoning About Cloth-Covered Objects}, year = {2019}, month = {07/2019}, address = {Montreal, Canada}, abstract = {

Humans have an intuitive understanding of physics. They can predict how a physical scene will unfold, and reason about how it came to be. Adults may rely on such a physical representation for visual reasoning and recognition, going beyond visual features and capturing objects in terms of their physical properties. Recently, the use of draped objects in recognition was used to examine adult object representations in the absence of many common visual features. In this paper we examine young children{\textquoteright}s reasoning about draped objects in order to examine the develop of physical object representation. In addition, we argue that a better understanding of the development of the concept of cloth as a physical entity is worthwhile in and of itself, as it may form a basic ontological category in intuitive physical reasoning akin to liquids and solids. We use two experiments to investigate young children{\textquoteright}s (ages 3{\textendash}5) reasoning about cloth-covered objects, and find that they perform significantly above chance (though far from perfectly) indicating a representation of physical objects that can interact dynamically with the world. Children{\textquoteright}s success and failure pattern is similar across the two experiments, and we compare it to adult behavior. We find a small effect, which suggests the specific features that make reasoning about certain objects more difficult may carry into adulthood.

}, keywords = {analysis-by-synthesis, cloth, cognitive development, imagination, intuitive physics, object recognition, occlusion, perception, vision}, url = {https://mindmodeling.org/cogsci2019/papers/0506/index.html}, author = {Tomer D Ullman and Eliza Kosoy and Ilker Yildirim and Amir Arsalan Soltani and Max Siegel and Joshua B. Tenenbaum and Elizabeth S Spelke} } @article {4178, title = {An integrative computational architecture for object-driven cortex}, journal = {Current Opinion in Neurobiology}, volume = {55}, year = {2019}, month = {01/2019}, pages = {73 - 81}, abstract = {

Computational architecture for object-driven cortex

Objects in motion activate multiple cortical regions in every lobe of the human brain. Do these regions represent a collection of independent systems, or is there an overarching functional architecture spanning all of object-driven cortex? Inspired by recent work in artificial intelligence (AI), machine learning, and cognitive science, we consider the hypothesis that these regions can be understood as a coherent network implementing an integrative computational system that unifies the functions needed to perceive, predict, reason about, and plan with physical objects{\textemdash}as in the paradigmatic case of using or making tools. Our proposal draws on a modeling framework that combines multiple AI methods, including causal generative models, hybrid symbolic-continuous planning algorithms, and neural recognition networks, with object-centric, physics-based representations. We review evidence relating specific components of our proposal to the specific regions that comprise object-driven cortex, and lay out future research directions with the goal of building a complete functional and mechanistic account of this system.

}, issn = {09594388}, doi = {10.1016/j.conb.2019.01.010}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0959438818301995}, author = {Ilker Yildirim and Jiajun Wu and Nancy Kanwisher and Joshua B. Tenenbaum} } @article {3540, title = {Efficient inverse graphics in biological face processing}, year = {2018}, month = {04/02/2018}, abstract = {

The visual system must not only recognize and localize objects, but perform much richer inferences about the underlying causes in the world that give rise to observed sense data. Analyzing scenes by inverting causal generative models, also known as "analysis-by-synthesis", has a long history in computational vision, and these models have some behavioral support, but they are typically too slow to support online perception and have no known mapping to actual neural circuits. Here we present a neurally plausible model for efficiently inverting generative models of images and test it as a precise account of one aspect of high-level vision, the perception of faces. The model is based on a deep neural network that learns to invert a three-dimensional (3D) face graphics program in a single fast feedforward pass. It successfully explains both human behavioral data and multiple levels of neural processing in non-human primates, as well as a classic illusion, the "hollow face" effect. The model also fits qualitatively better than state-of-the-art computer vision models, and suggests an interpretable reverse-engineering account of how images are transformed into scene percepts in the primate ventral stream.

}, url = {https://www.biorxiv.org/content/early/2018/04/02/282798}, author = {Ilker Yildirim and W. A. Freiwald and Tenenbaum J.} } @conference {2822, title = {Causal and compositional generative models in online perception}, booktitle = {39th Annual Conference of the Cognitive Science Society}, year = {2017}, address = {London, UK}, abstract = {

From a quick glance or the touch of an object, our brains map sensory signals to scenes composed of rich and detailed shapes and surfaces. Unlike the standard pattern recognition approaches to perception, we argue that this mapping draws on internal causal and compositional models of the outside phys- ical world, and that such internal models underlie the general- ization capacity of human perception. Here, we present a gen- erative model of visual and multisensory perception in which the latent variables encode intrinsic properties of objects such as their shapes and surfaces in addition to their extrinsic prop- erties such as pose and occlusion. These latent variables can be composed in novel ways and are inputs to sensory-specific causal models that output sense-specific signals. We present a novel recognition network that performs efficient inference in the generative model, computing at a speed similar to online perception. We show that our model, but not an alternative baseline model or a lesion of our model, can account for hu- man performance in an occluded face matching task and in a cross-modal visual-to-haptic face matching task.\ 

}, author = {Ilker Yildirim and Michael Janner}, editor = {Mario Belledonne and Christian Wallraven and W. A. Freiwald and Joshua B. Tenenbaum} } @proceedings {3539, title = {Causal and compositional generative models in online perception}, year = {2017}, month = {07/2017}, address = {London, UK}, abstract = {

From a quick glance or the touch of an object, our brains map sensory signals to scenes composed of rich and detailed shapes and surfaces. Unlike the standard approaches to perception, we argue that this mapping draws on internal causal and compositional models of the physical world and these internal models underlie the generalization capacity of human perception. Here, we present a generative model of visual and multisensory perception in which the latent variables encode intrinsic (e.g., shape) and extrinsic (e.g., occlusion) object properties. Latent variables are inputs to causal models that output sense-specific signals. We present a recognition network that performs efficient inference in the generative model, computing at a speed similar to online perception. We show that our model, but not alternatives, can account for human performance in an occluded face matching task and in a visual-to-haptic face matching task.

}, url = {https://mindmodeling.org/cogsci2017/papers/0266/index.html}, author = {Ilker Yildirim and Michael Janner and Mario Belledonne and Christian Wallraven and W. A. Freiwald and Joshua B. Tenenbaum} } @proceedings {2537, title = {Physical problem solving: Joint planning with symbolic, geometric, and dynamic constraints}, year = {2017}, month = {07/2017}, abstract = {

In this paper, we present a new task that investigates how peo- ple interact with and make judgments about towers of blocks. In Experiment 1, participants in the lab solved a series of prob- lems in which they had to re-configure three blocks from an initial to a final configuration. We recorded whether they used one hand or two hands to do so.\  In Experiment 2, we asked participants online to judge whether they think the person in the lab used one or two hands.\ \  The results revealed a close correspondence between participants{\textquoteright} actions in the lab,\  and the mental simulations of participants online.\  To explain par- ticipants{\textquoteright} actions and mental simulations, we develop a model that plans over a symbolic representation of the situation, exe- cutes the plan using a geometric solver, and checks the plan{\textquoteright}s feasibility by taking into account the physical constraints of the scene. Our model explains participants{\textquoteright} actions and judgments to a high degree of quantitative accuracy.

}, keywords = {intuitive physics, logic-geometric programming, planning, problem solving, scene understanding}, author = {Ilker Yildirim and Tobias Gerstenberg and Basil Saeed and Marc Toussant and Joshua B. Tenenbaum} } @conference {3596, title = {Self-supervised intrinsic image decomposition.}, booktitle = { Annual Conference on Neural Information Processing Systems (NIPS)}, year = {2017}, month = {12/2017}, address = {Long Beach, CA}, url = {https://papers.nips.cc/paper/7175-self-supervised-intrinsic-image-decomposition}, author = {Michael Janner and Jiajun Wu and Tejas Kulkarni and Ilker Yildirim and Joshua B. Tenenbaum} } @conference {2600, title = {Integrating Identification and Perception: A case study of familiar and unfamiliar face processing}, booktitle = {Proceedings of the Thirty-Eight Annual Conference of the Cognitive Science Society}, year = {2016}, month = {2016}, author = {Kelsey Allen and Ilker Yildirim and Joshua B. Tenenbaum} } @conference {1045, title = {Efficient and robust analysis-by-synthesis in vision: A computational framework, behavioral tests, and modeling neuronal representations}, booktitle = {Annual Conference of the Cognitive Science Society}, year = {2015}, author = {Ilker Yildirim and Tejas Kulkarni and W. A. Freiwald and Joshua B. Tenenbaum} } @conference {1825, title = {Galileo: Perceiving physical object properties by integrating a physics engine with deep learning.}, booktitle = {NIPS 2015}, year = {2015}, address = { Montr{\'e}al, Canada}, abstract = {
Humans demonstrate remarkable abilities to predict physical events in dynamicscenes, and to infer the physical properties of objects from static images. We propose a generative model for solving these problems of physical scene understanding from real-world videos and images. At the core of our generative modelis a 3D physics engine, operating on an object-based representation of physical properties, including mass, position, 3D shape, and friction. We can infer these latent properties using relatively brief runs of MCMC, which drive simulations in
the physics engine to fit key features of visual observations. We further explore directly mapping visual inputs to physical properties, inverting a part of the generative process using deep learning. We name our model Galileo, and evaluate it on a video dataset with simple yet physically rich scenarios. Results show that Galileo is able to infer the physical properties of objects and predict the outcome of a variety of physical events, with an accuracy comparable to human subjects. Our study points towards an account of human vision with generative physical knowledge at its core, and various recognition models as helpers leading to efficient inference.
}, url = {https://papers.nips.cc/paper/5780-galileo-perceiving-physical-object-properties-by-integrating-a-physics-engine-with-deep-learning}, author = {Jiajun Wu and Ilker Yildirim and Joseph J. Lim and William T. Freeman and Joshua B. Tenenbaum} } @conference {1803, title = {Perceiving Fully Occluded Objects with Physical Simulation}, booktitle = {Cognitive Science Conference (CogSci)}, year = {2015}, month = {07/2015}, address = {Pasadena, CA}, author = {Ilker Yildirim and Max Siegel and Joshua B. Tenenbaum} } @article {1037, title = {Explaining Monkey Face Patch System as Efficient Analysis-by-Synthesis}, year = {2014}, author = {Ilker Yildirim and Tejas Kulkarni and W. A. Freiwald and Joshua B. Tenenbaum} }