@article {4196, title = {A model for discovering {\textquoteleft}containment{\textquoteright} relations}, journal = {Cognition}, volume = {183}, year = {2019}, month = {02/2019}, pages = {67 - 81}, abstract = {

Rapid developments in the fields of learning and object recognition have been obtained by successfully developing and using methods for learning from a large number of labeled image examples. However, such current methods cannot explain infants{\textquoteright} learning of new concepts based on their visual experience, in particular, the ability to learn complex concepts without external guidance, as well as the natural order in which related concepts are acquired. A remarkable example of early visual learning is the category of {\textquoteright}containers{\textquoteright} and the notion of {\textquoteright}containment{\textquoteright}. Surprisingly, this is one of the earliest spatial relations to be learned, starting already around 3 month of age, and preceding other common relations (e.g., {\textquoteright}support{\textquoteright}, {\textquoteright}in-between{\textquoteright}). In this work we present a model, which explains infants{\textquoteright} capacity of learning {\textquoteright}containment{\textquoteright} and related concepts by {\textquoteright}just looking{\textquoteright}, together with their empirical development trajectory. Learning occurs in the model fast and without external guidance, relying only on perceptual processes that are present in the first months of life. Instead of labeled training examples, the system provides its own internal supervision to guide the learning process. We show how the detection of so-called {\textquoteright}paradoxical occlusion{\textquoteright} provides natural internal supervision, which guides the system to gradually acquire a range of useful containment-related concepts. Similar mechanisms of using implicit internal supervision can have broad application in other cognitive domains as well as artificial intelligent systems, because they alleviate the need for supplying extensive external supervision, and because they can guide the learning process to extract concepts that are meaningful to the observer, even if they are not by themselves obvious, or salient in the input.

}, keywords = {Computational model; Containment relation; Developmental trajectory; Infants{\textquoteright} perceptual learning; Spatial relations learning; Unsupervised learning}, issn = {00100277}, doi = {10.1016/j.cognition.2018.11.001}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0010027718302877}, author = {Shimon Ullman and Dorfman, Nimrod and Harari, Daniel} } @article {3422, title = {Discovery and usage of joint attention in images}, journal = {arXiv.org}, year = {2018}, month = {04/2018}, abstract = {

Joint visual attention is characterized by two or more individuals looking at a common target at the same time. The ability to identify joint attention in scenes, the people involved, and their common target, is fundamental to the understanding of social interactions, including others{\textquoteright} intentions and goals. In this work we deal with the extraction of joint attention events, and the use of such events for image descriptions. The work makes two novel contributions. First, our extraction algorithm is the first which identifies joint visual attention in single static images. It computes 3D gaze direction, identifies the gaze target by combining gaze direction with a 3D depth map computed for the image, and identifies the common gaze target. Second, we use a human study to demonstrate the sensitivity of humans to joint attention, suggesting that the detection of such a configuration in an image can be useful for understanding the image, including the goals of the agents and their joint activity, and therefore can contribute to image captioning and related tasks.

}, keywords = {compositional approach, computational study, Gaze perception, human study, joint attention}, url = {https://arxiv.org/abs/1804.04604}, author = {Daniel Harari and Joshua B. Tenenbaum and Shimon Ullman} } @article {1760, title = {Atoms of recognition in human and computer vision}, journal = {PNAS }, volume = {113}, year = {2016}, month = {03/2016}, pages = {2744{\textendash}2749}, abstract = {
Discovering the visual features and representations used by thebrain to recognize objects is a central problem in the study of vision. Recently, neural network models of visual object recognition, including biological and deep network models, have shown remarkableprogress and have begun to rival human performance in some challenging tasks. These models are trained on image examples andlearn to extract features and representations and to use them for categorization. It remains unclear, however, whether the representations and learning processes discovered by current models aresimilar to those used by the human visual system. Here we show,by introducing and using minimal recognizable images, that thehuman visual system uses features and processes that are not usedby current models and that are critical for recognition. We found bypsychophysical studies that at the level of minimal recognizableimages a minute change in the image can have a drastic effect onrecognition, thus identifying features that are critical for the task.Simulations then showed that current models cannot explain thissensitivity to precise feature configurations and, more generally,do not learn to recognize minimal images at a human level. The roleof the features shown here is revealed uniquely at the minimal level, where the contribution of each feature is essential. A full understanding of the learning and use of such features will extend ourunderstanding of visual recognition and its cortical mechanisms andwill enhance the capacity of computational models to learn fromvisual experience and to deal with recognition and detailedimage interpretation.
}, keywords = {Computer vision, minimal images, object recognition, visual perception, visual representations}, issn = {1091-6490}, doi = {10.1073/pnas.1513198113}, url = {http://www.pnas.org/content/113/10/2744.abstract}, author = {Shimon Ullman and Liav Assif and Eitan Fetaya and Daniel Harari} } @article {2133, title = {Do You See What I Mean? Visual Resolution of Linguistic Ambiguities}, year = {2016}, month = {09/2016}, abstract = {

Understanding language goes hand in hand with the ability to integrate complex contextual information obtained via perception. In this work, we present a novel task for grounded language understanding: disambiguating a sentence given a visual scene which depicts one of the possible interpretations of that sentence. To this end, we introduce a new multimodal corpus containing ambiguous sentences, representing a wide range of syntactic, semantic and discourse ambiguities, coupled with videos that visualize the different interpretations for each sentence. We address this task by extending a vision model which determines if a sentence is depicted by a video. We demonstrate how such a model can be adjusted to recognize different interpretations of the same underlying sentence, allowing to disambiguate sentences in a unified fashion across the different ambiguity types.

}, author = {Yevgeni Berzak and Andrei Barbu and Daniel Harari and Boris Katz and Shimon Ullman} } @article {1885, title = {Language and Vision Ambiguities (LAVA) Corpus}, year = {2016}, month = {01/2016}, abstract = {

Ambiguity is one of the defining characteristics of human languages, and language understanding crucially relies on the ability to obtain unambiguous representations of linguistic content. While some ambiguities can be resolved using intra-linguistic contextual cues, the disambiguation of many linguistic constructions requires integration of world knowledge and perceptual information obtained from other modalities. In this work, we focus on the problem of grounding language in the visual modality, and introduce a novel task for visual and linguistic understanding which requires resolving linguistic ambiguities by utilizing the visual context of the utterance.

To address this challenge, we release the Language and Vision Ambiguities (LAVA) corpus. LAVA contains ambiguous sentences coupled with visual scenes that depict the different interpretations of each sentence. The sentences in the corpus are annotated with syntactic and semantic parses, and cover a wide range of linguistic ambiguities, including PP and VP attachment, conjunctions, logical forms, anaphora and ellipsis. In addition to the sentence disambiguation challenge, the corpus will support a variety of related tasks which use natural language as a medium for expressing visual understanding.

Reference:
Yevgeni Berzak, Andrei Barbu, Daniel Harari, Boris Katz, and Shimon Ullman (2015). Do You See What I Mean? Visual Resolution of Linguistic Ambiguities. Conference on Empirical Methods in Natural Language Processing (EMNLP), Lisbon, Portugal. [PDF]

Download all of the clips in MP4 format (ZIP)

}, url = {http://web.mit.edu/lavacorpus/}, author = {Yevgeni Berzak and Andrei Barbu and Daniel Harari and Boris Katz and Shimon Ullman} } @article {2326, title = {Measuring and modeling the perception of natural and unconstrained gaze in humans and machines}, year = {2016}, month = {11/2016}, abstract = {

Humans are remarkably adept at interpreting the gaze direction of other individuals in their surroundings. This skill is at the core of the ability to engage in joint visual attention, which is essential for establishing social interactions. How accurate are humans in determining the gaze direction of others in lifelike scenes, when they can move their heads and eyes freely, and what are the sources of information for the underlying perceptual processes? These questions pose a challenge from both empirical and computational perspectives, due to the complexity of the visual input in real-life situations. Here we measure empirically human accuracy in perceiving the gaze direction of others in lifelike scenes, and study computationally the sources of information and representations underlying this cognitive capacity. We show that humans perform better in face-to-face conditions compared with recorded conditions, and that this advantage is not due to the availability of input dynamics. We further show that humans are still performing well when only the eyes-region is visible, rather than the whole face. We develop a computational model, which replicates the pattern of human performance, including the finding that the eyes-region contains on its own, the required information for estimating both head orientation and direction of gaze. Consistent with neurophysiological findings on task-specific face regions in the brain, the learned computational representations reproduce perceptual effects such as the Wollaston illusion, when trained to estimate direction of gaze, but not when trained to recognize objects or faces.

}, keywords = {computational evaluation, computational modeling, Computer vision, empirical evaluation, estimation of gaze direction, Gaze perception, joint attention, Machine Learning}, author = {Daniel Harari and Tao Gao and Nancy Kanwisher and Joshua B. Tenenbaum and Shimon Ullman} } @conference {1429, title = {Do You See What I Mean? Visual Resolution of Linguistic Ambiguities}, booktitle = {Conference on Empirical Methods in Natural Language Processing, Lisbon, Portugal. }, year = {2015}, month = {09/2015}, author = {Yevgeni Berzak and Andrei Barbu and Daniel Harari and Boris Katz and Shimon Ullman} } @proceedings {1762, title = {A model for full local image interpretation}, year = {2015}, abstract = {

We describe a computational model of humans{\textquoteright} ability to provide a detailed interpretation of a scene{\textquoteright}s components. Humans can identify in an image meaningful components almost everywhere, and identifying these components is an essential part of the visual process, and of understanding the surrounding scene and its potential meaning to the viewer. Detailed interpretation is beyond the scope of current models of visual recognition. Our model suggests that this is a fundamental limitation, related to the fact that existing models rely on feed - forward but limited top - down processing. In our model, a first recognition stage leads to the initial activation of class candidates, which is incomplete and with limited accuracy. This stage then triggers the application of class - specific interpretation and validation processes, which recover richer and more accurate interpretation of the visible scene. We discuss implications of the model for visual interpretation by humans and by computer vision models.

}, author = {Guy Ben-Yosef and Liav Assif and Daniel Harari and Shimon Ullman} } @article {459, title = {When Computer Vision Gazes at Cognition.}, number = {025}, year = {2014}, month = {12/2014}, abstract = {

Joint attention is a core, early-developing form of social interaction. It is based on our ability to discriminate the third party objects that other people are looking at. While it has been shown that people can accurately determine whether another person is looking directly at them versus away, little is known about human ability to discriminate a third person gaze directed towards objects that are further away, especially in unconstraint cases where the looker can move her head and eyes freely. In this paper we address this question by jointly exploring human psychophysics and a cognitively motivated computer vision model, which can detect the 3D direction of gaze from 2D face images. The synthesis of behavioral study and computer vision yields several interesting discoveries. (1) Human accuracy of discriminating targets 8{\deg}-10{\deg} of visual angle apart is around 40\% in a free looking gaze task; (2) The ability to interpret gaze of different lookers vary dramatically; (3) This variance can be captured by the computational model; (4) Human outperforms the current model significantly. These results collectively show that the acuity of human joint attention is indeed highly impressive, given the computational challenge of the natural looking task. Moreover, the gap between human and model performance, as well as the variability of gaze interpretation across different lookers, require further understanding of the underlying mechanisms utilized by humans for this challenging task.

}, author = {Tao Gao and Daniel Harari and Joshua B. Tenenbaum and Shimon Ullman} }