@article {5009, title = {Image interpretation by iterative bottom-up top- down processing}, number = {120}, year = {2021}, month = {11/2021}, abstract = {

Scene understanding requires the extraction and representation of scene components, such as objects and their parts, people, and places, together with their individual properties, as well as relations and interactions between them. We describe a model in which meaningful scene structures are extracted from the image by an iterative process, combining bottom-up (BU) and top-down (TD) networks, interacting through a symmetric bi-directional communication between them ({\textquoteleft}counter-streams{\textquoteright} structure). The BU- TD model extracts and recognizes scene constituents with their selected properties and relations, and uses them to describe and understand the image.

The scene representation is constructed by the iterative use of three components. The first model component is a bottom-up stream that extracts selected scene elements, properties and relations. The second component ({\textquoteleft}cognitive augmentation{\textquoteright}) augments the extracted visual representation based on relevant non-visual stored representations. It also provides input to the third component, the top-down stream, in the form of a TD instruction, instructing the model what task to perform next. The top-down stream then guides the BU visual stream to perform the selected task in the next cycle. During this

process, the visual representations extracted from the image can be combined with relevant non- visual representations, so that the final scene representation is based on both visual information extracted from the scene and relevant stored knowledge of the world.
We show how the BU-TD model composes complex visual tasks from sequences of steps, invoked by individual TD instructions. In particular, we describe how a sequence of TD-instructions is used to extract from the scene structures of interest, including an algorithm to automatically select the next TD- instruction in the sequence. The selection of TD instruction depends in general on the goal, the image, and on information already extracted from the image in previous steps. The TD-instructions sequence is therefore not a fixed sequence determined at the start, but an evolving program (or {\textquoteleft}visual routine{\textquoteright}) that depends on the goal and the image.

The extraction process is shown to have favourable properties in terms of combinatorial generalization,

generalizing well to novel scene structures and new combinations of objects, properties and relations not seen during training. Finally, we compare the model with relevant aspects of the human vision, and suggest directions for using the BU-TD scheme for integrating visual and cognitive components in the process of scene understanding.

}, author = {Shimon Ullman and Liav Assif and Alona Strugatski and Ben-Zion Vatashsky and Hila Levi and Aviv Netanyahu and Adam Uri Yaari} } @article {4458, title = {Minimal videos: Trade-off between spatial and temporal information in human and machine vision.}, journal = {Cognition}, year = {2020}, month = {08/2020}, abstract = {

Objects and their parts can be visually recognized from purely spatial or purely temporal information but the mechanisms integrating space and time are poorly understood. Here we show that visual recognition of objects and actions can be achieved by efficiently combining spatial and motion cues in configurations where each source on its own is insufficient for recognition. This analysis is obtained by identifying minimal videos: these are short and tiny video clips in which objects, parts, and actions can be reliably recognized, but any reduction in either space or time makes them unrecognizable. Human recognition in minimal videos is invariably accompanied by full interpretation of the internal components of the video. State-of-the-art deep convolutional networks for dynamic recognition cannot replicate human behavior in these configurations. The gap between human and machine vision demonstrated here is due to critical mechanisms for full spatiotemporal interpretation that are lacking in current computational models.

}, keywords = {Comparing deep neural networks and humans, Integration of spatial and temporal visual information, minimal images, Minimal videos, Visual dynamic recognition}, doi = {10.1016/j.cognition.2020.104263}, url = {https://www.sciencedirect.com/science/article/abs/pii/S0010027720300822}, author = {Guy Ben-Yosef and Gabriel Kreiman and Shimon Ullman} } @conference {4558, title = {What can human minimal videos tell us about dynamic recognition models?}, booktitle = {International Conference on Learning Representations (ICLR 2020)}, year = {2020}, month = {04/2020}, address = {Virtual Conference}, abstract = {

In human vision objects and their parts can be visually recognized from purely spatial or purely temporal information but the mechanisms integrating space and time are poorly understood. Here we show that human visual recognition of objects and actions can be achieved by efficiently combining spatial and motion cues in configurations where each source on its own is insufficient for recognition. This analysis is obtained by identifying minimal videos: these are short and tiny video clips in which objects, parts, and actions can be reliably recognized, but any reduction in either space or time makes them unrecognizable. State-of-the-art deep networks for dynamic visual recognition cannot replicate human behavior in these configurations. This gap between humans and machines points to critical mechanisms in human dynamic vision that are lacking in current models.

Published as a workshop paper at {\textquotedblleft}Bridging AI and Cognitive Science{\textquotedblright} (ICLR 2020)

}, url = {https://baicsworkshop.github.io/pdf/BAICS_1.pdf}, author = {Guy Ben-Yosef and Gabriel Kreiman and Shimon Ullman} } @article {4196, title = {A model for discovering {\textquoteleft}containment{\textquoteright} relations}, journal = {Cognition}, volume = {183}, year = {2019}, month = {02/2019}, pages = {67 - 81}, abstract = {

Rapid developments in the fields of learning and object recognition have been obtained by successfully developing and using methods for learning from a large number of labeled image examples. However, such current methods cannot explain infants{\textquoteright} learning of new concepts based on their visual experience, in particular, the ability to learn complex concepts without external guidance, as well as the natural order in which related concepts are acquired. A remarkable example of early visual learning is the category of {\textquoteright}containers{\textquoteright} and the notion of {\textquoteright}containment{\textquoteright}. Surprisingly, this is one of the earliest spatial relations to be learned, starting already around 3 month of age, and preceding other common relations (e.g., {\textquoteright}support{\textquoteright}, {\textquoteright}in-between{\textquoteright}). In this work we present a model, which explains infants{\textquoteright} capacity of learning {\textquoteright}containment{\textquoteright} and related concepts by {\textquoteright}just looking{\textquoteright}, together with their empirical development trajectory. Learning occurs in the model fast and without external guidance, relying only on perceptual processes that are present in the first months of life. Instead of labeled training examples, the system provides its own internal supervision to guide the learning process. We show how the detection of so-called {\textquoteright}paradoxical occlusion{\textquoteright} provides natural internal supervision, which guides the system to gradually acquire a range of useful containment-related concepts. Similar mechanisms of using implicit internal supervision can have broad application in other cognitive domains as well as artificial intelligent systems, because they alleviate the need for supplying extensive external supervision, and because they can guide the learning process to extract concepts that are meaningful to the observer, even if they are not by themselves obvious, or salient in the input.

}, keywords = {Computational model; Containment relation; Developmental trajectory; Infants{\textquoteright} perceptual learning; Spatial relations learning; Unsupervised learning}, issn = {00100277}, doi = {10.1016/j.cognition.2018.11.001}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0010027718302877}, author = {Shimon Ullman and Dorfman, Nimrod and Harari, Daniel} } @article {4087, title = {Using neuroscience to develop artificial intelligence}, journal = {Science}, volume = {363}, year = {2019}, month = {02/2019}, pages = {692 - 693}, chapter = {692}, abstract = {

When the mathematician Alan Turing posed the question {\textquotedblleft}Can machines think?{\textquotedblright} in the first line of his seminal 1950 paper that ushered in the quest for artificial intelligence (AI) (1), the only known systems carrying out complex computations were biological nervous systems. It is not surprising, therefore, that scientists in the nascent field of AI turned to brain circuits as a source for guidance. One path that was taken since the early attempts to perform intelligent computation by brain-like circuits (2), and which led recently to remarkable successes, can be described as a highly reductionist approach to model cortical circuitry. In its basic current form, known as a {\textquotedblleft}deep network{\textquotedblright} (or deep net) architecture, this brain-inspired model is built from successive layers of neuron-like elements, connected by adjustable weights, called {\textquotedblleft}synapses{\textquotedblright} after their biological counterparts (3). The application of deep nets and related methods to AI systems has been transformative. They proved superior to previously known methods in central areas of AI research, including computer vision, speech recognition and production, and playing complex games. Practical applications are already in broad use, in areas such as computer vision and speech and text translation, and large-scale efforts are under way in many other areas. Here, I discuss how additional aspects of brain circuitry could supply cues for guiding network models toward broader aspects of cognition and general AI.

}, issn = {0036-8075}, doi = {10.1126/science.aau6595}, url = {http://www.sciencemag.org/lookup/doi/10.1126/science.aau6595}, author = {Shimon Ullman} } @article {3422, title = {Discovery and usage of joint attention in images}, journal = {arXiv.org}, year = {2018}, month = {04/2018}, abstract = {

Joint visual attention is characterized by two or more individuals looking at a common target at the same time. The ability to identify joint attention in scenes, the people involved, and their common target, is fundamental to the understanding of social interactions, including others{\textquoteright} intentions and goals. In this work we deal with the extraction of joint attention events, and the use of such events for image descriptions. The work makes two novel contributions. First, our extraction algorithm is the first which identifies joint visual attention in single static images. It computes 3D gaze direction, identifies the gaze target by combining gaze direction with a 3D depth map computed for the image, and identifies the common gaze target. Second, we use a human study to demonstrate the sensitivity of humans to joint attention, suggesting that the detection of such a configuration in an image can be useful for understanding the image, including the goals of the agents and their joint activity, and therefore can contribute to image captioning and related tasks.

}, keywords = {compositional approach, computational study, Gaze perception, human study, joint attention}, url = {https://arxiv.org/abs/1804.04604}, author = {Daniel Harari and Joshua B. Tenenbaum and Shimon Ullman} } @article {2548, title = {Full interpretation of minimal images.}, journal = {Cognition}, volume = {171}, year = {2018}, month = {02/2018}, pages = {65-84}, chapter = {65}, abstract = {

The goal in this work is to model the process of\  {\textquoteleft}full interpretation{\textquoteright}\  of\  object images,\  which is the ability to identify and localize all semantic features and parts that are recognized by human observers.\  The task is approached\  by dividing the interpretation of\  the complete object to the interpretation of multiple reduced but interpretable local\  regions. In such reduced regions, interpretation is\  simpler,\  since the number of\  se mantic\  components is small, and the variability of possible configurations is low.\ 

We model the interpretation process by identifying primitive components and\  relations that play a useful role in\  local\  interpretation by humans. To identify useful\  compo nents and relations used in the interpretation process, we consider the\  interpretation of\  {\textquoteleft} minimal configurations{\textquoteright} :\  these\  are\  reduced\  local regions , whic h are\  minimal in the sense that further reduction\  renders them unrecognizable and\  uninterpretable.\  We show that\  such\  minimal\  interpretable image s have useful properties,\  which\  we use to identify\  informative\  features and relations used for full interpretation.\  We describe our interpretation model, and show results of\  detailed\  interpretations\  of\  minimal c onfigurations, produced automatically by the model. Finally, we\  discuss\  implications of\  full\  interpretation\  to\  difficult visual tasks, such as recognizing human\  activities or interactions , which are beyond the scope of current models of visual\  recognition .

}, keywords = {Image interpretation, M inimal images, Parts and relations, Top-down processing}, doi = {https://doi.org/10.1016/j.cognition.2017.10.006}, author = {Guy Ben-Yosef and Liav Assif and Shimon Ullman} } @article {4107, title = {Full interpretation of minimal images}, journal = {Cognition}, volume = {171}, year = {2018}, month = {01/2018}, pages = {65 - 84}, abstract = {

The goal in this work is to model the process of {\textquoteleft}full interpretation{\textquoteright} of object images, which is the ability to identify and localize all semantic features and parts that are recognized by human observers. The task is approached by dividing the interpretation of the complete object to the interpretation of multiple reduced but interpretable local regions. In such reduced regions, interpretation is simpler, since the number of semantic components is small, and the variability of possible configurations is low.

We model the interpretation process by identifying primitive components and relations that play a useful role in local interpretation by humans. To identify useful components and relations used in the interpretation process, we consider the interpretation of {\textquoteleft}minimal configurations{\textquoteright}: these are reduced local regions, which are minimal in the sense that further reduction renders them unrecognizable and uninterpretable. We show that such minimal interpretable images have useful properties, which we use to identify informative features and relations used for full interpretation. We describe our interpretation model, and show results of detailed interpretations of minimal configurations, produced automatically by the model. Finally, we discuss possible extensions and implications of full interpretation to difficult visual tasks, such as recognizing social interactions, which are beyond the scope of current models of visual recognition.

}, keywords = {Image interpretation Minimal images Parts and relations Top-down processing}, issn = {00100277}, doi = {10.1016/j.cognition.2017.10.006}, url = {https://linkinghub.elsevier.com/retrieve/pii/S001002771730269Xhttps://api.elsevier.com/content/article/PII:S001002771730269X?httpAccept=text/xmlhttps://api.elsevier.com/content/article/PII:S001002771730269X?httpAccept=text/plain}, author = {Guy Ben-Yosef and Liav Assif and Shimon Ullman} } @article {4106, title = {Image interpretation above and below the object level}, journal = {Interface Focus}, volume = {8}, year = {2018}, month = {06/2018}, pages = {20180020}, abstract = {

Computational models of vision have advanced in recent years at a rapid rate, rivalling in some areas human-level performance. Much of the progress to date has focused on analysing the visual scene at the object level{\textemdash}the recognition and localization of objects in the scene. Human understanding of images reaches a richer and deeper image understanding both {\textquoteleft}below{\textquoteright} the object level, such as identifying and localizing object parts and sub-parts, as well as {\textquoteleft}above{\textquoteright} the object level, such as identifying object relations, and agents with their actions and interactions. In both cases, understanding depends on recovering meaningful structures in the image, and their components, properties and inter-relations, a process referred here as {\textquoteleft}image interpretation{\textquoteright}. In this paper, we describe recent directions, based on human and computer vision studies, towards human-like image interpretation, beyond the reach of current schemes, both below the object level, as well as some aspects of image interpretation at the level of meaningful configurations beyond the recognition of individual objects, and in particular, interactions between two people in close contact. In both cases the recognition process depends on the detailed interpretation of so-called {\textquoteleft}minimal images{\textquoteright}, and at both levels recognition depends on combining {\textquoteleft}bottom-up{\textquoteright} processing, proceeding from low to higher levels of a processing hierarchy, together with {\textquoteleft}top-down{\textquoteright} processing, proceeding from high to lower levels stages of visual analysis.

}, issn = {2042-8898}, doi = {10.1098/rsfs.2018.0020}, url = {https://royalsocietypublishing.org/doi/full/10.1098/rsfs.2018.0020$\#$d3e1503}, author = {Guy Ben-Yosef and Shimon Ullman} } @article {3627, title = {Image interpretation above and below the object level}, year = {2018}, month = {05/2018}, abstract = {

Computational models of vision have advanced in recent years at a rapid rate, rivaling in some areas human-level performance. Much of the progress to date has focused on analyzing the visual scene at the object level {\textendash} the recognition and localization of objects in the scene. Human understanding of images reaches a richer and deeper image understanding both {\textquoteleft}below{\textquoteright} the object level, such as identifying and localizing object parts and sub-parts, as well as {\textquoteleft}above{\textquoteright} the object levels, such as identifying object relations, and agents with their actions and interactions. In both cases, understanding depends on recovering meaningful structures in the image, their components, properties, and inter-relations, a process referred here as {\textquoteleft}image interpretation{\textquoteright}.

In this paper we describe recent directions, based on human and computer vision studies, towards human-like image interpretation, beyond the reach of current schemes, both below the object level, as well as some aspects of image interpretation at the level of meaningful configurations beyond the recognition of individual objects, in particular, interactions between two people in close contact. In both cases the recognition process depends on the detailed interpretation of so-called {\textquoteright}minimal images{\textquoteright}, and at both levels recognition depends on combining {\textquoteleft}bottom-up{\textquoteright} processing, proceeding from low to higher levels of a processing hierarchy, together with {\textquoteleft}top-down{\textquoteright} processing, proceeding from high to lower levels stages of visual analysis.

}, keywords = {Interaction Recognition, minimal images, Social Interactions, Visual interpretation, visual recognition}, author = {Guy Ben-Yosef and Shimon Ullman} } @article {3620, title = {Image interpretation above and below the object level}, journal = {Proceedings of the Royal Society: Interface Focus}, year = {2018}, month = {06/2018}, abstract = {

Computational models of vision have advanced in recent years at a rapid rate, rivaling in some areas human-level performance. Much of the progress to date has focused on analyzing the visual scene at the object level {\textendash} the recognition and localization of objects in the scene. Human understanding of images reaches a richer and deeper image understanding both {\textquoteleft}below{\textquoteright} the object level, such as identifying and localizing object parts and sub-parts, as well as {\textquoteleft}above{\textquoteright} the object levels, such as identifying object relations, and agents with their actions and interactions. In both cases, understanding depends on recovering meaningful structures in the image, their components, properties, and inter-relations, a process referred here as {\textquoteleft}image interpretation{\textquoteright}.

In this paper we describe recent directions, based on human and computer vision studies, towards human-like image interpretation, beyond the reach of current schemes, both below the object level, as well as some aspects of image interpretation at the level of meaningful configurations beyond the recognition of individual objects, in particular, interactions between two people in close contact. In both cases the recognition process depends on the detailed interpretation of so-called {\textquoteright}minimal images{\textquoteright}, and at both levels recognition depends on combining {\textquoteleft}bottom-up{\textquoteright} processing, proceeding from low to higher levels of a processing hierarchy, together with {\textquoteleft}top-down{\textquoteright} processing, proceeding from high to lower levels stages of visual analysis.

}, author = {Guy Ben-Yosef and Shimon Ullman} } @article {4195, title = {Searching for visual features that explain response variance of face neurons in inferior temporal cortex}, journal = {PLOS ONE}, volume = {13}, year = {2018}, month = {09/2019}, pages = {e0201192}, abstract = {

Despite a large body of research on response properties of neurons in the inferior temporal (IT) cortex, studies to date have not yet produced quantitative feature descriptions that can predict responses to arbitrary objects. This deficit in the research prevents a thorough understanding of object representation in the IT cortex. Here we propose a fragment-based approach for finding quantitative feature descriptions of face neurons in the IT cortex. The development of the proposed method was driven by the assumption that it is possible to recover features from a set of natural image fragments if the set is sufficiently large. To find the feature from the set, we compared object responses predicted from each fragment and responses of neurons to these objects, and search for the fragment that revealed the highest correlation with neural object responses. Prediction of object responses of each fragment was made by normalizing Euclidian distance between the fragment and each object to 0 to 1 such that the smaller distance gives the higher value. The distance was calculated at the space where images were transformed to a local orientation space by a Gabor filter and a local max operation. The method allowed us to find features with a correlation coefficient between predicted and neural responses of 0.68 on average (number of object stimuli, 104) from among 560,000 feature candidates, reliably explaining differential responses among faces as well as a general preference for faces over to non-face objects. Furthermore, predicted responses of the resulting features to novel object images were significantly correlated with neural responses to these images. Identification of features comprising specific, moderately complex combinations of local orientations and colors enabled us to predict responses to upright and inverted faces, which provided a possible mechanism of face inversion effects.

}, doi = {10.1371/journal.pone.020119210.1371}, url = {http://dx.plos.org/10.1371/journal.pone.0201192}, author = {Owaki, Takashi and Vidal-Naquet, Michel and Nam, Yunjun and Uchida, Go and Sato, Takayuki and C{\^a}teau, Hideyuki and Shimon Ullman and Tanifuji, Manabu}, editor = {Nishijo, Hisao} } @article {3904, title = {Spatiotemporal interpretation features in the recognition of dynamic images}, year = {2018}, month = {11/2018}, abstract = {

Objects and their parts can be visually recognized and localized from purely spatial information in static images and also from purely temporal information as in the perception of biological motion. Cortical regions have been identified, which appear to specialize in visual recognition based on either static or dynamic cues, but the mechanisms by which spatial and temporal information is integrated is only poorly understood. Here we show that visual recognition of objects and actions can be achieved by efficiently combining spatial and motion cues in configurations where each source on its own is insufficient for recognition. This analysis is obtained by the identification of minimal spatiotemporal configurations: these are short videos in which objects and their parts, along with an action being performed, can be reliably recognized, but any reduction in either space or time makes them unrecognizable. State-of-the-art computational models for recognition from dynamic images based on deep 2D and 3D convolutional networks cannot replicate human recognition in these configurations. Action recognition in minimal spatiotemporal configurations is invariably accompanied by full human interpretation of the internal components of the image and their inter-relations. We hypothesize that this gap is due to mechanisms for full spatiotemporal interpretation process, which in human vision is an integral part of recognizing dynamic event, but is not sufficiently represented in current DNNs.

}, author = {Guy Ben-Yosef and Gabriel Kreiman and Shimon Ullman} } @article {2408, title = {Full interpretation of minimal images}, year = {2017}, month = {02/2017}, abstract = {

The goal in this work is to model the process of {\textquoteleft}full interpretation{\textquoteright} of object images, which is the ability to identify and localize all semantic features and parts that are recognized by human observers. The task is approached by dividing the interpretation of the complete object to the interpretation of multiple reduced but interpretable local regions. In such reduced regions, interpretation is simpler, since the number of semantic components is small, and the variability of possible configurations is low.

We model the interpretation process by identifying primitive components and relations that play a useful role in local interpretation by humans. To identify useful components and relations used in the interpretation process, we consider the interpretation of {\textquoteleft}minimal configurations{\textquoteright}: these are reduced local regions, which are minimal in the sense that further reduction renders them unrecognizable and uninterpretable. We show that such minimal interpretable images have useful properties, which we use to identify informative features and relations used for full interpretation. We describe our interpretation model, and show results of detailed interpretations of minimal configurations, produced automatically by the model. Finally, we discuss implications of full interpretation to difficult visual tasks, such as recognizing human activities or interactions, which are beyond the scope of current models of visual recognition.

This manuscript has beed accepted for publication in Cognition.

}, keywords = {Image interpretation, Parts and relations, Visual object recognition}, author = {Guy Ben-Yosef and Liav Assif and Shimon Ullman} } @conference {2724, title = {A model for interpreting social interactions in local image regions}, booktitle = {AAAI Spring Symposium Series, Science of Intelligence}, year = {2017}, month = {03/2017}, address = {Palo Alto, CA}, abstract = {
Understanding social interactions (such as {\textquoteleft}hug{\textquoteright} or {\textquoteleft}fight{\textquoteright}) is a basic and important capacity of the human visual system, but a challenging and still open problem for modeling. In this work we study visual recognition of social interactions, based on small but recognizable local regions. The approach is based on two novel key components: (i) A given social interaction can be recognized reliably from reduced images (called {\textquoteleft}minimal images{\textquoteright}). (ii) The recognition of a social interaction depends on identifying components and relations within the minimal image (termed {\textquoteleft}interpretation{\textquoteright}). We show psychophysics data for minimal images and modeling results for their interpretation. We discuss the integration of minimal configurations in recognizing social interactions in a detailed, high-resolution image.
}, url = {http://www.aaai.org/ocs/index.php/SSS/SSS17/paper/view/15354}, author = {Guy Ben-Yosef and Alon Yachin and Shimon Ullman} } @article {1760, title = {Atoms of recognition in human and computer vision}, journal = {PNAS }, volume = {113}, year = {2016}, month = {03/2016}, pages = {2744{\textendash}2749}, abstract = {
Discovering the visual features and representations used by thebrain to recognize objects is a central problem in the study of vision. Recently, neural network models of visual object recognition, including biological and deep network models, have shown remarkableprogress and have begun to rival human performance in some challenging tasks. These models are trained on image examples andlearn to extract features and representations and to use them for categorization. It remains unclear, however, whether the representations and learning processes discovered by current models aresimilar to those used by the human visual system. Here we show,by introducing and using minimal recognizable images, that thehuman visual system uses features and processes that are not usedby current models and that are critical for recognition. We found bypsychophysical studies that at the level of minimal recognizableimages a minute change in the image can have a drastic effect onrecognition, thus identifying features that are critical for the task.Simulations then showed that current models cannot explain thissensitivity to precise feature configurations and, more generally,do not learn to recognize minimal images at a human level. The roleof the features shown here is revealed uniquely at the minimal level, where the contribution of each feature is essential. A full understanding of the learning and use of such features will extend ourunderstanding of visual recognition and its cortical mechanisms andwill enhance the capacity of computational models to learn fromvisual experience and to deal with recognition and detailedimage interpretation.
}, keywords = {Computer vision, minimal images, object recognition, visual perception, visual representations}, issn = {1091-6490}, doi = {10.1073/pnas.1513198113}, url = {http://www.pnas.org/content/113/10/2744.abstract}, author = {Shimon Ullman and Liav Assif and Eitan Fetaya and Daniel Harari} } @article {2133, title = {Do You See What I Mean? Visual Resolution of Linguistic Ambiguities}, year = {2016}, month = {09/2016}, abstract = {

Understanding language goes hand in hand with the ability to integrate complex contextual information obtained via perception. In this work, we present a novel task for grounded language understanding: disambiguating a sentence given a visual scene which depicts one of the possible interpretations of that sentence. To this end, we introduce a new multimodal corpus containing ambiguous sentences, representing a wide range of syntactic, semantic and discourse ambiguities, coupled with videos that visualize the different interpretations for each sentence. We address this task by extending a vision model which determines if a sentence is depicted by a video. We demonstrate how such a model can be adjusted to recognize different interpretations of the same underlying sentence, allowing to disambiguate sentences in a unified fashion across the different ambiguity types.

}, author = {Yevgeni Berzak and Andrei Barbu and Daniel Harari and Boris Katz and Shimon Ullman} } @conference {2585, title = {Human Pose Estimation Using Deep Consensus Voting}, booktitle = {ECCV 2016}, year = {2016}, abstract = {

In this paper we consider the problem of human pose estimation from a single still image.\  We propose a novel approach where each location in the\  image\  votes\  for\  the\  position\  of\  each\  keypoint\  using\  a\  convolutional neural net.\  The voting scheme allows us to utilize information from the whole image, rather than rely on a sparse set of keypoint locations.\  Using dense, multi-target votes, not only produces good keypoint predictions, but also enables us to compute image-dependent joint keypoint probabilities by looking at consensus voting.\  This differs from most previous methods where joint probabilities are learned from relative keypoint locations and are independent of the image.\  We finally combine the keypoints votes and joint probabilities in order to identify the optimal pose configuration.\  We show our competitive performance on the MPII Human Pose and Leeds Sports Pose datasets.

}, author = {Ita Lifshitz and Ethan Fetaya and Shimon Ullman} } @article {1885, title = {Language and Vision Ambiguities (LAVA) Corpus}, year = {2016}, month = {01/2016}, abstract = {

Ambiguity is one of the defining characteristics of human languages, and language understanding crucially relies on the ability to obtain unambiguous representations of linguistic content. While some ambiguities can be resolved using intra-linguistic contextual cues, the disambiguation of many linguistic constructions requires integration of world knowledge and perceptual information obtained from other modalities. In this work, we focus on the problem of grounding language in the visual modality, and introduce a novel task for visual and linguistic understanding which requires resolving linguistic ambiguities by utilizing the visual context of the utterance.

To address this challenge, we release the Language and Vision Ambiguities (LAVA) corpus. LAVA contains ambiguous sentences coupled with visual scenes that depict the different interpretations of each sentence. The sentences in the corpus are annotated with syntactic and semantic parses, and cover a wide range of linguistic ambiguities, including PP and VP attachment, conjunctions, logical forms, anaphora and ellipsis. In addition to the sentence disambiguation challenge, the corpus will support a variety of related tasks which use natural language as a medium for expressing visual understanding.

Reference:
Yevgeni Berzak, Andrei Barbu, Daniel Harari, Boris Katz, and Shimon Ullman (2015). Do You See What I Mean? Visual Resolution of Linguistic Ambiguities. Conference on Empirical Methods in Natural Language Processing (EMNLP), Lisbon, Portugal. [PDF]

Download all of the clips in MP4 format (ZIP)

}, url = {http://web.mit.edu/lavacorpus/}, author = {Yevgeni Berzak and Andrei Barbu and Daniel Harari and Boris Katz and Shimon Ullman} } @article {2326, title = {Measuring and modeling the perception of natural and unconstrained gaze in humans and machines}, year = {2016}, month = {11/2016}, abstract = {

Humans are remarkably adept at interpreting the gaze direction of other individuals in their surroundings. This skill is at the core of the ability to engage in joint visual attention, which is essential for establishing social interactions. How accurate are humans in determining the gaze direction of others in lifelike scenes, when they can move their heads and eyes freely, and what are the sources of information for the underlying perceptual processes? These questions pose a challenge from both empirical and computational perspectives, due to the complexity of the visual input in real-life situations. Here we measure empirically human accuracy in perceiving the gaze direction of others in lifelike scenes, and study computationally the sources of information and representations underlying this cognitive capacity. We show that humans perform better in face-to-face conditions compared with recorded conditions, and that this advantage is not due to the availability of input dynamics. We further show that humans are still performing well when only the eyes-region is visible, rather than the whole face. We develop a computational model, which replicates the pattern of human performance, including the finding that the eyes-region contains on its own, the required information for estimating both head orientation and direction of gaze. Consistent with neurophysiological findings on task-specific face regions in the brain, the learned computational representations reproduce perceptual effects such as the Wollaston illusion, when trained to estimate direction of gaze, but not when trained to recognize objects or faces.

}, keywords = {computational evaluation, computational modeling, Computer vision, empirical evaluation, estimation of gaze direction, Gaze perception, joint attention, Machine Learning}, author = {Daniel Harari and Tao Gao and Nancy Kanwisher and Joshua B. Tenenbaum and Shimon Ullman} } @article {2723, title = {Recognizing and Interpreting Social Interactions in Local Image Regions}, year = {2016}, note = {

(Accepted for oral presentation)

}, month = {11/2016}, abstract = {

Understanding social interactions (such as {\textquoteright}hug{\textquoteright}\ or {\textquoteright}fight{\textquoteright}) is a basic and important capacity of the human visual system, but a challenging and still open problem for modeling. Here we study visual recognition of social interactions, based on small but recognizable local regions. The approach is based on two novel key components: (i) A given social interaction can be recognized reliably from reduced images (called {\textquoteright}minimal images{\textquoteright}). (ii) The recognition of a social interaction depends on identifying components and relations within the minimal image (termed {\textquoteright}interpretation{\textquoteright}). We show psychophysics data for minimal images and modeling results for their interpretation.\ 

}, author = {Guy Ben-Yosef and Alon Yachin and Shimon Ullman} } @conference {2547, title = {Visual Concept Recognition and Localization via Iterative Introspection. }, booktitle = {Asian Conference on Computer Vision}, year = {2016}, month = {11/2016}, author = {Amir Rosenfeld and Shimon Ullman} } @conference {1429, title = {Do You See What I Mean? Visual Resolution of Linguistic Ambiguities}, booktitle = {Conference on Empirical Methods in Natural Language Processing, Lisbon, Portugal. }, year = {2015}, month = {09/2015}, author = {Yevgeni Berzak and Andrei Barbu and Daniel Harari and Boris Katz and Shimon Ullman} } @proceedings {793, title = {Graph Approximation and Clustering on a Budget}, volume = {38}, year = {2015}, abstract = {

We consider the problem of learning from a\  similarity matrix (such as spectral cluster-\  ing and low-dimensional embedding), when\  computing pairwise similarities are costly,\  and only a limited number of entries can be\  observed. We provide a theoretical anal-\  ysis using standard notions of graph ap-\  proximation, significantly generalizing pre-\  vious results, which focused on spectral\  clustering with two clusters. We also pro-\  pose a new algorithmic approach based on\  adaptive sampling, which experimentally\  matches or improves on previous methods,\  while being considerably more general and\  computationally cheaper.

}, author = {Ethan Fetaya and Ohad Shamir and Shimon Ullman} } @proceedings {1762, title = {A model for full local image interpretation}, year = {2015}, abstract = {

We describe a computational model of humans{\textquoteright} ability to provide a detailed interpretation of a scene{\textquoteright}s components. Humans can identify in an image meaningful components almost everywhere, and identifying these components is an essential part of the visual process, and of understanding the surrounding scene and its potential meaning to the viewer. Detailed interpretation is beyond the scope of current models of visual recognition. Our model suggests that this is a fundamental limitation, related to the fact that existing models rely on feed - forward but limited top - down processing. In our model, a first recognition stage leads to the initial activation of class candidates, which is incomplete and with limited accuracy. This stage then triggers the application of class - specific interpretation and validation processes, which recover richer and more accurate interpretation of the visible scene. We discuss implications of the model for visual interpretation by humans and by computer vision models.

}, author = {Guy Ben-Yosef and Liav Assif and Daniel Harari and Shimon Ullman} } @article {1761, title = {Visual categorization of social interactions}, journal = {Visual Cognition }, volume = {22}, year = {2015}, month = {02/06/2015}, abstract = {

Prominent theories of action recognition suggest that during the recognition of actions the physical patterns of the action is associated with only one action interpretation (e.g., a person waving his arm is recognized as waving). In contrast to this view, studies examining the visual categorization of objects show that objects are recognized in multiple ways (e.g., a VW Beetle can be recognized as a car or a beetle) and that categorization performance is based on the visual and motor movement similarity between objects. Here, we studied whether we find evidence for multiple levels of categorization for social interactions (physical interactions with another person, e.g., handshakes). To do so, we compared visual categorization of objects and social interactions (Experiments 1 and 2) in a grouping task and assessed the usefulness of motor and visual cues (Experiments 3, 4, and 5) for object and social interaction categorization. Additionally, we measured recognition performance associated with recognizing objects and social interactions at different categorization levels (Experiment 6). We found that basic level object categories were associated with a clear recognition advantage compared to subordinate recognition but basic level social interaction categories provided only a little recognition advantage. Moreover, basic level object categories were more strongly associated with similar visual and motor cues than basic level social interaction categories. The results suggest that cognitive categories underlying the recognition of objects and social interactions are associated with different performances. These results are in line with the idea that the same action can be associated with several action interpretations (e.g., a person waving his arm can be recognized as waving or greeting).

}, doi = {10.1080/13506285.2014.991368}, author = {Stephan de la Rosa and Rabia N. Choudhery and Crist{\'o}bal Curio and Shimon Ullman and Liav Assif and Heinrich H. B{\"u}lthoff} } @article {459, title = {When Computer Vision Gazes at Cognition.}, number = {025}, year = {2014}, month = {12/2014}, abstract = {

Joint attention is a core, early-developing form of social interaction. It is based on our ability to discriminate the third party objects that other people are looking at. While it has been shown that people can accurately determine whether another person is looking directly at them versus away, little is known about human ability to discriminate a third person gaze directed towards objects that are further away, especially in unconstraint cases where the looker can move her head and eyes freely. In this paper we address this question by jointly exploring human psychophysics and a cognitively motivated computer vision model, which can detect the 3D direction of gaze from 2D face images. The synthesis of behavioral study and computer vision yields several interesting discoveries. (1) Human accuracy of discriminating targets 8{\deg}-10{\deg} of visual angle apart is around 40\% in a free looking gaze task; (2) The ability to interpret gaze of different lookers vary dramatically; (3) This variance can be captured by the computational model; (4) Human outperforms the current model significantly. These results collectively show that the acuity of human joint attention is indeed highly impressive, given the computational challenge of the natural looking task. Moreover, the gap between human and model performance, as well as the variability of gaze interpretation across different lookers, require further understanding of the underlying mechanisms utilized by humans for this challenging task.

}, author = {Tao Gao and Daniel Harari and Joshua B. Tenenbaum and Shimon Ullman} }