@article {5255, title = {BrainBERT: Self-supervised representation learning for Intracranial Electrodes}, year = {2023}, month = {02/2023}, address = {Kigali, Rwanda, Africa}, abstract = {

We create a reusable Transformer, BrainBERT, for intracranial recordings bringing modern representation learning approaches to neuroscience. Much like in NLP and speech recognition, this Transformer enables classifying complex concepts, i.e., decoding neural data, with higher accuracy and with much less data by being pretrained in an unsupervised manner on a large corpus of unannotated neural recordings. Our approach generalizes to new subjects with electrodes in new positions and to unrelated tasks showing that the representations robustly disentangle the neural signal. Just like in NLP where one can study language by investigating what a language model learns, this approach opens the door to investigating the brain by what a model of the brain learns. As a first step along this path, we demonstrate a new analysis of the intrinsic dimensionality of the computations in different areas of the brain. To construct these representations, we combine a technique for producing super-resolution spectrograms of neural data with an approach designed for generating contextual representations of audio by masking. In the future, far more concepts will be decodable from neural recordings by using representation learning, potentially unlocking the brain like language models unlocked language.

}, keywords = {decoding, language models, Neuroscience, self-supervision, transformer}, url = {https://openreview.net/forum?id=xmcYx_reUn6}, author = {Christopher Wang and Vighnesh Subramaniam and Adam Uri Yaari and Gabriel Kreiman and Boris Katz and Ignacio Cases and Andrei Barbu} } @conference {5322, title = {Zero-shot linear combinations of grounded social interactions with Linear Social MDPs}, booktitle = {Proceedings of the 37th AAAI Conference on Artificial Intelligence (AAAI)}, year = {2023}, month = {02/2023}, abstract = {

Humans and animals engage in rich social interactions. It is often theorized that a relatively small number of basic social interactions give rise to the full range of behavior observed. But no computational theory explaining how social interactions combine together has been proposed before. We do so here. We take a model, the Social MDP, which is able to express a range of social interactions, and extend it to represent linear combinations of social interactions. Practically for robotics applications, such models are now able to not just express that an agent should help another agent, but to express goal-centric social interactions. Perhaps an agent is helping someone get dressed, but preventing them from falling, and is happy to exchange stories in the meantime. How an agent responds socially, should depend on what it thinks the other agent is doing at that point in time. To encode this notion, we take linear combinations of social interactions as defined in Social MDPs, and compute the weights on those combinations on the fly depending on the estimated goals of other agents. This new model, the Linear Social MDP, enables zero-shot reasoning about complex social interactions, provides a mathematical basis for the long-standing intuition that social interactions should compose, and leads to interesting new behaviors that we validate using human observers. Complex social interactions are part of the future of intelligent agents, and having principled mathematical models built on a foundation like MDPs will make it possible to bring social interactions to every robotic application.

}, author = {Ravi Tejwani and Yen-Ling Kuo and Tianmin Shu and Bennett Stankovits and Dan Gutfreund and Joshua B. Tenenbaum and Boris Katz and Andrei Barbu} } @conference {5302, title = {The Aligned Multimodal Movie Treebank: An audio, video, dependency-parse treebank}, booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing}, year = {2022}, abstract = {

Treebanks have traditionally included only text and were derived from written sources such as newspapers or the web. We introduce the Aligned Multimodal Movie Treebank (AMMT), an English language treebank derived from dialog in Hollywood movies which includes transcriptions of the audio-visual streams with word-level alignment, as well as part of speech tags and dependency parses in the Universal Dependencies formalism. AMMT consists of 31,264 sentences and 218,090 words, that will amount to the 3rd largest UD English treebank and the only multimodal treebank in UD. To help with the web-based annotation effort, we also introduce the Efficient Audio Alignment Annotator (EAAA), a companion tool that enables annotators to significantly speed-up their annotation processes.

}, author = {Adam Yaari and Jan DeWitt and Henry Hu and Bennett Stankovits and Sue Felshin and Yevgeni Berzak and Helena Aparicio and Boris Katz and Ignacio Cases and Andrei Barbu} } @article {5050, title = {Incorporating Rich Social Interactions Into MDPs}, year = {2022}, abstract = {

Much of what we do as humans is engage socially with other agents, a skill that robots must also eventually possess. We demonstrate that a rich theory of social interactions originating from microso- ciology and economics can be formalized by extending a nested MDP where agents reason about arbitrary functions of each other{\textquoteright}s hidden rewards. This extended Social MDP allows us to encode the five basic interactions that underlie microsociology: cooperation, conflict, coercion, competition, and exchange. The result is a robotic agent capable of executing social interactions zero-shot in new environments; like humans it can engage socially in novel ways even without a single example of that social interaction. Moreover, the judgments of these Social MDPs align closely with those of humans when considering which social interaction is taking place in an environment. This method both sheds light on the nature of social interactions, by providing concrete mathematical definitions, and brings rich social interactions into a mathematical framework that has proven to be natural for robotics, MDPs.

}, author = {Ravi Tejwani and Yen-Ling Kuo and Tianmin Shu and Bennett Stankovits and Dan Gutfreund and Joshua B. Tenenbaum and Boris Katz and Andrei Barbu} } @article {5299, title = {Quantifying the Emergence of Symbolic Communication}, journal = {CogSci}, year = {2022}, abstract = {

We quantitatively study the emergence of symbolic communication in humans with a communication game that attempts to recapitulate an essential step in the development of human language: the emergence of shared signs. In our experiment, a teacher must communicate a first order logic formula to a student through a narrow channel deprived of common shared signs: subjects cannot communicate with each other with the sole exception of car motions in a computer game. Subjects spontaneously develop a shared vocabulary of car motions including indices, icons, and symbols, spanning both task-specific and task-agnostic concepts such as "square{\textquoteright}{\textquoteright} and "understand{\textquoteright}{\textquoteright}. We characterize the conditions under which indices, icons, and symbols arise, finding that symbols are harder to establish than icons and indices. We observe the dominant sign category being developed transitions from indices to icons to symbols, and identify communicating in ambiguous game environments as a pressure for icon and symbol development.

}, url = {https://escholarship.org/uc/item/08n3293v}, author = {Emily Cheng and Yen-Ling Kuo and Josefina Correa and Boris Katz and Ignacio Cases and Andrei Barbu} } @conference {5301, title = {Spontaneous sign emergence in humans and machines through an embodied communication game}, booktitle = {JCoLE Workshop}, year = {2022}, author = {Emily Cheng and Yen-Ling Kuo and Ignacio Cases and Boris Katz and Andrei Barbu} } @article {5051, title = {Trajectory Prediction with Linguistic Representations}, year = {2022}, abstract = {

Language allows humans to build mental models that interpret what is happening around them resulting in more accurate long-term predictions. We present a novel trajectory prediction model that uses linguistic intermediate representations to forecast trajectories, and is trained using trajectory sam- ples with partially-annotated captions. The model learns the meaning of each of the words without direct per-word supervision. At inference time, it generates a linguistic description of trajectories which captures maneuvers and interactions over an extended time interval. This generated description is used to refine predictions of the trajectories of multiple agents. We train and validate our model on the Argoverse dataset, and demonstrate improved accuracy results in trajectory prediction. In addition, our model is more interpretable: it presents part of its reasoning in plain language as captions, which can aid model development and can aid in building confidence in the model before deploying it.

}, author = {Yen-Ling Kuo and Xin Huang and Andrei Barbu and Stephen G. McGill and Boris Katz and John J. Leonard and Guy Rosman} } @article {5054, title = {Compositional Networks Enable Systematic Generalization for Grounded Language Understanding}, year = {2021}, abstract = {

Humans are remarkably flexible when under- standing new sentences that include combinations of concepts they have never encountered before. Recent work has shown that while deep networks can mimic some human language abilities when presented with novel sentences, systematic variation un- covers the limitations in the language-understanding abilities of networks. We demonstrate that these limitations can be overcome by addressing the generalization challenges in the gSCAN dataset, which explicitly measures how well an agent is able to interpret novel linguistic commands grounded in vision, e.g., novel pairings of adjectives and nouns. The key principle we employ is compositionality: that the compositional structure of networks should reflect the compositional structure of the problem domain they address, while allowing other parameters to be learned end-to-end. We build a general-purpose mechanism that enables agents to generalize their language understanding to compositional domains. Crucially, our network has the same state-of-the art performance as prior work while generalizing its knowledge when prior work does not. Our network also provides a level of interpretability that enables users to inspect what each part of networks learns. Robust grounded language understanding without dramatic failures and without corner cases is critical to building safe and fair robots; we demonstrate the significant role that compositionality can play in achieving that goal.

}, author = {Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @article {5069, title = {Compositional RL Agents That Follow Language Commands in Temporal Logic}, journal = {Frontiers in Robotics and AI}, volume = {8}, year = {2021}, month = {07/2022}, abstract = {

We demonstrate how a reinforcement learning agent can use compositional recurrent neural networks to learn to carry out commands specified in linear temporal logic (LTL). Our approach takes as input an LTL formula, structures a deep network according to the parse of the formula, and determines satisfying actions. This compositional structure of the network enables zero-shot generalization to significantly more complex unseen formulas. We demonstrate this ability in multiple problem domains with both discrete and continuous state-action spaces. In a symbolic domain, the agent finds a sequence of letters that satisfy a specification. In a Minecraft-like environment, the agent finds a sequence of actions that conform to a formula. In the Fetch environment, the robot finds a sequence of arm configurations that move blocks on a table to fulfill the commands. While most prior work can learn to execute one formula reliably, we develop a novel form of multi-task learning for RL agents that allows them to learn from a diverse set of tasks and generalize to a new set of diverse tasks without any additional training. The compositional structures presented here are not specific to LTL, thus opening the path to RL agents that perform zero-shot generalization in other compositional domains.

}, doi = {10.3389/frobt.2021.689550}, url = {https://www.frontiersin.org/articles/10.3389/frobt.2021.689550/full}, author = {Kuo, Yen-Ling and Katz, Boris and Andrei Barbu} } @article {5056, title = {Compositional RL Agents That Follow Language Commands in Temporal Logic}, year = {2021}, abstract = {

We demonstrate how a reinforcement learning agent can use compositional recurrent neural net- works to learn to carry out commands specified in linear temporal logic (LTL). Our approach takes as input an LTL formula, structures a deep network according to the parse of the formula, and determines satisfying actions. This compositional structure of the network enables zero-shot generalization to sig- nificantly more complex unseen formulas. We demonstrate this ability in multiple problem domains with both discrete and continuous state-action spaces. In a symbolic domain, the agent finds a sequence of letters that satisfy a specification. In a Minecraft-like environment, the agent finds a sequence of actions that conform to a formula. In the Fetch environment, the robot finds a sequence of arm config- urations that move blocks on a table to fulfill the commands. While most prior work can learn to execute one formula reliably, we develop a novel form of multi-task learning for RL agents that allows them to learn from a diverse set of tasks and generalize to a new set of diverse tasks without any additional training. The compositional structures presented here are not specific to LTL, thus opening the path to RL agents that perform zero-shot generalization in other compositional domains.

}, author = {Yen-Ling Kuo and Andrei Barbu and Boris Katz} } @article {4825, title = {Large-scale benchmarking of deep neural network models in mouse visual cortex reveals patterns similar to those observed in macaque visual cortex}, year = {2021}, abstract = {

What is the representational structure of mouse visual cortex and how is it shaped? Mice obviouslyinteract with the world and recognize objects but unlike in primates, a majority of research to date suggests theactivity of their visual cortex may not be so well described by deep neural networks trained for object recognition.Using the Allen Brain Observatory{\quotesinglbase}{\"A}{\^o}s 2-photon calcium-imaging dataset of activity in over 30,000 rodent visualcortical neurons recorded in response to natural scenes, we work to resolve this discrepancy and demonstrate thatmodern neural networks can indeed be used to explain activity in the mouse visual cortex to a more reasonabledegree than previously suggested. In so doing, we elucidate at large scale the properties of networks whichbest match the biological visual system, with both representational similarity analysis and encoding modelscoming to mostly the same conclusions. Our analysis of 30 object recognition architectures (both pretrainedand randomly initialized) from the PyTorch model zoo demonstrates that deeper, thinner residual networks withbypass connections, fewer parameters shared across many convolutions, and higher scores on the ImageNetimage-recognition challenge tend to be more predictive of the neural activations in our sample. Additionally, wefind a significant degree of overlap between the models that best predict macaque visual cortex (as cataloguedby brain-score.org) and those that best predict mouse visual cortex. In concert, these findings help to bolster themouse brain as a viable source of data for the methods that have been successful thus far in the study of monkeybrains, and provide a preliminary set of design targets for building models that can better take advantage of theunparalleled scale, quality, and resolution of data afforded by calcium-imaging in the mouse brain.

}, author = {Colin Conwell and David Mayo and Michael Buice and Boris Katz and George Alvarez and Andrei Barbu} } @article {4826, title = {Measuring Social Biases in Grounded Vision and Language Embeddings}, year = {2021}, abstract = {

We generalize the notion of measuring social biases in word embeddings to visually grounded word embeddings. \ Biases are present in grounded embeddings, and indeed seem to be equally or more significant than for ungrounded embeddings. \ This is despite the fact that vision and language can suffer from different biases, which one might hope could attenuate the biases in both. Multiple ways exist to generalize metrics measuring bias in word embeddings to this new setting. \ We introduce the space of generalizations (Grounded-WEAT and Grounded-SEAT) and demonstrate that three generalizations answer different yet important questions about how biases, language, and vision interact. These metrics are used on a new dataset, the first for grounded bias, created by augmenting standard linguistic bias benchmarks with 10,228 images from COCO, Conceptual Captions, and Google Images. \ Dataset construction is challenging because vision datasets are themselves very biased. \ The presence of these biases in systems will begin to have real-world consequences as they are deployed, making carefully measuring bias and then mitigating it critical to building a fair society.

}, author = {Candace Ross and Boris Katz and Andrei Barbu} } @article {5057, title = {Measuring Social Biases in Grounded Vision and Language Embeddings}, year = {2021}, abstract = {

We generalize the notion of measuring social biases in word embeddings to visually grounded word embeddings. Biases are present in grounded embeddings, and indeed seem to be equally or more significant than for ungrounded embeddings. This is despite the fact that vision and language can suffer from different biases, which one might hope could attenuate the biases in both. Multiple ways exist to generalize metrics measuring bias in word embeddings to this new setting. We introduce the space of generalizations (GroundedWEAT and Grounded-SEAT) and demonstrate that three gener- alizations answer different yet important questions about how biases, language, and vision interact. These metrics are used on a new dataset, the first for grounded bias, created by augmenting stan- dard linguistic bias benchmarks with 10,228 images from COCO, Conceptual Captions, and Google Images. Dataset construction is challenging because vision datasets are themselves very biased. The presence of these biases in systems will begin to have real-world consequences as they are deployed, making carefully measuring bias and then mitigating it critical to building a fair society.

}, author = {Candace Ross and Andrei Barbu and Boris Katz} } @conference {4827, title = {Multi-resolution modeling of a discrete stochastic process identifies causes of cancer}, booktitle = {International Conference on Learning Representations}, year = {2021}, month = {09/2020}, abstract = {

Detection of cancer-causing mutations within the vast and mostly unexplored human genome is a major challenge. Doing so requires modeling the background mutation rate, a highly non-stationary stochastic process, across regions of interest varying in size from one to millions of positions. Here, we present the split-Poisson-Gamma (SPG) distribution, an extension of the classical Poisson-Gamma formulation, to model a discrete stochastic process at multiple resolutions. We demonstrate that the probability model has a closed-form posterior, enabling efficient and accurate linear-time prediction over any length scale after the parameters of the model have been inferred a single time. We apply our framework to model mutation rates in tumors and show that model parameters can be accurately inferred from high-dimensional epigenetic data using a convolutional neural network, Gaussian process, and maximum-likelihood estimation. Our method is both more accurate and more efficient than existing models over a large range of length scales. We demonstrate the usefulness of multi-resolution modeling by detecting genomic elements that drive tumor emergence and are of vastly differing sizes.

}, url = {https://openreview.net/forum?id=KtH8W3S_RE}, author = {Adam Uri Yaari and Maxwell Sherman and Oliver Clarke Priebe and Po-Ru Loh and Boris Katz and Andrei Barbu and Bonnie Berger} } @article {5052, title = {Neural Regression, Representational Similarity, Model Zoology Neural Taskonomy at Scale in Rodent Visual Cortex}, year = {2021}, abstract = {

How well do deep neural networks fare as models of mouse visual cortex? A majority of research to date suggests results far more mixed than those produced in the modeling of primate visual cortex. Here, we perform a large-scale bench- marking of dozens of deep neural network models in mouse visual cortex with both representational similarity analysis and neural regression. Using the Allen Brain Observatory{\textquoteright}s 2-photon calcium-imaging dataset of activity in over 6,000 reliable rodent visual cortical neurons recorded in response to natural scenes, we replicate previous findings and resolve previous discrepancies, ultimately demonstrating that modern neural networks can in fact be used to explain activity in the mouse visual cortex to a more reasonable degree than previously suggested. Using our benchmark as an atlas, we offer preliminary answers to overarching questions about levels of analysis, questions about the properties of models that best predict the visual system overall and questions about the mapping between biological and artificial representations. Our results provide a reference point for future ventures in the deep neural network modeling of mouse visual cortex, hinting at novel combinations of mapping method, architecture, and task to more fully characterize the computational motifs of visual representation in a species so central to neuroscience, but with a perceptual physiology and ecology markedly different from the ones we study in primates.

}, author = {Colin Conwell and David Mayo and Michael A. Buice and Boris Katz and George A. Alvarez and Andrei Barbu} } @conference {4830, title = {PHASE: PHysically-grounded Abstract Social Events for Machine Social Perception}, booktitle = {AAAI-21}, year = {2021}, abstract = {

The ability to perceive and reason about social interactions in the context of physical environments is core to human social intelligence and human-machine cooperation. However, no prior dataset or benchmark has systematically evaluated physically grounded perception of complex social interactions that go beyond short actions, such as high-fiving, or simple group activities, such as gathering. In this work, we create a dataset of physically-grounded abstract social events, PHASE, that resemble a wide range of real-life social interactions by including social concepts such as helping another agent. PHASE consists of 2D animations of pairs of agents moving in a continuous space generated procedurally using a physics engine and a hierarchical planner. Agents have a limited field of view, and can interact with multiple objects, in an environment that has multiple landmarks and obstacles. Using PHASE, we design a social recognition task and a social prediction task. PHASE is validated with human experiments demonstrating that humans perceive rich interactions in the social events, and that the simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-the-art feed-forward neural networks. We hope that PHASE can serve as a difficult new challenge for developing new models that can recognize complex social interactions.

}, author = {Aviv Netanyahu and Tianmin Shu and Boris Katz and Andrei Barbu and Joshua B. Tenenbaum} } @article {5058, title = {PHASE: PHysically-grounded Abstract Social Events for Machine Social Perception}, number = {123}, year = {2021}, month = {03/2021}, abstract = {

The ability to perceive and reason about social interactions in the context of physical environments
is core to human social intelligence and human-machine cooperation. However, no prior dataset or
benchmark has systematically evaluated physically grounded perception of complex social interactions
that go beyond short actions, such as high-fiving, or simple group activities, such as gathering. In this
work, we create a dataset of physically-grounded abstract social events, PHASE, that resemble a wide
range of real-life social interactions by including social concepts such as helping another agent. PHASE
consists of 2D animations of pairs of agents moving in a continuous space generated procedurally
using a physics engine and a hierarchical planner. Agents have a limited field of view, and can interact
with multiple objects, in an environment that has multiple landmarks and obstacles. Using PHASE,
we design a social recognition task and a social prediction task. PHASE is validated with human
experiments demonstrating that humans perceive rich interactions in the social events, and that the
simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse
planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-
the-art feedforward neural networks. We hope that PHASE can serve as a difficult new challenge for
developing new models that can recognize complex social interactions.

}, author = {Aviv Netanyahu and Tianmin Shu and Boris Katz and Andrei Barbu and Joshua B. Tenenbaum} } @article {5053, title = {Social Interactions as Recursive MDPs}, year = {2021}, abstract = {

While machines and robots must interact with humans, providing them with social skills has been a largely overlooked topic. This is mostly a consequence of the fact that tasks such as navigation, command following, and even game playing are well-defined, while social reasoning still mostly re- mains a pre-theoretic problem. We demonstrate how social interactions can be effectively incorporated into MDPs (Markov decision processes) by reasoning recursively about the goals of other agents. In essence, our method extends the reward function to include a combination of physical goals (something agents want to accomplish in the configuration space, a traditional MDP) and social goals (something agents want to accomplish relative to the goals of other agents). Our Social MDPs allow specifying reward functions in terms of the estimated reward functions of other agents, modeling interactions such as helping or hindering another agent (by maximizing or minimizing the other agent{\textquoteright}s reward) while bal- ancing this with the actual physical goals of each agent. Our formulation allows for an arbitrary function of another agent{\textquoteright}s estimated reward structure and physical goals, enabling more complex behaviors such as politely hindering another agent or aggressively helping them. Extending Social MDPs in the same manner as I-POMDPs (Interactive-partially observed Markov decision processes) extension would enable interactions such as convincing another agent that something is true. To what extent the Social MDPs presented here and their potential Social POMDPs variant account for all possible social interactions is unknown, but having a precise mathematical model to guide questions about social in- teractions has both practical value (we demonstrate how to make zero-shot social inferences and one could imagine chatbots and robots guided by Social MDPs) and theoretical value by bringing the tools of MDP that have so successfully organized research around navigation to shed light on what social interactions really are given their extreme importance to human well-being and human civilization.

}, author = {Ravi Tejwani and Yen-Ling Kuo and Tianmin Shu and Boris Katz and Andrei Barbu} } @conference {5081, title = {Spoken ObjectNet: A Bias-Controlled Spoken Caption Dataset}, booktitle = {Interspeech 2021}, year = {2021}, month = {08/2021}, address = {ISCA}, doi = {10.21437/Interspeech.2021}, url = {https://www.isca-speech.org/archive/interspeech_2021}, author = {Palmer, Ian and Rouditchenko, Andrew and Andrei Barbu and Katz, Boris and Glass, James} } @article {5055, title = {Spoken ObjectNet: A Bias-Controlled Spoken Caption Dataset}, year = {2021}, abstract = {

Visually-grounded spoken language datasets can enable models to learn cross-modal correspon- dences with very weak supervision. However, modern audio-visual datasets contain biases that un- dermine the real-world performance of models trained on that data. We introduce Spoken ObjectNet, which is designed to remove some of these biases and provide a way to better evaluate how effec- tively models will perform in real-world scenarios. This dataset expands upon ObjectNet, which is a biascontrolled image dataset that features similar image classes to those present in ImageNet. We detail our data collection pipeline, which features several methods to improve caption quality, including automated language model checks. Lastly, we show baseline results on image retrieval and audio re- trieval tasks. These results show that models trained on other datasets and then evaluated on Spoken ObjectNet tend to perform poorly due to biases in other datasets that the models have learned. We also show evidence that the performance decrease is due to the dataset controls, and not the transfer setting.

}, author = {Ian Palmer and Andrew Rouditchenko and Andrei Barbu and Boris Katz and James Glass} } @conference {5107, title = {On the use of Cortical Magnification and Saccades as Biological Proxies for Data Augmentation}, booktitle = {Shared Visual Representations in Human and Machine Intelligence (SVRHM) Workshop at NeurIPS}, year = {2021}, keywords = {Active Perception, Data-Augmentation, Foveation, Self-Supervised Learning}, url = {https://openreview.net/forum?id=Rpazl253IHb}, author = {Binxu Wang and David Mayo and Arturo Deza and Andrei Barbu and Colin Conwell} } @article {5060, title = {Deep compositional robotic planners that follow natural language commands}, year = {2020}, abstract = {

We demonstrate how a sampling-based robotic planner can be augmented to learn to understand a sequence of natural language commands in a continuous configuration space to move and manipu- late objects. Our approach combines a deep network structured according to the parse of a complex command that includes objects, verbs, spatial relations, and attributes, with a sampling-based planner, RRT. A recurrent hierarchical deep network controls how the planner explores the environment, de- termines when a planned path is likely to achieve a goal, and estimates the confidence of each move to trade off exploitation and exploration between the network and the planner. Planners are designed to have near-optimal behavior when information about the task is missing, while networks learn to ex- ploit observations which are available from the environment, making the two naturally complementary. Combining the two enables generalization to new maps, new kinds of obstacles, and more complex sentences that do not occur in the training set. Little data is required to train the model despite it jointly acquiring a CNN that extracts features from the environment as it learns the meanings of words. The model provides a level of interpretability through the use of attention maps allowing users to see its reasoning steps despite being an end-to-end model. This end-to-end model allows robots to learn to follow natural language commands in challenging continuous environments.

}, author = {Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @conference {4519, title = {Deep compositional robotic planners that follow natural language commands }, booktitle = {International Conference on Robotics and Automation (ICRA)}, year = {2020}, month = {05/2020}, address = {Palais des Congr{\`e}s de Paris, Paris, France}, author = {Yen-Ling Kuo and Katz, Boris and Andrei Barbu} } @article {5061, title = {Encoding formulas as deep networks: Reinforcement learning for zero-shot execution of LTL formulas}, year = {2020}, abstract = {

We demonstrate a reinforcement learning agent which uses a compositional recurrent neural network that takes as input an LTL formula and determines satisfying actions. The input LTL formulas have never been seen before, yet the network performs zero-shot generalization to satisfy them. This is a novel form of multi-task learning for RL agents where agents learn from one diverse set of tasks and generalize to a new set of diverse tasks. The formulation of the network enables this capacity to generalize. We demonstrate this ability in two domains. In a symbolic domain, the agent finds a sequence of letters that is accepted. In a Minecraft-like environment, the agent finds a sequence of actions that conform to the formula. While prior work could learn to execute one formula reliably given examples of that formula, we demonstrate how to encode all formulas reliably. This could form the basis of new multi- task agents that discover sub-tasks and execute them without any additional training, as well as the agents which follow more complex linguistic commands. The structures required for this generalization are specific to LTL formulas, which opens up an interesting theoretical question: what structures are required in neural networks for zero-shot generalization to different logics?

}, author = {Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @conference {4804, title = {Encoding formulas as deep networks: Reinforcement learning for zero-shot execution of LTL formulas}, booktitle = {2020 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)}, year = {2020}, address = {Las Vegas, NV, USA}, abstract = {

We demonstrate a reinforcement learning agent which uses a compositional recurrent neural network that takes as input an LTL formula and determines satisfying actions. The input LTL formulas have never been seen before, yet the network performs zero-shot generalization to satisfy them. This is a novel form of multi-task learning for RL agents where agents learn from one diverse set of tasks and generalize to a new set of diverse tasks. The formulation of the network enables this capacity to generalize. We demonstrate this ability in two domains. In a symbolic domain, the agent finds a sequence of letters that is accepted. In a Minecraft-like environment, the agent finds a sequence of actions that conform to the formula. While prior work could learn to execute one formula reliably given examples of that formula, we demonstrate how to encode all formulas reliably. This could form the basis of new multitask agents that discover sub-tasks and execute them without any additional training, as well as the agents which follow more complex linguistic commands. The structures required for this generalization are specific to LTL formulas, which opens up an interesting theoretical question: what structures are required in neural networks for zero-shot generalization to different logics?

}, doi = {10.1109/IROS45743.2020.9341325}, url = {https://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=9340668}, author = {Kuo, Yen-Ling and Katz, Boris and Andrei Barbu} } @article {4811, title = {Learning a Natural-language to LTL Executable Semantic Parser for Grounded Robotics}, year = {2020}, month = {12/2020}, institution = {Proceedings of Conference on Robot Learning (CoRL-2020)}, abstract = {

Children acquire their native language with apparent ease by observing how language is used in context and attempting to use it themselves. They do so without laborious annotations, negative examples, or even direct corrections. We take a step toward robots that can do the same by training a grounded semantic parser, which discovers latent linguistic representations that can be used for the execution of natural-language commands. In particular, we focus on the difficult domain of commands with a temporal aspect, whose semantics we capture with Linear Temporal Logic, LTL. Our parser is trained with pairs of sentences and executions as well as an executor. At training time, the parser hypothesizes a meaning representation for the input as a formula in LTL. Three competing pressures allow the parser to discover meaning from language. First, any hypothesized meaning for a sentence must be permissive enough to reflect all the annotated execution trajectories. Second, the executor -- a pretrained end-to-end LTL planner -- must find that the observed trajectories are likely executions of the meaning. Finally, a generator, which reconstructs the original input, encourages the model to find representations that conserve knowledge about the command. Together these ensure that the meaning is neither too general nor too specific. Our model generalizes well, being able to parse and execute both machine-generated and human-generated commands, with near-equal accuracy, despite the fact that the human-generated sentences are much more varied and complex with an open lexicon. The approach presented here is not specific to LTL; it can be applied to any domain where sentence meanings can be hypothesized and an executor can verify these meanings, thus opening the door to many applications for robotic agents.

}, url = {https://corlconf.github.io/paper_385/}, author = {Christopher Wang and Candace Ross and Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @article {5059, title = {Learning a natural-language to LTL executable semantic parser for grounded robotics}, year = {2020}, month = {08/2020}, abstract = {

Children acquire their native language with apparent ease by observing how language is used in context and attempting to use it themselves. They do so without laborious annotations, negative examples, or even direct corrections. We take a step toward robots that can do the same by training a grounded semantic parser, which discovers latent linguistic representations that can be used for the execution of natural-language commands. In particular, we focus on the difficult domain of commands with a temporal aspect, whose semantics we capture with Linear Temporal Logic, LTL. Our parser is trained with pairs of sentences and executions as well as an executor. At training time, the parser hypothesizes a meaning representation for the input as a formula in LTL. Three competing pressures allow the parser to discover meaning from language. First, any hypothesized meaning for a sentence must be permissive enough to reflect all the annotated execution trajectories. Second, the executor {\textemdash} a pretrained end-to-end LTL planner {\textemdash} must find that the observed trajectories are likely executions of the meaning. Finally, a generator, which reconstructs the original input, encourages the model to find representations that conserve knowledge about the command. Together these ensure that the meaning is neither too general nor too specific. Our model generalizes well, being able to parse and execute both machine-generated and human-generated commands, with near-equal accuracy, despite the fact that the human-generated sentences are much more varied and complex with an open lexicon. The approach presented here is not specific to LTL: it can be applied to any domain where sentence meanings can be hypothesized and an executor can verify these meanings, thus opening the door to many applications for robotic agents.

}, doi = {https://doi.org/10.48550/arXiv.2008.03277}, author = {Christopher Wang and Candace Ross and Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @conference {4700, title = {PHASE: PHysically-grounded Abstract Social Eventsfor Machine Social Perception}, booktitle = {Shared Visual Representations in Human and Machine Intelligence (SVRHM) workshop at NeurIPS 2020}, year = {2020}, month = {12/2020}, abstract = {

The ability to perceive and reason about social interactions in the context ofphysical environments is core to human social intelligence and human-machinecooperation. However, no prior dataset or benchmark has systematically evaluatedphysically grounded perception of complex social interactions that go beyondshort actions, such as high-fiving, or simple group activities, such as gathering.In this work, we create a dataset of physically-grounded abstract social events,PHASE, that resemble a wide range of real-life social interactions by includingsocial concepts such as helping another agent. PHASE consists of 2D animationsof pairs of agents moving in a continuous space generated procedurally using aphysics engine and a hierarchical planner. Agents have a limited field of view, andcan interact with multiple objects, in an environment that has multiple landmarksand obstacles. Using PHASE, we design a social recognition task and a social prediction task. PHASE is validated with human experiments demonstrating thathumans perceive rich interactions in the social events, and that the simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-the-art feed-forward neural networks. We hope that PHASEcan serve as a difficult new challenge for developing new models that can recognize complex social interactions.

}, url = {https://openreview.net/forum?id=_bokm801zhx}, author = {Aviv Netanyahu and Tianmin Shu and Boris Katz and Andrei Barbu and Joshua B. Tenenbaum} } @article {4511, title = {Deep Compositional Robotic Planners that Follow Natural Language Commands.}, year = {2019}, month = {12/2019}, address = {Vancouver Convention Centre, Vancouver, Canada}, url = {https://vigilworkshop.github.io/}, author = {Yen-Ling Kuo and Katz, Boris and Andrei Barbu} } @article {4498, title = {Deep video-to-video transformations for accessibility with an application to photosensitivity}, journal = {Pattern Recognition Letters}, year = {2019}, month = {06/2019}, abstract = {

We demonstrate how to construct a new class of visual assistive technologies that, rather than extract symbolic information, learn to transform the visual environment to make it more accessible. We do so without engineering which transformations are useful allowing for arbitrary modifications of the visual input. As an instantiation of this idea we tackle a problem that affects and hurts millions worldwide: photosensitivity. Any time an affected person opens a website, video, or some other medium that contains an adverse visual stimulus, either intended or unintended, they might experience a seizure with potentially significant consequences. We show how a deep network can learn a video-to-video transformation rendering such stimuli harmless while otherwise preserving the video. This approach uses a specification of the adverse phenomena, the forward transformation, to learn the inverse transformation. We show how such a network generalizes to real-world videos that have triggered numerous seizures, both by mistake and in politically-motivated attacks. A number of complimentary approaches are demonstrated including using a hand-crafted generator and a GAN using a differentiable perceptual metric. Such technology can be deployed offline to protect videos before they are shown or online with assistive glasses or real-time post processing. Other applications of this general technique include helping those with limited vision, attention deficit hyperactivity disorder, and autism.

}, issn = {01678655}, doi = {10.1016/j.patrec.2019.01.019}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0167865519300133}, author = {Andrei Barbu and Banda, Dalitso and Katz, Boris} } @conference {4518, title = {Learning Language from Vision.}, booktitle = {Workshop on Visually Grounded Interaction and Language (ViGIL) at the Thirty-third Annual Conference on Neural Information Processing Systems (NeurIPS)}, year = {2019}, month = {12/2019}, address = {Vancouver Convention Center, Vancouver, Canada}, author = {Candace Ross and Yevgeni Berzak and Boris Katz and Andrei Barbu} } @proceedings {4388, title = {ObjectNet: A large-scale bias-controlled dataset for pushing the limits of object recognition models}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

We collect a large real-world test set, ObjectNet, for object recognition with controls where object backgrounds, rotations, and imaging viewpoints are random. Most scientific experiments have controls, confounds which are removed from the data, to ensure that subjects cannot perform a task by exploiting trivial correlations in the data. Historically, large machine learning and computer vision datasets have lacked such controls. This has resulted in models that must be fine-tuned for new datasets and perform better on datasets than in real-world applications. When tested on ObjectNet, object detectors show a 40-45\% drop in performance, with respect to their performance on other benchmarks, due to the controls for biases. Controls make ObjectNet robust to fine-tuning showing only small performance increases. We develop a highly automated platform that enables gathering datasets with controls by crowdsourcing image capturing and annotation. ObjectNet is the same size as the ImageNet test set (50,000 images), and by design does not come paired with a training set in order to encourage generalization. The dataset is both easier than ImageNet (objects are largely centered and unoccluded) and harder (due to the controls). Although we focus on object recognition here, data with controls can be gathered at scale using automated tools throughout machine learning to generate datasets that exercise models in new ways thus providing valuable feedback to researchers. This work opens up new avenues for research in generalizable, robust, and more human-like computer vision and in creating datasets where results are predictive of real-world performance.

}, author = {Andrei Barbu and David Mayo and Julian Alverio and William Luo and Christopher Wang and Dan Gutfreund and Joshua B. Tenenbaum and Boris Katz} } @conference {4112, title = {Deep sequential models for sampling-based planning}, booktitle = {The IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS 2018)}, year = {2018}, month = {10/2018 }, address = {Madrid, Spain}, abstract = {

We demonstrate how a sequence model and asampling-based planner can influence each other to produceefficient plans and how such a model can automatically learnto take advantage of observations of the environment. Sampling-based planners such as RRT generally know nothing of theirenvironments even if they have traversed similar spaces manytimes. A sequence model, such as an HMM or LSTM, guidesthe search for good paths. The resulting model, called DeRRT*,observes the state of the planner and the local environment tobias the next move and next planner state. The neural-network-based models avoid manual feature engineering by co-traininga convolutional network which processes map features andobservations from sensors. We incorporate this sequence modelin a manner that combines its likelihood with the existing biasfor searching large unexplored Voronoi regions. This leads tomore efficient trajectories with fewer rejected samples even indifficult domains such as when escaping bug traps. This modelcan also be used for dimensionality reduction in multi-agentenvironments with dynamic obstacles. Instead of planning in ahigh-dimensional space that includes the configurations of theother agents, we plan in a low-dimensional subspace relying onthe sequence model to bias samples using the observed behaviorof the other agents. The techniques presented here are general,include both graphical models and deep learning approaches,and can be adapted to a range of planners.

}, doi = {10.1109/IROS.2018.8593947}, url = {https://ieeexplore.ieee.org/document/8593947}, author = {Yen-Ling Kuo and Andrei Barbu and Boris Katz} } @conference {4109, title = {Grounding language acquisition by training semantic parsersusing captioned videos}, booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP 2018), }, year = {2018}, month = {10/2018 }, address = {Brussels, Belgium}, abstract = {

We develop a semantic parser that is trained ina grounded setting using pairs of videos cap-tioned with sentences. This setting is bothdata-efficient, requiring little annotation, andsimilar to the experience of children wherethey observe their environment and listen tospeakers. The semantic parser recovers themeaning of English sentences despite not hav-ing access to any annotated sentences. It doesso despite the ambiguity inherent in visionwhere a sentence may refer to any combina-tion of objects, object properties, relations oractions taken by any agent in a video. For thistask, we collected a new dataset for groundedlanguage acquisition. Learning a grounded se-mantic parser {\textemdash} turning sentences into logi-cal forms using captioned videos {\textemdash} can sig-nificantly expand the range of data that parserscan be trained on, lower the effort of training asemantic parser, and ultimately lead to a betterunderstanding of child language acquisition.

}, isbn = {978-1-948087-84-1}, url = {http://aclweb.org/anthology/D18-1285}, author = {Candace Ross and Andrei Barbu and Yevgeni Berzak and Battushig Myanganbayar and Boris Katz} } @article {3963, title = {Partially Occluded Hands: A challenging new dataset for single-image hand pose estimation}, year = {2018}, month = {12/2018}, abstract = {

Recognizing the pose of hands matters most when hands are interacting with other objects. To understand how well both machines and humans perform on single-image 2D hand-pose reconstruction from RGB images, we collected a challenging dataset of hands interacting with 148 objects. We used a novel methodology that provides the same hand in the same pose both with the object being present and occluding the hand and without the object occluding the hand. Additionally, we collected a wide range of grasps for each object designing the data collection methodology to ensure this diversity. Using this dataset we measured the performance of two state-of-the-art hand-pose recognition methods showing that both are extremely brittle when faced with even light occlusion from an object. This is not evident in previous datasets because they often avoid hand- object occlusions and because they are collected from videos where hands are often between objects and mostly unoccluded. We annotated a subset of the dataset and used that to show that humans are robust with respect to occlusion, and also to characterize human hand perception, the space of grasps that seem to be considered, and the accuracy of reconstructing occluded portions of hands. We expect that such data will be of interest to both the vision community for developing more robust hand-pose algorithms and to the robotic grasp planning community for learning such grasps. The dataset is available at occludedhands.com

}, keywords = {dataset, Partial occlusion, RGB hand-pose reconstruction}, author = {Battushig Myanganbayar and Cristina Mata and Gil Dekel and Katz, Boris and Guy Ben-Yosef and Andrei Barbu} } @conference {3964, title = {Partially Occluded Hands: A challenging new dataset for single-image hand pose estimation}, booktitle = {The 14th Asian Conference on Computer Vision (ACCV 2018)}, year = {2018}, month = {12/2018}, abstract = {

Recognizing the pose of hands matters most when hands are interacting with other objects. To understand how well both machines and humans perform on single-image 2D hand-pose reconstruction from RGB images, we collected a challenging dataset of hands interacting with 148 objects. We used a novel methodology that provides the same hand in the same pose both with the object being present and occluding the hand and without the object occluding the hand. Additionally, we collected a wide range of grasps for each object designing the data collection methodology to ensure this diversity. Using this dataset we measured the performance of two state-of-the-art hand-pose recognition methods showing that both are extremely brittle when faced with even light occlusion from an object. This is not evident in previous datasets because they often avoid hand- object occlusions and because they are collected from videos where hands are often between objects and mostly unoccluded. We annotated a subset of the dataset and used that to show that humans are robust with respect to occlusion, and also to characterize human hand perception, the space of grasps that seem to be considered, and the accuracy of reconstructing occluded portions of hands. We expect that such data will be of interest to both the vision community for developing more robust hand-pose algorithms and to the robotic grasp planning community for learning such grasps. The dataset is available at occludedhands.com

}, keywords = {dataset, Partial occlusion, RGB hand-pose reconstruction}, url = {http://accv2018.net/}, author = {Battushig Myanganbayar and Cristina Mata and Gil Dekel and Boris Katz and Guy Ben-Yosef and Andrei Barbu} } @conference {3492, title = {Temporal Grounding Graphs for Language Understanding with Accrued Visual-Linguistic Context}, booktitle = {Proceedings of the Twenty-Sixth International Joint Conference on Artificial Intelligence (IJCAI 2017)}, year = {2017}, month = {08/2017}, address = {Melbourne, Australia}, abstract = {

A robot{\textquoteright}s ability to understand or ground natural language instructions is fundamentally tied to its knowledge about the surrounding world. We present an approach to grounding natural language utter- ances in the context of factual information gathered through natural-language interactions and past vi- sual observations. A probabilistic model estimates, from a natural language utterance, the objects, re- lations, and actions that the utterance refers to, the objectives for future robotic actions it implies, and generates a plan to execute those actions while up- dating a state representation to include newly ac- quired knowledge from the visual-linguistic context. Grounding a command necessitates a representa- tion for past observations and interactions; however, maintaining the full context consisting of all pos- sible observed objects, attributes, spatial relations, actions, etc., over time is intractable. Instead, our model, Temporal Grounding Graphs , maintains a learned state representation for a belief over factual groundings, those derived from natural-language in- teractions, and lazily infers new groundings from visual observations using the context implied by the utterance. This work significantly expands the range of language that a robot can understand by incor- porating factual knowledge and observations of its workspace into its inference about the meaning and grounding of natural-language utterances.

}, url = {c}, author = {Rohan Paul and Andrei Barbu and Sue Felshin and Boris Katz and Nicholas Roy} } @article {2214, title = {Anchoring and Agreement in Syntactic Annotations}, year = {2016}, month = {09/2016}, abstract = {

Published in the Proceedings of EMNLP 2016

We present a study on two key characteristics of human syntactic annotations: anchoring and agreement. Anchoring is a well-known cognitive bias in human decision making, where judgments are drawn towards preexisting values. We study the influence of anchoring on a standard approach to creation of syntactic resources where syntactic annotations are obtained via human editing of tagger and parser output. Our experiments demonstrate a clear anchoring effect and reveal unwanted consequences, including overestimation of parsing performance and lower quality of annotations in comparison with human-based annotations. Using sentences from the Penn Treebank WSJ, we also report systematically obtained inter-annotator agreement estimates for English dependency parsing. Our agreement results control for parser bias, and are consequential in that they are on par with state of the art parsing performance for English newswire. We discuss the impact of our findings on strategies for future annotation efforts and parser evaluations.

}, author = {Yevgeni Berzak and Yan Huang and Andrei Barbu and Anna Korhonen and Boris Katz} } @article {2133, title = {Do You See What I Mean? Visual Resolution of Linguistic Ambiguities}, year = {2016}, month = {09/2016}, abstract = {

Understanding language goes hand in hand with the ability to integrate complex contextual information obtained via perception. In this work, we present a novel task for grounded language understanding: disambiguating a sentence given a visual scene which depicts one of the possible interpretations of that sentence. To this end, we introduce a new multimodal corpus containing ambiguous sentences, representing a wide range of syntactic, semantic and discourse ambiguities, coupled with videos that visualize the different interpretations for each sentence. We address this task by extending a vision model which determines if a sentence is depicted by a video. We demonstrate how such a model can be adjusted to recognize different interpretations of the same underlying sentence, allowing to disambiguate sentences in a unified fashion across the different ambiguity types.

}, author = {Yevgeni Berzak and Andrei Barbu and Daniel Harari and Boris Katz and Shimon Ullman} } @article {1885, title = {Language and Vision Ambiguities (LAVA) Corpus}, year = {2016}, month = {01/2016}, abstract = {

Ambiguity is one of the defining characteristics of human languages, and language understanding crucially relies on the ability to obtain unambiguous representations of linguistic content. While some ambiguities can be resolved using intra-linguistic contextual cues, the disambiguation of many linguistic constructions requires integration of world knowledge and perceptual information obtained from other modalities. In this work, we focus on the problem of grounding language in the visual modality, and introduce a novel task for visual and linguistic understanding which requires resolving linguistic ambiguities by utilizing the visual context of the utterance.

To address this challenge, we release the Language and Vision Ambiguities (LAVA) corpus. LAVA contains ambiguous sentences coupled with visual scenes that depict the different interpretations of each sentence. The sentences in the corpus are annotated with syntactic and semantic parses, and cover a wide range of linguistic ambiguities, including PP and VP attachment, conjunctions, logical forms, anaphora and ellipsis. In addition to the sentence disambiguation challenge, the corpus will support a variety of related tasks which use natural language as a medium for expressing visual understanding.

Reference:
Yevgeni Berzak, Andrei Barbu, Daniel Harari, Boris Katz, and Shimon Ullman (2015). Do You See What I Mean? Visual Resolution of Linguistic Ambiguities. Conference on Empirical Methods in Natural Language Processing (EMNLP), Lisbon, Portugal. [PDF]

Download all of the clips in MP4 format (ZIP)

}, url = {http://web.mit.edu/lavacorpus/}, author = {Yevgeni Berzak and Andrei Barbu and Daniel Harari and Boris Katz and Shimon Ullman} } @article {2319, title = {A look back at the June 2016 BMM Workshop in Sestri Levante, Italy}, year = {2016}, month = {11/2016}, abstract = {

"On June 20th 2016, the first of a series of workshops on the science of intelligence kicked off in Sestri Levante, Italy. Organized by the Center for Brains, Minds, and Machines (CBMM), the Italian Institute of Technology (IIT), and the Max Plank Institution for Biological Cybernetics, this three-day workshop brought together an international cast of researchers to discuss human and machine intelligence. Computer scientists, cognitive scientists, and neuroscientists collaborated in a wide-ranging conversation about integrating different approaches to intelligence, both artificial and human, into a coherent science of intelligence..."

View the BMM Workshop in Sestri Levante page and watch the videos.

}, author = {Boris Katz and Andrei Barbu} } @article {1092, title = {A Compositional Framework for Grounding Language Inference, Generation, and Acquisition in Video}, year = {2015}, abstract = {

We present an approach to simultaneously reasoning about a video clip and an entire natural-language sentence. The compositional nature of language is exploited to construct models which represent the meanings of entire sentences composed out of the meanings of the words in those sentences mediated by a grammar that encodes the predicate-argument relations. We demonstrate that these models faithfully represent the meanings of sentences and are sensitive to how the roles played by participants (nouns), their characteristics (adjectives), the actions performed (verbs), the manner of such actions (adverbs), and changing spatial relations between participants (prepositions) affect the meaning of a sentence and how it is grounded in video. We exploit this methodology in three ways. In the first, a video clip along with a sentence are taken as input and the participants in the event described by the sentence are highlighted, even when the clip depicts multiple similar simultaneous events. In the second, a video clip is taken as input without a sentence and a sentence is generated that describes an event in that clip. In the third, a corpus of video clips is paired with sentences which describe some of the events in those clips and the meanings of the words in those sentences are learned. We learn these meanings without needing to specify which attribute of the video clips each word in a given sentence refers to. The learned meaning representations are shown to be intelligible to humans.

}, doi = {doi:10.1613/jair.4556}, url = {https://www.jair.org/media/4556/live-4556-8631-jair.pdf}, author = {Honan Yu and N. Siddharth and Andrei Barbu and Jeffrey Mark Siskind} } @conference {1429, title = {Do You See What I Mean? Visual Resolution of Linguistic Ambiguities}, booktitle = {Conference on Empirical Methods in Natural Language Processing, Lisbon, Portugal. }, year = {2015}, month = {09/2015}, author = {Yevgeni Berzak and Andrei Barbu and Daniel Harari and Boris Katz and Shimon Ullman} } @article {461, title = {Abstracts of the 2014 Brains, Minds, and Machines Summer Course}, number = {024}, year = {2014}, month = {09/2014}, abstract = {

A compilation of abstracts from the student projects of the 2014 Brains, Minds, and Machines Summer School, held at Woods Hole Marine Biological Lab, May 29 - June 12, 2014.

}, author = {Nadav Amir and Tarek R. Besold and Raffaello Camoriano and Goker Erdogan and Thomas Flynn and Grant Gillary and Jesse Gomez and Ariel Herbert-Voss and Gladia Hotan and Jonathan Kadmon and Scott W. Linderman and Tina T. Liu and Andrew Marantan and Joseph Olson and Garrick Orchard and Dipan K. Pal and Giulia Pasquale and Honi Sanders and Carina Silberer and Kevin A Smith and Carlos Stein N. de Briton and Jordan W. Suchow and M. H. Tessler and Guillaume Viejo and Drew Walker and Leila Wehbe and Andrei Barbu and Leyla Isik and Emily Mackevicius and Yasmine Meroz} } @article {452, title = {The Compositional Nature of Event Representations in the Human Brain}, number = {011}, year = {2014}, month = {07/14/2014}, abstract = {

How does the human brain represent simple compositions of constituents: actors, verbs, objects, directions, and locations? Subjects viewed videos during neuroimaging (fMRI) sessions from which sentential descriptions of those videos were identified by decoding the brain representations based only on their fMRI activation patterns. Constituents (e.g., fold and shirt) were independently decoded from a single presentation. Independent constituent classification was then compared to joint classification of aggregate concepts (e.g., fold -shirt); results were similar as measured by accuracy and correlation. The brain regions used for independent constituent classification are largely disjoint and largely cover those used for joint classification. This allows recovery of sentential descriptions of stimulus videos by composing the results of the independent constituent classifiers. Furthermore, classifiers trained on the words one set of subjects think of when watching a video can recognize sentences a different subject thinks of when watching a different video.

}, author = {Andrei Barbu and Daniel Barrett and Wei Chen and N. Siddharth and Caiming Xiong and Jason J. Corso and Christiane D. Fellbaum and Catherine Hanson and Stephen Jos{\'e} Hanson and Sebastien Helie and Evguenia Malaia and Barak A. Pearlmutter and Jeffrey Mark Siskind and Thomas Michael Talavage and Ronnie B. Wilbur} } @article {1367, title = {Seeing is Worse than Believing: Reading People{\textquoteright}s Minds Better than Computer-Vision Methods Recognize Actions}, number = {012}, year = {2014}, month = {09/2014}, abstract = {

We had human subjects perform a one-out-of-six class action recognition task from video stimuli while undergoing functional magnetic resonance imaging (fMRI). Support-vector machines (SVMs) were trained on the recovered brain scans to classify actions observed during imaging, yielding average classification accuracy of 69.73\% when tested on scans from the same subject and of 34.80\% when tested on scans from different subjects. An apples-to-apples comparison was performed with all publicly available software that implements state-of-the-art action recognition on the same video corpus with the same cross-validation regimen and same partitioning into training and test sets, yielding classification accuracies between 31.25\% and 52.34\%. This indicates that one can read people{\textquoteright}s minds better than state-of-the-art computer-vision methods can perform action recognition.

}, author = {Andrei Barbu and Daniel Barrett and Wei Chen and N. Siddharth and Caiming Xiong and Jason J. Corso and Christiane D. Fellbaum and Catherine Hanson and Stephen Jos{\'e} Hanson and Sebastien Helie and Evguenia Malaia and Barak A. Pearlmutter and Jeffrey Mark Siskind and Thomas Michael Talavage and Ronnie B. Wilbur} } @inbook {1090, title = {Seeing is worse than believing: Reading people{\textquoteright}s minds better than computer-vision methods recognize actions}, booktitle = {Computer Vision {\textendash} ECCV 2014, Lecture Notes in Computer Science}, series = {13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V}, volume = {8693}, year = {2014}, pages = {612{\textendash}627}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Zurich, Switzerland}, abstract = {

We had human subjects perform a one-out-of-six class action recognition task from video stimuli while undergoing functional magnetic resonance imaging (fMRI). Support-vector machines (SVMs) were trained on the recovered brain scans to classify actions observed during imaging, yielding average classification accuracy of 69.73\% when tested on scans from the same subject and of 34.80\% when tested on scans from different subjects. An apples-to-apples comparison was performed with all publicly available software that implements state-of-the-art action recognition on the same video corpus with the same cross-validation regimen and same partitioning into training and test sets, yielding classification accuracies between 31.25\% and 52.34\%. This indicates that one can read people{\textquoteright}s minds better than state-of-the-art computer-vision methods can perform action recognition.

}, doi = {10.1007/978-3-319-10602-1_40}, author = {Andrei Barbu and Daniel Barrett and Wei Chen and N. Siddharth and Caiming Xiong and Jason J. Corso and Christiane D. Fellbaum and Catherine Hanson and Stephen Jos{\'e} Hanson and Sebastien Helie and Evguenia Malaia and Barak A. Pearlmutter and Jeffrey Mark Siskind and Thomas Michael Talavage and Ronnie B. Wilbur} } @article {442, title = {Seeing What You{\textquoteright}re Told: Sentence-Guided Activity Recognition In Video.}, number = {006}, year = {2014}, month = {05/2014}, abstract = {

We present a system that demonstrates how the compositional structure of events, in concert with the compositional structure of language, can interplay with the underlying focusing mechanisms in video action recognition, thereby providing a medium, not only for top-down and bottom-up integration, but also for multi-modal integration between vision and language. We show how the roles played by participants (nouns), their characteristics (adjectives), the actions performed (verbs), the manner of such actions (adverbs), and changing spatial relations between participants (prepositions) in the form of whole sentential descriptions mediated by a grammar, guides the activity-recognition process. Further, the utility and expressiveness of our framework is demonstrated by performing three separate tasks in the domain of multi-activity videos: sentence-guided focus of attention, generation of sentential descriptions of video, and query-based video search, simply by leveraging the framework in different manners.

}, author = {N. Siddharth and Andrei Barbu and Jeffrey Mark Siskind} } @conference {1089, title = {Seeing What You{\textquoteright}re Told: Sentence-Guided Activity Recognition In Video}, booktitle = {CVPR}, year = {2014}, month = {07/2014}, publisher = {IEEE}, organization = {IEEE}, address = {Columbus, Ohio}, abstract = {

We present a system that demonstrates how the compositional structure of events, in concert with the compositional structure of language, can interplay with the underlying focusing mechanisms in video action recognition, providing a medium for top-down and bottom-up integration as well as multi-modal integration between vision and language. We show how the roles played by participants (nouns), their characteristics (adjectives), the actions performed (verbs), the manner of such actions (adverbs), and changing spatial relations between participants (prepositions), in the form of whole-sentence descriptions mediated by a grammar, guides the activity-recognition process. Further, the utility and expressiveness of our framework is demonstrated by performing three separate tasks in the domain of multiactivity video: sentence-guided focus of attention, generation of sentential description, and query-based search, simply by leveraging the framework in different manners

}, author = {N. Siddharth and Andrei Barbu and Jeffrey Mark Siskind} } @article {1094, title = {Seeing what you{\textquoteright}re told, sentence guided activity recognition in video}, year = {2014}, publisher = {IEEE}, author = {N. Siddharth and Andrei Barbu and Jeffrey Mark Siskind} }