@article {5255,
	title = {BrainBERT: Self-supervised representation learning for Intracranial Electrodes},
	year = {2023},
	month = {02/2023},
	address = {Kigali, Rwanda, Africa},
	abstract = {<p>We create a reusable Transformer, BrainBERT, for intracranial recordings bringing modern representation learning approaches to neuroscience. Much like in NLP and speech recognition, this Transformer enables classifying complex concepts, i.e., decoding neural data, with higher accuracy and with much less data by being pretrained in an unsupervised manner on a large corpus of unannotated neural recordings. Our approach generalizes to new subjects with electrodes in new positions and to unrelated tasks showing that the representations robustly disentangle the neural signal. Just like in NLP where one can study language by investigating what a language model learns, this approach opens the door to investigating the brain by what a model of the brain learns. As a first step along this path, we demonstrate a new analysis of the intrinsic dimensionality of the computations in different areas of the brain. To construct these representations, we combine a technique for producing super-resolution spectrograms of neural data with an approach designed for generating contextual representations of audio by masking. In the future, far more concepts will be decodable from neural recordings by using representation learning, potentially unlocking the brain like language models unlocked language.</p>
},
	keywords = {decoding, language models, Neuroscience, self-supervision, transformer},
	url = {https://openreview.net/forum?id=xmcYx_reUn6},
	author = {Christopher Wang and Vighnesh Subramaniam and Adam Uri Yaari and Gabriel Kreiman and Boris Katz and Ignacio Cases and Andrei Barbu}
}
@conference {5538,
	title = {Modeling Visual Impairments with Artificial Neural Networks: a Review},
	booktitle = {International Conference on Computer Vision 2023},
	year = {2023},
	month = {10/2023},
	address = {Paris},
	abstract = {<p>We present an approach to bridge the gap between the computational models of human vision and the clinical practice on visual impairments (VI). In a nutshell, we propose to connect advances in neuroscience and machine learning to study the impact of VI on key functional competencies and improve treatment strategies. We review related literature, with the goal of promoting the full exploitation of Artificial Neural Network (ANN) models in meeting the needs of visually impaired individuals and the operators working in the field of visual rehabilitation. We first summarize the existing types of visual issues, the key functional vision-related tasks, and the current methodologies used for the assessment of both. Second, we explore the ANNs best suitable to model visual issues and to predict their impact on functional vision-related tasks, at a behavioral (including performance and attention measures) and neural level. We provide guidelines to inform the future research about developing and deploying ANNs for clinical applications targeting individuals affected by VI.</p>
},
	url = {https://openaccess.thecvf.com/content/ICCV2023W/ACVR/html/Schiatti_Modeling_Visual_Impairments_with_Artificial_Neural_Networks_a_Review_ICCVW_2023_paper.html},
	author = {Lucia Schiatti and Monica Gori and Martin Schrimpf and Giulia Cappagli and Federica Morelli and Sabrina Signorini and Boris Katz and Andrei Barbu}
}
@conference {5548,
	title = {Using Multimodal DNNs to Study Vision-Language Integration in the Brain},
	booktitle = {ICLR 2023},
	year = {2023},
	month = {03/2023},
	abstract = {<p>We leverage a large stereoelectroencephalography (SEEG) dataset consisting of neural recordings during movie viewing and a battery of unimodal and multimodal deep neural network models (SBERT, BEIT, SIMCLR, CLIP, SLIP) to identify candidate sites of multimodal integration in the human brain. Our data-driven method involves three steps: first, we parse the neural data into discrete, distinct event-structures, i.e., image-text pairs defined either by word onset times or visual scene cuts. We then use the activity generated by these event-structures in our candidate models to predict the activity generated in the brain. Finally, using contrasts between models with or without multimodal learning signals, we isolate those neural arrays driven more by multimodal representations than by unimodal representations. Using this method, we identify a sizable set of candidate neural sites that our model predictions suggest are shaped by multimodality (from 3\%-29\%, depending on increasingly conservative statistical inclusion criteria). We note a meaningful cluster of these multimodal electrodes in and around the temporoparietal junction, long theorized to be a hub of multimodal integration.</p>
},
	url = {https://openreview.net/pdf?id=OQQ1p0pFP4},
	author = {Vighnesh Subramaniam and Colin Conwell and Christopher Wang and Gabriel Kreiman and Boris Katz and Ignacio Cases and Andrei Barbu}
}
@conference {5322,
	title = {Zero-shot linear combinations of grounded social interactions with Linear Social MDPs},
	booktitle = {Proceedings of the 37th AAAI Conference on Artificial Intelligence (AAAI)},
	year = {2023},
	month = {02/2023},
	abstract = {<p>Humans and animals engage in rich social interactions. It is often theorized that a relatively small number of basic social interactions give rise to the full range of behavior observed. But no computational theory explaining how social interactions combine together has been proposed before. We do so here. We take a model, the Social MDP, which is able to express a range of social interactions, and extend it to represent linear combinations of social interactions. Practically for robotics applications, such models are now able to not just express that an agent should help another agent, but to express goal-centric social interactions. Perhaps an agent is helping someone get dressed, but preventing them from falling, and is happy to exchange stories in the meantime. How an agent responds socially, should depend on what it thinks the other agent is doing at that point in time. To encode this notion, we take linear combinations of social interactions as defined in Social MDPs, and compute the weights on those combinations on the fly depending on the estimated goals of other agents. This new model, the Linear Social MDP, enables zero-shot reasoning about complex social interactions, provides a mathematical basis for the long-standing intuition that social interactions should compose, and leads to interesting new behaviors that we validate using human observers. Complex social interactions are part of the future of intelligent agents, and having principled mathematical models built on a foundation like MDPs will make it possible to bring social interactions to every robotic application.</p>
},
	author = {Ravi Tejwani and Yen-Ling Kuo and Tianmin Shu and Bennett Stankovits and Dan Gutfreund and Joshua B. Tenenbaum and Boris Katz and Andrei Barbu}
}
@conference {5302,
	title = {The Aligned Multimodal Movie Treebank: An audio, video, dependency-parse treebank},
	booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing},
	year = {2022},
	abstract = {<p>Treebanks have traditionally included only text and were derived from written sources such as newspapers or the web. We introduce the Aligned Multimodal Movie Treebank (AMMT), an English language treebank derived from dialog in Hollywood movies which includes transcriptions of the audio-visual streams with word-level alignment, as well as part of speech tags and dependency parses in the Universal Dependencies formalism. AMMT consists of 31,264 sentences and 218,090 words, that will amount to the 3rd largest UD English treebank and the only multimodal treebank in UD. To help with the web-based annotation effort, we also introduce the Efficient Audio Alignment Annotator (EAAA), a companion tool that enables annotators to significantly speed-up their annotation processes.</p>
},
	author = {Adam Yaari and Jan DeWitt and Henry Hu and Bennett Stankovits and Sue Felshin and Yevgeni Berzak and Helena Aparicio and Boris Katz and Ignacio Cases and Andrei Barbu}
}
@article {5050,
	title = {Incorporating Rich Social Interactions Into MDPs},
	year = {2022},
	abstract = {<div>
<div>
<div>
<p>Much of what we do as humans is engage socially with other agents, a skill that robots must also eventually possess. We demonstrate that a rich theory of social interactions originating from microso- ciology and economics can be formalized by extending a nested MDP where agents reason about arbitrary functions of each other{\textquoteright}s hidden rewards. This extended Social MDP allows us to encode the five basic interactions that underlie microsociology: cooperation, conflict, coercion, competition, and exchange. The result is a robotic agent capable of executing social interactions zero-shot in new environments; like humans it can engage socially in novel ways even without a single example of that social interaction. Moreover, the judgments of these Social MDPs align closely with those of humans when considering which social interaction is taking place in an environment. This method both sheds light on the nature of social interactions, by providing concrete mathematical definitions, and brings rich social interactions into a mathematical framework that has proven to be natural for robotics, MDPs.</p>
</div>
</div>
</div>
},
	author = {Ravi Tejwani and Yen-Ling Kuo and Tianmin Shu and Bennett Stankovits and Dan Gutfreund and Joshua B. Tenenbaum and Boris Katz and Andrei Barbu}
}
@article {5299,
	title = {Quantifying the Emergence of Symbolic Communication},
	journal = {CogSci},
	year = {2022},
	abstract = {<p>We quantitatively study the emergence of symbolic communication in humans with a communication game that attempts to recapitulate an essential step in the development of human language: the emergence of shared signs. In our experiment, a teacher must communicate a first order logic formula to a student through a narrow channel deprived of common shared signs: subjects cannot communicate with each other with the sole exception of car motions in a computer game. Subjects spontaneously develop a shared vocabulary of car motions including indices, icons, and symbols, spanning both task-specific and task-agnostic concepts such as "square{\textquoteright}{\textquoteright} and "understand{\textquoteright}{\textquoteright}. We characterize the conditions under which indices, icons, and symbols arise, finding that symbols are harder to establish than icons and indices. We observe the dominant sign category being developed transitions from indices to icons to symbols, and identify communicating in ambiguous game environments as a pressure for icon and symbol development.</p>
},
	url = {https://escholarship.org/uc/item/08n3293v},
	author = {Emily Cheng and Yen-Ling Kuo and Josefina Correa and Boris Katz and Ignacio Cases and Andrei Barbu}
}
@conference {5301,
	title = {Spontaneous sign emergence in humans and machines through an embodied communication game},
	booktitle = {JCoLE Workshop},
	year = {2022},
	author = {Emily Cheng and Yen-Ling Kuo and Ignacio Cases and Boris Katz and Andrei Barbu}
}
@article {5051,
	title = {Trajectory Prediction with Linguistic Representations},
	year = {2022},
	abstract = {<div>
<div>
<div>
<p>Language allows humans to build mental models that interpret what is happening around them resulting in more accurate long-term predictions. We present a novel trajectory prediction model that uses linguistic intermediate representations to forecast trajectories, and is trained using trajectory sam- ples with partially-annotated captions. The model learns the meaning of each of the words without direct per-word supervision. At inference time, it generates a linguistic description of trajectories which captures maneuvers and interactions over an extended time interval. This generated description is used to refine predictions of the trajectories of multiple agents. We train and validate our model on the Argoverse dataset, and demonstrate improved accuracy results in trajectory prediction. In addition, our model is more interpretable: it presents part of its reasoning in plain language as captions, which can aid model development and can aid in building confidence in the model before deploying it.</p>
</div>
</div>
</div>
},
	author = {Yen-Ling Kuo and Xin Huang and Andrei Barbu and Stephen G. McGill and Boris Katz and John J. Leonard and Guy Rosman}
}
@article {5054,
	title = {Compositional Networks Enable Systematic Generalization for Grounded Language Understanding},
	year = {2021},
	abstract = {<div>
<div>
<div>
<p>Humans are remarkably flexible when under- standing new sentences that include combinations of concepts they have never encountered before. Recent work has shown that while deep networks can mimic some human language abilities when presented with novel sentences, systematic variation un- covers the limitations in the language-understanding abilities of networks. We demonstrate that these limitations can be overcome by addressing the generalization challenges in the gSCAN dataset, which explicitly measures how well an agent is able to interpret novel linguistic commands grounded in vision, e.g., novel pairings of adjectives and nouns. The key principle we employ is compositionality: that the compositional structure of networks should reflect the compositional structure of the problem domain they address, while allowing other parameters to be learned end-to-end. We build a general-purpose mechanism that enables agents to generalize their language understanding to compositional domains. Crucially, our network has the same state-of-the art performance as prior work while generalizing its knowledge when prior work does not. Our network also provides a level of interpretability that enables users to inspect what each part of networks learns. Robust grounded language understanding without dramatic failures and without corner cases is critical to building safe and fair robots; we demonstrate the significant role that compositionality can play in achieving that goal.</p>
</div>
</div>
</div>
},
	author = {Yen-Ling Kuo and Boris Katz and Andrei Barbu}
}
@article {5056,
	title = {Compositional RL Agents That Follow Language Commands in Temporal Logic},
	year = {2021},
	abstract = {<div>
<div>
<div>
<p>We demonstrate how a reinforcement learning agent can use compositional recurrent neural net- works to learn to carry out commands specified in linear temporal logic (LTL). Our approach takes as input an LTL formula, structures a deep network according to the parse of the formula, and determines satisfying actions. This compositional structure of the network enables zero-shot generalization to sig- nificantly more complex unseen formulas. We demonstrate this ability in multiple problem domains with both discrete and continuous state-action spaces. In a symbolic domain, the agent finds a sequence of letters that satisfy a specification. In a Minecraft-like environment, the agent finds a sequence of actions that conform to a formula. In the Fetch environment, the robot finds a sequence of arm config- urations that move blocks on a table to fulfill the commands. While most prior work can learn to execute one formula reliably, we develop a novel form of multi-task learning for RL agents that allows them to learn from a diverse set of tasks and generalize to a new set of diverse tasks without any additional training. The compositional structures presented here are not specific to LTL, thus opening the path to RL agents that perform zero-shot generalization in other compositional domains.</p>
</div>
</div>
</div>
},
	author = {Yen-Ling Kuo and Andrei Barbu and Boris Katz}
}
@article {4825,
	title = {Large-scale benchmarking of deep neural network models in mouse visual cortex reveals patterns similar to those observed in macaque visual cortex},
	year = {2021},
	abstract = {<p>What is the representational structure of mouse visual cortex and how is it shaped? Mice obviouslyinteract with the world and recognize objects but unlike in primates, a majority of research to date suggests theactivity of their visual cortex may not be so well described by deep neural networks trained for object recognition.Using the Allen Brain Observatory{\quotesinglbase}{\"A}{\^o}s 2-photon calcium-imaging dataset of activity in over 30,000 rodent visualcortical neurons recorded in response to natural scenes, we work to resolve this discrepancy and demonstrate thatmodern neural networks can indeed be used to explain activity in the mouse visual cortex to a more reasonabledegree than previously suggested. In so doing, we elucidate at large scale the properties of networks whichbest match the biological visual system, with both representational similarity analysis and encoding modelscoming to mostly the same conclusions. Our analysis of 30 object recognition architectures (both pretrainedand randomly initialized) from the PyTorch model zoo demonstrates that deeper, thinner residual networks withbypass connections, fewer parameters shared across many convolutions, and higher scores on the ImageNetimage-recognition challenge tend to be more predictive of the neural activations in our sample. Additionally, wefind a significant degree of overlap between the models that best predict macaque visual cortex (as cataloguedby brain-score.org) and those that best predict mouse visual cortex. In concert, these findings help to bolster themouse brain as a viable source of data for the methods that have been successful thus far in the study of monkeybrains, and provide a preliminary set of design targets for building models that can better take advantage of theunparalleled scale, quality, and resolution of data afforded by calcium-imaging in the mouse brain.</p>
},
	author = {Colin Conwell and David Mayo and Michael Buice and Boris Katz and George Alvarez and Andrei Barbu}
}
@article {4826,
	title = {Measuring Social Biases in Grounded Vision and Language Embeddings},
	year = {2021},
	abstract = {<p>We generalize the notion of measuring social biases in word embeddings to visually grounded word embeddings. \&amp;nbsp;Biases are present in grounded embeddings, and indeed seem to be equally or more significant than for ungrounded embeddings. \&amp;nbsp;This is despite the fact that vision and language can suffer from different biases, which one might hope could attenuate the biases in both. Multiple ways exist to generalize metrics measuring bias in word embeddings to this new setting. \&amp;nbsp;We introduce the space of generalizations (Grounded-WEAT and Grounded-SEAT) and demonstrate that three generalizations answer different yet important questions about how biases, language, and vision interact. These metrics are used on a new dataset, the first for grounded bias, created by augmenting standard linguistic bias benchmarks with 10,228 images from COCO, Conceptual Captions, and Google Images. \&amp;nbsp;Dataset construction is challenging because vision datasets are themselves very biased. \&amp;nbsp;The presence of these biases in systems will begin to have real-world consequences as they are deployed, making carefully measuring bias and then mitigating it critical to building a fair society.</p>
},
	author = {Candace Ross and Boris Katz and Andrei Barbu}
}
@article {5057,
	title = {Measuring Social Biases in Grounded Vision and Language Embeddings},
	year = {2021},
	abstract = {<div>
<div>
<div>
<p>We generalize the notion of measuring social biases in word embeddings to visually grounded word embeddings. Biases are present in grounded embeddings, and indeed seem to be equally or more significant than for ungrounded embeddings. This is despite the fact that vision and language can suffer from different biases, which one might hope could attenuate the biases in both. Multiple ways exist to generalize metrics measuring bias in word embeddings to this new setting. We introduce the space of generalizations (GroundedWEAT and Grounded-SEAT) and demonstrate that three gener- alizations answer different yet important questions about how biases, language, and vision interact. These metrics are used on a new dataset, the first for grounded bias, created by augmenting stan- dard linguistic bias benchmarks with 10,228 images from COCO, Conceptual Captions, and Google Images. Dataset construction is challenging because vision datasets are themselves very biased. The presence of these biases in systems will begin to have real-world consequences as they are deployed, making carefully measuring bias and then mitigating it critical to building a fair society.</p>
</div>
</div>
</div>
},
	author = {Candace Ross and Andrei Barbu and Boris Katz}
}
@conference {4827,
	title = {Multi-resolution modeling of a discrete stochastic process identifies causes of cancer},
	booktitle = {International Conference on Learning Representations},
	year = {2021},
	month = {09/2020},
	abstract = {<p>Detection of cancer-causing mutations within the vast and mostly unexplored human genome is a major challenge. Doing so requires modeling the background mutation rate, a highly non-stationary stochastic process, across regions of interest varying in size from one to millions of positions. Here, we present the split-Poisson-Gamma (SPG) distribution, an extension of the classical Poisson-Gamma formulation, to model a discrete stochastic process at multiple resolutions. We demonstrate that the probability model has a closed-form posterior, enabling efficient and accurate linear-time prediction over any length scale after the parameters of the model have been inferred a single time. We apply our framework to model mutation rates in tumors and show that model parameters can be accurately inferred from high-dimensional epigenetic data using a convolutional neural network, Gaussian process, and maximum-likelihood estimation. Our method is both more accurate and more efficient than existing models over a large range of length scales. We demonstrate the usefulness of multi-resolution modeling by detecting genomic elements that drive tumor emergence and are of vastly differing sizes.</p>
},
	url = {https://openreview.net/forum?id=KtH8W3S_RE},
	author = {Adam Uri Yaari and Maxwell Sherman and Oliver Clarke Priebe and Po-Ru Loh and Boris Katz and Andrei Barbu and Bonnie Berger}
}
@article {5052,
	title = {Neural Regression, Representational Similarity, Model Zoology Neural Taskonomy at Scale in Rodent Visual Cortex},
	year = {2021},
	abstract = {<p>How well do deep neural networks fare as models of mouse visual cortex? A majority of research to date suggests results far more mixed than those produced in the modeling of primate visual cortex. Here, we perform a large-scale bench- marking of dozens of deep neural network models in mouse visual cortex with both representational similarity analysis and neural regression. Using the Allen Brain Observatory{\textquoteright}s 2-photon calcium-imaging dataset of activity in over 6,000 reliable rodent visual cortical neurons recorded in response to natural scenes, we replicate previous findings and resolve previous discrepancies, ultimately demonstrating that modern neural networks can in fact be used to explain activity in the mouse visual cortex to a more reasonable degree than previously suggested. Using our benchmark as an atlas, we offer preliminary answers to overarching questions about levels of analysis, questions about the properties of models that best predict the visual system overall and questions about the mapping between biological and artificial representations. Our results provide a reference point for future ventures in the deep neural network modeling of mouse visual cortex, hinting at novel combinations of mapping method, architecture, and task to more fully characterize the computational motifs of visual representation in a species so central to neuroscience, but with a perceptual physiology and ecology markedly different from the ones we study in primates.</p>
},
	author = {Colin Conwell and David Mayo and Michael A. Buice and Boris Katz and George A. Alvarez and Andrei Barbu}
}
@conference {4830,
	title = {PHASE: PHysically-grounded Abstract Social Events for Machine Social Perception},
	booktitle = {AAAI-21},
	year = {2021},
	abstract = {<p>The ability to perceive and reason about social interactions in the context of physical environments is core to human social intelligence and human-machine cooperation. However, no prior dataset or benchmark has systematically evaluated physically grounded perception of complex social interactions that go beyond short actions, such as high-fiving, or simple group activities, such as gathering. In this work, we create a dataset of physically-grounded abstract social events, PHASE, that resemble a wide range of real-life social interactions by including social concepts such as helping another agent. PHASE consists of 2D animations of pairs of agents moving in a continuous space generated procedurally using a physics engine and a hierarchical planner. Agents have a limited field of view, and can interact with multiple objects, in an environment that has multiple landmarks and obstacles. Using PHASE, we design a social recognition task and a social prediction task. PHASE is validated with human experiments demonstrating that humans perceive rich interactions in the social events, and that the simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-the-art feed-forward neural networks. We hope that PHASE can serve as a difficult new challenge for developing new models that can recognize complex social interactions.</p>
},
	author = {Aviv Netanyahu and Tianmin Shu and Boris Katz and Andrei Barbu and Joshua B. Tenenbaum}
}
@article {5058,
	title = {PHASE: PHysically-grounded Abstract Social Events for Machine Social Perception},
	number = {123},
	year = {2021},
	month = {03/2021},
	abstract = {<p>The ability to perceive and reason about social interactions in the context of physical environments<br />
is core to human social intelligence and human-machine cooperation. However, no prior dataset or<br />
benchmark has systematically evaluated physically grounded perception of complex social interactions<br />
that go beyond short actions, such as high-fiving, or simple group activities, such as gathering. In this<br />
work, we create a dataset of physically-grounded abstract social events, PHASE, that resemble a wide<br />
range of real-life social interactions by including social concepts such as helping another agent. PHASE<br />
consists of 2D animations of pairs of agents moving in a continuous space generated procedurally<br />
using a physics engine and a hierarchical planner. Agents have a limited field of view, and can interact<br />
with multiple objects, in an environment that has multiple landmarks and obstacles. Using PHASE,<br />
we design a social recognition task and a social prediction task. PHASE is validated with human<br />
experiments demonstrating that humans perceive rich interactions in the social events, and that the<br />
simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse<br />
planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-<br />
the-art feedforward neural networks. We hope that PHASE can serve as a difficult new challenge for<br />
developing new models that can recognize complex social interactions.</p>
},
	author = {Aviv Netanyahu and Tianmin Shu and Boris Katz and Andrei Barbu and Joshua B. Tenenbaum}
}
@article {5053,
	title = {Social Interactions as Recursive MDPs},
	year = {2021},
	abstract = {<div>
<div>
<div>
<p>While machines and robots must interact with humans, providing them with social skills has been a largely overlooked topic. This is mostly a consequence of the fact that tasks such as navigation, command following, and even game playing are well-defined, while social reasoning still mostly re- mains a pre-theoretic problem. We demonstrate how social interactions can be effectively incorporated into MDPs (Markov decision processes) by reasoning recursively about the goals of other agents. In essence, our method extends the reward function to include a combination of physical goals (something agents want to accomplish in the configuration space, a traditional MDP) and social goals (something agents want to accomplish relative to the goals of other agents). Our Social MDPs allow specifying reward functions in terms of the estimated reward functions of other agents, modeling interactions such as helping or hindering another agent (by maximizing or minimizing the other agent{\textquoteright}s reward) while bal- ancing this with the actual physical goals of each agent. Our formulation allows for an arbitrary function of another agent{\textquoteright}s estimated reward structure and physical goals, enabling more complex behaviors such as politely hindering another agent or aggressively helping them. Extending Social MDPs in the same manner as I-POMDPs (Interactive-partially observed Markov decision processes) extension would enable interactions such as convincing another agent that something is true. To what extent the Social MDPs presented here and their potential Social POMDPs variant account for all possible social interactions is unknown, but having a precise mathematical model to guide questions about social in- teractions has both practical value (we demonstrate how to make zero-shot social inferences and one could imagine chatbots and robots guided by Social MDPs) and theoretical value by bringing the tools of MDP that have so successfully organized research around navigation to shed light on what social interactions really are given their extreme importance to human well-being and human civilization.</p>
</div>
</div>
</div>
},
	author = {Ravi Tejwani and Yen-Ling Kuo and Tianmin Shu and Boris Katz and Andrei Barbu}
}
@article {5055,
	title = {Spoken ObjectNet: A Bias-Controlled Spoken Caption Dataset},
	year = {2021},
	abstract = {<div>
<div>
<div>
<p>Visually-grounded spoken language datasets can enable models to learn cross-modal correspon- dences with very weak supervision. However, modern audio-visual datasets contain biases that un- dermine the real-world performance of models trained on that data. We introduce Spoken ObjectNet, which is designed to remove some of these biases and provide a way to better evaluate how effec- tively models will perform in real-world scenarios. This dataset expands upon ObjectNet, which is a biascontrolled image dataset that features similar image classes to those present in ImageNet. We detail our data collection pipeline, which features several methods to improve caption quality, including automated language model checks. Lastly, we show baseline results on image retrieval and audio re- trieval tasks. These results show that models trained on other datasets and then evaluated on Spoken ObjectNet tend to perform poorly due to biases in other datasets that the models have learned. We also show evidence that the performance decrease is due to the dataset controls, and not the transfer setting.</p>
</div>
</div>
</div>
},
	author = {Ian Palmer and Andrew Rouditchenko and Andrei Barbu and Boris Katz and James Glass}
}
@article {5060,
	title = {Deep compositional robotic planners that follow natural language commands},
	year = {2020},
	abstract = {<div>
<div>
<div>
<p>We demonstrate how a sampling-based robotic planner can be augmented to learn to understand a sequence of natural language commands in a continuous configuration space to move and manipu- late objects. Our approach combines a deep network structured according to the parse of a complex command that includes objects, verbs, spatial relations, and attributes, with a sampling-based planner, RRT. A recurrent hierarchical deep network controls how the planner explores the environment, de- termines when a planned path is likely to achieve a goal, and estimates the confidence of each move to trade off exploitation and exploration between the network and the planner. Planners are designed to have near-optimal behavior when information about the task is missing, while networks learn to ex- ploit observations which are available from the environment, making the two naturally complementary. Combining the two enables generalization to new maps, new kinds of obstacles, and more complex sentences that do not occur in the training set. Little data is required to train the model despite it jointly acquiring a CNN that extracts features from the environment as it learns the meanings of words. The model provides a level of interpretability through the use of attention maps allowing users to see its reasoning steps despite being an end-to-end model. This end-to-end model allows robots to learn to follow natural language commands in challenging continuous environments.</p>
</div>
</div>
</div>
},
	author = {Yen-Ling Kuo and Boris Katz and Andrei Barbu}
}
@article {5061,
	title = {Encoding formulas as deep networks: Reinforcement learning for zero-shot execution of LTL formulas},
	year = {2020},
	abstract = {<div>
<div>
<div>
<div>
<p>We demonstrate a reinforcement learning agent which uses a compositional recurrent neural network that takes as input an LTL formula and determines satisfying actions. The input LTL formulas have never been seen before, yet the network performs zero-shot generalization to satisfy them. This is a novel form of multi-task learning for RL agents where agents learn from one diverse set of tasks and generalize to a new set of diverse tasks. The formulation of the network enables this capacity to generalize. We demonstrate this ability in two domains. In a symbolic domain, the agent finds a sequence of letters that is accepted. In a Minecraft-like environment, the agent finds a sequence of actions that conform to the formula. While prior work could learn to execute one formula reliably given examples of that formula, we demonstrate how to encode all formulas reliably. This could form the basis of new multi- task agents that discover sub-tasks and execute them without any additional training, as well as the agents which follow more complex linguistic commands. The structures required for this generalization are specific to LTL formulas, which opens up an interesting theoretical question: what structures are required in neural networks for zero-shot generalization to different logics?</p>
</div>
</div>
</div>
</div>
},
	author = {Yen-Ling Kuo and Boris Katz and Andrei Barbu}
}
@article {4811,
	title = {Learning a Natural-language to LTL Executable Semantic Parser for Grounded Robotics},
	year = {2020},
	month = {12/2020},
	institution = {Proceedings of Conference on Robot Learning (CoRL-2020)},
	abstract = {<p>Children acquire their native language with apparent ease by observing how language is used in context and attempting to use it themselves. They do so without laborious annotations, negative examples, or even direct corrections. We take a step toward robots that can do the same by training a grounded semantic parser, which discovers latent linguistic representations that can be used for the execution of natural-language commands. In particular, we focus on the difficult domain of commands with a temporal aspect, whose semantics we capture with Linear Temporal Logic, LTL. Our parser is trained with pairs of sentences and executions as well as an executor. At training time, the parser hypothesizes a meaning representation for the input as a formula in LTL. Three competing pressures allow the parser to discover meaning from language. First, any hypothesized meaning for a sentence must be permissive enough to reflect all the annotated execution trajectories. Second, the executor -- a pretrained end-to-end LTL planner -- must find that the observed trajectories are likely executions of the meaning. Finally, a generator, which reconstructs the original input, encourages the model to find representations that conserve knowledge about the command. Together these ensure that the meaning is neither too general nor too specific. Our model generalizes well, being able to parse and execute both machine-generated and human-generated commands, with near-equal accuracy, despite the fact that the human-generated sentences are much more varied and complex with an open lexicon. The approach presented here is not specific to LTL; it can be applied to any domain where sentence meanings can be hypothesized and an executor can verify these meanings, thus opening the door to many applications for robotic agents.</p>
},
	url = {https://corlconf.github.io/paper_385/},
	author = {Christopher Wang and Candace Ross and Yen-Ling Kuo and Boris Katz and Andrei Barbu}
}
@article {5059,
	title = {Learning a natural-language to LTL executable semantic parser for grounded robotics},
	year = {2020},
	month = {08/2020},
	abstract = {<div>
<div>
<div>
<p>Children acquire their native language with apparent ease by observing how language is used in context and attempting to use it themselves. They do so without laborious annotations, negative examples, or even direct corrections. We take a step toward robots that can do the same by training a grounded semantic parser, which discovers latent linguistic representations that can be used for the execution of natural-language commands. In particular, we focus on the difficult domain of commands with a temporal aspect, whose semantics we capture with Linear Temporal Logic, LTL. Our parser is trained with pairs of sentences and executions as well as an executor. At training time, the parser hypothesizes a meaning representation for the input as a formula in LTL. Three competing pressures allow the parser to discover meaning from language. First, any hypothesized meaning for a sentence must be permissive enough to reflect all the annotated execution trajectories. Second, the executor {\textemdash} a pretrained end-to-end LTL planner {\textemdash} must find that the observed trajectories are likely executions of the meaning. Finally, a generator, which reconstructs the original input, encourages the model to find representations that conserve knowledge about the command. Together these ensure that the meaning is neither too general nor too specific. Our model generalizes well, being able to parse and execute both machine-generated and human-generated commands, with near-equal accuracy, despite the fact that the human-generated sentences are much more varied and complex with an open lexicon. The approach presented here is not specific to LTL: it can be applied to any domain where sentence meanings can be hypothesized and an executor can verify these meanings, thus opening the door to many applications for robotic agents.</p>
</div>
</div>
</div>
},
	doi = {https://doi.org/10.48550/arXiv.2008.03277},
	author = {Christopher Wang and Candace Ross and Yen-Ling Kuo and Boris Katz and Andrei Barbu}
}
@conference {4700,
	title = {PHASE: PHysically-grounded Abstract Social Eventsfor Machine Social Perception},
	booktitle = {Shared Visual Representations in Human and Machine Intelligence (SVRHM) workshop at NeurIPS 2020},
	year = {2020},
	month = {12/2020},
	abstract = {<p>The ability to perceive and reason about social interactions in the context ofphysical environments is core to human social intelligence and human-machinecooperation. However, no prior dataset or benchmark has systematically evaluatedphysically grounded perception of complex social interactions that go beyondshort actions, such as high-fiving, or simple group activities, such as gathering.In this work, we create a dataset of physically-grounded abstract social events,PHASE, that resemble a wide range of real-life social interactions by includingsocial concepts such as helping another agent. PHASE consists of 2D animationsof pairs of agents moving in a continuous space generated procedurally using aphysics engine and a hierarchical planner. Agents have a limited field of view, andcan interact with multiple objects, in an environment that has multiple landmarksand obstacles. Using PHASE, we design a social recognition task and a social prediction task. PHASE is validated with human experiments demonstrating thathumans perceive rich interactions in the social events, and that the simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-the-art feed-forward neural networks. We hope that PHASEcan serve as a difficult new challenge for developing new models that can recognize complex social interactions.</p>
},
	url = {https://openreview.net/forum?id=_bokm801zhx},
	author = {Aviv Netanyahu and Tianmin Shu and Boris Katz and Andrei Barbu and Joshua B. Tenenbaum}
}
@conference {4518,
	title = {Learning Language from Vision.},
	booktitle = {Workshop on Visually Grounded Interaction and Language (ViGIL) at the Thirty-third Annual Conference on Neural Information Processing Systems (NeurIPS)},
	year = {2019},
	month = {12/2019},
	address = {Vancouver Convention Center, Vancouver, Canada},
	author = {Candace Ross and Yevgeni Berzak and Boris Katz and Andrei Barbu}
}
@proceedings {4388,
	title = {ObjectNet: A large-scale bias-controlled dataset for pushing the limits of object recognition models},
	year = {2019},
	month = {11/2019},
	address = {Vancouver, Canada},
	abstract = {<p>We collect a large real-world test set, ObjectNet, for object recognition with controls where object backgrounds, rotations, and imaging viewpoints are random. Most scientific experiments have controls, confounds which are removed from the data, to ensure that subjects cannot perform a task by exploiting trivial correlations in the data. Historically, large machine learning and computer vision datasets have lacked such controls. This has resulted in models that must be fine-tuned for new datasets and perform better on datasets than in real-world applications. When tested on ObjectNet, object detectors show a 40-45\% drop in performance, with respect to their performance on other benchmarks, due to the controls for biases. Controls make ObjectNet robust to fine-tuning showing only small performance increases. We develop a highly automated platform that enables gathering datasets with controls by crowdsourcing image capturing and annotation. ObjectNet is the same size as the ImageNet test set (50,000 images), and by design does not come paired with a training set in order to encourage generalization. The dataset is both easier than ImageNet (objects are largely centered and unoccluded) and harder (due to the controls). Although we focus on object recognition here, data with controls can be gathered at scale using automated tools throughout machine learning to generate datasets that exercise models in new ways thus providing valuable feedback to researchers. This work opens up new avenues for research in generalizable, robust, and more human-like computer vision and in creating datasets where results are predictive of real-world performance.</p>
},
	author = {Andrei Barbu and David Mayo and Julian Alverio and William Luo and Christopher Wang and Dan Gutfreund and Joshua B. Tenenbaum and Boris Katz}
}
@proceedings {3651,
	title = {Assessing Language Proficiency from Eye Movements in Reading},
	year = {2018},
	month = {06/2018},
	address = {New Orleans},
	keywords = {Computation, language},
	url = {http://naacl2018.org/},
	author = {Yevgeni Berzak and Boris Katz and Roger Levy}
}
@conference {4112,
	title = {Deep sequential models for sampling-based planning},
	booktitle = {The IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS 2018)},
	year = {2018},
	month = {10/2018 },
	address = {Madrid, Spain},
	abstract = {<p>We demonstrate how a sequence model and asampling-based planner can influence each other to produceefficient plans and how such a model can automatically learnto take advantage of observations of the environment. Sampling-based planners such as RRT generally know nothing of theirenvironments even if they have traversed similar spaces manytimes. A sequence model, such as an HMM or LSTM, guidesthe search for good paths. The resulting model, called DeRRT*,observes the state of the planner and the local environment tobias the next move and next planner state. The neural-network-based models avoid manual feature engineering by co-traininga convolutional network which processes map features andobservations from sensors. We incorporate this sequence modelin a manner that combines its likelihood with the existing biasfor searching large unexplored Voronoi regions. This leads tomore efficient trajectories with fewer rejected samples even indifficult domains such as when escaping bug traps. This modelcan also be used for dimensionality reduction in multi-agentenvironments with dynamic obstacles. Instead of planning in ahigh-dimensional space that includes the configurations of theother agents, we plan in a low-dimensional subspace relying onthe sequence model to bias samples using the observed behaviorof the other agents. The techniques presented here are general,include both graphical models and deep learning approaches,and can be adapted to a range of planners.</p>
},
	doi = {10.1109/IROS.2018.8593947},
	url = {https://ieeexplore.ieee.org/document/8593947},
	author = {Yen-Ling Kuo and Andrei Barbu and Boris Katz}
}
@conference {4109,
	title = {Grounding language acquisition by training semantic parsersusing captioned videos},
	booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP 2018), },
	year = {2018},
	month = {10/2018 },
	address = {Brussels, Belgium},
	abstract = {<p>We develop a semantic parser that is trained ina grounded setting using pairs of videos cap-tioned with sentences. This setting is bothdata-efficient, requiring little annotation, andsimilar to the experience of children wherethey observe their environment and listen tospeakers. The semantic parser recovers themeaning of English sentences despite not hav-ing access to any annotated sentences. It doesso despite the ambiguity inherent in visionwhere a sentence may refer to any combina-tion of objects, object properties, relations oractions taken by any agent in a video. For thistask, we collected a new dataset for groundedlanguage acquisition. Learning a grounded se-mantic parser {\textemdash} turning sentences into logi-cal forms using captioned videos {\textemdash} can sig-nificantly expand the range of data that parserscan be trained on, lower the effort of training asemantic parser, and ultimately lead to a betterunderstanding of child language acquisition.</p>
},
	isbn = {978-1-948087-84-1},
	url = {http://aclweb.org/anthology/D18-1285},
	author = {Candace Ross and Andrei Barbu and Yevgeni Berzak and Battushig Myanganbayar and Boris Katz}
}
@inbook {3489,
	title = {A Natural Language Interface for Mobile Devices},
	booktitle = {The Wiley Handbook of Human Computer Interaction},
	volume = {2},
	year = {2018},
	month = {02/2018 },
	pages = {539-559},
	publisher = {John Wiley \& Sons, },
	organization = {John Wiley \& Sons, },
	edition = {First},
	abstract = {<p>This chapter discusses some of the primary issues related to the design and construction of natural language interfaces, and in particular, interfaces to mobile devices. It describes two systems in this space: the START information access system and the StartMobile natural language interface to mobile devices. The chapter also discusses recently deployed commercial systems and future directions. The use of natural language annotations, and in particular, parameterized natural language annotations, enables START to respond to user requests in a wide variety of ways. StartMobile uses the START system as a first stage in the processing of user requests. Current commercial systems such as Apple{\textquoteright}s Siri, IBM{\textquoteright}s Watson, Google{\textquoteright}s {\textquotedblleft}Google Now{\textquotedblright}, Microsoft{\textquoteright}s Cortana, and Amazon{\textquoteright}s Alexa employ technology of the sort contained in START and StartMobile in combination with statistical ...</p>
},
	doi = {10.1002/9781118976005.ch23},
	author = {Boris Katz and Gary Borchardt and Sue Felshin and Federico Mora}
}
@conference {3964,
	title = {Partially Occluded Hands: A challenging new dataset for single-image hand pose estimation},
	booktitle = {The 14th Asian Conference on Computer Vision (ACCV 2018)},
	year = {2018},
	month = {12/2018},
	abstract = {<div>
<div>
<div>
<p>Recognizing the pose of hands matters most when hands are interacting with other objects. To understand how well both machines and humans perform on single-image 2D hand-pose reconstruction from RGB images, we collected a challenging dataset of hands interacting with 148 objects. We used a novel methodology that provides the same hand in the same pose both with the object being present and occluding the hand and without the object occluding the hand. Additionally, we collected a wide range of grasps for each object designing the data collection methodology to ensure this diversity. Using this dataset we measured the performance of two state-of-the-art hand-pose recognition methods showing that both are extremely brittle when faced with even light occlusion from an object. This is not evident in previous datasets because they often avoid hand- object occlusions and because they are collected from videos where hands are often between objects and mostly unoccluded. We annotated a subset of the dataset and used that to show that humans are robust with respect to occlusion, and also to characterize human hand perception, the space of grasps that seem to be considered, and the accuracy of reconstructing occluded portions of hands. We expect that such data will be of interest to both the vision community for developing more robust hand-pose algorithms and to the robotic grasp planning community for learning such grasps. The dataset is available at occludedhands.com</p>
</div>
</div>
</div>
},
	keywords = {dataset, Partial occlusion, RGB hand-pose reconstruction},
	url = {http://accv2018.net/},
	author = {Battushig Myanganbayar and Cristina Mata and Gil Dekel and Boris Katz and Guy Ben-Yosef and Andrei Barbu}
}
@conference {2587,
	title = {Predicting Native Language from Gaze},
	booktitle = {Annual Meeting of the Association for Computational Linguistics (ACL 2017)},
	year = {2017},
	author = {Yevgeni Berzak and Chie Nakamura and Suzanne Flynn and Boris Katz}
}
@conference {3492,
	title = {Temporal Grounding Graphs for Language Understanding with Accrued Visual-Linguistic Context},
	booktitle = {Proceedings of the Twenty-Sixth International Joint Conference on Artificial Intelligence (IJCAI 2017)},
	year = {2017},
	month = {08/2017},
	address = {Melbourne, Australia},
	abstract = {<p>A robot{\textquoteright}s ability to understand or ground natural language instructions is fundamentally tied to its knowledge about the surrounding world. We present an approach to grounding natural language utter- ances in the context of factual information gathered through natural-language interactions and past vi- sual observations. A probabilistic model estimates, from a natural language utterance, the objects, re- lations, and actions that the utterance refers to, the objectives for future robotic actions it implies, and generates a plan to execute those actions while up- dating a state representation to include newly ac- quired knowledge from the visual-linguistic context. Grounding a command necessitates a representa- tion for past observations and interactions; however, maintaining the full context consisting of all pos- sible observed objects, attributes, spatial relations, actions, etc., over time is intractable. Instead, our model, Temporal Grounding Graphs , maintains a learned state representation for a belief over factual groundings, those derived from natural-language in- teractions, and lazily infers new groundings from visual observations using the context implied by the utterance. This work significantly expands the range of language that a robot can understand by incor- porating factual knowledge and observations of its workspace into its inference about the meaning and grounding of natural-language utterances.</p>
},
	url = {c},
	author = {Rohan Paul and Andrei Barbu and Sue Felshin and Boris Katz and Nicholas Roy}
}
@article {2214,
	title = {Anchoring and Agreement in Syntactic Annotations},
	year = {2016},
	month = {09/2016},
	abstract = {<div>
<div>
<div>
<div>
<div>
<div>
<p><strong>Published in the Proceedings of EMNLP 2016 </strong></p>
</div>
</div>
</div>

<p>We present a study on two key characteristics of human syntactic annotations: anchoring and agreement. Anchoring is a well-known cognitive bias in human decision making, where judgments are drawn towards preexisting values. We study the influence of anchoring on a standard approach to creation of syntactic resources where syntactic annotations are obtained via human editing of tagger and parser output. Our experiments demonstrate a clear anchoring effect and reveal unwanted consequences, including overestimation of parsing performance and lower quality of annotations in comparison with human-based annotations. Using sentences from the Penn Treebank WSJ, we also report systematically obtained inter-annotator agreement estimates for English dependency parsing. Our agreement results control for parser bias, and are consequential in that they are on par with state of the art parsing performance for English newswire. We discuss the impact of our findings on strategies for future annotation efforts and parser evaluations.</p>
</div>
</div>
</div>
},
	author = {Yevgeni Berzak and Yan Huang and Andrei Barbu and Anna Korhonen and Boris Katz}
}
@article {2132,
	title = {Contrastive Analysis with Predictive Power: Typology Driven Estimation of Grammatical Error Distributions in ESL},
	year = {2016},
	month = {07/2015},
	abstract = {<p>This work examines the impact of crosslinguistic transfer on grammatical errors in English as Second Language (ESL) texts. Using a computational framework that formalizes the theory of Contrastive Analysis (CA), we demonstrate that language specific error distributions in ESL writing can be predicted from the typological properties of the native language and their relation to the typology of English. Our typology driven model enables to obtain accurate estimates of such distributions without access to any ESL data for the target languages. Furthermore, we present a strategy for adjusting our method to low-resource languages that lack typological documentation using a bootstrapping approach which approximates native language typology from ESL texts. Finally, we show that our framework is instrumental for linguistic inquiry seeking to identify first language factors that contribute to a wide range of difficulties in second language acquisition.</p>
},
	author = {Yevgeni Berzak and Roi Reichart and Boris Katz}
}
@article {2133,
	title = {Do You See What I Mean? Visual Resolution of Linguistic Ambiguities},
	year = {2016},
	month = {09/2016},
	abstract = {<p>Understanding language goes hand in hand with the ability to integrate complex contextual information obtained via perception. In this work, we present a novel task for grounded language understanding: disambiguating a sentence given a visual scene which depicts one of the possible interpretations of that sentence. To this end, we introduce a new multimodal corpus containing ambiguous sentences, representing a wide range of syntactic, semantic and discourse ambiguities, coupled with videos that visualize the different interpretations for each sentence. We address this task by extending a vision model which determines if a sentence is depicted by a video. We demonstrate how such a model can be adjusted to recognize different interpretations of the same underlying sentence, allowing to disambiguate sentences in a unified fashion across the different ambiguity types.</p>
},
	author = {Yevgeni Berzak and Andrei Barbu and Daniel Harari and Boris Katz and Shimon Ullman}
}
@article {1885,
	title = {Language and Vision Ambiguities (LAVA) Corpus},
	year = {2016},
	month = {01/2016},
	abstract = {<p>Ambiguity is one of the defining characteristics of human languages, and language understanding crucially relies on the ability to obtain unambiguous representations of linguistic content. While some ambiguities can be resolved using intra-linguistic contextual cues, the disambiguation of many linguistic constructions requires integration of world knowledge and perceptual information obtained from other modalities. In this work, we focus on the problem of grounding language in the visual modality, and introduce a novel task for visual and linguistic understanding which requires resolving linguistic ambiguities by utilizing the visual context of the utterance.</p>

<p>To address this challenge, we release the Language and Vision Ambiguities (LAVA) corpus. LAVA contains ambiguous sentences coupled with visual scenes that depict the different interpretations of each sentence. The sentences in the corpus are annotated with syntactic and semantic parses, and cover a wide range of linguistic ambiguities, including PP and VP attachment, conjunctions, logical forms, anaphora and ellipsis. In addition to the sentence disambiguation challenge, the corpus will support a variety of related tasks which use natural language as a medium for expressing visual understanding.</p>

<p>Reference:<br />
Yevgeni Berzak, Andrei Barbu, Daniel Harari, Boris Katz, and Shimon Ullman (2015). Do You See What I Mean? Visual Resolution of Linguistic Ambiguities. Conference on Empirical Methods in Natural Language Processing (EMNLP), Lisbon, Portugal. [PDF]</p>

<p><a href="/sites/default/files/lh/LAVA-corpus-mp4s.zip">Download all of the clips in MP4 format (ZIP)</a></p>
},
	url = {http://web.mit.edu/lavacorpus/},
	author = {Yevgeni Berzak and Andrei Barbu and Daniel Harari and Boris Katz and Shimon Ullman}
}
@conference {2583,
	title = {Learning to Answer Questions from Wikipedia Infoboxes},
	booktitle = {The 2016 Conference on Empirical Methods on Natural Language Processing (EMNLP 2016)},
	year = {2016},
	abstract = {<p>A natural language interface to answers on the Web can help us access information more ef- ficiently.\&nbsp; We start with an interesting source of information{\textemdash}infoboxes\&nbsp; in Wikipedia that summarize factoid knowledge{\textemdash}and develop a comprehensive\&nbsp; approach\&nbsp; to\&nbsp; answering\&nbsp; ques- tions\&nbsp; with\&nbsp; high\&nbsp; precision.\&nbsp;\&nbsp;\&nbsp; We\&nbsp; first\&nbsp; build\&nbsp; a system to access data in infoboxes in a struc- tured manner. We use our system to construct a crowdsourced dataset of over 15,000 high- quality,\&nbsp; diverse\&nbsp; questions.\&nbsp;\&nbsp; With\&nbsp; these\&nbsp; ques- tions, we train a convolutional neural network model\&nbsp; that\&nbsp; outperforms\&nbsp; models\&nbsp; that\&nbsp; achieve top results in similar answer selection tasks.</p>
},
	author = {Alvaro Morales and Varot Premtoon and Cordelia Avery and Sue Felshin and Boris Katz}
}
@article {2319,
	title = {A look back at the June 2016 BMM Workshop in Sestri Levante, Italy},
	year = {2016},
	month = {11/2016},
	abstract = {<p>"On June 20<sup>th</sup> 2016, the first of a series of workshops on the science of intelligence kicked off in Sestri Levante, Italy. Organized by the Center for Brains, Minds, and Machines (CBMM), the Italian Institute of Technology (IIT), and the Max Plank Institution for Biological Cybernetics, this three-day workshop brought together an international cast of researchers to discuss human and machine intelligence. Computer scientists, cognitive scientists, and neuroscientists collaborated in a wide-ranging conversation about integrating different approaches to intelligence, both artificial and human, into a coherent science of intelligence..."</p>

<p><a href="/node/1514">View the BMM Workshop in Sestri Levante page and watch the videos.</a></p>
},
	author = {Boris Katz and Andrei Barbu}
}
@article {2825,
	title = {Treebank of Learner English (TLE)},
	year = {2016},
	month = {08/2016},
	abstract = {<p>The majority of the English text available worldwide is generated by non-native speakers. Learner language introduces a variety of challenges\&nbsp;and is of paramount importance for the scientific study of language acquisition as well as for Natural Language Processing. Despite the ubiquity of non-native English, there has been no publicly available syntactic treebank for English as a Second Language (ESL). To address this shortcoming, we released the Treebank of Learner English (TLE), a first of its kind resource for non-native English, containing 5,124 sentences manually annotated with Part of Speech (POS) tags and syntactic dependency trees. Full syntactic analyses are provided for both the original and error corrected versions of each sentence. We also introduced annotation guidelines that allow for consistent syntactic treatment of ungrammatical English. We envision the treebank to support a wide range of linguistic and computational research on language learning as well as automatic processing of ungrammatical language.</p>
},
	url = {http://esltreebank.org/},
	author = {Yevgeni Berzak and Jessica Kenney and Carolyn Spadine and Jing Xian Wang and Lucia Lam and Keiko Sophie Mori and Sebastian Garza and Boris Katz}
}
@article {2134,
	title = {Universal Dependencies for Learner English},
	year = {2016},
	month = {06/2016},
	abstract = {<p>We introduce the Treebank of Learner English (TLE), the first publicly available syntactic treebank for English as a Second Language (ESL). The TLE provides manually annotated POS tags and Universal Dependency (UD) trees for 5,124 sentences from the Cambridge First Certificate in English (FCE) corpus. The UD annotations are tied to a pre-existing error annotation of the FCE, whereby full syntactic analyses are provided for both the original and error corrected versions of each sentence. Further on, we delineate ESL annotation guidelines that allow for consistent syntactic treatment of ungrammatical English. Finally, we benchmark POS tagging and dependency parsing performance on the TLE dataset and measure the effect of grammatical errors on parsing accuracy. We envision the treebank to support a wide range of linguistic and computational research o n second language acquisition as well as automatic processing of ungrammatical language.</p>
},
	author = {Yevgeni Berzak and Jessica Kenney and Carolyn Spadine and Jing Xian Wang and Lucia Lam and Keiko Sophie Mori and Sebastian Garza and Boris Katz}
}
@conference {1077,
	title = {Contrastive Analysis with Predictive Power: Typology Driven Estimation of Grammatical Error Distributions in ESL},
	booktitle = {Nineteenth Conference on Computational Natural Language Learning (CoNLL), Beijing, China},
	year = {2015},
	month = {07/31/2015},
	abstract = {<p>This work examines the impact of cross- linguistic transfer on grammatical errors in English as Second Language (ESL) texts. Using a computational framework that for- malizes the theory of Contrastive Analy- sis (CA), we demonstrate that language specific error distributions in ESL writ- ing can be predicted from the typologi- cal properties of the native language and their relation to the typology of English. Our typology driven model enables to ob- tain accurate estimates of such distribu- tions without access to any ESL data for the target languages. Furthermore, we present a strategy for adjusting our method to low-resource languages that lack typo- logical documentation using a bootstrap- ping approach which approximates native language typology from ESL texts. Fi- nally, we show that our framework is in- strumental for linguistic inquiry seeking to identify first language factors that con- tribute to a wide range of difficulties in second language acquisition</p>
},
	author = {Yevgeni Berzak and Roi Reichart and Boris Katz}
}
@conference {1429,
	title = {Do You See What I Mean? Visual Resolution of Linguistic Ambiguities},
	booktitle = {Conference on Empirical Methods in Natural Language Processing, Lisbon, Portugal. },
	year = {2015},
	month = {09/2015},
	author = {Yevgeni Berzak and Andrei Barbu and Daniel Harari and Boris Katz and Shimon Ullman}
}
@article {1164,
	title = {Towards a Programmer{\textquoteright}s Apprentice (Again)},
	number = {030},
	year = {2015},
	month = {04/2015},
	abstract = {<div>
<div>
<div>
<p>Programmers are loathe to interrupt their workflow to document their design rationale, leading to frequent errors when software is modified{\textemdash}often much later and by different programmers. A Pro- grammer{\textquoteright}s Assistant could interact with the programmer to capture and preserve design rationale, in a natural way that would make rationale capture {\textquotedblleft}cost less than it{\textquoteright}s worth{\textquotedblright}, and could also detect common flaws in program design. Such a programmer{\textquoteright}s assistant was not practical when it was first proposed decades ago, but advances over the years make now the time to revisit the concept, as our prototype shows.</p>
</div>
</div>
</div>
},
	author = {Howard Shrobe and Boris Katz and Randall Davis}
}
@article {439,
	title = {Reconstructing Native Language Typology from Foreign Language Usage.},
	number = {007},
	year = {2014},
	month = {04/2014},
	abstract = {<p>Linguists and psychologists have long been studying cross-linguistic transfer, the influence of native language properties on linguistic performance in a foreign language. In this work we provide empirical evidence for this process in the form of a strong correlation between language similarities derived from structural features in English as Second Language (ESL) texts and equivalent similarities obtained directly from the typological features of the native languages. We leverage this finding to recover native language typological similarity structure directly from ESL text, and perform prediction of typological features in an unsupervised fashion with respect to the target languages. Our method achieves 72.2\% accuracy on the typology prediction task, a result that is highly competitive with equivalent methods that rely on typological resources.</p>
},
	keywords = {language, linguistics, Visual Intelligence},
	author = {Yevgeni Berzak and Roi Reichart and Boris Katz}
}