@conference {5322, title = {Zero-shot linear combinations of grounded social interactions with Linear Social MDPs}, booktitle = {Proceedings of the 37th AAAI Conference on Artificial Intelligence (AAAI)}, year = {2023}, month = {02/2023}, abstract = {

Humans and animals engage in rich social interactions. It is often theorized that a relatively small number of basic social interactions give rise to the full range of behavior observed. But no computational theory explaining how social interactions combine together has been proposed before. We do so here. We take a model, the Social MDP, which is able to express a range of social interactions, and extend it to represent linear combinations of social interactions. Practically for robotics applications, such models are now able to not just express that an agent should help another agent, but to express goal-centric social interactions. Perhaps an agent is helping someone get dressed, but preventing them from falling, and is happy to exchange stories in the meantime. How an agent responds socially, should depend on what it thinks the other agent is doing at that point in time. To encode this notion, we take linear combinations of social interactions as defined in Social MDPs, and compute the weights on those combinations on the fly depending on the estimated goals of other agents. This new model, the Linear Social MDP, enables zero-shot reasoning about complex social interactions, provides a mathematical basis for the long-standing intuition that social interactions should compose, and leads to interesting new behaviors that we validate using human observers. Complex social interactions are part of the future of intelligent agents, and having principled mathematical models built on a foundation like MDPs will make it possible to bring social interactions to every robotic application.

}, author = {Ravi Tejwani and Yen-Ling Kuo and Tianmin Shu and Bennett Stankovits and Dan Gutfreund and Joshua B. Tenenbaum and Boris Katz and Andrei Barbu} } @article {5050, title = {Incorporating Rich Social Interactions Into MDPs}, year = {2022}, abstract = {

Much of what we do as humans is engage socially with other agents, a skill that robots must also eventually possess. We demonstrate that a rich theory of social interactions originating from microso- ciology and economics can be formalized by extending a nested MDP where agents reason about arbitrary functions of each other{\textquoteright}s hidden rewards. This extended Social MDP allows us to encode the five basic interactions that underlie microsociology: cooperation, conflict, coercion, competition, and exchange. The result is a robotic agent capable of executing social interactions zero-shot in new environments; like humans it can engage socially in novel ways even without a single example of that social interaction. Moreover, the judgments of these Social MDPs align closely with those of humans when considering which social interaction is taking place in an environment. This method both sheds light on the nature of social interactions, by providing concrete mathematical definitions, and brings rich social interactions into a mathematical framework that has proven to be natural for robotics, MDPs.

}, author = {Ravi Tejwani and Yen-Ling Kuo and Tianmin Shu and Bennett Stankovits and Dan Gutfreund and Joshua B. Tenenbaum and Boris Katz and Andrei Barbu} } @article {5299, title = {Quantifying the Emergence of Symbolic Communication}, journal = {CogSci}, year = {2022}, abstract = {

We quantitatively study the emergence of symbolic communication in humans with a communication game that attempts to recapitulate an essential step in the development of human language: the emergence of shared signs. In our experiment, a teacher must communicate a first order logic formula to a student through a narrow channel deprived of common shared signs: subjects cannot communicate with each other with the sole exception of car motions in a computer game. Subjects spontaneously develop a shared vocabulary of car motions including indices, icons, and symbols, spanning both task-specific and task-agnostic concepts such as "square{\textquoteright}{\textquoteright} and "understand{\textquoteright}{\textquoteright}. We characterize the conditions under which indices, icons, and symbols arise, finding that symbols are harder to establish than icons and indices. We observe the dominant sign category being developed transitions from indices to icons to symbols, and identify communicating in ambiguous game environments as a pressure for icon and symbol development.

}, url = {https://escholarship.org/uc/item/08n3293v}, author = {Emily Cheng and Yen-Ling Kuo and Josefina Correa and Boris Katz and Ignacio Cases and Andrei Barbu} } @conference {5301, title = {Spontaneous sign emergence in humans and machines through an embodied communication game}, booktitle = {JCoLE Workshop}, year = {2022}, author = {Emily Cheng and Yen-Ling Kuo and Ignacio Cases and Boris Katz and Andrei Barbu} } @article {5051, title = {Trajectory Prediction with Linguistic Representations}, year = {2022}, abstract = {

Language allows humans to build mental models that interpret what is happening around them resulting in more accurate long-term predictions. We present a novel trajectory prediction model that uses linguistic intermediate representations to forecast trajectories, and is trained using trajectory sam- ples with partially-annotated captions. The model learns the meaning of each of the words without direct per-word supervision. At inference time, it generates a linguistic description of trajectories which captures maneuvers and interactions over an extended time interval. This generated description is used to refine predictions of the trajectories of multiple agents. We train and validate our model on the Argoverse dataset, and demonstrate improved accuracy results in trajectory prediction. In addition, our model is more interpretable: it presents part of its reasoning in plain language as captions, which can aid model development and can aid in building confidence in the model before deploying it.

}, author = {Yen-Ling Kuo and Xin Huang and Andrei Barbu and Stephen G. McGill and Boris Katz and John J. Leonard and Guy Rosman} } @article {5054, title = {Compositional Networks Enable Systematic Generalization for Grounded Language Understanding}, year = {2021}, abstract = {

Humans are remarkably flexible when under- standing new sentences that include combinations of concepts they have never encountered before. Recent work has shown that while deep networks can mimic some human language abilities when presented with novel sentences, systematic variation un- covers the limitations in the language-understanding abilities of networks. We demonstrate that these limitations can be overcome by addressing the generalization challenges in the gSCAN dataset, which explicitly measures how well an agent is able to interpret novel linguistic commands grounded in vision, e.g., novel pairings of adjectives and nouns. The key principle we employ is compositionality: that the compositional structure of networks should reflect the compositional structure of the problem domain they address, while allowing other parameters to be learned end-to-end. We build a general-purpose mechanism that enables agents to generalize their language understanding to compositional domains. Crucially, our network has the same state-of-the art performance as prior work while generalizing its knowledge when prior work does not. Our network also provides a level of interpretability that enables users to inspect what each part of networks learns. Robust grounded language understanding without dramatic failures and without corner cases is critical to building safe and fair robots; we demonstrate the significant role that compositionality can play in achieving that goal.

}, author = {Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @article {5056, title = {Compositional RL Agents That Follow Language Commands in Temporal Logic}, year = {2021}, abstract = {

We demonstrate how a reinforcement learning agent can use compositional recurrent neural net- works to learn to carry out commands specified in linear temporal logic (LTL). Our approach takes as input an LTL formula, structures a deep network according to the parse of the formula, and determines satisfying actions. This compositional structure of the network enables zero-shot generalization to sig- nificantly more complex unseen formulas. We demonstrate this ability in multiple problem domains with both discrete and continuous state-action spaces. In a symbolic domain, the agent finds a sequence of letters that satisfy a specification. In a Minecraft-like environment, the agent finds a sequence of actions that conform to a formula. In the Fetch environment, the robot finds a sequence of arm config- urations that move blocks on a table to fulfill the commands. While most prior work can learn to execute one formula reliably, we develop a novel form of multi-task learning for RL agents that allows them to learn from a diverse set of tasks and generalize to a new set of diverse tasks without any additional training. The compositional structures presented here are not specific to LTL, thus opening the path to RL agents that perform zero-shot generalization in other compositional domains.

}, author = {Yen-Ling Kuo and Andrei Barbu and Boris Katz} } @article {5053, title = {Social Interactions as Recursive MDPs}, year = {2021}, abstract = {

While machines and robots must interact with humans, providing them with social skills has been a largely overlooked topic. This is mostly a consequence of the fact that tasks such as navigation, command following, and even game playing are well-defined, while social reasoning still mostly re- mains a pre-theoretic problem. We demonstrate how social interactions can be effectively incorporated into MDPs (Markov decision processes) by reasoning recursively about the goals of other agents. In essence, our method extends the reward function to include a combination of physical goals (something agents want to accomplish in the configuration space, a traditional MDP) and social goals (something agents want to accomplish relative to the goals of other agents). Our Social MDPs allow specifying reward functions in terms of the estimated reward functions of other agents, modeling interactions such as helping or hindering another agent (by maximizing or minimizing the other agent{\textquoteright}s reward) while bal- ancing this with the actual physical goals of each agent. Our formulation allows for an arbitrary function of another agent{\textquoteright}s estimated reward structure and physical goals, enabling more complex behaviors such as politely hindering another agent or aggressively helping them. Extending Social MDPs in the same manner as I-POMDPs (Interactive-partially observed Markov decision processes) extension would enable interactions such as convincing another agent that something is true. To what extent the Social MDPs presented here and their potential Social POMDPs variant account for all possible social interactions is unknown, but having a precise mathematical model to guide questions about social in- teractions has both practical value (we demonstrate how to make zero-shot social inferences and one could imagine chatbots and robots guided by Social MDPs) and theoretical value by bringing the tools of MDP that have so successfully organized research around navigation to shed light on what social interactions really are given their extreme importance to human well-being and human civilization.

}, author = {Ravi Tejwani and Yen-Ling Kuo and Tianmin Shu and Boris Katz and Andrei Barbu} } @article {5060, title = {Deep compositional robotic planners that follow natural language commands}, year = {2020}, abstract = {

We demonstrate how a sampling-based robotic planner can be augmented to learn to understand a sequence of natural language commands in a continuous configuration space to move and manipu- late objects. Our approach combines a deep network structured according to the parse of a complex command that includes objects, verbs, spatial relations, and attributes, with a sampling-based planner, RRT. A recurrent hierarchical deep network controls how the planner explores the environment, de- termines when a planned path is likely to achieve a goal, and estimates the confidence of each move to trade off exploitation and exploration between the network and the planner. Planners are designed to have near-optimal behavior when information about the task is missing, while networks learn to ex- ploit observations which are available from the environment, making the two naturally complementary. Combining the two enables generalization to new maps, new kinds of obstacles, and more complex sentences that do not occur in the training set. Little data is required to train the model despite it jointly acquiring a CNN that extracts features from the environment as it learns the meanings of words. The model provides a level of interpretability through the use of attention maps allowing users to see its reasoning steps despite being an end-to-end model. This end-to-end model allows robots to learn to follow natural language commands in challenging continuous environments.

}, author = {Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @conference {4519, title = {Deep compositional robotic planners that follow natural language commands }, booktitle = {International Conference on Robotics and Automation (ICRA)}, year = {2020}, month = {05/2020}, address = {Palais des Congr{\`e}s de Paris, Paris, France}, author = {Yen-Ling Kuo and Katz, Boris and Andrei Barbu} } @article {5061, title = {Encoding formulas as deep networks: Reinforcement learning for zero-shot execution of LTL formulas}, year = {2020}, abstract = {

We demonstrate a reinforcement learning agent which uses a compositional recurrent neural network that takes as input an LTL formula and determines satisfying actions. The input LTL formulas have never been seen before, yet the network performs zero-shot generalization to satisfy them. This is a novel form of multi-task learning for RL agents where agents learn from one diverse set of tasks and generalize to a new set of diverse tasks. The formulation of the network enables this capacity to generalize. We demonstrate this ability in two domains. In a symbolic domain, the agent finds a sequence of letters that is accepted. In a Minecraft-like environment, the agent finds a sequence of actions that conform to the formula. While prior work could learn to execute one formula reliably given examples of that formula, we demonstrate how to encode all formulas reliably. This could form the basis of new multi- task agents that discover sub-tasks and execute them without any additional training, as well as the agents which follow more complex linguistic commands. The structures required for this generalization are specific to LTL formulas, which opens up an interesting theoretical question: what structures are required in neural networks for zero-shot generalization to different logics?

}, author = {Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @article {4811, title = {Learning a Natural-language to LTL Executable Semantic Parser for Grounded Robotics}, year = {2020}, month = {12/2020}, institution = {Proceedings of Conference on Robot Learning (CoRL-2020)}, abstract = {

Children acquire their native language with apparent ease by observing how language is used in context and attempting to use it themselves. They do so without laborious annotations, negative examples, or even direct corrections. We take a step toward robots that can do the same by training a grounded semantic parser, which discovers latent linguistic representations that can be used for the execution of natural-language commands. In particular, we focus on the difficult domain of commands with a temporal aspect, whose semantics we capture with Linear Temporal Logic, LTL. Our parser is trained with pairs of sentences and executions as well as an executor. At training time, the parser hypothesizes a meaning representation for the input as a formula in LTL. Three competing pressures allow the parser to discover meaning from language. First, any hypothesized meaning for a sentence must be permissive enough to reflect all the annotated execution trajectories. Second, the executor -- a pretrained end-to-end LTL planner -- must find that the observed trajectories are likely executions of the meaning. Finally, a generator, which reconstructs the original input, encourages the model to find representations that conserve knowledge about the command. Together these ensure that the meaning is neither too general nor too specific. Our model generalizes well, being able to parse and execute both machine-generated and human-generated commands, with near-equal accuracy, despite the fact that the human-generated sentences are much more varied and complex with an open lexicon. The approach presented here is not specific to LTL; it can be applied to any domain where sentence meanings can be hypothesized and an executor can verify these meanings, thus opening the door to many applications for robotic agents.

}, url = {https://corlconf.github.io/paper_385/}, author = {Christopher Wang and Candace Ross and Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @article {5059, title = {Learning a natural-language to LTL executable semantic parser for grounded robotics}, year = {2020}, month = {08/2020}, abstract = {

Children acquire their native language with apparent ease by observing how language is used in context and attempting to use it themselves. They do so without laborious annotations, negative examples, or even direct corrections. We take a step toward robots that can do the same by training a grounded semantic parser, which discovers latent linguistic representations that can be used for the execution of natural-language commands. In particular, we focus on the difficult domain of commands with a temporal aspect, whose semantics we capture with Linear Temporal Logic, LTL. Our parser is trained with pairs of sentences and executions as well as an executor. At training time, the parser hypothesizes a meaning representation for the input as a formula in LTL. Three competing pressures allow the parser to discover meaning from language. First, any hypothesized meaning for a sentence must be permissive enough to reflect all the annotated execution trajectories. Second, the executor {\textemdash} a pretrained end-to-end LTL planner {\textemdash} must find that the observed trajectories are likely executions of the meaning. Finally, a generator, which reconstructs the original input, encourages the model to find representations that conserve knowledge about the command. Together these ensure that the meaning is neither too general nor too specific. Our model generalizes well, being able to parse and execute both machine-generated and human-generated commands, with near-equal accuracy, despite the fact that the human-generated sentences are much more varied and complex with an open lexicon. The approach presented here is not specific to LTL: it can be applied to any domain where sentence meanings can be hypothesized and an executor can verify these meanings, thus opening the door to many applications for robotic agents.

}, doi = {https://doi.org/10.48550/arXiv.2008.03277}, author = {Christopher Wang and Candace Ross and Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @article {4511, title = {Deep Compositional Robotic Planners that Follow Natural Language Commands.}, year = {2019}, month = {12/2019}, address = {Vancouver Convention Centre, Vancouver, Canada}, url = {https://vigilworkshop.github.io/}, author = {Yen-Ling Kuo and Katz, Boris and Andrei Barbu} } @conference {4112, title = {Deep sequential models for sampling-based planning}, booktitle = {The IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS 2018)}, year = {2018}, month = {10/2018 }, address = {Madrid, Spain}, abstract = {

We demonstrate how a sequence model and asampling-based planner can influence each other to produceefficient plans and how such a model can automatically learnto take advantage of observations of the environment. Sampling-based planners such as RRT generally know nothing of theirenvironments even if they have traversed similar spaces manytimes. A sequence model, such as an HMM or LSTM, guidesthe search for good paths. The resulting model, called DeRRT*,observes the state of the planner and the local environment tobias the next move and next planner state. The neural-network-based models avoid manual feature engineering by co-traininga convolutional network which processes map features andobservations from sensors. We incorporate this sequence modelin a manner that combines its likelihood with the existing biasfor searching large unexplored Voronoi regions. This leads tomore efficient trajectories with fewer rejected samples even indifficult domains such as when escaping bug traps. This modelcan also be used for dimensionality reduction in multi-agentenvironments with dynamic obstacles. Instead of planning in ahigh-dimensional space that includes the configurations of theother agents, we plan in a low-dimensional subspace relying onthe sequence model to bias samples using the observed behaviorof the other agents. The techniques presented here are general,include both graphical models and deep learning approaches,and can be adapted to a range of planners.

}, doi = {10.1109/IROS.2018.8593947}, url = {https://ieeexplore.ieee.org/document/8593947}, author = {Yen-Ling Kuo and Andrei Barbu and Boris Katz} }