@article {4826, title = {Measuring Social Biases in Grounded Vision and Language Embeddings}, year = {2021}, abstract = {

We generalize the notion of measuring social biases in word embeddings to visually grounded word embeddings. \ Biases are present in grounded embeddings, and indeed seem to be equally or more significant than for ungrounded embeddings. \ This is despite the fact that vision and language can suffer from different biases, which one might hope could attenuate the biases in both. Multiple ways exist to generalize metrics measuring bias in word embeddings to this new setting. \ We introduce the space of generalizations (Grounded-WEAT and Grounded-SEAT) and demonstrate that three generalizations answer different yet important questions about how biases, language, and vision interact. These metrics are used on a new dataset, the first for grounded bias, created by augmenting standard linguistic bias benchmarks with 10,228 images from COCO, Conceptual Captions, and Google Images. \ Dataset construction is challenging because vision datasets are themselves very biased. \ The presence of these biases in systems will begin to have real-world consequences as they are deployed, making carefully measuring bias and then mitigating it critical to building a fair society.

}, author = {Candace Ross and Boris Katz and Andrei Barbu} } @article {5057, title = {Measuring Social Biases in Grounded Vision and Language Embeddings}, year = {2021}, abstract = {

We generalize the notion of measuring social biases in word embeddings to visually grounded word embeddings. Biases are present in grounded embeddings, and indeed seem to be equally or more significant than for ungrounded embeddings. This is despite the fact that vision and language can suffer from different biases, which one might hope could attenuate the biases in both. Multiple ways exist to generalize metrics measuring bias in word embeddings to this new setting. We introduce the space of generalizations (GroundedWEAT and Grounded-SEAT) and demonstrate that three gener- alizations answer different yet important questions about how biases, language, and vision interact. These metrics are used on a new dataset, the first for grounded bias, created by augmenting stan- dard linguistic bias benchmarks with 10,228 images from COCO, Conceptual Captions, and Google Images. Dataset construction is challenging because vision datasets are themselves very biased. The presence of these biases in systems will begin to have real-world consequences as they are deployed, making carefully measuring bias and then mitigating it critical to building a fair society.

}, author = {Candace Ross and Andrei Barbu and Boris Katz} } @article {4811, title = {Learning a Natural-language to LTL Executable Semantic Parser for Grounded Robotics}, year = {2020}, month = {12/2020}, institution = {Proceedings of Conference on Robot Learning (CoRL-2020)}, abstract = {

Children acquire their native language with apparent ease by observing how language is used in context and attempting to use it themselves. They do so without laborious annotations, negative examples, or even direct corrections. We take a step toward robots that can do the same by training a grounded semantic parser, which discovers latent linguistic representations that can be used for the execution of natural-language commands. In particular, we focus on the difficult domain of commands with a temporal aspect, whose semantics we capture with Linear Temporal Logic, LTL. Our parser is trained with pairs of sentences and executions as well as an executor. At training time, the parser hypothesizes a meaning representation for the input as a formula in LTL. Three competing pressures allow the parser to discover meaning from language. First, any hypothesized meaning for a sentence must be permissive enough to reflect all the annotated execution trajectories. Second, the executor -- a pretrained end-to-end LTL planner -- must find that the observed trajectories are likely executions of the meaning. Finally, a generator, which reconstructs the original input, encourages the model to find representations that conserve knowledge about the command. Together these ensure that the meaning is neither too general nor too specific. Our model generalizes well, being able to parse and execute both machine-generated and human-generated commands, with near-equal accuracy, despite the fact that the human-generated sentences are much more varied and complex with an open lexicon. The approach presented here is not specific to LTL; it can be applied to any domain where sentence meanings can be hypothesized and an executor can verify these meanings, thus opening the door to many applications for robotic agents.

}, url = {https://corlconf.github.io/paper_385/}, author = {Christopher Wang and Candace Ross and Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @article {5059, title = {Learning a natural-language to LTL executable semantic parser for grounded robotics}, year = {2020}, month = {08/2020}, abstract = {

Children acquire their native language with apparent ease by observing how language is used in context and attempting to use it themselves. They do so without laborious annotations, negative examples, or even direct corrections. We take a step toward robots that can do the same by training a grounded semantic parser, which discovers latent linguistic representations that can be used for the execution of natural-language commands. In particular, we focus on the difficult domain of commands with a temporal aspect, whose semantics we capture with Linear Temporal Logic, LTL. Our parser is trained with pairs of sentences and executions as well as an executor. At training time, the parser hypothesizes a meaning representation for the input as a formula in LTL. Three competing pressures allow the parser to discover meaning from language. First, any hypothesized meaning for a sentence must be permissive enough to reflect all the annotated execution trajectories. Second, the executor {\textemdash} a pretrained end-to-end LTL planner {\textemdash} must find that the observed trajectories are likely executions of the meaning. Finally, a generator, which reconstructs the original input, encourages the model to find representations that conserve knowledge about the command. Together these ensure that the meaning is neither too general nor too specific. Our model generalizes well, being able to parse and execute both machine-generated and human-generated commands, with near-equal accuracy, despite the fact that the human-generated sentences are much more varied and complex with an open lexicon. The approach presented here is not specific to LTL: it can be applied to any domain where sentence meanings can be hypothesized and an executor can verify these meanings, thus opening the door to many applications for robotic agents.

}, doi = {https://doi.org/10.48550/arXiv.2008.03277}, author = {Christopher Wang and Candace Ross and Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @conference {4518, title = {Learning Language from Vision.}, booktitle = {Workshop on Visually Grounded Interaction and Language (ViGIL) at the Thirty-third Annual Conference on Neural Information Processing Systems (NeurIPS)}, year = {2019}, month = {12/2019}, address = {Vancouver Convention Center, Vancouver, Canada}, author = {Candace Ross and Yevgeni Berzak and Boris Katz and Andrei Barbu} }