@conference {5302, title = {The Aligned Multimodal Movie Treebank: An audio, video, dependency-parse treebank}, booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing}, year = {2022}, abstract = {

Treebanks have traditionally included only text and were derived from written sources such as newspapers or the web. We introduce the Aligned Multimodal Movie Treebank (AMMT), an English language treebank derived from dialog in Hollywood movies which includes transcriptions of the audio-visual streams with word-level alignment, as well as part of speech tags and dependency parses in the Universal Dependencies formalism. AMMT consists of 31,264 sentences and 218,090 words, that will amount to the 3rd largest UD English treebank and the only multimodal treebank in UD. To help with the web-based annotation effort, we also introduce the Efficient Audio Alignment Annotator (EAAA), a companion tool that enables annotators to significantly speed-up their annotation processes.

}, author = {Adam Yaari and Jan DeWitt and Henry Hu and Bennett Stankovits and Sue Felshin and Yevgeni Berzak and Helena Aparicio and Boris Katz and Ignacio Cases and Andrei Barbu} } @inbook {3489, title = {A Natural Language Interface for Mobile Devices}, booktitle = {The Wiley Handbook of Human Computer Interaction}, volume = {2}, year = {2018}, month = {02/2018 }, pages = {539-559}, publisher = {John Wiley \& Sons, }, organization = {John Wiley \& Sons, }, edition = {First}, abstract = {

This chapter discusses some of the primary issues related to the design and construction of natural language interfaces, and in particular, interfaces to mobile devices. It describes two systems in this space: the START information access system and the StartMobile natural language interface to mobile devices. The chapter also discusses recently deployed commercial systems and future directions. The use of natural language annotations, and in particular, parameterized natural language annotations, enables START to respond to user requests in a wide variety of ways. StartMobile uses the START system as a first stage in the processing of user requests. Current commercial systems such as Apple{\textquoteright}s Siri, IBM{\textquoteright}s Watson, Google{\textquoteright}s {\textquotedblleft}Google Now{\textquotedblright}, Microsoft{\textquoteright}s Cortana, and Amazon{\textquoteright}s Alexa employ technology of the sort contained in START and StartMobile in combination with statistical ...

}, doi = {10.1002/9781118976005.ch23}, author = {Boris Katz and Gary Borchardt and Sue Felshin and Federico Mora} } @conference {3492, title = {Temporal Grounding Graphs for Language Understanding with Accrued Visual-Linguistic Context}, booktitle = {Proceedings of the Twenty-Sixth International Joint Conference on Artificial Intelligence (IJCAI 2017)}, year = {2017}, month = {08/2017}, address = {Melbourne, Australia}, abstract = {

A robot{\textquoteright}s ability to understand or ground natural language instructions is fundamentally tied to its knowledge about the surrounding world. We present an approach to grounding natural language utter- ances in the context of factual information gathered through natural-language interactions and past vi- sual observations. A probabilistic model estimates, from a natural language utterance, the objects, re- lations, and actions that the utterance refers to, the objectives for future robotic actions it implies, and generates a plan to execute those actions while up- dating a state representation to include newly ac- quired knowledge from the visual-linguistic context. Grounding a command necessitates a representa- tion for past observations and interactions; however, maintaining the full context consisting of all pos- sible observed objects, attributes, spatial relations, actions, etc., over time is intractable. Instead, our model, Temporal Grounding Graphs , maintains a learned state representation for a belief over factual groundings, those derived from natural-language in- teractions, and lazily infers new groundings from visual observations using the context implied by the utterance. This work significantly expands the range of language that a robot can understand by incor- porating factual knowledge and observations of its workspace into its inference about the meaning and grounding of natural-language utterances.

}, url = {c}, author = {Rohan Paul and Andrei Barbu and Sue Felshin and Boris Katz and Nicholas Roy} } @conference {2583, title = {Learning to Answer Questions from Wikipedia Infoboxes}, booktitle = {The 2016 Conference on Empirical Methods on Natural Language Processing (EMNLP 2016)}, year = {2016}, abstract = {

A natural language interface to answers on the Web can help us access information more ef- ficiently.\  We start with an interesting source of information{\textemdash}infoboxes\  in Wikipedia that summarize factoid knowledge{\textemdash}and develop a comprehensive\  approach\  to\  answering\  ques- tions\  with\  high\  precision.\ \ \  We\  first\  build\  a system to access data in infoboxes in a struc- tured manner. We use our system to construct a crowdsourced dataset of over 15,000 high- quality,\  diverse\  questions.\ \  With\  these\  ques- tions, we train a convolutional neural network model\  that\  outperforms\  models\  that\  achieve top results in similar answer selection tasks.

}, author = {Alvaro Morales and Varot Premtoon and Cordelia Avery and Sue Felshin and Boris Katz} }