@article {1092, title = {A Compositional Framework for Grounding Language Inference, Generation, and Acquisition in Video}, year = {2015}, abstract = {

We present an approach to simultaneously reasoning about a video clip and an entire natural-language sentence. The compositional nature of language is exploited to construct models which represent the meanings of entire sentences composed out of the meanings of the words in those sentences mediated by a grammar that encodes the predicate-argument relations. We demonstrate that these models faithfully represent the meanings of sentences and are sensitive to how the roles played by participants (nouns), their characteristics (adjectives), the actions performed (verbs), the manner of such actions (adverbs), and changing spatial relations between participants (prepositions) affect the meaning of a sentence and how it is grounded in video. We exploit this methodology in three ways. In the first, a video clip along with a sentence are taken as input and the participants in the event described by the sentence are highlighted, even when the clip depicts multiple similar simultaneous events. In the second, a video clip is taken as input without a sentence and a sentence is generated that describes an event in that clip. In the third, a corpus of video clips is paired with sentences which describe some of the events in those clips and the meanings of the words in those sentences are learned. We learn these meanings without needing to specify which attribute of the video clips each word in a given sentence refers to. The learned meaning representations are shown to be intelligible to humans.

}, doi = {doi:10.1613/jair.4556}, url = {https://www.jair.org/media/4556/live-4556-8631-jair.pdf}, author = {Honan Yu and N. Siddharth and Andrei Barbu and Jeffrey Mark Siskind} } @article {452, title = {The Compositional Nature of Event Representations in the Human Brain}, number = {011}, year = {2014}, month = {07/14/2014}, abstract = {

How does the human brain represent simple compositions of constituents: actors, verbs, objects, directions, and locations? Subjects viewed videos during neuroimaging (fMRI) sessions from which sentential descriptions of those videos were identified by decoding the brain representations based only on their fMRI activation patterns. Constituents (e.g., fold and shirt) were independently decoded from a single presentation. Independent constituent classification was then compared to joint classification of aggregate concepts (e.g., fold -shirt); results were similar as measured by accuracy and correlation. The brain regions used for independent constituent classification are largely disjoint and largely cover those used for joint classification. This allows recovery of sentential descriptions of stimulus videos by composing the results of the independent constituent classifiers. Furthermore, classifiers trained on the words one set of subjects think of when watching a video can recognize sentences a different subject thinks of when watching a different video.

}, author = {Andrei Barbu and Daniel Barrett and Wei Chen and N. Siddharth and Caiming Xiong and Jason J. Corso and Christiane D. Fellbaum and Catherine Hanson and Stephen Jos{\'e} Hanson and Sebastien Helie and Evguenia Malaia and Barak A. Pearlmutter and Jeffrey Mark Siskind and Thomas Michael Talavage and Ronnie B. Wilbur} } @article {1367, title = {Seeing is Worse than Believing: Reading People{\textquoteright}s Minds Better than Computer-Vision Methods Recognize Actions}, number = {012}, year = {2014}, month = {09/2014}, abstract = {

We had human subjects perform a one-out-of-six class action recognition task from video stimuli while undergoing functional magnetic resonance imaging (fMRI). Support-vector machines (SVMs) were trained on the recovered brain scans to classify actions observed during imaging, yielding average classification accuracy of 69.73\% when tested on scans from the same subject and of 34.80\% when tested on scans from different subjects. An apples-to-apples comparison was performed with all publicly available software that implements state-of-the-art action recognition on the same video corpus with the same cross-validation regimen and same partitioning into training and test sets, yielding classification accuracies between 31.25\% and 52.34\%. This indicates that one can read people{\textquoteright}s minds better than state-of-the-art computer-vision methods can perform action recognition.

}, author = {Andrei Barbu and Daniel Barrett and Wei Chen and N. Siddharth and Caiming Xiong and Jason J. Corso and Christiane D. Fellbaum and Catherine Hanson and Stephen Jos{\'e} Hanson and Sebastien Helie and Evguenia Malaia and Barak A. Pearlmutter and Jeffrey Mark Siskind and Thomas Michael Talavage and Ronnie B. Wilbur} } @inbook {1090, title = {Seeing is worse than believing: Reading people{\textquoteright}s minds better than computer-vision methods recognize actions}, booktitle = {Computer Vision {\textendash} ECCV 2014, Lecture Notes in Computer Science}, series = {13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V}, volume = {8693}, year = {2014}, pages = {612{\textendash}627}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Zurich, Switzerland}, abstract = {

We had human subjects perform a one-out-of-six class action recognition task from video stimuli while undergoing functional magnetic resonance imaging (fMRI). Support-vector machines (SVMs) were trained on the recovered brain scans to classify actions observed during imaging, yielding average classification accuracy of 69.73\% when tested on scans from the same subject and of 34.80\% when tested on scans from different subjects. An apples-to-apples comparison was performed with all publicly available software that implements state-of-the-art action recognition on the same video corpus with the same cross-validation regimen and same partitioning into training and test sets, yielding classification accuracies between 31.25\% and 52.34\%. This indicates that one can read people{\textquoteright}s minds better than state-of-the-art computer-vision methods can perform action recognition.

}, doi = {10.1007/978-3-319-10602-1_40}, author = {Andrei Barbu and Daniel Barrett and Wei Chen and N. Siddharth and Caiming Xiong and Jason J. Corso and Christiane D. Fellbaum and Catherine Hanson and Stephen Jos{\'e} Hanson and Sebastien Helie and Evguenia Malaia and Barak A. Pearlmutter and Jeffrey Mark Siskind and Thomas Michael Talavage and Ronnie B. Wilbur} } @article {442, title = {Seeing What You{\textquoteright}re Told: Sentence-Guided Activity Recognition In Video.}, number = {006}, year = {2014}, month = {05/2014}, abstract = {

We present a system that demonstrates how the compositional structure of events, in concert with the compositional structure of language, can interplay with the underlying focusing mechanisms in video action recognition, thereby providing a medium, not only for top-down and bottom-up integration, but also for multi-modal integration between vision and language. We show how the roles played by participants (nouns), their characteristics (adjectives), the actions performed (verbs), the manner of such actions (adverbs), and changing spatial relations between participants (prepositions) in the form of whole sentential descriptions mediated by a grammar, guides the activity-recognition process. Further, the utility and expressiveness of our framework is demonstrated by performing three separate tasks in the domain of multi-activity videos: sentence-guided focus of attention, generation of sentential descriptions of video, and query-based video search, simply by leveraging the framework in different manners.

}, author = {N. Siddharth and Andrei Barbu and Jeffrey Mark Siskind} } @conference {1089, title = {Seeing What You{\textquoteright}re Told: Sentence-Guided Activity Recognition In Video}, booktitle = {CVPR}, year = {2014}, month = {07/2014}, publisher = {IEEE}, organization = {IEEE}, address = {Columbus, Ohio}, abstract = {

We present a system that demonstrates how the compositional structure of events, in concert with the compositional structure of language, can interplay with the underlying focusing mechanisms in video action recognition, providing a medium for top-down and bottom-up integration as well as multi-modal integration between vision and language. We show how the roles played by participants (nouns), their characteristics (adjectives), the actions performed (verbs), the manner of such actions (adverbs), and changing spatial relations between participants (prepositions), in the form of whole-sentence descriptions mediated by a grammar, guides the activity-recognition process. Further, the utility and expressiveness of our framework is demonstrated by performing three separate tasks in the domain of multiactivity video: sentence-guided focus of attention, generation of sentential description, and query-based search, simply by leveraging the framework in different manners

}, author = {N. Siddharth and Andrei Barbu and Jeffrey Mark Siskind} } @article {1094, title = {Seeing what you{\textquoteright}re told, sentence guided activity recognition in video}, year = {2014}, publisher = {IEEE}, author = {N. Siddharth and Andrei Barbu and Jeffrey Mark Siskind} }