@article {3452, title = {A fast, invariant representation for human action in the visual system}, journal = {Journal of Neurophysiology}, year = {2018}, abstract = {

Humans can effortlessly recognize others{\textquoteright} actions in the presence of complex transformations, such as changes in viewpoint. Several studies have located the regions in the brain involved in invariant action recognition; however, the underlying neural computations remain poorly understood. We use magnetoencephalography decoding and a data set of well-controlled, naturalistic videos of five actions (run, walk, jump, eat, drink) performed by different actors at different viewpoints to study the computational steps used to recognize actions across complex transformations. In particular, we ask when the brain discriminates between different actions, and when it does so in a manner that is invariant to changes in 3D viewpoint. We measure the latency difference between invariant and noninvariant action decoding when subjects view full videos as well as form-depleted and motion-depleted stimuli. We were unable to detect a difference in decoding latency or temporal profile between invariant and noninvariant action recognition in full videos. However, when either form or motion information is removed from the stimulus set, we observe a decrease and delay in invariant action decoding. Our results suggest that the brain recognizes actions and builds invariance to complex transformations at the same time and that both form and motion information are crucial for fast, invariant action recognition.

Associated Dataset: MEG action recognition data

}, doi = {https://doi.org/10.1152/jn.00642.2017}, url = {https://www.physiology.org/doi/10.1152/jn.00642.2017}, author = {Leyla Isik and Andrea Tacchetti and Tomaso Poggio} } @article {3871, title = {Invariant Recognition Shapes Neural Representations of Visual Input}, journal = {Annual Review of Vision Science}, volume = {4}, year = {2018}, month = {10/2018}, pages = {403 - 422}, abstract = {

Recognizing the people, objects, and actions in the world around us is a crucial aspect of human perception that allows us to plan and act in our environment. Remarkably, our proficiency in recognizing semantic categories from visual input is unhindered by transformations that substantially alter their appearance (e.g., changes in lighting or position). The ability to generalize across these complex transformations is a hallmark of human visual intelligence, which has been the focus of wide-ranging investigation in systems and computational neuroscience. However, while the neural machinery of human visual perception has been thoroughly described, the computational principles dictating its functioning remain unknown. Here, we review recent results in brain imaging, neurophysiology, and computational neuroscience in support of the hypothesis that the ability to support the invariant recognition of semantic entities in the visual world shapes which neural representations of sensory input are computed by human visual cortex.

}, keywords = {computational neuroscience, Invariance, neural decoding, visual representations}, issn = {2374-4642}, doi = {10.1146/annurev-vision-091517-034103}, url = {https://www.annualreviews.org/doi/10.1146/annurev-vision-091517-034103}, author = {Andrea Tacchetti and Leyla Isik and Tomaso Poggio} } @article {3685, title = {MEG action recognition data}, year = {2018}, abstract = {

MEG action recognition data from Isik et al., 2018 and Tacchetti et al., 2017. In binned format to be used with the Neural Decoding Toolbox (2018-02-13).

Associated publications:

L. Isik,\ Tacchetti, A., and\ Poggio, T.,\ {\textquotedblleft}A fast, invariant representation for human action in the visual system{\textquotedblright},\ Journal of Neurophysiology, 2018.
A. Tacchetti,\ Isik, L., and\ Poggio, T.,\ {\textquotedblleft}Invariant recognition drives neural representations of action sequences{\textquotedblright},\ PLoS Comp. Bio, 2017.
}, doi = {https://doi.org/10.7910/DVN/KFYY2M}, url = {https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/KFYY2M}, author = {Leyla Isik and Andrea Tacchetti} } @article {2550, title = {Discriminate-and-Rectify Encoders: Learning from Image Transformation Sets}, year = {2017}, month = {03/2017}, abstract = {

The complexity of a learning task is increased by transformations in the input space that preserve class identity. Visual object recognition for example is affected by changes in viewpoint, scale, illumination or planar transformations. While drastically altering the visual appearance, these changes are orthogonal to recognition and should not be reflected in the representation or feature encoding used for learning. We introduce a framework for weakly supervised learning of image embeddings that are robust to transformations and selective to the class distribution, using sets of transforming examples (orbit sets), deep parametrizations and a novel orbit-based loss. The proposed loss combines a discriminative, contrastive part for orbits with a reconstruction error that learns to rectify orbit transformations. The learned embeddings are evaluated in distance metric-based tasks, such as one-shot classification under geometric transformations, as well as face verification and retrieval under more realistic visual variability. Our results suggest that orbit sets, suitably computed or observed, can be used for efficient, weakly-supervised learning of semantically relevant image embeddings.

}, author = {Andrea Tacchetti and Stephen Voinea and Georgios Evangelopoulos} } @article {3274, title = {A fast, invariant representation for human action in the visual system.}, journal = {J Neurophysiol}, year = {2017}, month = {11/2017}, pages = {jn.00642.2017}, abstract = {

Humans can effortlessly recognize others{\textquoteright} actions in the presence of complex transformations, such as changes in viewpoint. Several studies have located the regions in the brain involved in invariant action recognition, however, the underlying neural computations remain poorly understood. We use magnetoencephalography (MEG) decoding and a dataset of well-controlled, naturalistic videos of five actions (run, walk, jump, eat, drink) performed by different actors at different viewpoints to study the computational steps used to recognize actions across complex transformations. In particular, we ask when the brain discriminates between different actions, and when it does so in a manner that is invariant to changes in 3D viewpoint. We measure the latency difference between invariant and non-invariant action decoding when subjects view full videos as well as form-depleted and motion-depleted stimuli. We were unable to detect a difference in decoding latency or temporal profile between invariant and non-invariant action recognition in full videos. However, when either form or motion information is removed from the stimulus set, we observe a decrease and delay in invariant action decoding. Our results suggest that the brain recognizes actions and builds invariance to complex transformations at the same time, and that both form and motion information are crucial for fast, invariant action recognition.

}, keywords = {action recognition, magnetoencephalography, neural decoding, vision}, issn = {1522-1598}, doi = {10.1152/jn.00642.2017}, author = {Leyla Isik and Andrea Tacchetti and Tomaso Poggio} } @article {3162, title = {Invariant action recognition dataset}, year = {2017}, month = {11/2017}, abstract = {

To study the effect of changes in view and actor on action recognition, we filmed a dataset of five actors performing five different actions (drink, eat, jump, run and walk) on a treadmill from five different views (0, 45, 90, 135, and 180 degrees from the front of the actor/treadmill; the treadmill rather than the camera was rotated in place to acquire from different viewpoints). The dataset was filmed on a fixed, constant background. To avoid low-level object/action confounds (e.g. the action {\textquotedblleft}drink{\textquotedblright} being classified as the only videos with water bottle in the scene) and guarantee that the main sources of variation of visual appearance are due to actions, actors and viewpoint, the actors held the same objects (an apple and a water bottle) in each video, regardless of the action they performed. This controlled design allows us to test hypotheses on the computational mechanisms underlying invariant recognition in the human visual system without having to settle for a synthetic dataset.

More information and the dataset files can be found here - https://doi.org/10.7910/DVN/DMT0PG

}, url = {https://doi.org/10.7910/DVN/DMT0PG}, author = {Andrea Tacchetti and Leyla Isik and Tomaso Poggio} } @article {3453, title = {Invariant recognition drives neural representations of action sequences}, journal = {PLoS Comp. Bio}, year = {2017}, abstract = {

Recognizing the actions of others from visual stimuli is a crucial aspect of human perception that allows individuals to respond to social cues. Humans are able to discriminate between similar actions despite transformations, like changes in viewpoint or actor, that substantially alter the visual appearance of a scene. This ability to generalize across complex transformations is a hallmark of human visual intelligence. Advances in understanding action recognition at the neural level have not always translated into precise accounts of the computational principles underlying what representations of action sequences are constructed by human visual cortex. Here we test the hypothesis that invariant action discrimination might fill this gap. Recently, the study of artificial systems for static object perception has produced models, Convolutional Neural Networks (CNNs), that achieve human level performance in complex discriminative tasks. Within this class, architectures that better support invariant object recognition also produce image representations that better match those implied by human and primate neural data. However, whether these models produce representations of action sequences that support recognition across complex transformations and closely follow neural representations of actions remains unknown. Here we show that spatiotemporal CNNs accurately categorize video stimuli into action classes, and that deliberate model modifications that improve performance on an invariant action recognition task lead to data representations that better match human neural recordings. Our results support our hypothesis that performance on invariant discrimination dictates the neural representations of actions computed in the brain. These results broaden the scope of the invariant recognition framework for understanding visual intelligence from perception of inanimate objects and faces in static images to the study of human perception of action sequences.

Associated Dataset: MEG action recognition data

}, author = {Andrea Tacchetti and Leyla Isik and Tomaso Poggio} } @article {3272, title = {Invariant recognition drives neural representations of action sequences}, journal = {PLOS Computational Biology}, volume = {13}, year = {2017}, month = {12/2017}, pages = {e1005859}, abstract = {

Recognizing the actions of others from visual stimuli is a crucial aspect of human perception that allows individuals to respond to social cues. Humans are able to discriminate between similar actions despite transformations, like changes in viewpoint or actor, that substantially alter the visual appearance of a scene. This ability to generalize across complex transformations is a hallmark of human visual intelligence. Advances in understanding action recognition at the neural level have not always translated into precise accounts of the computational principles underlying what representations of action sequences are constructed by human visual cortex. Here we test the hypothesis that invariant action discrimination might fill this gap. Recently, the study of artificial systems for static object perception has produced models, Convolutional Neural Networks (CNNs), that achieve human level performance in complex discriminative tasks. Within this class, architectures that better support invariant object recognition also produce image representations that better match those implied by human and primate neural data. However, whether these models produce representations of action sequences that support recognition across complex transformations and closely follow neural representations of actions remains unknown. Here we show that spatiotemporal CNNs accurately categorize video stimuli into action classes, and that deliberate model modifications that improve performance on an invariant action recognition task lead to data representations that better match human neural recordings. Our results support our hypothesis that performance on invariant discrimination dictates the neural representations of actions computed in the brain. These results broaden the scope of the invariant recognition framework for understanding visual intelligence from perception of inanimate objects and faces in static images to the study of human perception of action sequences.

}, doi = {10.1371/journal.pcbi.1005859}, url = {http://dx.plos.org/10.1371/journal.pcbi.1005859}, author = {Andrea Tacchetti and Leyla Isik and Tomaso Poggio}, editor = {Berniker, Max} } @inbook {2562, title = {Invariant Recognition Predicts Tuning of Neurons in Sensory Cortex}, booktitle = {Computational and Cognitive Neuroscience of Vision}, year = {2017}, pages = {85-104}, publisher = {Springer}, organization = {Springer}, issn = {978-981-10-0211-3}, author = {Jim Mutch and F. Anselmi and Andrea Tacchetti and Lorenzo Rosasco and JZ. Leibo and Tomaso Poggio} } @conference {2673, title = {Representation Learning from Orbit Sets for One-shot Classification}, booktitle = {AAAI Spring Symposium Series, Science of Intelligence}, year = {2017}, address = {AAAI}, abstract = {

The sample complexity of a learning task is increased by transformations that do not change class identity. Visual object recognition for example, i.e. the discrimination or categorization of distinct semantic classes, is affected by changes in viewpoint, scale, illumination or planar transformations. We introduce a weakly-supervised framework for learning robust and selective representations from sets of transforming examples (orbit sets). We train deep encoders that explicitly account for the equivalence up to transformations of orbit sets and show that the resulting encodings contract the intra-orbit distance and preserve identity either by preserving reconstruction or by increasing the inter-orbit distance. We explore a loss function that combines a discriminative term, and a reconstruction term that uses a decoder-encoder map to learn to rectify transformation-perturbed examples, and demonstrate the validity of the resulting embeddings for one-shot learning. Our results suggest that a suitable definition of orbit sets is a form of weak supervision that can be exploited to learn semantically relevant embeddings.

}, url = {https://www.aaai.org/ocs/index.php/SSS/SSS17/paper/view/15357}, author = {Andrea Tacchetti and Stephen Voinea and Georgios Evangelopoulos and Tomaso Poggio} } @article {1596, title = {Fast, invariant representation for human action in the visual system}, year = {2016}, month = {01/2016}, abstract = {

Isik, L*, Tacchetti, A*, and Poggio, T (* authors contributed equally to this work)

The ability to recognize the actions of others from visual input is essential to humans{\textquoteright} daily lives. The neural computations underlying action recognition, however, are still poorly understood. We use magnetoencephalography (MEG) decoding and a computational model to study action recognition from a novel dataset of well-controlled, naturalistic videos of five actions (run, walk, jump, eat drink) performed by five actors at five viewpoints. We show for the first that that actor- and view-invariant representations for action arise in the human brain as early as 200 ms. We next extend a class of biologically inspired hierarchical computational models of object recognition to recognize actions from videos and explain the computations underlying our MEG findings. This model achieves 3D viewpoint-invariance by the same biologically inspired computational mechanism it uses to build invariance to position and scale. These results suggest that robustness to complex transformations, such as 3D viewpoint invariance, does not require special neural architectures, and further provide a mechanistic explanation of the computations driving invariant action recognition.

}, url = {http://arxiv.org/abs/1601.01358}, author = {Leyla Isik and Andrea Tacchetti and Tomaso Poggio} } @article {2576, title = {Spatio-temporal convolutional networks explain neural representations of human actions}, year = {2016}, author = {Andrea Tacchetti and Leyla Isik and Tomaso Poggio} } @article {2559, title = {Invariant representations for action recognition in the visual system.}, volume = {15}, year = {2015}, address = {Journal of vision}, doi = {10.1167/15.12.558}, url = {http://jov.arvojournals.org/article.aspx?articleid=2433666}, author = {Andrea Tacchetti and Leyla Isik and Tomaso Poggio} } @article {2560, title = {Invariant representations for action recognition in the visual system}, year = {2015}, author = {Leyla Isik and Andrea Tacchetti and Tomaso Poggio} } @article {1415, title = {Unsupervised learning of invariant representations}, journal = {Theoretical Computer Science}, year = {2015}, month = {06/25/2015}, abstract = {

The present phase of Machine Learning is characterized by supervised learning algorithms relying on large sets of labeled examples (n{\textrightarrow}$\infty$n{\textrightarrow}$\infty$). The next phase is likely to focus on algorithms capable of learning from very few labeled examples (n{\textrightarrow}1n{\textrightarrow}1), like humans seem able to do. We propose an approach to this problem and describe the underlying theory, based on the unsupervised, automatic learning of a {\textquotedblleft}good{\textquotedblright} representation for supervised learning, characterized by small sample complexity. We consider the case of visual object recognition, though the theory also applies to other domains like speech. The starting point is the conjecture, proved in specific cases, that image representations which are invariant to translation, scaling and other transformations can considerably reduce the sample complexity of learning. We prove that an invariant and selective signature can be computed for each image or image patch: the invariance can be exact in the case of group transformations and approximate under non-group transformations. A module performing filtering and pooling, like the simple and complex cells described by Hubel and Wiesel, can compute such signature. The theory offers novel unsupervised learning algorithms for {\textquotedblleft}deep{\textquotedblright} architectures for image and speech recognition. We conjecture that the main computational goal of the ventral stream of visual cortex is to provide a hierarchical representation of new objects/images which is invariant to transformations, stable, and selective for recognition{\textemdash}and show how this representation may be continuously learned in an unsupervised way during development and visual experience.

}, keywords = {convolutional networks, Cortex, Hierarchy, Invariance}, doi = {10.1016/j.tcs.2015.06.048}, url = {http://www.sciencedirect.com/science/article/pii/S0304397515005587}, author = {F. Anselmi and JZ. Leibo and Lorenzo Rosasco and Jim Mutch and Andrea Tacchetti and Tomaso Poggio} } @article {226, title = {Unsupervised learning of invariant representations with low sample complexity: the magic of sensory cortex or a new framework for machine learning?}, number = {001}, year = {2014}, month = {03/2014}, abstract = {

The present phase of Machine Learning is characterized by supervised learning algorithms relying on large sets of labeled examples (n{\textrightarrow}$\infty$). The next phase is likely to focus on algorithms capable of learning from very few labeled examples (n{\textrightarrow}1), like humans seem able to do. We propose an approach to this problem and describe the underlying theory, based on the unsupervised, automatic learning of a {\textquotedblleft}good{\textquotedblright} representation for supervised learning, characterized by small sample complexity (n). We consider the case of visual object recognition though the theory applies to other domains. The starting point is the conjecture, proved in specific cases, that image representations which are invariant to translations, scaling and other transformations can considerably reduce the sample complexity of learning. We prove that an invariant and unique (discriminative) signature can be computed for each image patch, I, in terms of empirical distributions of the dot-products between I and a set of templates stored during unsupervised learning. A module performing filtering and pooling, like the simple and complex cells described by Hubel and Wiesel, can compute such estimates. Hierarchical architectures consisting of this basic Hubel-Wiesel moduli inherit its properties of invariance, stability, and discriminability while capturing the compositional organization of the visual world in terms of wholes and parts. The theory extends existing deep learning convolutional architectures for image and speech recognition. It also suggests that the main computational goal of the ventral stream of visual cortex is to provide a hierarchical representation of new objects/images which is invariant to transformations, stable, and discriminative for recognition{\textemdash}and that this representation may be continuously learned in an unsupervised way during development and visual experience.

}, keywords = {Computer vision, Pattern recognition}, author = {F. Anselmi and JZ. Leibo and Lorenzo Rosasco and Jim Mutch and Andrea Tacchetti and Tomaso Poggio} } @proceedings {387, title = {Unsupervised Learning of Invariant Representations in Hierarchical Architectures.}, year = {2013}, month = {11/2013}, abstract = {

Representations that are invariant to translation, scale and other transformations, can considerably reduce the sample complexity of learning, allowing recognition of new object classes from very few examples {\textendash} a hallmark of human recognition. Empirical estimates of one-dimensional projections of the distribution induced by a group of affine transformations are proven to represent a unique and invariant signature associated with an image. We show how projections yielding invariant signatures for future images can be learned automatically, and updated continuously, during unsupervised visual experience. A module performing filtering and pooling, like simple and complex cells as proposed by Hubel and Wiesel, can compute such estimates. Under this view, a pooling stage estimates a one-dimensional probability distribution. Invariance from observations through a restricted window is equivalent to a sparsity property w.r.t. to a transformation, which yields templates that are a) Gabor for optimal simultaneous invariance to translation and scale or b) very specific for complex, class-dependent transformations such as rotation in depth of faces. Hierarchical architectures consisting of this basic Hubel-Wiesel module inherit its properties of invariance, stability, and discriminability while capturing the compositional organization of the visual world in terms of wholes and parts, and are invariant to complex transformations that may only be locally affine. The theory applies to several existing deep learning convolutional architectures for image and speech recognition. It also suggests that the main computational goal of the ventral stream of visual cortex is to provide a hierarchical representation of new objects which is invariant to transformations, stable, and discriminative for recognition {\textendash} this representation may be learned in an unsupervised way from natural visual experience.

Read paper\>

}, keywords = {convolutional networks, Hierarchy, Invariance, visual cortex}, author = {F. Anselmi and JZ. Leibo and Lorenzo Rosasco and Jim Mutch and Andrea Tacchetti and Tomaso Poggio} }