@inbook {2562, title = {Invariant Recognition Predicts Tuning of Neurons in Sensory Cortex}, booktitle = {Computational and Cognitive Neuroscience of Vision}, year = {2017}, pages = {85-104}, publisher = {Springer}, organization = {Springer}, issn = {978-981-10-0211-3}, author = {Jim Mutch and F. Anselmi and Andrea Tacchetti and Lorenzo Rosasco and JZ. Leibo and Tomaso Poggio} } @article {2327, title = {View-Tolerant Face Recognition and Hebbian Learning Imply Mirror-Symmetric Neural Tuning to Head Orientation}, journal = {Current Biology}, volume = {27}, year = {2017}, month = {01/2017}, pages = {1-6}, abstract = {

The primate brain contains a hierarchy of visual areas, dubbed the ventral stream, which rapidly computes object representations that are both specific for object identity and robust against identity-preserving transformations, like depth rotations. Current computational models of object recognition, including recent deep-learning networks, generate these properties through a hierarchy of alternating selectivity-increasing filtering and tolerance-increasing pooling operations, similar to simple-complex cells operations. Here, we prove that a class of hierarchical architectures and a broad set of biologically plausible learning rules generate approximate invariance to identity-preserving transformations at the top level of the processing hierarchy. However, all past models tested failed to reproduce the most salient property of an intermediate representation of a three-level face-processing hierarchy in the brain: mirror-symmetric tuning to head orientation. Here, we demonstrate that one specific biologically plausible Hebb-type learning rule generates mirror-symmetric tuning to bilaterally symmetric stimuli, like faces, at intermediate levels of the architecture and show why it does so. Thus, the tuning properties of individual cells inside the visual stream appear to result from group properties of the stimuli they encode and to reflect the learning rules that sculpted the information-processing system within which they reside.\ 

}, doi = {http://dx.doi.org/10.1016/j.cub.2016.10.015}, author = {JZ. Leibo and Qianli Liao and F. Anselmi and W. A. Freiwald and Tomaso Poggio} } @conference {2629, title = {How Important Is Weight Symmetry in Backpropagation?}, booktitle = {Thirtieth AAAI Conference on Artificial Intelligence (AAAI-16)}, year = {2016}, address = {Phoenix, AZ.}, abstract = {

Gradient backpropagation (BP) requires symmetric feedforward and feedback connections -- the same weights must be used for forward and backward passes. This "weight transport problem" (Grossberg 1987) is thought to be one of the main reasons to doubt BP{\textquoteright}s biologically plausibility. Using 15 different classification datasets, we systematically investigate to what extent BP really depends on weight symmetry. In a study that turned out to be surprisingly similar in spirit to Lillicrap et al.{\textquoteright}s demonstration (Lillicrap et al. 2014) but orthogonal in its results, our experiments indicate that: (1) the magnitudes of feedback weights do not matter to performance (2) the signs of feedback weights do matter -- the more concordant signs between feedforward and their corresponding feedback connections, the better (3) with feedback weights having random magnitudes and 100\% concordant signs, we were able to achieve the same or even better performance than SGD. (4) some normalizations/stabilizations are indispensable for such asymmetric BP to work, namely Batch Normalization (BN) (Ioffe and Szegedy 2015) and/or a "Batch Manhattan" (BM) update rule.

}, url = {https://cbmm.mit.edu/sites/default/files/publications/liao-leibo-poggio.pdf}, author = {Qianli Liao and JZ. Leibo and Tomaso Poggio} } @conference {1651, title = {How Important Is Weight Symmetry in Backpropagation?}, booktitle = {Thirtieth AAAI Conference on Artificial Intelligence (AAAI-16)}, year = {2016}, month = {Accepted}, publisher = {Association for the Advancement of Artificial Intelligence}, organization = {Association for the Advancement of Artificial Intelligence}, address = {Phoenix, AZ.}, abstract = {

Gradient backpropagation (BP) requires symmetric feedforward and feedback connections -- the same weights must be used for forward and backward passes. This "weight transport problem" (Grossberg 1987) is thought to be one of the main reasons to doubt BP{\textquoteright}s biologically plausibility. Using 15 different classification datasets, we systematically investigate to what extent BP really depends on weight symmetry. In a study that turned out to be surprisingly similar in spirit to Lillicrap et al.{\textquoteright}s demonstration (Lillicrap et al. 2014) but orthogonal in its results, our experiments indicate that: (1) the magnitudes of feedback weights do not matter to performance (2) the signs of feedback weights do matter -- the more concordant signs between feedforward and their corresponding feedback connections, the better (3) with feedback weights having random magnitudes and 100\% concordant signs, we were able to achieve the same or even better performance than SGD. (4) some normalizations/stabilizations are indispensable for such asymmetric BP to work, namely Batch Normalization (BN) (Ioffe and Szegedy 2015) and/or a "Batch Manhattan" (BM) update rule.

}, author = {Qianli Liao and JZ. Leibo and Tomaso Poggio} } @article {2122, title = {View-tolerant face recognition and Hebbian learning imply mirror-symmetric neural tuning to head orientation}, year = {2016}, month = {06/2016}, abstract = {

The primate brain contains a hierarchy of visual areas, dubbed the ventral stream, which rapidly computes object representations that are both specific for object identity and relatively robust against identity-preserving transformations like depth-rotations [ 33 , 32 , 23 , 13 ]. Current computational models of object recognition, including recent deep learning networks, generate these properties through a hierarchy of alternating selectivity-increasing filtering and tolerance-increasing pooling operations, similar to simple-complex cells operations [ 46 , 8 , 44 , 29 ]. While simulations of these models recapitulate the ventral stream{\textquoteright}s progression from early view-specific to late view-tolerant representations, they fail to generate the most salient property of the intermediate representation for faces found in the brain: mirror-symmetric tuning of the neural population to head orientation [ 16 ]. Here we prove that a class of hierarchical architectures and a broad set of biologically plausible learning rules can provide approximate invariance at the top level of the network. While most of the learning rules do not yield mirror-symmetry in the mid-level representations, we characterize a specific biologically-plausible Hebb-type learning rule that is guaranteed to generate mirror-symmetric tuning to faces tuning at intermediate levels of the architecture.

}, author = {JZ. Leibo and Qianli Liao and W. A. Freiwald and F. Anselmi and Tomaso Poggio} } @article {1593, title = {How Important is Weight Symmetry in Backpropagation?}, year = {2015}, month = {11/29/2015}, abstract = {

Gradient backpropagation (BP) requires symmetric feedforward and feedback connections{\textemdash}the same weights must be used for forward and backward passes. This {\textquotedblleft}weight transport problem{\textquotedblright} [1] is thought to be one of the main reasons of BP{\textquoteright}s biological implausibility. Using 15 different classification datasets, we systematically study to what extent BP really depends on weight symmetry. In a study that turned out to be surprisingly similar in spirit to Lillicrap et al.{\textquoteright}s demonstration [2] but orthogonal in its results, our experiments indicate that: (1) the magnitudes of feedback weights do not matter to performance (2) the signs of feedback weights do matter{\textemdash}the more concordant signs between feedforward and their corresponding feedback connections, the better (3) with feedback weights having random magnitudes and 100\% concordant signs, we were able to achieve the same or even better performance than SGD. (4) some normalizations/stabilizations are indispensable for such asymmetric BP to work, namely Batch Normalization (BN) [3] and/or a {\textquotedblleft}Batch Manhattan{\textquotedblright} (BM) update rule.

}, author = {Qianli Liao and JZ. Leibo and Tomaso Poggio} } @article {1380, title = {The Invariance Hypothesis Implies Domain-Specific Regions in Visual Cortex}, year = {2015}, month = {07/2015}, author = {JZ. Leibo and Qianli Liao and F. Anselmi and Tomaso Poggio} } @article {1484, title = {The Invariance Hypothesis Implies Domain-Specific Regions in Visual Cortex}, journal = {PLOS Computational Biology}, volume = {11}, year = {2015}, month = {10/23/2015}, pages = {e1004390}, abstract = {

Is visual cortex made up of general-purpose information processing machinery, or does it consist of a collection of specialized modules? If prior knowledge, acquired from learning a set of objects is only transferable to new objects that share properties with the old, then the recognition system{\textquoteright}s optimal organization must be one containing specialized modules for different object classes. Our analysis starts from a premise we call the invariance hypothesis: that the computational goal of the ventral stream is to compute an invariant-to-transformations and discriminative signature for recognition. The key condition enabling approximate transfer of invariance without sacrificing discriminability turns out to be that the learned and novel objects transform similarly. This implies that the optimal recognition system must contain subsystems trained only with data from similarly-transforming objects and suggests a novel interpretation of domain-specific regions like the fusiform face area (FFA). Furthermore, we can define an index of transformation-compatibility, computable from videos, that can be combined with information about the statistics of natural vision to yield predictions for which object categories ought to have domain-specific regions in agreement with the available data. The result is a unifying account linking the large literature on view-based recognition with the wealth of experimental evidence concerning domain-specific regions.

}, doi = {10.1371/journal.pcbi.1004390}, url = {http://dx.plos.org/10.1371/journal.pcbi.1004390}, author = {JZ. Leibo and Qianli Liao and F. Anselmi and Tomaso Poggio} } @article {1415, title = {Unsupervised learning of invariant representations}, journal = {Theoretical Computer Science}, year = {2015}, month = {06/25/2015}, abstract = {

The present phase of Machine Learning is characterized by supervised learning algorithms relying on large sets of labeled examples (n{\textrightarrow}$\infty$n{\textrightarrow}$\infty$). The next phase is likely to focus on algorithms capable of learning from very few labeled examples (n{\textrightarrow}1n{\textrightarrow}1), like humans seem able to do. We propose an approach to this problem and describe the underlying theory, based on the unsupervised, automatic learning of a {\textquotedblleft}good{\textquotedblright} representation for supervised learning, characterized by small sample complexity. We consider the case of visual object recognition, though the theory also applies to other domains like speech. The starting point is the conjecture, proved in specific cases, that image representations which are invariant to translation, scaling and other transformations can considerably reduce the sample complexity of learning. We prove that an invariant and selective signature can be computed for each image or image patch: the invariance can be exact in the case of group transformations and approximate under non-group transformations. A module performing filtering and pooling, like the simple and complex cells described by Hubel and Wiesel, can compute such signature. The theory offers novel unsupervised learning algorithms for {\textquotedblleft}deep{\textquotedblright} architectures for image and speech recognition. We conjecture that the main computational goal of the ventral stream of visual cortex is to provide a hierarchical representation of new objects/images which is invariant to transformations, stable, and selective for recognition{\textemdash}and show how this representation may be continuously learned in an unsupervised way during development and visual experience.

}, keywords = {convolutional networks, Cortex, Hierarchy, Invariance}, doi = {10.1016/j.tcs.2015.06.048}, url = {http://www.sciencedirect.com/science/article/pii/S0304397515005587}, author = {F. Anselmi and JZ. Leibo and Lorenzo Rosasco and Jim Mutch and Andrea Tacchetti and Tomaso Poggio} } @article {357, title = {Can a biologically-plausible hierarchy effectively replace face detection, alignment, and recognition pipelines?}, number = {003}, year = {2014}, month = {03/2014}, abstract = {

The standard approach to unconstrained face recognition in natural photographs is via a detection, alignment, recognition pipeline. While that approach has achieved impressive results, there are several reasons to be dissatisfied with it, among them is its lack of biological plausibility. A recent theory of invariant recognition by feedforward hierarchical networks, like HMAX, other convolutional networks, or possibly the ventral stream, implies an alternative approach to unconstrained face recognition. This approach accomplishes detection and alignment implicitly by storing transformations of training images (called templates) rather than explicitly detecting and aligning faces at test time. Here we propose a particular locality-sensitive hashing based voting scheme which we call {\textquotedblleft}consensus of collisions{\textquotedblright} and show that it can be used to approximate the full 3-layer hierarchy implied by the theory. The resulting end-to-end system for unconstrained face recognition operates on photographs of faces taken under natural conditions, e.g., Labeled Faces in the Wild (LFW), without aligning or cropping them, as is normally done. It achieves a drastic improvement in the state of the art on this end-to-end task, reaching the same level of performance as the best systems operating on aligned, closely cropped images (no outside training data). It also performs well on two newer datasets, similar to LFW, but more difficult: LFW-jittered (new here) and SUFR-W.

}, keywords = {Computer vision, Face recognition, Hierarchy, Invariance}, author = {Qianli Liao and JZ. Leibo and Youssef Mroueh and Tomaso Poggio} } @article {389, title = {The dynamics of invariant object recognition in the human visual system.}, journal = {J Neurophysiol}, volume = {111}, year = {2014}, month = {01/2014}, pages = {91-102}, abstract = {

The human visual system can rapidly recognize objects despite transformations that alter their appearance. The precise timing of when the brain computes neural representations that are invariant to particular transformations, however, has not been mapped in humans. Here we employ magnetoencephalography decoding analysis to measure the dynamics of size- and position-invariant visual information development in the ventral visual stream. With this method we can read out the identity of objects beginning as early as 60 ms. Size- and position-invariant visual information appear around 125 ms and 150 ms, respectively, and both develop in stages, with invariance to smaller transformations arising before invariance to larger transformations. Additionally, the magnetoencephalography sensor activity localizes to neural sources that are in the most posterior occipital regions at the early decoding times and then move temporally as invariant information develops. These results provide previously unknown latencies for key stages of human-invariant object recognition, as well as new and compelling evidence for a feed-forward hierarchical model of invariant object recognition where invariance increases at each successive visual area along the ventral stream.

Corresponding Dataset - The dynamics of invariant object recognition in the human visual system.

}, keywords = {Adolescent, Adult, Evoked Potentials, Visual, Female, Humans, Male, Pattern Recognition, Visual, Reaction Time, visual cortex}, issn = {1522-1598}, doi = {10.1152/jn.00394.2013}, url = {http://jn.physiology.org/content/early/2013/09/27/jn.00394.2013.abstract}, author = {Leyla Isik and Ethan Meyers and JZ. Leibo and Tomaso Poggio} } @article {2288, title = {The dynamics of invariant object recognition in the human visual system.}, year = {2014}, month = {01/2014}, abstract = {

This is the dataset for corresponding Journal Article - The dynamics of invariant object recognition in the human visual system.

The human visual system can rapidly recognize objects despite transformations that alter their appearance. The precise timing of when the brain computes neural representations that are invariant to particular transformations, however, has not been mapped in humans. Here we employ magnetoencephalography decoding analysis to measure the dynamics of size- and position-invariant visual information development in the ventral visual stream. With this method we can read out the identity of objects beginning as early as 60 ms. Size- and position-invariant visual information appear around 125 ms and 150 ms, respectively, and both develop in stages, with invariance to smaller transformations arising before invariance to larger transformations. Additionally, the magnetoencephalography sensor activity localizes to neural sources that are in the most posterior occipital regions at the early decoding times and then move temporally as invariant information develops. These results provide previously unknown latencies for key stages of human-invariant object recognition, as well as new and compelling evidence for a feed-forward hierarchical model of invariant object recognition where invariance increases at each successive visual area along the ventral stream.

Dataset files can be downloaded here - http://dx.doi.org/10.7910/DVN/KRUPXZ

11 subjects{\textquoteright} MEG data from Isik et al., 2014. Data is available in raw .fif format or in Matlab raster format that is compatible with the neural decoding toolbox (readout.info).

For Matlab code to pre-process this MEG data, and run the decoding analyses please visit

https://bitbucket.org/lisik/meg_decoding

}, doi = {http://dx.doi.org/10.7910/DVN/KRUPXZ}, author = {Leyla Isik and Ethan Meyers and JZ. Leibo and Tomaso Poggio} } @article {438, title = {The Invariance Hypothesis Implies Domain-Specific Regions in Visual Cortex}, number = {004}, year = {2014}, month = {04/2014}, abstract = {

Is visual cortex made up of general-purpose information processing machinery, or does it consist of a collection of specialized modules? If prior knowledge, acquired from learning a set of objects is only transferable to new objects that share properties with the old, then the recognition system{\textquoteright}s optimal organization must be one containing specialized modules for different object classes. Our analysis starts from a premise we call the invariance hypothesis: that the computational goal of the ventral stream is to compute an invariant-to-transformations and discriminative signature for recognition. The key condition enabling approximate transfer of invariance without sacrificing discriminability turns out to be that the learned and novel objects transform similarly. This implies that the optimal recognition system must contain subsystems trained only with data from similarly-transforming objects and suggests a novel interpretation of domain-specific regions like the fusiform face area (FFA). Furthermore, we can define an index of transformation-compatibility, computable from videos, that can be combined with information about the statistics of natural vision to yield predictions for which object categories ought to have domain-specific regions. The result is a unifying account linking the large literature on view-based recognition with the wealth of experimental evidence concerning domain-specific regions.

}, keywords = {Neuroscience, Theories for Intelligence}, doi = {10.1101/004473}, url = {http://biorxiv.org/lookup/doi/10.1101/004473}, author = {JZ. Leibo and Qianli Liao and F. Anselmi and Tomaso Poggio} } @conference {222, title = {Learning invariant representations and applications to face verification}, booktitle = {NIPS 2013}, year = {2014}, month = {02/2014}, publisher = {Advances in Neural Information Processing Systems 26}, organization = {Advances in Neural Information Processing Systems 26}, address = {Lake Tahoe, Nevada}, abstract = {

One approach to computer object recognition and modeling the brain{\textquoteright}s ventral stream involves unsupervised learning of representations that are invariant to common transformations. However, applications of these ideas have usually been limited to 2D affine transformations, e.g., translation and scaling, since they are easiest to solve via convolution. In accord with a recent theory of transformation-invariance [1], we propose a model that, while capturing other common convolutional networks as special cases, can also be used with arbitrary identity-preserving transformations. The model{\textquoteright}s wiring can be learned from videos of transforming objects{\textemdash}or any other grouping of images into sets by their depicted object. Through a series of successively more complex empirical tests, we study the invariance/discriminability properties of this model with respect to different transformations. First, we empirically confirm theoretical predictions (from [1]) for the case of 2D affine transformations. Next, we apply the model to non-affine transformations; as expected, it performs well on face verification tasks requiring invariance to the relatively smooth transformations of 3D rotation-in-depth and changes in illumination direction. Surprisingly, it can also tolerate clutter {\textquotedblleft}transformations{\textquotedblright} which map an image of a face on one background to an image of the same face on a different background. Motivated by these empirical findings, we tested the same model on face verification benchmark tasks from the computer vision literature: Labeled Faces in the Wild, PubFig [2, 3, 4] and a new dataset we gathered{\textemdash}achieving strong performance in these highly unconstrained cases as well.

}, keywords = {Computer vision}, url = {http://nips.cc/Conferences/2013/Program/event.php?ID=4074}, author = {Qianli Liao and JZ. Leibo and Tomaso Poggio} } @article {219, title = {Subtasks of Unconstrained Face Recognition}, year = {2014}, month = {01/2014}, publisher = {9th International Joint Conference on Computer Vision, Imaging and Computer Graphics Theory and Applications. (VISAPP).}, address = {Lisbon, Portugal}, abstract = {

Unconstrained face recognition remains a challenging computer vision problem despite recent exceptionally high results (\~{} 95\% accuracy) on the current gold standard evaluation dataset: Labeled Faces in the Wild (LFW) (Huang et al., 2008; Chen et al., 2013). We offer a decomposition of the unconstrained problem into subtasks based on the idea that invariance to identity-preserving transformations is the crux of recognition. Each of the subtasks in the Subtasks of Unconstrained Face Recognition (SUFR) challenge consists of a same-different face-matching problem on a set of 400 individual synthetic faces rendered so as to isolate a specific transformation or set of transformations. We characterized the performance of 9 different models (8 previously published) on each of the subtasks. One notable finding was that the HMAX-C2 feature was not nearly as clutter-resistant as had been suggested by previous publications (Leibo et al., 2010; Pinto et al., 2011). Next we considered LFW and argued that it is too easy of a task to continue to be regarded as a measure of progress on unconstrained face recognition. In particular, strong performance on LFW requires almost no invariance, yet it cannot be considered a fair approximation of the outcome of a detection{\textrightarrow}alignment pipeline since it does not contain the kinds of variability that realistic alignment systems produce when working on non-frontal faces. We offer a new, more difficult, natural image dataset: SUFR-in-the-Wild (SUFR-W), which we created using a protocol that was similar to LFW, but with a few differences designed to produce more need for transformation invariance. We present baseline results for eight different face recognition systems on the new dataset and argue that it is time to retire LFW and move on to more difficult evaluations for unconstrained face recognition.

Click here for more information on related dataset \>

}, keywords = {Face identification, Invariance, Labeled Faces in the Wild, Same-different matching, Synthetic data}, author = {JZ. Leibo and Qianli Liao and Tomaso Poggio} } @article {384, title = {Subtasks of unconstrained face recognition}, year = {2014}, month = {01/2014}, abstract = {

This package contains:

1. \ SUFR-W, a dataset of {\textquotedblleft}in the wild{\textquotedblright} natural images of faces gathered from the internet. The protocol used to create the dataset is described in Leibo, Liao and Poggio (2014).

2. \ The full set of SUFR synthetic datasets, called the {\textquotedblleft}Subtasks of Unconstrained Face Recognition Challenge{\textquotedblright} in Leibo, Liao and Poggio (2014).

Click here for more information \& download \>

Click here to download the data set directly \>

}, keywords = {Computer vision}, author = {JZ. Leibo and Qianli Liao and Tomaso Poggio} } @article {455, title = {Unsupervised learning of clutter-resistant visual representations from natural videos.}, number = {023}, year = {2014}, month = {09/2014}, abstract = {

Populations of neurons in inferotemporal cortex (IT) maintain an explicit code for object identity that also tolerates transformations of object appearance e.g., position, scale, viewing angle [1, 2, 3]. Though the learning rules are not known, recent results [4, 5, 6] suggest the operation of an unsupervised temporal-association-based method e.g., Foldiak{\textquoteright}s trace rule [7]. Such methods exploit the temporal continuity of the visual world by assuming that visual experience over short timescales will tend to have invariant identity content. Thus, by associating representations of frames from nearby times, a representation that tolerates whatever transformations occurred in the video may be achieved. Many previous studies verified that such rules can work in simple situations without background clutter, but the presence of visual clutter has remained problematic for this approach. Here we show that temporal association based on large class-specific filters (templates) avoids the problem of clutter. Our system learns in an unsupervised way from natural videos gathered from the internet, and is able to perform a difficult unconstrained face recognition task on natural images (Labeled Faces in the Wild [8]).

}, author = {Qianli Liao and JZ. Leibo and Tomaso Poggio} } @article {226, title = {Unsupervised learning of invariant representations with low sample complexity: the magic of sensory cortex or a new framework for machine learning?}, number = {001}, year = {2014}, month = {03/2014}, abstract = {

The present phase of Machine Learning is characterized by supervised learning algorithms relying on large sets of labeled examples (n{\textrightarrow}$\infty$). The next phase is likely to focus on algorithms capable of learning from very few labeled examples (n{\textrightarrow}1), like humans seem able to do. We propose an approach to this problem and describe the underlying theory, based on the unsupervised, automatic learning of a {\textquotedblleft}good{\textquotedblright} representation for supervised learning, characterized by small sample complexity (n). We consider the case of visual object recognition though the theory applies to other domains. The starting point is the conjecture, proved in specific cases, that image representations which are invariant to translations, scaling and other transformations can considerably reduce the sample complexity of learning. We prove that an invariant and unique (discriminative) signature can be computed for each image patch, I, in terms of empirical distributions of the dot-products between I and a set of templates stored during unsupervised learning. A module performing filtering and pooling, like the simple and complex cells described by Hubel and Wiesel, can compute such estimates. Hierarchical architectures consisting of this basic Hubel-Wiesel moduli inherit its properties of invariance, stability, and discriminability while capturing the compositional organization of the visual world in terms of wholes and parts. The theory extends existing deep learning convolutional architectures for image and speech recognition. It also suggests that the main computational goal of the ventral stream of visual cortex is to provide a hierarchical representation of new objects/images which is invariant to transformations, stable, and discriminative for recognition{\textemdash}and that this representation may be continuously learned in an unsupervised way during development and visual experience.

}, keywords = {Computer vision, Pattern recognition}, author = {F. Anselmi and JZ. Leibo and Lorenzo Rosasco and Jim Mutch and Andrea Tacchetti and Tomaso Poggio} } @proceedings {387, title = {Unsupervised Learning of Invariant Representations in Hierarchical Architectures.}, year = {2013}, month = {11/2013}, abstract = {

Representations that are invariant to translation, scale and other transformations, can considerably reduce the sample complexity of learning, allowing recognition of new object classes from very few examples {\textendash} a hallmark of human recognition. Empirical estimates of one-dimensional projections of the distribution induced by a group of affine transformations are proven to represent a unique and invariant signature associated with an image. We show how projections yielding invariant signatures for future images can be learned automatically, and updated continuously, during unsupervised visual experience. A module performing filtering and pooling, like simple and complex cells as proposed by Hubel and Wiesel, can compute such estimates. Under this view, a pooling stage estimates a one-dimensional probability distribution. Invariance from observations through a restricted window is equivalent to a sparsity property w.r.t. to a transformation, which yields templates that are a) Gabor for optimal simultaneous invariance to translation and scale or b) very specific for complex, class-dependent transformations such as rotation in depth of faces. Hierarchical architectures consisting of this basic Hubel-Wiesel module inherit its properties of invariance, stability, and discriminability while capturing the compositional organization of the visual world in terms of wholes and parts, and are invariant to complex transformations that may only be locally affine. The theory applies to several existing deep learning convolutional architectures for image and speech recognition. It also suggests that the main computational goal of the ventral stream of visual cortex is to provide a hierarchical representation of new objects which is invariant to transformations, stable, and discriminative for recognition {\textendash} this representation may be learned in an unsupervised way from natural visual experience.

Read paper\>

}, keywords = {convolutional networks, Hierarchy, Invariance, visual cortex}, author = {F. Anselmi and JZ. Leibo and Lorenzo Rosasco and Jim Mutch and Andrea Tacchetti and Tomaso Poggio} }