@conference {3999, title = {Trading robust representations for sample complexity through self-supervised visual experience}, booktitle = {Advances in Neural Information Processing Systems 31}, year = {2018}, month = {12/2018}, pages = {9640{\textendash}9650}, publisher = {Curran Associates, Inc.}, organization = {Curran Associates, Inc.}, address = {Montreal, Canada}, abstract = {
Learning in small sample regimes is among the most remarkable features of the human perceptual system. This ability is related to robustness to transformations, which is acquired through visual experience in the form of weak- or self-supervision during development. We explore the idea of allowing artificial systems to learn representations of visual stimuli through weak supervision prior to downstream su- pervised tasks. We introduce a novel loss function for representation learning using unlabeled image sets and video sequences, and experimentally demonstrate that these representations support one-shot learning and reduce the sample complexity of multiple recognition tasks. We establish the existence of a trade-off between the sizes of weakly supervised, automatically obtained from video sequences, and fully supervised data sets. Our results suggest that equivalence sets other than class labels, which are abundant in unlabeled visual experience, can be used for self-supervised learning of semantically relevant image embeddings.
}, url = {http://papers.nips.cc/paper/8170-trading-robust-representations-for-sample-complexity-through-self-supervised-visual-experience.pdf}, author = {Tacchetti, Andrea and Stephen Voinea and Evangelopoulos, Georgios}, editor = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett} } @article {2550, title = {Discriminate-and-Rectify Encoders: Learning from Image Transformation Sets}, year = {2017}, month = {03/2017}, abstract = {The complexity of a learning task is increased by transformations in the input space that preserve class identity. Visual object recognition for example is affected by changes in viewpoint, scale, illumination or planar transformations. While drastically altering the visual appearance, these changes are orthogonal to recognition and should not be reflected in the representation or feature encoding used for learning. We introduce a framework for weakly supervised learning of image embeddings that are robust to transformations and selective to the class distribution, using sets of transforming examples (orbit sets), deep parametrizations and a novel orbit-based loss. The proposed loss combines a discriminative, contrastive part for orbits with a reconstruction error that learns to rectify orbit transformations. The learned embeddings are evaluated in distance metric-based tasks, such as one-shot classification under geometric transformations, as well as face verification and retrieval under more realistic visual variability. Our results suggest that orbit sets, suitably computed or observed, can be used for efficient, weakly-supervised learning of semantically relevant image embeddings.
}, author = {Andrea Tacchetti and Stephen Voinea and Georgios Evangelopoulos} } @conference {2673, title = {Representation Learning from Orbit Sets for One-shot Classification}, booktitle = {AAAI Spring Symposium Series, Science of Intelligence}, year = {2017}, address = {AAAI}, abstract = {The sample complexity of a learning task is increased by transformations that do not change class identity. Visual object recognition for example, i.e. the discrimination or categorization of distinct semantic classes, is affected by changes in viewpoint, scale, illumination or planar transformations. We introduce a weakly-supervised framework for learning robust and selective representations from sets of transforming examples (orbit sets). We train deep encoders that explicitly account for the equivalence up to transformations of orbit sets and show that the resulting encodings contract the intra-orbit distance and preserve identity either by preserving reconstruction or by increasing the inter-orbit distance. We explore a loss function that combines a discriminative term, and a reconstruction term that uses a decoder-encoder map to learn to rectify transformation-perturbed examples, and demonstrate the validity of the resulting embeddings for one-shot learning. Our results suggest that a suitable definition of orbit sets is a form of weak supervision that can be exploited to learn semantically relevant embeddings.
}, url = {https://www.aaai.org/ocs/index.php/SSS/SSS17/paper/view/15357}, author = {Andrea Tacchetti and Stephen Voinea and Georgios Evangelopoulos and Tomaso Poggio} } @conference {1142, title = {Discriminative Template Learning in Group-Convolutional Networks for Invariant Speech Representations}, booktitle = {INTERSPEECH-2015}, year = {2015}, month = {09/2015}, publisher = {International Speech Communication Association (ISCA)}, organization = {International Speech Communication Association (ISCA)}, address = {Dresden, Germany}, url = {http://www.isca-speech.org/archive/interspeech_2015/i15_3229.html}, author = {Chiyuan Zhang and Stephen Voinea and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @conference {1559, title = {Learning with Group Invariant Features: A Kernel Perspective}, booktitle = {NIPS 2015}, year = {2015}, abstract = {Representations in the auditory cortex might be based on mechanisms similar to the visual ventral stream; modules for building invariance to transformations and multiple layers for compositionality and selectivity. In this paper we propose the use of such computational modules for extracting invariant and discriminative audio representations. Building on a theory of invariance in hierarchical architectures, we propose a novel, mid-level representation for acoustical signals, using the empirical distributions of projections on a set of templates and their transformations. Under the assumption that, by construction, this dictionary of templates is composed from similar classes, and samples the orbit of variance-inducing signal transformations (such as shift and scale), the resulting signature is theoretically guaranteed to be unique, invariant to transformations and stable to deformations. Modules of projection and pooling can then constitute layers of deep networks, for learning composite representations. We present the main theoretical and computational aspects of a framework for unsupervised learning of invariant audio representations, empirically evaluated on music genre classification.
}, keywords = {Audio Representation, Hierarchy, Invariance, Machine Learning, Theories for Intelligence}, author = {Chiyuan Zhang and Georgios Evangelopoulos and Stephen Voinea and Lorenzo Rosasco and Tomaso Poggio} } @conference {1141, title = {A Deep Representation for Invariance and Music Classification}, booktitle = {ICASSP 2014 - 2014 IEEE International Conference on Acoustics, Speech and Signal Processing}, year = {2014}, month = {05/04/2014}, publisher = {IEEE}, organization = {IEEE}, address = {Florence, Italy}, keywords = {acoustic signal processing, signal representation, unsupervised learning}, doi = {10.1109/ICASSP.2014.6854954}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6854954}, author = {Chiyuan Zhang and Georgios Evangelopoulos and Stephen Voinea and Lorenzo Rosasco and Tomaso Poggio} } @article {451, title = {Learning An Invariant Speech Representation}, number = {022}, year = {2014}, month = {06/2014}, abstract = {Recognition of speech, and in particular the ability to generalize and learn from small sets of labelled examples like humans do, depends on an appropriate representation of the acoustic input. We formulate the problem of finding robust speech features for supervised learning with small sample complexity as a problem of learning representations of the signal that are maximally invariant to intraclass transformations and deformations. We propose an extension of a theory for unsupervised learning of invariant visual representations to the auditory domain and empirically evaluate its validity for voiced speech sound classification. Our version of the theory requires the memory-based, unsupervised storage of acoustic templates {\textemdash} such as specific phones or words {\textemdash} together with all the transformations of each that normally occur. A quasi-invariant representation for a speech segment can be obtained by projecting it to each template orbit, i.e., the set of transformed signals, and computing the associated one-dimensional empirical probability distributions. The computations can be performed by modules of filtering and pooling, and extended to hierarchical architectures. In this paper, we apply a single-layer, multicomponent representation for phonemes and demonstrate improved accuracy and decreased sample complexity for vowel classification compared to standard spectral, cepstral and perceptual features.
}, keywords = {Theories for Intelligence}, author = {Georgios Evangelopoulos and Stephen Voinea and Chiyuan Zhang and Lorenzo Rosasco and Tomaso Poggio} } @conference {220, title = {Phone Classification by a Hierarchy of Invariant Representation Layers}, booktitle = {INTERSPEECH 2014 - 15th Annual Conf. of the International Speech Communication Association}, year = {2014}, publisher = {International Speech Communication Association (ISCA)}, organization = {International Speech Communication Association (ISCA)}, address = {Singapore}, abstract = {We propose a multi-layer feature extraction framework for speech, capable of providing invariant representations. A set of templates is generated by sampling the result of applying smooth, identity-preserving transformations (such as vocal tract length and tempo variations) to arbitrarily-selected speech signals. Templates are then stored as the weights of {\textquotedblleft}neurons{\textquotedblright}. We use a cascade of such computational modules to factor out different types of transformation variability in a hierarchy, and show that it improves phone classification over baseline features. In addition, we describe empirical comparisons of a) different transformations which may be responsible for the variability in speech signals and of b) different ways of assembling template sets for training. The proposed layered system is an effort towards explaining the performance of recent deep learning networks and the principles by which the human auditory cortex might reduce the sample complexity of learning in speech recognition. Our theory and experiments suggest that invariant representations are crucial in learning from complex, real-world data like natural speech. Our model is built on basic computational primitives of cortical neurons, thus making an argument about how representations might be learned in the human auditory cortex.
}, keywords = {Hierarchy, Invariance, Neural Networks, Speech Representation}, url = {http://www.isca-speech.org/archive/interspeech_2014/i14_2346.html}, author = {Chiyuan Zhang and Stephen Voinea and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @article {1140, title = {Speech Representations based on a Theory for Learning Invariances}, year = {2014}, month = {10/2014}, type = {poster presentation}, address = {SANE 2014 - Speech and Audio in the Northeast}, abstract = {Recognition of sounds and speech from a small number of labelled examples (like humans do), depends on the properties of the representation of the acoustic input. We formulate the problem of extracting robust speech features for supervised learning with small sample complexity as a problem of learning representations of the signal that are maximally invariant to intraclass transformations and deformations. We propose an extension of a theory for unsupervised learning of invariant visual representations to the auditory domain, that requires the memory-based, unsupervised storage of acoustic templates -- such as specific phones or words -- together with all the transformations of each that normally occur. A quasi-invariant representation for a speech signal can be obtained by projecting it to a number of template orbits, i.e., each one a set of transformed template signals, and computing the associated one-dimensional empirical probability distributions. The computations are perfomed by modules of filtering and pooling, that can be used for obtaining a mapping in single- or multilayer architectures. We consider several aspects of such representations including different signal scales (word vs. frame), input domains (raw waveforms vs. frequency filterbank responses), structures (shallow vs.\ multilayer/hierarchical), and ways of sampling from template orbit sets given a set of observations (explicit vs. learned). Preliminary empirical evaluations for learning to separate speech phones and words are given on TIMIT and subsets of TI-DIGITS.\
}, author = {Stephen Voinea and Chiyuan Zhang and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @conference {221, title = {Word-level Invariant Representations From Acoustic Waveforms}, booktitle = {INTERSPEECH 2014 - 15th Annual Conf. of the International Speech Communication Association}, year = {2014}, publisher = {International Speech Communication Association (ISCA)}, organization = {International Speech Communication Association (ISCA)}, address = {Singapore}, abstract = {Extracting discriminant, transformation-invariant features from raw audio signals remains a serious challenge for speech recognition. The issue of speaker variability is central to this problem, as changes in accent, dialect, gender, and age alter the sound waveform of speech units at multiple scales (phonemes, words, or phrases). Approaches for dealing with this variability have typically focused on analyzing the spectral properties of speech at the level of frames, on par with frame-level acoustic modeling usually applied to speech recognition systems. In this paper, we propose a framework for representing speech at the whole-word level and extracting features from the acoustic, temporal domain, without the need for spectral encoding or pre-processing. Leveraging recent work on unsupervised learning of invariant sensory representations, we extract a signature for a word by first projecting its raw waveform onto a set of templates and their transformations, and then forming empirical estimates of the resulting one-dimensional distributions via histograms. The representation and relevant parameters are evaluated for word classification on a series of datasets with increasing speaker-mismatch difficulty, and the results are compared to those of an MFCC-based representation.
}, keywords = {Invariance, Speech Representation, Theories for Intelligence}, url = {http://www.isca-speech.org/archive/interspeech_2014/i14_2385.html}, author = {Stephen Voinea and Chiyuan Zhang and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} }