@article {2780, title = {Musings on Deep Learning: Properties of SGD}, year = {2017}, month = {04/2017}, abstract = {

[formerly titled "Theory of Deep Learning III: Generalization Properties of SGD"]

In Theory III we characterize with a mix of theory and experiments the generalization properties of Stochastic Gradient Descent in overparametrized deep convolutional networks. We show that Stochastic Gradient Descent (SGD) selects with high probability solutions that 1) have zero (or small) empirical error, 2) are degenerate as shown in Theory II and 3) have maximum generalization.

}, author = {Chiyuan Zhang and Qianli Liao and Alexander Rakhlin and Karthik Sridharan and Brando Miranda and Noah Golowich and Tomaso Poggio} } @article {3261, title = {Theory of Deep Learning IIb: Optimization Properties of SGD}, year = {2017}, month = {12/2017}, abstract = {

In Theory IIb we characterize with a mix of theory and experiments the optimization of deep convolutional networks by Stochastic Gradient Descent. The main new result in this paper is theoretical and experimental evidence for the following conjecture about SGD: SGD concentrates in probability - like the classical Langevin equation {\textendash} on large volume, {\textquotedblleft}flat{\textquotedblright} minima, selecting flat minimizers which are with very high probability also global minimizers.

}, author = {Chiyuan Zhang and Qianli Liao and Alexander Rakhlin and Brando Miranda and Noah Golowich and Tomaso Poggio} } @conference {1142, title = {Discriminative Template Learning in Group-Convolutional Networks for Invariant Speech Representations}, booktitle = {INTERSPEECH-2015}, year = {2015}, month = {09/2015}, publisher = {International Speech Communication Association (ISCA)}, organization = {International Speech Communication Association (ISCA)}, address = {Dresden, Germany}, url = {http://www.isca-speech.org/archive/interspeech_2015/i15_3229.html}, author = {Chiyuan Zhang and Stephen Voinea and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @conference {1574, title = {Learning with a Wasserstein Loss}, booktitle = {Advances in Neural Information Processing Systems (NIPS 2015) 28}, year = {2015}, abstract = {

Learning to predict multi-label outputs is challenging, but in many problems there is a natural metric on the outputs that can be used to improve predictions. In this paper we develop a loss function for multi-label learning, based on the Wasserstein distance. The Wasserstein distance provides a natural notion of dissimilarity for probability measures. Although optimizing with respect to the exact Wasserstein distance is costly, recent work has described a regularized approximation that is efficiently computed. We describe an efficient learning algorithm based on this regularization, as well as a novel extension of the Wasserstein distance from prob- ability measures to unnormalized measures. We also describe a statistical learning bound for the loss. The Wasserstein loss can encourage smoothness of the predic- tions with respect to a chosen metric on the output space. We demonstrate this property on a real-data tag prediction problem, using the Yahoo Flickr Creative Commons dataset, outperforming a baseline that doesn{\textquoteright}t use the metric.

}, url = {http://arxiv.org/abs/1506.05439}, author = {Charlie Frogner and Chiyuan Zhang and Hossein Mobahi and Mauricio Araya-Polo and Tomaso Poggio} } @article {227, title = {A Deep Representation for Invariance And Music Classification}, number = {002}, year = {2014}, month = {03/2014}, abstract = {

Representations in the auditory cortex might be based on mechanisms similar to the visual ventral stream; modules for building invariance to transformations and multiple layers for compositionality and selectivity. In this paper we propose the use of such computational modules for extracting invariant and discriminative audio representations. Building on a theory of invariance in hierarchical architectures, we propose a novel, mid-level representation for acoustical signals, using the empirical distributions of projections on a set of templates and their transformations. Under the assumption that, by construction, this dictionary of templates is composed from similar classes, and samples the orbit of variance-inducing signal transformations (such as shift and scale), the resulting signature is theoretically guaranteed to be unique, invariant to transformations and stable to deformations. Modules of projection and pooling can then constitute layers of deep networks, for learning composite representations. We present the main theoretical and computational aspects of a framework for unsupervised learning of invariant audio representations, empirically evaluated on music genre classification.

}, keywords = {Audio Representation, Hierarchy, Invariance, Machine Learning, Theories for Intelligence}, author = {Chiyuan Zhang and Georgios Evangelopoulos and Stephen Voinea and Lorenzo Rosasco and Tomaso Poggio} } @conference {1141, title = {A Deep Representation for Invariance and Music Classification}, booktitle = {ICASSP 2014 - 2014 IEEE International Conference on Acoustics, Speech and Signal Processing}, year = {2014}, month = {05/04/2014}, publisher = {IEEE}, organization = {IEEE}, address = {Florence, Italy}, keywords = {acoustic signal processing, signal representation, unsupervised learning}, doi = {10.1109/ICASSP.2014.6854954}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6854954}, author = {Chiyuan Zhang and Georgios Evangelopoulos and Stephen Voinea and Lorenzo Rosasco and Tomaso Poggio} } @article {451, title = {Learning An Invariant Speech Representation}, number = {022}, year = {2014}, month = {06/2014}, abstract = {

Recognition of speech, and in particular the ability to generalize and learn from small sets of labelled examples like humans do, depends on an appropriate representation of the acoustic input. We formulate the problem of finding robust speech features for supervised learning with small sample complexity as a problem of learning representations of the signal that are maximally invariant to intraclass transformations and deformations. We propose an extension of a theory for unsupervised learning of invariant visual representations to the auditory domain and empirically evaluate its validity for voiced speech sound classification. Our version of the theory requires the memory-based, unsupervised storage of acoustic templates {\textemdash} such as specific phones or words {\textemdash} together with all the transformations of each that normally occur. A quasi-invariant representation for a speech segment can be obtained by projecting it to each template orbit, i.e., the set of transformed signals, and computing the associated one-dimensional empirical probability distributions. The computations can be performed by modules of filtering and pooling, and extended to hierarchical architectures. In this paper, we apply a single-layer, multicomponent representation for phonemes and demonstrate improved accuracy and decreased sample complexity for vowel classification compared to standard spectral, cepstral and perceptual features.

}, keywords = {Theories for Intelligence}, author = {Georgios Evangelopoulos and Stephen Voinea and Chiyuan Zhang and Lorenzo Rosasco and Tomaso Poggio} } @conference {216, title = {Machine Learning Based Automated Fault Detection in Seismic Traces}, booktitle = {EAGE Conference and Exhibition 2014}, year = {2014}, month = {06/2014}, address = {The Netherlands}, abstract = {

Introduction:

The Initial stages of velocity model building (VMB) start off from smooth models that capture geological assumptions of the subsurface region under analysis. Acceptable velocity models result from successive iterations of human intervention (interpreter) and seismic data processing with in complex workflows. The interpreters ensure that any additions or corrections made by seismic processing are compliant with geological and geophysical knowledge. The information that seismic processing adds to the model consists of structural elements, faults are one of the most relevant of those events since they can signal reservoir boundaries or hydrocarbon traps. Faults are excluded in the initial models due to their local scale. Bringing faults into the model in early stages can help to steer the VMB process.

This work introduced a tool whose purpose is to assist the interpreters during the initial stages of the VMB, when no seismic data has been migrated. Our novel method is based on machine learning techniques and can automatically identify and localize faults from not migrated seismic data. Comprehensive research has targeted the fault localization problem, but most of the results are obtained using processed seismic data or images as input (Admasu and Toennies (2004); Tingdahl et al. (2001); Cohen et al. (2006); Hale (2013), etc). Our approach suggests an additional tool that can be used to speed up the
VMB process.

Fully automated VMB has not been achieved because the human factor is difficult to formalize in a way that can be systematically applied. Nonetheless, if our framework is extended to other seismic events or attributes, it might become a powerful tool to alleviate interpreters{\textquoteright} work.

}, url = {http://cbcl.mit.edu/publications/eage14.pdf}, author = {Chiyuan Zhang and Charlie Frogner and Mauricio Araya-Polo and Detlef Hohl} } @conference {220, title = {Phone Classification by a Hierarchy of Invariant Representation Layers}, booktitle = {INTERSPEECH 2014 - 15th Annual Conf. of the International Speech Communication Association}, year = {2014}, publisher = {International Speech Communication Association (ISCA)}, organization = {International Speech Communication Association (ISCA)}, address = {Singapore}, abstract = {

We propose a multi-layer feature extraction framework for speech, capable of providing invariant representations. A set of templates is generated by sampling the result of applying smooth, identity-preserving transformations (such as vocal tract length and tempo variations) to arbitrarily-selected speech signals. Templates are then stored as the weights of {\textquotedblleft}neurons{\textquotedblright}. We use a cascade of such computational modules to factor out different types of transformation variability in a hierarchy, and show that it improves phone classification over baseline features. In addition, we describe empirical comparisons of a) different transformations which may be responsible for the variability in speech signals and of b) different ways of assembling template sets for training. The proposed layered system is an effort towards explaining the performance of recent deep learning networks and the principles by which the human auditory cortex might reduce the sample complexity of learning in speech recognition. Our theory and experiments suggest that invariant representations are crucial in learning from complex, real-world data like natural speech. Our model is built on basic computational primitives of cortical neurons, thus making an argument about how representations might be learned in the human auditory cortex.

}, keywords = {Hierarchy, Invariance, Neural Networks, Speech Representation}, url = {http://www.isca-speech.org/archive/interspeech_2014/i14_2346.html}, author = {Chiyuan Zhang and Stephen Voinea and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @article {1140, title = {Speech Representations based on a Theory for Learning Invariances}, year = {2014}, month = {10/2014}, type = {poster presentation}, address = {SANE 2014 - Speech and Audio in the Northeast}, abstract = {

Recognition of sounds and speech from a small number of labelled examples (like humans do), depends on the properties of the representation of the acoustic input. We formulate the problem of extracting robust speech features for supervised learning with small sample complexity as a problem of learning representations of the signal that are maximally invariant to intraclass transformations and deformations. We propose an extension of a theory for unsupervised learning of invariant visual representations to the auditory domain, that requires the memory-based, unsupervised storage of acoustic templates -- such as specific phones or words -- together with all the transformations of each that normally occur. A quasi-invariant representation for a speech signal can be obtained by projecting it to a number of template orbits, i.e., each one a set of transformed template signals, and computing the associated one-dimensional empirical probability distributions. The computations are perfomed by modules of filtering and pooling, that can be used for obtaining a mapping in single- or multilayer architectures. We consider several aspects of such representations including different signal scales (word vs. frame), input domains (raw waveforms vs. frequency filterbank responses), structures (shallow vs.\ multilayer/hierarchical), and ways of sampling from template orbit sets given a set of observations (explicit vs. learned). Preliminary empirical evaluations for learning to separate speech phones and words are given on TIMIT and subsets of TI-DIGITS.\ 

}, author = {Stephen Voinea and Chiyuan Zhang and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @conference {221, title = {Word-level Invariant Representations From Acoustic Waveforms}, booktitle = {INTERSPEECH 2014 - 15th Annual Conf. of the International Speech Communication Association}, year = {2014}, publisher = {International Speech Communication Association (ISCA)}, organization = {International Speech Communication Association (ISCA)}, address = {Singapore}, abstract = {

Extracting discriminant, transformation-invariant features from raw audio signals remains a serious challenge for speech recognition. The issue of speaker variability is central to this problem, as changes in accent, dialect, gender, and age alter the sound waveform of speech units at multiple scales (phonemes, words, or phrases). Approaches for dealing with this variability have typically focused on analyzing the spectral properties of speech at the level of frames, on par with frame-level acoustic modeling usually applied to speech recognition systems. In this paper, we propose a framework for representing speech at the whole-word level and extracting features from the acoustic, temporal domain, without the need for spectral encoding or pre-processing. Leveraging recent work on unsupervised learning of invariant sensory representations, we extract a signature for a word by first projecting its raw waveform onto a set of templates and their transformations, and then forming empirical estimates of the resulting one-dimensional distributions via histograms. The representation and relevant parameters are evaluated for word classification on a series of datasets with increasing speaker-mismatch difficulty, and the results are compared to those of an MFCC-based representation.

}, keywords = {Invariance, Speech Representation, Theories for Intelligence}, url = {http://www.isca-speech.org/archive/interspeech_2014/i14_2385.html}, author = {Stephen Voinea and Chiyuan Zhang and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} }