@conference {5544, title = {Assumption violations in causal discovery and the robustness of score matching}, booktitle = {37th Conference on Neural Information Processing Systems (NeurIPS 2023)}, year = {2024}, month = {12/2023}, abstract = {

When domain knowledge is limited and experimentation is restricted by ethical, financial, or time constraints, practitioners turn to observational causal discovery methods to recover the causal structure, exploiting the statistical properties of their data. Because causal discovery without further assumptions is an ill-posed problem, each algorithm comes with its own set of usually untestable assumptions, some of which are hard to meet in real datasets. Motivated by these considerations, this paper extensively benchmarks the empirical performance of recent causal discovery methods on observational iid data generated under different background conditions, allowing for violations of the critical assumptions required by each selected ap- proach. Our experimental findings show that score matching-based methods demon- strate surprising performance in the false positive and false negative rate of the inferred graph in these challenging scenarios, and we provide theoretical insights into their performance. This work is also the first effort to benchmark the stability of causal discovery algorithms with respect to the values of their hyperparameters. Fi- nally, we hope this paper will set a new standard for the evaluation of causal discov- ery methods and can serve as an accessible entry point for practitioners interested in the field, highlighting the empirical implications of different algorithm choices.

}, url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/93ed74938a54a73b5e4c52bbaf42ca8e-Paper-Conference.pdf}, author = {Francesco Montagna and Atalanti A. Mastakouri and Elias Eulig and Nicoletta Noceti and Lorenzo Rosasco and Dominik Janzing and Bryon Aragam and Francesco Locatello} } @conference {5543, title = {Estimating Koopman operators with sketching to provably learn large scale dynamical systems}, booktitle = {37th Conference on Neural Information Processing Systems (NeurIPS 2023)}, year = {2023}, month = {12/2023}, abstract = {

The theory of Koopman operators allows to deploy non-parametric machine learn- ing algorithms to predict and analyze complex dynamical systems. Estimators such as principal component regression (PCR) or reduced rank regression (RRR) in ker- nel spaces can be shown to provably learn Koopman operators from finite empirical observations of the system{\textquoteright}s time evolution. Scaling these approaches to very long trajectories is a challenge and requires introducing suitable approximations to make computations feasible. In this paper, we boost the efficiency of different kernel- based Koopman operator estimators using random projections (sketching). We de- rive, implement and test the new {\textquotedblleft}sketched{\textquotedblright} estimators with extensive experiments on synthetic and large-scale molecular dynamics datasets. Further, we establish non asymptotic error bounds giving a sharp characterization of the trade-offs between statistical learning rates and computational efficiency. Our empirical and theoretical analysis shows that the proposed estimators provide a sound and efficient way to learn large scale dynamical systems. In particular our experiments indicate that the proposed estimators retain the same accuracy of PCR or RRR, while being much faster. Code is available at https://github.com/Giodiro/NystromKoopman.

}, url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/f3d1e34a15c0af0954ae36a7f811c754-Paper-Conference.pdf}, author = {Giacomo Meanti* and Antoine Chatalic and Vladimir R. Kostic and Pietro Novelli and Massimiliano Pontil and Lorenzo Rosasco} } @conference {5540, title = {Heteroscedastic Gaussian Processes and Random Features: Scalable Motion Primitives with Guarantees}, booktitle = {7th Conference on Robot Learning (CoRL 2023}, year = {2023}, month = {11/2023}, address = {Altanta, GA}, abstract = {

Heteroscedastic Gaussian processes (HGPs) are kernel-based, non- parametric models that can be used to infer nonlinear functions with time-varying noise. In robotics, they can be employed for learning from demonstration as mo- tion primitives, i.e. as a model of the trajectories to be executed by the robot. HGPs provide variance estimates around the reference signal modeling the tra- jectory, capturing both the predictive uncertainty and the motion variability. How- ever, similarly to standard Gaussian processes they suffer from a cubic complexity in the number of training points, due to the inversion of the kernel matrix. The un- certainty can be leveraged for more complex learning tasks, such as inferring the variable impedance profile required from a robotic manipulator. However, suit- able approximations are needed to make HGPs scalable, at the price of potentially worsening the posterior mean and variance profiles. Motivated by these obser- vations, we study the combination of HGPs and random features, which are a popular, data-independent approximation strategy of kernel functions. In a the- oretical analysis, we provide novel guarantees on the approximation error of the HGP posterior due to random features. Moreover, we validate this scalable motion primitive on real robot data, related to the problem of variable impedance learning. In this way, we show that random features offer a viable and theoretically sound alternative for speeding up the trajectory processing, without sacrificing accuracy.

}, url = {https://proceedings.mlr.press/v229/caldarelli23a/caldarelli23a.pdf}, author = {Edoardo Caldarelli and Antoine Chatalic and Adri{\textquoteleft}a Colom{\textasciiacute}e and Lorenzo Rosasco and Carme Torras} } @article {5466, title = {The Janus effects of SGD vs GD: high noise and low rank}, number = {144}, year = {2023}, month = {12/2024}, abstract = {

It was always obvious that \ SGD with small minibatch size yields for neural networks much higher asymptotic fluctuations in the updates of the weight matrices than GD. It has also been often reported that SGD in deep RELU networks shows empirically a low-rank bias in the weight matrices. A recent \ theoretical analysis derived a bound on the rank and linked it to the size of the SGD fluctuations [25]. In this paper, we provide an empirical and \ theoretical analysis of the convergence of SGD vs GD, first for deep RELU networks and then for the case of linear regression, where sharper estimates can be obtained and which is of independent interest. In the linear case, we prove that the component $W^\perp$ of the matrix $W$ corresponding to the null space of the data matrix $X$ converges to zero for both SGD and GD, provided the regularization term is non-zero. Because of the larger number of updates required to go through all the training data, the convergence rate {\it per epoch} of these components is much faster for SGD than for GD. In practice, SGD has a much stronger bias than GD towards solutions for weight matrices $W$ with high fluctuations -- even when the choice of mini batches is deterministic -- and low rank, provided the initialization is from a random matrix. Thus SGD \ with non-zero regularization, shows the coupled phenomenon of \ asymptotic noise and a low-rank bias-- unlike GD.

}, author = {Mengjia Xu and Tomer Galanti and Akshay Rangamani and Lorenzo Rosasco and Andrea Pinto and Tomaso Poggio} } @conference {5546, title = {An Optimal Structured Zeroth-order Algorithm for Non-smooth Optimization}, booktitle = {37th Conference on Neural Information Processing Systems (NeurIPS 2023)}, year = {2023}, month = {12/2023}, abstract = {

Finite-difference methods are a class of algorithms designed to solve black-box optimization problems by approximating a gradient of the target function on a set of directions. In black-box optimization, the non-smooth setting is particularly relevant since, in practice, differentiability and smoothness assumptions cannot be verified. To cope with nonsmoothness, several authors use a smooth approximation of the target function and show that finite difference methods approximate its gradient. Recently, it has been proved that imposing a structure in the directions allows improving performance. However, only the smooth setting was considered. To close this gap, we introduce and analyze O-ZD, the first structured finite- difference algorithm for non-smooth black-box optimization. Our method exploits a smooth approximation of the target function and we prove that it approximates its gradient on a subset of random orthogonal directions. We analyze the convergence of O-ZD under different assumptions. For non-smooth convex functions, we obtain the optimal complexity. In the non-smooth non-convex setting, we characterize the number of iterations needed to bound the expected norm of the smoothed gradient. For smooth functions, our analysis recovers existing results for structured zeroth-order methods for the convex case and extends them to the non-convex setting. We conclude with numerical simulations where assumptions are satisfied, observing that our algorithm has very good practical performances.

}, url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/7429f4c1b267cf619f28c4d4f1532f99-Paper-Conference.pdf}, author = {Marco Rando and Cesare Molinari and Silvia Villa and Lorenzo Rosasco} } @article {5300, title = {Scalable Causal Discovery with Score Matching}, year = {2022}, month = {09/2022}, abstract = {

This paper demonstrates how to discover the whole causal graph from the second derivative of the log-likelihood in observational non-linear additive Gaussian noise models. Leveraging scalable machine learning approaches to approximate the score function , we extend the work of Rolland et al., 2022, that only recovers the topological order from the score and requires an expensive pruning step to discover the edges. Our analysis leads to DAS, a practical algorithm that reduces the complexity of the pruning by a factor proportional to the graph size. In practice, DAS achieves competitive accuracy with current state-of-the-art while being over an order of magnitude faster. Overall, our approach enables principled and scalable causal discovery, significantly lowering the compute bar.

}, url = {https://openreview.net/forum?id=v56PHv_W2A}, author = {Francesco Montagna and Nicoletta Noceti and Lorenzo Rosasco and Kun Zhang and Francesco Locatello} } @article {4568, title = {For interpolating kernel machines, the minimum norm ERM solution is the most stable}, number = {108}, year = {2020}, month = {06/2020}, abstract = {

We study the average CVloo stability of kernel ridge-less regression and derive corresponding risk bounds. We show that the interpolating solution with minimum norm has the best CVloo stability, which in turn is controlled by the condition number of the empirical kernel matrix. The latter can be characterized in the asymptotic regime where both the dimension and cardinality of the data go to infinity. Under the assumption of random kernel matrices, the corresponding test error follows a double descent curve.

}, author = {Akshay Rangamani and Lorenzo Rosasco and Tomaso Poggio} } @proceedings {4385, title = {Beating SGD Saturation with Tail-Averaging and Minibatching}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

While stochastic gradient descent (SGD) is one of the major workhorses in machine learning, the learning properties of many practically used variants are still poorly understood. In this paper, we consider least squares learning in a nonparametric setting and contribute to filling this gap by focusing on the effect and interplay of multiple passes, mini-batching and averaging, in particular tail averaging. Our results show how these different variants of SGD can be combined to achieve optimal learning rates, also providing practical insights. A novel key result is that tail averaging allows faster convergence rates than uniform averaging in the nonparametric setting. Further, we show that a combination of tail-averaging and minibatching allows more aggressive step-size choices than using any one of said components.

}, author = {Nicole Muecke and Gergely Neu and Lorenzo Rosasco} } @conference {4516, title = {Dynamics \& Generalization in Deep Networks -Minimizing the Norm}, booktitle = {NAS Sackler Colloquium on Science of Deep Learning}, year = {2019}, month = {03/2019}, address = {Washington D.C.}, author = {Andrzej Banburski and Qianli Liao and Brando Miranda and Lorenzo Rosasco and Jack Hidary and Tomaso Poggio} } @proceedings {4386, title = {Implicit Regularization of Accelerated Methods in Hilbert Spaces}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

We study learning properties of accelerated gradient descent methods for linear least-squares in Hilbert spaces. We analyze the implicit regularization properties of Nesterov acceleration and a variant of heavy-ball in terms of corresponding learning error bounds. Our results show that acceleration can provides faster bias decay than gradient descent, but also suffers of a more unstable behavior. As a result acceleration cannot be in general expected to improve learning accuracy with respect to gradient descent, but rather to achieve the same accuracy with reduced computations. Our theoretical results are validated by numerical simulations. Our analysis is based on studying suitable polynomials induced by the accelerated dynamics and combining spectral techniques with concentration inequalities.

}, author = {Nicol{\`o} Pagliana and Lorenzo Rosasco} } @conference {4517, title = {Weight and Batch Normalization implement Classical Generalization Bounds }, booktitle = {ICML}, year = {2019}, month = {06/2019}, address = {Long Beach/California}, author = {Andrzej Banburski and Qianli Liao and Brando Miranda and Lorenzo Rosasco and Jack Hidary and Tomaso Poggio} } @article {3694, title = {Theory III: Dynamics and Generalization in Deep Networks}, year = {2018}, month = {06/2018}, abstract = {

The key to generalization is controlling the complexity of
\ \ \ \ \ \ the network. However, there is no obvious control of
\ \ \ \ \ \ complexity -- such as an explicit regularization term --
\ \ \ \ \ \ in the training of deep networks for classification. We
\ \ \ \ \ \ will show that a classical form of norm control -- but
\ \ \ \ \ \ kind of hidden -- is present in deep networks trained with
\ \ \ \ \ \ gradient descent techniques on exponential-type losses. In
\ \ \ \ \ \ particular, gradient descent induces a dynamics of the
\ \ \ \ \ \ normalized weights which converge for $t \to \infty$ to an
\ \ \ \ \ \ equilibrium which corresponds to a minimum norm (or
\ \ \ \ \ \ maximum margin) solution. For sufficiently large but
\ \ \ \ \ \ finite $\rho$ -- and thus finite $t$ -- the dynamics
\ \ \ \ \ \ converges to one of several margin maximizers, with the
\ \ \ \ \ \ margin monotonically increasing towards a limit stationary
\ \ \ \ \ \ point of the flow. In the usual case of stochastic
\ \ \ \ \ \ gradient descent, most of the stationary points are likely
\ \ \ \ \ \ to be convex minima corresponding to a regularized,
\ \ \ \ \ \ constrained minimizer -- the network with normalized
\ \ \ \ \ \ weights-- which is stable and has asymptotic zero
\ \ \ \ \ \ generalization gap, asymptotically for $N \to \infty$,
\ \ \ \ \ \ where $N$ is the number of training examples. For finite,
\ \ \ \ \ \ fixed $N$ the generalizaton gap may not be zero but the
\ \ \ \ \ \ minimum norm property of the solution can provide, we
\ \ \ \ \ \ conjecture, good expected performance for suitable data
\ \ \ \ \ \ distributions. Our approach extends some of the results of
\ \ \ \ \ \ Srebro from linear networks to deep networks and provides
\ \ \ \ \ \ a new perspective on the implicit bias of gradient
\ \ \ \ \ \ descent. We believe that the elusive complexity control we
\ \ \ \ \ \ describe is responsible for the puzzling empirical finding
\ \ \ \ \ \ of good predictive performance by deep networks, despite
\ \ \ \ \ \ overparametrization.\

}, author = {Andrzej Banburski and Qianli Liao and Brando Miranda and Tomaso Poggio and Lorenzo Rosasco and Jack Hidary and Fernanda De La Torre} } @inbook {2562, title = {Invariant Recognition Predicts Tuning of Neurons in Sensory Cortex}, booktitle = {Computational and Cognitive Neuroscience of Vision}, year = {2017}, pages = {85-104}, publisher = {Springer}, organization = {Springer}, issn = {978-981-10-0211-3}, author = {Jim Mutch and F. Anselmi and Andrea Tacchetti and Lorenzo Rosasco and JZ. Leibo and Tomaso Poggio} } @article {2900, title = {Symmetry Regularization}, number = {063}, year = {2017}, month = {05/2017}, abstract = {

The properties of a representation, such as smoothness, adaptability, generality, equivari- ance/invariance, depend on restrictions imposed during learning. In this paper, we propose using data symmetries, in the sense of equivalences under transformations, as a means for learning symmetry- adapted representations, i.e., representations that are equivariant to transformations in the original space. We provide a sufficient condition to enforce the representation, for example the weights of a neural network layer or the atoms of a dictionary, to have a group structure and specifically the group structure in an unlabeled training set. By reducing the analysis of generic group symmetries to per- mutation symmetries, we devise an analytic expression for a regularization scheme and a permutation invariant metric on the representation space. Our work provides a proof of concept on why and how to learn equivariant representations, without explicit knowledge of the underlying symmetries in the data.

}, author = {F. Anselmi and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @article {3266, title = {Theory of Deep Learning III: explaining the non-overfitting puzzle}, year = {2017}, month = {12/2017}, abstract = {

THIS MEMO IS REPLACED BY CBMM MEMO 90

A main puzzle of deep networks revolves around the absence of overfitting despite overparametrization and despite the large capacity demonstrated by zero training error on randomly labeled data. In this note, we show that the dynamical systems associated with gradient descent minimization of nonlinear networks behave near zero stable minima of the empirical error as gradient system in a quadratic potential with degenerate Hessian. The proposition is supported by theoretical and numerical results, under the assumption of stable minima of the gradient.

Our proposition provides the extension to deep networks of key properties of gradient descent methods for linear networks, that as, suggested in (1), can be the key to understand generalization. Gradient descent enforces a form of implicit regular- ization controlled by the number of iterations, and asymptotically converging to the minimum norm solution. This implies that there is usually an optimum early stopping that avoids overfitting of the loss (this is relevant mainly for regression). For classification, the asymptotic convergence to the minimum norm solution implies convergence to the maximum margin solution which guarantees good classification error for {\textquotedblleft}low noise{\textquotedblright} datasets.

The implied robustness to overparametrization has suggestive implications for the robustness of deep hierarchically local networks to variations of the architecture with respect to the curse of dimensionality.

}, author = {Tomaso Poggio and Keji Kawaguchi and Qianli Liao and Brando Miranda and Lorenzo Rosasco and Xavier Boix and Jack Hidary and Hrushikesh Mhaskar} } @article {2557, title = {Why and when can deep-but not shallow-networks avoid the curse of dimensionality: A review}, journal = {International Journal of Automation and Computing}, year = {2017}, month = {03/2017}, pages = {1-17}, abstract = {

The paper reviews and extends an emerging body of theoretical results on deep learning including the conditions under which it can be exponentially better than shallow learning. A class of deep convolutional networks represent an important special case of these conditions, though weight sharing is not the main reason for their exponential advantage. Implications of a few key theorems are discussed, together with new results, open problems and conjectures.

}, keywords = {convolutional neural networks, deep and shallow networks, deep learning, function approximation, Machine Learning, Neural Networks}, doi = {10.1007/s11633-017-1054-2}, url = {http://link.springer.com/article/10.1007/s11633-017-1054-2?wt_mc=Internal.Event.1.SEM.ArticleAuthorOnlineFirst}, author = {Tomaso Poggio and Hrushikesh Mhaskar and Lorenzo Rosasco and Brando Miranda and Qianli Liao} } @conference {1813, title = {Holographic Embeddings of Knowledge Graphs}, booktitle = {Thirtieth AAAI Conference on Artificial Intelligence (AAAI-16)}, year = {2016}, address = {Phoenix, Arizona, USA}, abstract = {

Learning embeddings of entities and relations is an efficient and versatile method to perform machine learning on relational data such as knowledge graphs. In this work, we propose holographic embeddings (HolE) to learn compositional vector space representations of entire knowledge graphs. The proposed method is related to holographic models of associative memory in that it employs circular correlation to create compositional representations. By using correlation as the compositional operator HolE can capture rich interactions but simultaneously remains efficient to compute, easy to train, and scalable to very large datasets. In extensive experiments we show that holographic embeddings are able to outperform state-of-the-art methods for link prediction in knowledge graphs and relational learning benchmark datasets.

}, author = {Maximilian Nickel and Lorenzo Rosasco and Tomaso Poggio} } @article {2098, title = {On invariance and selectivity in representation learning}, journal = {Information and Inference: A Journal of the IMA}, year = {2016}, month = {05/2016}, pages = {iaw009}, abstract = {

We study the problem of learning from data representations that are invariant to transformations, and at the same time selective, in the sense that two points have the same representation if one is the transformation of the other. The mathematical results here sharpen some of the key claims of i-theory{\textemdash}a recent theory of feedforward processing in sensory cortex (Anselmi et al., 2013, Theor. Comput. Sci. and arXiv:1311.4158; Anselmi et al., 2013, Magic materials: a theory of deep hierarchical architectures for learning sensory representations. CBCL Paper; Anselmi \& Poggio, 2010, Representation learning in sensory cortex: a theory. CBMM Memo No. 26).

}, issn = {2049-8764}, doi = {10.1093/imaiai/iaw009}, url = {http://imaiai.oxfordjournals.org/lookup/doi/10.1093/imaiai/iaw009}, author = {F. Anselmi and Lorenzo Rosasco and Tomaso Poggio} } @article {2321, title = {Theory I: Why and When Can Deep Networks Avoid the Curse of Dimensionality?}, year = {2016}, month = {11/2016}, abstract = {

[formerly titled "Why and When Can Deep - but Not Shallow - Networks Avoid the Curse of Dimensionality: a Review"]

}, author = {Tomaso Poggio and Hrushikesh Mhaskar and Lorenzo Rosasco and Brando Miranda and Qianli Liao} } @article {1371, title = {Deep Convolutional Networks are Hierarchical Kernel Machines}, number = {035}, year = {2015}, month = {06/17/2015}, abstract = {

We extend i-theory to incorporate not only pooling but also rectifying nonlinearities in an extended HW module (eHW) designed for supervised learning. The two operations roughly correspond to invariance and selectivity, respectively. Under the assumption of normalized inputs, we show that appropriate linear combinations of rectifying nonlinearities are equivalent to radial kernels. If pooling is present an equivalent kernel also exist. Thus present-day DCNs (Deep Convolutional Networks) can be exactly equivalent to a hierarchy of kernel machines with pooling and non-pooling layers. Finally, we describe a conjecture for theoretically understanding hierarchies of such modules. A main consequence of the conjecture is that hierarchies of eHW modules minimize memory requirements while computing a selective and invariant representation.

}, author = {F. Anselmi and Lorenzo Rosasco and Cheston Tan and Tomaso Poggio} } @conference {1142, title = {Discriminative Template Learning in Group-Convolutional Networks for Invariant Speech Representations}, booktitle = {INTERSPEECH-2015}, year = {2015}, month = {09/2015}, publisher = {International Speech Communication Association (ISCA)}, organization = {International Speech Communication Association (ISCA)}, address = {Dresden, Germany}, url = {http://www.isca-speech.org/archive/interspeech_2015/i15_3229.html}, author = {Chiyuan Zhang and Stephen Voinea and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @article {1508, title = {Holographic Embeddings of Knowledge Graphs}, year = {2015}, month = {11/16/2015}, abstract = {

Learning embeddings of entities and relations is an efficient and versatile method to perform machine learning on relational data such as knowledge graphs. In this work, we propose holographic embeddings (HolE) to learn compositional vector space representations of entire knowledge graphs. The proposed method is related to holographic models of associative memory in that it employs circular correlation to create compositional representations. By using correlation as the compositional operator, HolE can capture rich interactions but simultaneously remains efficient to compute, easy to train, and scalable to very large datasets. In extensive experiments we show that holographic embeddings are able to outperform state-of-the-art methods for link prediction in knowledge graphs and relational learning benchmark datasets.

}, keywords = {Associative Memory, Knowledge Graph, Machine Learning}, author = {Maximilian Nickel and Lorenzo Rosasco and Tomaso Poggio} } @article {695, title = {On Invariance and Selectivity in Representation Learning}, number = {029}, year = {2015}, month = {03/23/2015}, abstract = {

We discuss data representation which can be learned automatically from data, are invariant to transformations, and at the same time selective, in the sense that two points have the same representation only if they are one the transformation of the other. The mathematical results here sharpen some of the key claims of i-theory, a recent theory of feedforward processing in sensory cortex.

}, author = {F. Anselmi and Lorenzo Rosasco and Tomaso Poggio} } @article {1588, title = {I-theory on depth vs width: hierarchical function composition}, year = {2015}, month = {12/29/2015}, abstract = {

Deep learning networks with convolution, pooling and subsampling are a special case of hierarchical architectures, which can be represented by trees (such as binary trees). Hierarchical as well as shallow networks can approximate functions of several variables, in particular those that are compositions of low dimensional functions. We show that the power of a deep network architecture with respect to a shallow network is rather independent of the specific nonlinear operations in the network and depends instead on the the behavior of the VC-dimension. A shallow network can approximate compositional functions with the same error of a deep network but at the cost of a VC-dimension that is exponential instead than quadratic in the dimensionality of the function. To complete the argument we argue that there exist visual computations that are intrinsically compositional. In particular, we prove that recognition invariant to translation cannot be computed by shallow networks in the presence of clutter. Finally, a general framework that includes the compositional case is sketched. The key condition that allows tall, thin networks to be nicer that short, fat networks is that the target input-output function must be sparse in a certain technical sense.

}, author = {Tomaso Poggio and F. Anselmi and Lorenzo Rosasco} } @conference {1573, title = {Learning with incremental iterative regularization}, booktitle = {NIPS 2015}, year = {2015}, abstract = {

Within a statistical learning setting, we propose and study an iterative regularization algorithm for least squares defined by an incremental gradient method. In particular, we show that, if all other parameters are fixed a priori, the number of passes over the data (epochs) acts as a regularization parameter, and prove strong universal consistency, i.e. almost sure convergence of the risk, as well as sharp finite sample bounds for the iterates. Our results are a step towards understanding the effect of multiple epochs in stochastic gradient techniques in machine learning and rely on integrating statistical and optimizationresults.

}, url = {https://papers.nips.cc/paper/6015-learning-with-incremental-iterative-regularization}, author = {Lorenzo Rosasco and Villa, Silvia} } @conference {1572, title = {Less is More: Nystr{\"o}m Computational Regularization}, booktitle = {NIPS 2015}, year = {2015}, abstract = {

We study Nystr"om type subsampling approaches to large scale kernel methods, and prove learning bounds in the statistical learning setting, where random sampling and high probability estimates are considered. In particular, we prove that these approaches can achieve optimal learning bounds, provided the subsampling level is suitably chosen. These results suggest a simple incremental variant of Nystr"om Kernel Regularized Least Squares, where the subsampling level implements a form of computational regularization, in the sense that it controls at the same time regularization and computations. Extensive experimental analysis shows that the considered approach achieves state of the art performances on benchmark large scale datasets.

}, url = {https://papers.nips.cc/paper/5936-less-is-more-nystrom-computational-regularization}, author = {Alessandro Rudi and Raffaello Camoriano and Lorenzo Rosasco} } @article {1439, title = {Notes on Hierarchical Splines, DCLNs and i-theory}, year = {2015}, abstract = {

We define an extension of classical additive splines for multivariate
function approximation that we call hierarchical splines. We show that the
case of hierarchical, additive, piece-wise linear splines includes present-day
Deep Convolutional Learning Networks (DCLNs) with linear rectifiers and
pooling (sum or max). We discuss how these observations together with
i-theory may provide a framework for a general theory of deep networks.

}, author = {Tomaso Poggio and Lorenzo Rosasco and Amnon Shashua and Nadav Cohen and F. Anselmi} } @article {1415, title = {Unsupervised learning of invariant representations}, journal = {Theoretical Computer Science}, year = {2015}, month = {06/25/2015}, abstract = {

The present phase of Machine Learning is characterized by supervised learning algorithms relying on large sets of labeled examples (n{\textrightarrow}$\infty$n{\textrightarrow}$\infty$). The next phase is likely to focus on algorithms capable of learning from very few labeled examples (n{\textrightarrow}1n{\textrightarrow}1), like humans seem able to do. We propose an approach to this problem and describe the underlying theory, based on the unsupervised, automatic learning of a {\textquotedblleft}good{\textquotedblright} representation for supervised learning, characterized by small sample complexity. We consider the case of visual object recognition, though the theory also applies to other domains like speech. The starting point is the conjecture, proved in specific cases, that image representations which are invariant to translation, scaling and other transformations can considerably reduce the sample complexity of learning. We prove that an invariant and selective signature can be computed for each image or image patch: the invariance can be exact in the case of group transformations and approximate under non-group transformations. A module performing filtering and pooling, like the simple and complex cells described by Hubel and Wiesel, can compute such signature. The theory offers novel unsupervised learning algorithms for {\textquotedblleft}deep{\textquotedblright} architectures for image and speech recognition. We conjecture that the main computational goal of the ventral stream of visual cortex is to provide a hierarchical representation of new objects/images which is invariant to transformations, stable, and selective for recognition{\textemdash}and show how this representation may be continuously learned in an unsupervised way during development and visual experience.

}, keywords = {convolutional networks, Cortex, Hierarchy, Invariance}, doi = {10.1016/j.tcs.2015.06.048}, url = {http://www.sciencedirect.com/science/article/pii/S0304397515005587}, author = {F. Anselmi and JZ. Leibo and Lorenzo Rosasco and Jim Mutch and Andrea Tacchetti and Tomaso Poggio} } @article {227, title = {A Deep Representation for Invariance And Music Classification}, number = {002}, year = {2014}, month = {03/2014}, abstract = {

Representations in the auditory cortex might be based on mechanisms similar to the visual ventral stream; modules for building invariance to transformations and multiple layers for compositionality and selectivity. In this paper we propose the use of such computational modules for extracting invariant and discriminative audio representations. Building on a theory of invariance in hierarchical architectures, we propose a novel, mid-level representation for acoustical signals, using the empirical distributions of projections on a set of templates and their transformations. Under the assumption that, by construction, this dictionary of templates is composed from similar classes, and samples the orbit of variance-inducing signal transformations (such as shift and scale), the resulting signature is theoretically guaranteed to be unique, invariant to transformations and stable to deformations. Modules of projection and pooling can then constitute layers of deep networks, for learning composite representations. We present the main theoretical and computational aspects of a framework for unsupervised learning of invariant audio representations, empirically evaluated on music genre classification.

}, keywords = {Audio Representation, Hierarchy, Invariance, Machine Learning, Theories for Intelligence}, author = {Chiyuan Zhang and Georgios Evangelopoulos and Stephen Voinea and Lorenzo Rosasco and Tomaso Poggio} } @conference {1141, title = {A Deep Representation for Invariance and Music Classification}, booktitle = {ICASSP 2014 - 2014 IEEE International Conference on Acoustics, Speech and Signal Processing}, year = {2014}, month = {05/04/2014}, publisher = {IEEE}, organization = {IEEE}, address = {Florence, Italy}, keywords = {acoustic signal processing, signal representation, unsupervised learning}, doi = {10.1109/ICASSP.2014.6854954}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6854954}, author = {Chiyuan Zhang and Georgios Evangelopoulos and Stephen Voinea and Lorenzo Rosasco and Tomaso Poggio} } @article {451, title = {Learning An Invariant Speech Representation}, number = {022}, year = {2014}, month = {06/2014}, abstract = {

Recognition of speech, and in particular the ability to generalize and learn from small sets of labelled examples like humans do, depends on an appropriate representation of the acoustic input. We formulate the problem of finding robust speech features for supervised learning with small sample complexity as a problem of learning representations of the signal that are maximally invariant to intraclass transformations and deformations. We propose an extension of a theory for unsupervised learning of invariant visual representations to the auditory domain and empirically evaluate its validity for voiced speech sound classification. Our version of the theory requires the memory-based, unsupervised storage of acoustic templates {\textemdash} such as specific phones or words {\textemdash} together with all the transformations of each that normally occur. A quasi-invariant representation for a speech segment can be obtained by projecting it to each template orbit, i.e., the set of transformed signals, and computing the associated one-dimensional empirical probability distributions. The computations can be performed by modules of filtering and pooling, and extended to hierarchical architectures. In this paper, we apply a single-layer, multicomponent representation for phonemes and demonstrate improved accuracy and decreased sample complexity for vowel classification compared to standard spectral, cepstral and perceptual features.

}, keywords = {Theories for Intelligence}, author = {Georgios Evangelopoulos and Stephen Voinea and Chiyuan Zhang and Lorenzo Rosasco and Tomaso Poggio} } @conference {220, title = {Phone Classification by a Hierarchy of Invariant Representation Layers}, booktitle = {INTERSPEECH 2014 - 15th Annual Conf. of the International Speech Communication Association}, year = {2014}, publisher = {International Speech Communication Association (ISCA)}, organization = {International Speech Communication Association (ISCA)}, address = {Singapore}, abstract = {

We propose a multi-layer feature extraction framework for speech, capable of providing invariant representations. A set of templates is generated by sampling the result of applying smooth, identity-preserving transformations (such as vocal tract length and tempo variations) to arbitrarily-selected speech signals. Templates are then stored as the weights of {\textquotedblleft}neurons{\textquotedblright}. We use a cascade of such computational modules to factor out different types of transformation variability in a hierarchy, and show that it improves phone classification over baseline features. In addition, we describe empirical comparisons of a) different transformations which may be responsible for the variability in speech signals and of b) different ways of assembling template sets for training. The proposed layered system is an effort towards explaining the performance of recent deep learning networks and the principles by which the human auditory cortex might reduce the sample complexity of learning in speech recognition. Our theory and experiments suggest that invariant representations are crucial in learning from complex, real-world data like natural speech. Our model is built on basic computational primitives of cortical neurons, thus making an argument about how representations might be learned in the human auditory cortex.

}, keywords = {Hierarchy, Invariance, Neural Networks, Speech Representation}, url = {http://www.isca-speech.org/archive/interspeech_2014/i14_2346.html}, author = {Chiyuan Zhang and Stephen Voinea and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @article {1140, title = {Speech Representations based on a Theory for Learning Invariances}, year = {2014}, month = {10/2014}, type = {poster presentation}, address = {SANE 2014 - Speech and Audio in the Northeast}, abstract = {

Recognition of sounds and speech from a small number of labelled examples (like humans do), depends on the properties of the representation of the acoustic input. We formulate the problem of extracting robust speech features for supervised learning with small sample complexity as a problem of learning representations of the signal that are maximally invariant to intraclass transformations and deformations. We propose an extension of a theory for unsupervised learning of invariant visual representations to the auditory domain, that requires the memory-based, unsupervised storage of acoustic templates -- such as specific phones or words -- together with all the transformations of each that normally occur. A quasi-invariant representation for a speech signal can be obtained by projecting it to a number of template orbits, i.e., each one a set of transformed template signals, and computing the associated one-dimensional empirical probability distributions. The computations are perfomed by modules of filtering and pooling, that can be used for obtaining a mapping in single- or multilayer architectures. We consider several aspects of such representations including different signal scales (word vs. frame), input domains (raw waveforms vs. frequency filterbank responses), structures (shallow vs.\ multilayer/hierarchical), and ways of sampling from template orbit sets given a set of observations (explicit vs. learned). Preliminary empirical evaluations for learning to separate speech phones and words are given on TIMIT and subsets of TI-DIGITS.\

}, author = {Stephen Voinea and Chiyuan Zhang and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @article {226, title = {Unsupervised learning of invariant representations with low sample complexity: the magic of sensory cortex or a new framework for machine learning?}, number = {001}, year = {2014}, month = {03/2014}, abstract = {

The present phase of Machine Learning is characterized by supervised learning algorithms relying on large sets of labeled examples (n{\textrightarrow}$\infty$). The next phase is likely to focus on algorithms capable of learning from very few labeled examples (n{\textrightarrow}1), like humans seem able to do. We propose an approach to this problem and describe the underlying theory, based on the unsupervised, automatic learning of a {\textquotedblleft}good{\textquotedblright} representation for supervised learning, characterized by small sample complexity (n). We consider the case of visual object recognition though the theory applies to other domains. The starting point is the conjecture, proved in specific cases, that image representations which are invariant to translations, scaling and other transformations can considerably reduce the sample complexity of learning. We prove that an invariant and unique (discriminative) signature can be computed for each image patch, I, in terms of empirical distributions of the dot-products between I and a set of templates stored during unsupervised learning. A module performing filtering and pooling, like the simple and complex cells described by Hubel and Wiesel, can compute such estimates. Hierarchical architectures consisting of this basic Hubel-Wiesel moduli inherit its properties of invariance, stability, and discriminability while capturing the compositional organization of the visual world in terms of wholes and parts. The theory extends existing deep learning convolutional architectures for image and speech recognition. It also suggests that the main computational goal of the ventral stream of visual cortex is to provide a hierarchical representation of new objects/images which is invariant to transformations, stable, and discriminative for recognition{\textemdash}and that this representation may be continuously learned in an unsupervised way during development and visual experience.

}, keywords = {Computer vision, Pattern recognition}, author = {F. Anselmi and JZ. Leibo and Lorenzo Rosasco and Jim Mutch and Andrea Tacchetti and Tomaso Poggio} } @conference {221, title = {Word-level Invariant Representations From Acoustic Waveforms}, booktitle = {INTERSPEECH 2014 - 15th Annual Conf. of the International Speech Communication Association}, year = {2014}, publisher = {International Speech Communication Association (ISCA)}, organization = {International Speech Communication Association (ISCA)}, address = {Singapore}, abstract = {

Extracting discriminant, transformation-invariant features from raw audio signals remains a serious challenge for speech recognition. The issue of speaker variability is central to this problem, as changes in accent, dialect, gender, and age alter the sound waveform of speech units at multiple scales (phonemes, words, or phrases). Approaches for dealing with this variability have typically focused on analyzing the spectral properties of speech at the level of frames, on par with frame-level acoustic modeling usually applied to speech recognition systems. In this paper, we propose a framework for representing speech at the whole-word level and extracting features from the acoustic, temporal domain, without the need for spectral encoding or pre-processing. Leveraging recent work on unsupervised learning of invariant sensory representations, we extract a signature for a word by first projecting its raw waveform onto a set of templates and their transformations, and then forming empirical estimates of the resulting one-dimensional distributions via histograms. The representation and relevant parameters are evaluated for word classification on a series of datasets with increasing speaker-mismatch difficulty, and the results are compared to those of an MFCC-based representation.

}, keywords = {Invariance, Speech Representation, Theories for Intelligence}, url = {http://www.isca-speech.org/archive/interspeech_2014/i14_2385.html}, author = {Stephen Voinea and Chiyuan Zhang and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @inbook {218, title = {On Learnability, Complexity and Stability}, booktitle = {Empirical Inference}, year = {2013}, pages = {59 - 69}, publisher = {Springer Berlin Heidelberg}, organization = {Springer Berlin Heidelberg}, chapter = {7}, address = {Berlin, Heidelberg}, abstract = {

Empirical Inference, Chapter 7

Editors: Bernhard Sch{\"o}lkopf, Zhiyuan Luo and Vladimir Vovk

Abstract:

We consider the fundamental question of learnability of a hypothesis class in the supervised learning setting and in the general learning setting introduced by Vladimir Vapnik. We survey classic results characterizing learnability in terms of suitable notions of complexity, as well as more recent results that establish the connection between learnability and stability of a learning algorithm.

}, isbn = {978-3-642-41135-9}, doi = {10.1007/978-3-642-41136-610.1007/978-3-642-41136-6_7}, url = {http://link.springer.com/10.1007/978-3-642-41136-6}, author = {Villa, Silvia and Lorenzo Rosasco and Tomaso Poggio and Sch{\"o}lkopf, Bernhard and Luo, Zhiyuan and Vovk, Vladimir} } @article {385, title = {Object recognition data sets (iCub/IIT)}, year = {2013}, month = {05/2013}, abstract = {

Data set for object recognition and categorization. 10 categories, 40 objects for the training phase. The acquisition size is 640{\texttimes}480 and subsequently cropped to the bounding box of the object according to the kinematics or motion cue. The bounding box is 160{\texttimes}160 in human mode and 320{\texttimes}320 in robot mode. For each object we provide 200 training samples. Each category is trained with 3 objects (600 examples per category).

Click HERE to Download Dataset from IIT website \>

Publications

Fanello, S.R.; Ciliberto, C.; Santoro, M.; Natale, L.; Metta, G.; Rosasco, L.; Odone, F.,{\textquotedblright}iCub World: Friendly Robots Help Building Good Vision Data-Sets,{\textquotedblright} In Proceedings of IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPR), 2013

Fanello, S. R.; Ciliberto, C.; Natale, L.; Metta, G., {\textquotedblleft}Weakly Supervised Strategies for Natural Object Recognition in Robotics,{\textquotedblright} IEEE International Conference on Robotics and Automation (ICRA). Karlsruhe, Germany, May 6-10, 2013

Fanello, S.R.; Noceti, N.; Metta, G.; Odone, F., {\textquotedblleft}Multi-Class Image Classification: Sparsity Does It Better,{\textquotedblright} International Conference on Computer Vision Theory and Applications (VISAPP), 2013

Ciliberto C.; Smeraldi F.; Natale L.; Metta G., {\textquotedblleft}Online Multiple Instance Learning Applied to Hand Detection in a Humanoid Robot,{\textquotedblright} IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS2011). San Francisco, California, USA, September 25-30, 2011

}, keywords = {Computer vision, object recognition, robotics}, author = {Lorenzo Rosasco} } @proceedings {387, title = {Unsupervised Learning of Invariant Representations in Hierarchical Architectures.}, year = {2013}, month = {11/2013}, abstract = {

Representations that are invariant to translation, scale and other transformations, can considerably reduce the sample complexity of learning, allowing recognition of new object classes from very few examples {\textendash} a hallmark of human recognition. Empirical estimates of one-dimensional projections of the distribution induced by a group of affine transformations are proven to represent a unique and invariant signature associated with an image. We show how projections yielding invariant signatures for future images can be learned automatically, and updated continuously, during unsupervised visual experience. A module performing filtering and pooling, like simple and complex cells as proposed by Hubel and Wiesel, can compute such estimates. Under this view, a pooling stage estimates a one-dimensional probability distribution. Invariance from observations through a restricted window is equivalent to a sparsity property w.r.t. to a transformation, which yields templates that are a) Gabor for optimal simultaneous invariance to translation and scale or b) very specific for complex, class-dependent transformations such as rotation in depth of faces. Hierarchical architectures consisting of this basic Hubel-Wiesel module inherit its properties of invariance, stability, and discriminability while capturing the compositional organization of the visual world in terms of wholes and parts, and are invariant to complex transformations that may only be locally affine. The theory applies to several existing deep learning convolutional architectures for image and speech recognition. It also suggests that the main computational goal of the ventral stream of visual cortex is to provide a hierarchical representation of new objects which is invariant to transformations, stable, and discriminative for recognition {\textendash} this representation may be learned in an unsupervised way from natural visual experience.

Read paper\>

}, keywords = {convolutional networks, Hierarchy, Invariance, visual cortex}, author = {F. Anselmi and JZ. Leibo and Lorenzo Rosasco and Jim Mutch and Andrea Tacchetti and Tomaso Poggio} } @conference {4886, title = {Learning manifolds with k-means and k-flats}, booktitle = {Advances in Neural Information Processing Systems 25 (NIPS 2012)}, year = {2012}, month = {12/2012}, abstract = {

We study the problem of estimating a manifold from random samples. In partic- ular, we consider piecewise constant and piecewise linear estimators induced by k-means and k-flats, and analyze their performance. We extend previous results for k-means in two separate directions. First, we provide new results for k-means reconstruction on manifolds and, secondly, we prove reconstruction bounds for higher-order approximation (k-flats), for which no known results were previously available. While the results for k-means are novel, some of the technical tools are well-established in the literature. In the case of k-flats, both the results and the mathematical tools are new.

}, url = {https://papers.nips.cc/paper/2012/hash/b20bb95ab626d93fd976af958fbc61ba-Abstract.html}, author = {Guillermo D. Canas and Tomaso Poggio and Lorenzo Rosasco} }