@article {3155, title = {Fisher-Rao Metric, Geometry, and Complexity of Neural Networks}, year = {2017}, month = {11/2017}, abstract = {

We study the relationship between geometry and capacity measures for deep\  neural\  networks\  from\  an\  invariance\  viewpoint.\  We\  introduce\  a\  new notion\  of\  capacity {\textemdash} the\  Fisher-Rao\  norm {\textemdash} that\  possesses\  desirable\  in- variance properties and is motivated by Information Geometry. We discover an analytical characterization of the new capacity measure, through which we establish norm-comparison inequalities and further show that the new measure serves as an umbrella for several existing norm-based complexity measures.\  We\  discuss\  upper\  bounds\  on\  the\  generalization\  error\  induced by\  the\  proposed\  measure.\  Extensive\  numerical\  experiments\  on\  CIFAR-10 support\  our\  theoretical\  findings.\  Our\  theoretical\  analysis\  rests\  on\  a\  key structural lemma about partial derivatives of multi-layer rectifier networks.

}, keywords = {capacity control, deep learning, Fisher-Rao metric, generalization error, information geometry, Invariance, natural gradient, ReLU activation, statistical learning theory}, url = {https://arxiv.org/abs/1711.01530}, author = {Liang, Tengyuan and Tomaso Poggio and Alexander Rakhlin and Stokes, James} } @article {2780, title = {Musings on Deep Learning: Properties of SGD}, year = {2017}, month = {04/2017}, abstract = {

[formerly titled "Theory of Deep Learning III: Generalization Properties of SGD"]

In Theory III we characterize with a mix of theory and experiments the generalization properties of Stochastic Gradient Descent in overparametrized deep convolutional networks. We show that Stochastic Gradient Descent (SGD) selects with high probability solutions that 1) have zero (or small) empirical error, 2) are degenerate as shown in Theory II and 3) have maximum generalization.

}, author = {Chiyuan Zhang and Qianli Liao and Alexander Rakhlin and Karthik Sridharan and Brando Miranda and Noah Golowich and Tomaso Poggio} } @article {3261, title = {Theory of Deep Learning IIb: Optimization Properties of SGD}, year = {2017}, month = {12/2017}, abstract = {

In Theory IIb we characterize with a mix of theory and experiments the optimization of deep convolutional networks by Stochastic Gradient Descent. The main new result in this paper is theoretical and experimental evidence for the following conjecture about SGD: SGD concentrates in probability - like the classical Langevin equation {\textendash} on large volume, {\textquotedblleft}flat{\textquotedblright} minima, selecting flat minimizers which are with very high probability also global minimizers.

}, author = {Chiyuan Zhang and Qianli Liao and Alexander Rakhlin and Brando Miranda and Noah Golowich and Tomaso Poggio} }