@article {3694, title = {Theory III: Dynamics and Generalization in Deep Networks}, year = {2018}, month = {06/2018}, abstract = {

The key to generalization is controlling the complexity of
\  \  \  \  \  \  the network. However, there is no obvious control of
\  \  \  \  \  \  complexity -- such as an explicit regularization term --
\  \  \  \  \  \  in the training of deep networks for classification. We
\  \  \  \  \  \  will show that a classical form of norm control -- but
\  \  \  \  \  \  kind of hidden -- is present in deep networks trained with
\  \  \  \  \  \  gradient descent techniques on exponential-type losses. In
\  \  \  \  \  \  particular, gradient descent induces a dynamics of the
\  \  \  \  \  \  normalized weights which converge for $t \to \infty$ to an
\  \  \  \  \  \  equilibrium which corresponds to a minimum norm (or
\  \  \  \  \  \  maximum margin) solution. For sufficiently large but
\  \  \  \  \  \  finite $\rho$ -- and thus finite $t$ -- the dynamics
\  \  \  \  \  \  converges to one of several margin maximizers, with the
\  \  \  \  \  \  margin monotonically increasing towards a limit stationary
\  \  \  \  \  \  point of the flow. In the usual case of stochastic
\  \  \  \  \  \  gradient descent, most of the stationary points are likely
\  \  \  \  \  \  to be convex minima corresponding to a regularized,
\  \  \  \  \  \  constrained minimizer -- the network with normalized
\  \  \  \  \  \  weights-- which is stable and has asymptotic zero
\  \  \  \  \  \  generalization gap, asymptotically for $N \to \infty$,
\  \  \  \  \  \  where $N$ is the number of training examples. For finite,
\  \  \  \  \  \  fixed $N$ the generalizaton gap may not be zero but the
\  \  \  \  \  \  minimum norm property of the solution can provide, we
\  \  \  \  \  \  conjecture, good expected performance for suitable data
\  \  \  \  \  \  distributions. Our approach extends some of the results of
\  \  \  \  \  \  Srebro from linear networks to deep networks and provides
\  \  \  \  \  \  a new perspective on the implicit bias of gradient
\  \  \  \  \  \  descent. We believe that the elusive complexity control we
\  \  \  \  \  \  describe is responsible for the puzzling empirical finding
\  \  \  \  \  \  of good predictive performance by deep networks, despite
\  \  \  \  \  \  overparametrization.\ 

}, author = {Andrzej Banburski and Qianli Liao and Brando Miranda and Tomaso Poggio and Lorenzo Rosasco and Jack Hidary and Fernanda De La Torre} }