@article {3694, title = {Theory III: Dynamics and Generalization in Deep Networks}, year = {2018}, month = {06/2018}, abstract = {
The key to generalization is controlling the complexity of
\ \ \ \ \ \ the network. However, there is no obvious control of
\ \ \ \ \ \ complexity -- such as an explicit regularization term --
\ \ \ \ \ \ in the training of deep networks for classification. We
\ \ \ \ \ \ will show that a classical form of norm control -- but
\ \ \ \ \ \ kind of hidden -- is present in deep networks trained with
\ \ \ \ \ \ gradient descent techniques on exponential-type losses. In
\ \ \ \ \ \ particular, gradient descent induces a dynamics of the
\ \ \ \ \ \ normalized weights which converge for $t \to \infty$ to an
\ \ \ \ \ \ equilibrium which corresponds to a minimum norm (or
\ \ \ \ \ \ maximum margin) solution. For sufficiently large but
\ \ \ \ \ \ finite $\rho$ -- and thus finite $t$ -- the dynamics
\ \ \ \ \ \ converges to one of several margin maximizers, with the
\ \ \ \ \ \ margin monotonically increasing towards a limit stationary
\ \ \ \ \ \ point of the flow. In the usual case of stochastic
\ \ \ \ \ \ gradient descent, most of the stationary points are likely
\ \ \ \ \ \ to be convex minima corresponding to a regularized,
\ \ \ \ \ \ constrained minimizer -- the network with normalized
\ \ \ \ \ \ weights-- which is stable and has asymptotic zero
\ \ \ \ \ \ generalization gap, asymptotically for $N \to \infty$,
\ \ \ \ \ \ where $N$ is the number of training examples. For finite,
\ \ \ \ \ \ fixed $N$ the generalizaton gap may not be zero but the
\ \ \ \ \ \ minimum norm property of the solution can provide, we
\ \ \ \ \ \ conjecture, good expected performance for suitable data
\ \ \ \ \ \ distributions. Our approach extends some of the results of
\ \ \ \ \ \ Srebro from linear networks to deep networks and provides
\ \ \ \ \ \ a new perspective on the implicit bias of gradient
\ \ \ \ \ \ descent. We believe that the elusive complexity control we
\ \ \ \ \ \ describe is responsible for the puzzling empirical finding
\ \ \ \ \ \ of good predictive performance by deep networks, despite
\ \ \ \ \ \ overparametrization.\