We present ISAAC (Input-baSed ApproximAte Curvature), a novel method that conditions the gradient using selected second-order information and has an asymptotically vanishing computational overhead, assuming a batch size smaller than the number of neurons. We show that it is possible to compute a good conditioner based on only the input to a respective layer without a substantial computational overhead. The proposed method allows effective training even in small-batch stochastic regimes, which makes it competitive to first-order as well as second-order methods.
@article{Petersen2023Isaac, author = {F. Petersen, T. Sutter, C. Borgelt, D. Huh, H. Kuehne, Y. Sun, O. Deussen}, copyright = {arXiv.org perpetual, non-exclusive license}, doi = {10.48550/arXiv.2305.00604}, journal = {Published as a conference paper at ICLR 2023}, keywords = {Machine Learning (cs.LG), Computer Vision and Pattern Recognition (cs.CV), Optimization and Control (math.OC), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Mathematics}, publisher = {arXiv}, title = {ISAAC Newton: Input-based Approximate Curvature for Newton's Method}, year = {2023} }