Source code for ribs.emitters.opt._adam_opt

"""Provides AdamOpt.

Adapted from:
https://github.com/icaros-usc/dqd/blob/main/ribs/emitters/opt/_adam.py
https://github.com/hardmaru/estool/blob/master/es.py
https://github.com/openai/evolution-strategies-starter/blob/master/es_distributed/optimizers.py
https://pytorch.org/docs/stable/generated/torch.optim.Adam.html
"""
import numpy as np

from ribs.emitters.opt._gradient_opt_base import GradientOptBase


[docs]class AdamOpt(GradientOptBase): """Adam optimizer. Refer to `Kingma and Ba 2014 <https://arxiv.org/pdf/1412.6980.pdf>`_ for more information on hyperparameters. Args: theta0 (array-like): Initial solution. 1D array. lr (float): Learning rate for the update. beta1 (float): Exponential decay rate for the moment estimates. beta2 (float): Another exponential decay rate for the moment estimates. epsilon (float): Hyperparameter for numerical stability. l2_coeff (float): Coefficient for L2 regularization. Note this is **not** the same as "weight decay" -- see `this blog post <https://www.fast.ai/posts/2018-07-02-adam-weight-decay.html>_` and `Loshchilov and Hutler 2019 <https://arxiv.org/abs/1711.05101>_` for more info. """ def __init__( # pylint: disable = super-init-not-called self, theta0, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, l2_coeff=0.0): self._m = None self._v = None self._t = None self._epsilon = epsilon self._beta1 = beta1 self._beta2 = beta2 self._l2_coeff = l2_coeff self._lr = lr self._theta = None self.reset(theta0) @property def theta(self): return self._theta
[docs] def reset(self, theta0): self._theta = np.copy(theta0) self._m = np.zeros_like(self._theta) self._v = np.zeros_like(self._theta) self._t = 0
[docs] def step(self, gradient): # Invert gradient since we seek to maximize -- see pseudocode here: # https://pytorch.org/docs/stable/generated/torch.optim.Adam.html gradient = -np.asarray(gradient) # L2 regularization (not weight decay). gradient += self._l2_coeff * self._theta self._t += 1 a = (self._lr * np.sqrt(1 - self._beta2**self._t) / (1 - self._beta1**self._t)) self._m = self._beta1 * self._m + (1 - self._beta1) * gradient self._v = (self._beta2 * self._v + (1 - self._beta2) * (gradient * gradient)) step = -a * self._m / (np.sqrt(self._v) + self._epsilon) self._theta += step