Source code for ribs.emitters.opt._adam_opt
"""Provides AdamOpt.
Adapted from:
https://github.com/icaros-usc/dqd/blob/main/ribs/emitters/opt/_adam.py
https://github.com/hardmaru/estool/blob/master/es.py
https://github.com/openai/evolution-strategies-starter/blob/master/es_distributed/optimizers.py
https://pytorch.org/docs/stable/generated/torch.optim.Adam.html
"""
import numpy as np
from numpy.typing import ArrayLike
from ribs.emitters.opt._gradient_opt_base import GradientOptBase
from ribs.typing import Float
[docs]
class AdamOpt(GradientOptBase):
"""Adam optimizer.
Refer to `Kingma and Ba 2014 <https://arxiv.org/pdf/1412.6980.pdf>`_ for more
information on hyperparameters.
Args:
theta0: Initial solution. 1D array.
lr: Learning rate for the update.
beta1: Exponential decay rate for the moment estimates.
beta2: Another exponential decay rate for the moment estimates.
epsilon: Hyperparameter for numerical stability.
l2_coeff: Coefficient for L2 regularization. Note this is **not** the same as
"weight decay" -- see `this blog post
<https://www.fast.ai/posts/2018-07-02-adam-weight-decay.html>_` and
`Loshchilov and Hutler 2019 <https://arxiv.org/abs/1711.05101>_` for more
info.
"""
def __init__(
self,
theta0: ArrayLike,
lr: Float = 0.001,
beta1: Float = 0.9,
beta2: Float = 0.999,
epsilon: Float = 1e-8,
l2_coeff: Float = 0.0,
):
self._m = None
self._v = None
self._t = None
self._epsilon = epsilon
self._beta1 = beta1
self._beta2 = beta2
self._l2_coeff = l2_coeff
self._lr = lr
self._theta = None
self.reset(theta0)
@property
def theta(self) -> np.ndarray:
return self._theta
[docs]
def reset(self, theta0: ArrayLike) -> None:
self._theta = np.asarray(theta0, copy=True)
self._m = np.zeros_like(self._theta)
self._v = np.zeros_like(self._theta)
self._t = 0
[docs]
def step(self, gradient: ArrayLike) -> None:
# Invert gradient since we seek to maximize -- see pseudocode here:
# https://pytorch.org/docs/stable/generated/torch.optim.Adam.html
gradient = -np.asarray(gradient)
# L2 regularization (not weight decay).
gradient += self._l2_coeff * self._theta
self._t += 1
a = self._lr * np.sqrt(1 - self._beta2**self._t) / (1 - self._beta1**self._t)
self._m = self._beta1 * self._m + (1 - self._beta1) * gradient
self._v = self._beta2 * self._v + (1 - self._beta2) * (gradient * gradient)
step = -a * self._m / (np.sqrt(self._v) + self._epsilon)
self._theta += step