Source code for ribs.archives._cvt_archive

"""Contains the CVTArchive class."""
import numpy as np
from numba import jit
from scipy.spatial import cKDTree  # pylint: disable=no-name-in-module
from sklearn.cluster import k_means

from ribs.archives._archive_base import ArchiveBase, require_init


[docs]class CVTArchive(ArchiveBase): """An archive that divides the entire behavior space into a fixed number of bins. This archive originates in `Vassiliades 2018 <https://ieeexplore.ieee.org/document/8000667>`_. It uses Centroidal Voronoi Tesselation (CVT) to divide an n-dimensional behavior space into k bins. The CVT is created by sampling points uniformly from the n-dimensional behavior space and using k-means clustering to identify k centroids. When items are inserted into the archive, we identify their bin by identifying the closest centroid in behavior space (using Euclidean distance). For k-means clustering, we use :func:`sklearn.cluster.k_means`. Finding the closest centroid is done in O(bins) time (i.e. brute force) by default. If ``use_kd_tree`` is True, it can be done in roughly O(log bins) time using :class:`scipy.spatial.cKDTree`. However, using the k-D tree lowers performance for small numbers of bins. The following plot compares the runtime of brute force vs k-D tree when inserting 100k samples into a 2D archive with varying numbers of bins (we took the minimum over 5 runs for each data point, as recommended in the docs for :meth:`timeit.Timer.repeat`. Note the logarithmic scales. This plot was generated on a reasonably modern laptop. .. image:: ../_static/imgs/cvt_add_plot.png :alt: Runtime to insert 100k entries into CVTArchive Archives with at least 5k bins seem to have faster insertion when using a k-D tree than when using brute force, so **we recommend setting** ``use_kd_tree`` **if the** ``CVTArchive`` **has at least 5k bins**. See `benchmarks/cvt_add.py <https://github.com/icaros-usc/pyribs/tree/master/benchmarks/cvt_add.py>`_ in the project repo for more information about how this plot was generated. Finally, if running multiple experiments, it may be beneficial to use the same centroids across each experiment. Doing so can keep experiments consistent and reduce execution time. To do this, either 1) construct custom centroids and pass them in via the ``custom_centroids`` argument, or 2) access the centroids created in the first archive with :attr:`centroids` and pass them into ``custom_centroids`` when constructing archives for subsequent experiments. Args: bins (int): The number of bins to use in the archive, equivalent to the number of centroids/areas in the CVT. ranges (array-like of (float, float)): Upper and lower bound of each dimension of the behavior space, e.g. ``[(-1, 1), (-2, 2)]`` indicates the first dimension should have bounds :math:`[-1,1]` (inclusive), and the second dimension should have bounds :math:`[-2,2]` (inclusive). ``ranges`` should be the same length as ``dims``. seed (int): Value to seed the random number generator as well as :func:`~sklearn.cluster.k_means`. Set to None to avoid a fixed seed. dtype (str or data-type): Data type of the solutions, objective values, and behavior values. We only support ``"f"`` / :class:`np.float32` and ``"d"`` / :class:`np.float64`. samples (int or array-like): If it is an int, this specifies the number of samples to generate when creating the CVT. Otherwise, this must be a (num_samples, behavior_dim) array where samples[i] is a sample to use when creating the CVT. It can be useful to pass in custom samples when there are restrictions on what samples in the behavior space are (physically) possible. custom_centroids (array-like): If passed in, this (bins, behavior_dim) array will be used as the centroids of the CVT instead of generating new ones. In this case, ``samples`` will be ignored, and ``archive.samples`` will be None. This can be useful when one wishes to use the same CVT across experiments for fair comparison. k_means_kwargs (dict): kwargs for :func:`~sklearn.cluster.k_means`. By default, we pass in `n_init=1`, `init="random"`, `algorithm="full"`, and `random_state=seed`. use_kd_tree (bool): If True, use a k-D tree for finding the closest centroid when inserting into the archive. This may result in a speedup for larger dimensions. ckdtree_kwargs (dict): kwargs for :class:`~scipy.spatial.cKDTree`. By default, we do not pass in any kwargs. Raises: ValueError: The ``samples`` array or the ``custom_centroids`` array has the wrong shape. """ def __init__(self, bins, ranges, seed=None, dtype=np.float64, samples=100_000, custom_centroids=None, k_means_kwargs=None, use_kd_tree=False, ckdtree_kwargs=None): ArchiveBase.__init__( self, storage_dims=(bins,), behavior_dim=len(ranges), seed=seed, dtype=dtype, ) ranges = list(zip(*ranges)) self._lower_bounds = np.array(ranges[0], dtype=self.dtype) self._upper_bounds = np.array(ranges[1], dtype=self.dtype) self._bins = bins # Apply default args for k-means. Users can easily override these, # particularly if they want higher quality clusters. self._k_means_kwargs = ({} if k_means_kwargs is None else k_means_kwargs.copy()) if "n_init" not in self._k_means_kwargs: # Only run one iter to be fast. self._k_means_kwargs["n_init"] = 1 if "init" not in self._k_means_kwargs: # The default, "k-means++", takes very long to init. self._k_means_kwargs["init"] = "random" if "algorithm" not in self._k_means_kwargs: # The default, "auto"/"elkan", allocates a huge array. self._k_means_kwargs["algorithm"] = "full" if "random_state" not in self._k_means_kwargs: self._k_means_kwargs["random_state"] = seed self._use_kd_tree = use_kd_tree self._centroid_kd_tree = None self._ckdtree_kwargs = ({} if ckdtree_kwargs is None else ckdtree_kwargs.copy()) if custom_centroids is None: if not isinstance(samples, int): # Validate shape of custom samples. These are ignored when # `custom_centroids` is provided. samples = np.asarray(samples, dtype=self.dtype) if samples.shape[1] != self._behavior_dim: raise ValueError( f"Samples has shape {samples.shape} but must be of " f"shape (n_samples, len(ranges)={self._behavior_dim})") self._samples = samples self._centroids = None else: # Validate shape of `custom_centroids` when they are provided. custom_centroids = np.asarray(custom_centroids, dtype=self.dtype) if custom_centroids.shape != (bins, self._behavior_dim): raise ValueError( f"custom_centroids has shape {custom_centroids.shape} but " f"must be of shape (bins={bins}, len(ranges)=" f"{self._behavior_dim})") self._centroids = custom_centroids self._samples = None @property def lower_bounds(self): """(behavior_dim,) numpy.ndarray: Lower bound of each dimension.""" return self._lower_bounds @property def upper_bounds(self): """(behavior_dim,) numpy.ndarray: Upper bound of each dimension.""" return self._upper_bounds @property @require_init def samples(self): """(num_samples, behavior_dim) numpy.ndarray: The samples used in creating the CVT. May be None until :meth:`initialize` is called. """ return self._samples @property @require_init def centroids(self): """(n_centroids, behavior_dim) numpy.ndarray: The centroids used in the CVT. None until :meth:`initialize` is called. """ return self._centroids
[docs] def initialize(self, solution_dim): """Initializes the archive storage space and centroids. This method may take a while to run. In addition to allocating storage space, it runs :func:`~sklearn.cluster.k_means` to create an approximate CVT, and it constructs a :class:`~scipy.spatial.cKDTree` object containing the centroids found by k-means. k-means is not run if ``custom_centroids`` was passed in during construction. Args: solution_dim (int): The dimension of the solution space. Raises: RuntimeError: The archive is already initialized. RuntimeError: The number of centroids returned by k-means clustering was fewer than the number of bins specified during construction. This is most likely caused by having too few samples and too many bins. """ ArchiveBase.initialize(self, solution_dim) if self._centroids is None: self._samples = self._rng.uniform( self._lower_bounds, self._upper_bounds, size=(self._samples, self._behavior_dim), ).astype(self.dtype) if isinstance(self._samples, int) else self._samples self._centroids = k_means(self._samples, self._bins, **self._k_means_kwargs)[0] if self._centroids.shape[0] < self._bins: raise RuntimeError( "While generating the CVT, k-means clustering found " f"{self._centroids.shape[0]} centroids, but this archive " f"needs {self._bins} bins. This most likely happened " "because there are too few samples and/or too many bins.") if self._use_kd_tree: self._centroid_kd_tree = cKDTree(self._centroids, **self._ckdtree_kwargs)
@staticmethod @jit(nopython=True) def _brute_force_nn_numba(behavior_values, centroids): """Calculates the nearest centroid to the given behavior values. Technically, we calculate squared distance, but we only care about finding the neighbor and not the distance itself. """ distances = np.expand_dims(behavior_values, axis=0) - centroids distances = np.sum(np.square(distances), axis=1) return np.argmin(distances)
[docs] def get_index(self, behavior_values): """Finds the index of the centroid closest to the behavior values. If ``idx`` is the index returned by this method for some behavior values ``beh``, then ``archive.centroids[idx]`` holds the coordinates of the centroid closest to ``beh``. See :attr:`centroids` for more info. The centroid index is located using either the k-D tree or brute force, depending on the value of ``use_kd_tree`` in the constructor. Args: behavior_values (numpy.ndarray): (:attr:`behavior_dim`,) array of coordinates in behavior space. Returns: int: Centroid index. """ # We use int() here since these methods may return a numpy integer. if self._use_kd_tree: return int(self._centroid_kd_tree.query(behavior_values)[1]) return int(self._brute_force_nn_numba(behavior_values, self._centroids))