Source code for ribs.archives._archive_data_frame

"""Provides ArchiveDataFrame."""
import re

import pandas as pd

# Developer Notes:
# - The documentation for this class is hacked -- to add new methods, manually
#   modify the template in docs/_templates/autosummary/class.rst
# - See here for info on extending DataFrame:
#   https://pandas.pydata.org/pandas-docs/stable/development/extending.html


[docs]class ArchiveDataFrame(pd.DataFrame):
    """A modified :class:`~pandas.DataFrame` for archive data.

    As this class inherits from :class:`~pandas.DataFrame`, it has the same
    methods, attributes, and arguments (even though the arguments shown here are
    ``*args`` and ``**kwargs``). However, this class adds methods that make it
    convenient to work with elites. This documentation only lists these
    additional methods and attributes.

    Example:

        This object is created by :meth:`~ArchiveBase.data` (i.e. users
        typically do not create it on their own)::

            df = archive.data(..., return_type="pandas")

        To iterate through every elite as a dict, use::

            for elite in df.iterelites():
                elite["solution"]  # Shape: (solution_dim,)
                elite["objective"]
                ...

        Arrays corresponding to individual fields can be accessed with
        :meth:`get_field`. For instance, the following is an array where entry
        ``i`` contains the measures of the ``i``'th elite in the DataFrame::

            df.get_field("measures")

    .. warning::

        Calling :meth:`get_field` always creates a copy, so the following will
        copy the measures 3 times::

            df.get_field("measures")[0]
            df.get_field("measures").mean()
            df.get_field("measures").median()

        **Thus, if you need to use the method several times, we recommend
        storing it first, like so**::

            measures = df.get_field("measures")
            measures[0]
            measures.mean()
            measures.median()

    .. note::

        After saving an ArchiveDataFrame to a CSV, loading it with
        :func:`pandas.read_csv` will load a :class:`~pandas.DataFrame`. To load
        a CSV as an ArchiveDataFrame, pass the ``DataFrame`` from ``read_csv``
        to ArchiveDataFrame::

            df = ArchiveDataFrame(pd.read_csv("file.csv"))

    .. note::

        Results of :meth:`get_field` "align" with each other -- e.g.
        ``get_field("measures")[i]`` corresponds to ``get_field("index")[i]``,
        ``get_field("objective")[i]``, and ``get_field("solution")[i]``.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    @property
    def _constructor(self):
        return ArchiveDataFrame

[docs]    def iterelites(self):
        """Iterator that outputs every elite in the ArchiveDataFrame as a dict.
        """
        # Identify fields in the data frame. There are some edge cases here,
        # such as if someone purposely names their field with an underscore and
        # a number at the end like "foobar_0", but it covers most cases.
        fields = {}
        for col in self:
            split = col.split("_")
            if len(split) == 1:
                # Single column.
                fields[col] = None
            elif split[-1].isdigit():
                # If the last item in the split is numerical, this should match
                # vector fields like "measures_0".

                # Exclude last val and underscore - note negative sign.
                field_name = col[:-(len(split[-1]) + 1)]

                fields[field_name] = None
            else:
                fields[col] = None

        # Retrieve field data.
        for name in fields:
            fields[name] = self.get_field(name)

        n_elites = len(self)

        return map(
            lambda i: {
                name: arr[i] for name, arr in fields.items()
            },
            range(n_elites),
        )

[docs]    def get_field(self, field):
        """Array holding the data for the given field.

        None if there is no data for the field.
        """
        # Note: The column names cannot be pre-computed because the DataFrame
        # columns might change in-place, e.g., when a column is deleted.

        if field in self:
            # Scalar field -- e.g., "objective"
            return self[field].to_numpy(copy=True)
        else:
            # Vector field -- e.g., field="measures" and we want columns like
            # "measures_0" and "measures_1"
            field_re = f"{field}_\\d+"
            cols = [c for c in self if re.fullmatch(field_re, c)]
            return self[cols].to_numpy(copy=True) if cols else None