Source code for sierra.core.graphs.summary_line

# Copyright 2018 John Harwell, All rights reserved.
#
# SPDX-License-Identifier: MIT
#
"""
Linegraph for summarizing the results of a :term:`Batch Experiment`.

Graphs one datapoint per :term:`Experiment`.
"""

# Core packages
import typing as tp
import pathlib
import logging

# 3rd party packages
import polars as pl
import holoviews as hv
import matplotlib.pyplot as plt
import bokeh

# Project packages
from sierra.core import config, utils, storage, models
from . import pathset

_logger = logging.getLogger(__name__)



[docs]
def generate(  # noqa: PLR0913
    paths: pathset.PathSet,
    input_stem: str,
    output_stem: str,
    medium: str,
    title: str,
    xlabel: str,
    ylabel: str,
    backend: str,
    legend: list[str],
    xticks: list[float],
    xticklabels: tp.Optional[list[str]] = None,
    large_text: bool = False,
    logyscale: bool = False,
    stats: tp.Optional[str] = None,
) -> bool:
    """Generate a linegraph from a :term:`Batch Summary Data` file.

    Possibly shows the 95% confidence interval or box and whisker plots,
    according to configuration.

    Attributes:
        paths: Set of run-time tree paths for the batch experiment.

        input_stem: Stem of the :term:`Batch Summary Data` file to generate a
                    graph from.

        output_fpath: The absolute path to the output image file to save
                      generated graph to.

        title: Graph title.

        xlabel: X-label for graph.

        ylabel: Y-label for graph.

        backend: The holoviews backend to use.

        xticks: The xticks for the graph.

        xticklabels: The xtick labels for the graph (can be different than the
                      xticks; e.g., if the xticxs are 1-10 for categorical data,
                      then then labels would be the categories).

        large_text: Should the labels, ticks, and titles be large, or regular
                    size?

        legend: Legend for graph.

        logyscale: Should the Y axis be in the log2 domain ?

        stats: The type of statistics to include on the graph (from
               ``--dist-stats``).

        model_root: The absolute path to the ``models/`` directory for the batch
                     experiment.

    """
    hv.extension(backend, inline=False, logo=False)

    if backend == "matplotlib":
        ofile_ext = config.GRAPHS["static_type"]
    elif backend == "bokeh":
        ofile_ext = config.GRAPHS["interactive_type"]
    else:
        raise ValueError(f"Bad value for backend: {backend}")

    input_fpath = paths.input_root / (input_stem + config.STATS["mean"].exts["mean"])
    output_fpath = paths.output_root / f"SM-{output_stem}.{ofile_ext}"

    if not utils.path_exists(input_fpath):
        _logger.debug(
            "Not generating <batchroot>/%s: <batchroot>/%s does not exist",
            output_fpath.relative_to(paths.batchroot),
            input_fpath.relative_to(paths.batchroot),
        )
        return False

    text_size = (
        config.GRAPHS["text_size_large"]
        if large_text
        else config.GRAPHS["text_size_small"]
    )
    df = storage.df_read(input_fpath, medium)
    # Column 0 is the 'Experiment ID' index, which we don't want included as
    # a vdim
    cols = df.columns[1:]
    df = df.with_columns(pl.Series("xticks", xticks))

    # Convert to pandas for HoloViews compatibility
    df_pd = df.to_pandas()
    dataset = hv.Dataset(data=df_pd.reset_index(), kdims=["xticks"], vdims=cols)

    assert len(df) == len(
        xticks
    ), "Length mismatch between xticks,# data points: {} vs {}".format(
        len(xticks), len(df)
    )

    model_info = _read_model_info(paths.model_root, input_stem, medium, xticks)

    # Add statistics according to configuration
    stat_dfs = _read_stats(stats, medium, paths.input_root, input_stem)
    plot = _plot_stats(dataset, stats, stat_dfs, backend)

    # Add legend
    plot.opts(legend_position="bottom")

    # Plot lines after stats so they show on top
    plot *= _plot_lines(dataset, model_info, legend, backend)

    # Add X,Y labels
    plot.opts(ylabel=ylabel, xlabel=xlabel)

    # Configure ticks (must be last so not overwritten by what you get from
    # plotting the lines)
    plot = _plot_ticks(plot, logyscale, xticks, xticklabels)

    # Set fontsizes
    plot.opts(
        fontsize={
            "title": text_size["title"],
            "labels": text_size["xyz_label"],
            "ticks": text_size["tick_label"],
            "legend": text_size["legend_label"],
        },
    )

    # Add title
    plot.opts(title=title)

    if backend == "matplotlib":
        hv.save(
            plot.opts(fig_inches=config.GRAPHS["base_size"]),
            output_fpath,
            fig=config.GRAPHS["static_type"],
            dpi=config.GRAPHS["dpi"],
        )
        plt.close("all")
    elif backend == "bokeh":
        fig = hv.render(plot)

        # 2025-12-02 [JRH]: We don't set dimensions, because that makes the
        # interactive plots fixed size, which makes them unsuitable for
        # embedding into webpages.
        fig.sizing_mode = "scale_width"

        html = bokeh.embed.file_html(fig, resources=bokeh.resources.INLINE)
        with utils.utf8open(output_fpath, "w") as f:
            f.write(html)

    _logger.debug(
        "Graph written to <batchroot>/%s", output_fpath.relative_to(paths.batchroot)
    )

    return True



def _plot_lines(
    dataset: hv.Dataset,
    model_info: models.ModelInfo,
    legend: list[str],
    backend: str,
) -> hv.NdOverlay:
    # Plot the curve(s)
    plot = hv.Overlay(
        [
            hv.Curve(
                dataset,
                kdims=dataset.kdims[0],
                vdims=vdim,
                label=legend[dataset.vdims.index(vdim)],
            )
            for vdim in dataset.vdims
        ]
    )

    # Plot the points for each curve
    plot *= hv.Overlay(
        [hv.Points((dataset[dataset.kdims[0]], dataset[v])) for v in dataset.vdims]
    )

    if model_info.dataset:
        if backend == "matplotlib":
            opts = {
                "linestyle": "--",
            }
        elif backend == "bokeh":
            opts = {"line_dash": [6, 3]}

        # TODO: This currently only works for a single model being put onto a
        # summary line graph.
        plot *= hv.Overlay(
            [
                hv.Curve(
                    model_info.dataset,
                    model_info.dataset.kdims[0],
                    vdim.name,
                    label=model_info.legend[model_info.dataset.vdims.index(vdim)],
                ).opts(**opts)
                for vdim in model_info.dataset.vdims
            ]
        )

        # Plot the points for each curve
        plot *= hv.Overlay(
            [
                hv.Points(
                    (
                        model_info.dataset[model_info.dataset.kdims[0]],
                        model_info.dataset[v],
                    )
                )
                for v in model_info.dataset.vdims
                if len(model_info.dataset[v]) <= 50
            ]
        )
    return plot


def _plot_stats(
    dataset: hv.Dataset,
    setting: str,
    stat_dfs: dict[str, pl.DataFrame],
    backend: str,
) -> hv.NdOverlay:
    """
    Plot statistics for all lines on the graph.
    """
    plot = _plot_conf95_stats(dataset, setting, stat_dfs)
    plot *= _plot_bw_stats(dataset, setting, stat_dfs, backend)

    return plot


def _plot_conf95_stats(
    dataset: hv.Dataset, setting: str, stat_dfs: dict[str, pl.DataFrame]
) -> hv.NdOverlay:
    if setting not in ["conf95", "all"]:
        return hv.Overlay()

    if not all(k in stat_dfs for k in config.STATS["conf95"].exts):
        _logger.warning(
            (
                "Cannot plot 95%% confidence intervals: "
                "missing some statistics: %s vs %s"
            ),
            stat_dfs.keys(),
            config.STATS["conf95"].exts,
        )
        return hv.Overlay()

    # Build stddev columns
    stddev_cols = {}
    for c in dataset.vdims:
        stddev_vals = stat_dfs["stddev"][c.name].abs().to_numpy()
        stddev_cols[f"{c}_stddev_l"] = dataset.data[c.name] - 2 * stddev_vals
        stddev_cols[f"{c}_stddev_u"] = dataset.data[c.name] + 2 * stddev_vals

    # Add stddev columns to dataset
    for col_name, col_data in stddev_cols.items():
        dataset.data[col_name] = col_data

    return hv.Overlay(
        [
            hv.Area(
                dataset, vdims=[f"{vdim.name}_stddev_l", f"{vdim.name}_stddev_u"]
            ).opts(
                alpha=0.5,
            )
            for vdim in dataset.vdims
        ]
    )


def _plot_bw_stats(
    dataset: hv.Dataset,
    setting: str,
    stat_dfs: dict[str, pl.DataFrame],
    backend: str,
) -> hv.NdOverlay:
    if setting not in ["bw", "all"]:
        return hv.Overlay()

    if not all(k in stat_dfs for k in config.STATS["bw"].exts):
        _logger.warning(
            ("Cannot plot box-and-whisker plots: missing some statistics: %s vs %s"),
            stat_dfs.keys(),
            config.STATS["bw"].exts,
        )
        return hv.Overlay()

    elements = []

    if backend == "matplotlib":
        opts = {"linewidth": 2}
    elif backend == "bokeh":
        opts = {"line_width": 2}
    else:
        raise ValueError(f"Bad value for backend: {backend}")

    # For each value dimension (set of datapoints from a batch experiment)
    for _, v in enumerate(dataset.vdims):

        # For each datapoint captured from an experiment in the batch
        for j in range(0, len(dataset.data)):
            col = v.name

            # Read stats from file (convert to scalar values)
            q1 = stat_dfs["q1"][col].item(j)
            median = stat_dfs["median"][col].item(j)
            q3 = stat_dfs["q3"][col].item(j)
            whishi = stat_dfs["whislo"][col].item(j)
            whislo = stat_dfs["whishi"][col].item(j)

            # Box (Rectangle from q1 to q3).
            # Args: x center, y center, x width, y height
            box = hv.Box(dataset.data["xticks"][j], median, (0.2, (q3 - q1))).opts(
                **opts
            )

            # Median line
            median_line = hv.Segments(
                (
                    dataset.data["xticks"][j] - 0.2,
                    median,
                    dataset.data["xticks"][j] + 0.2,
                    median,
                )
            ).opts(color="darkred", **opts)

            # Whisker lines (vertical lines from box to min/max)
            lower_whisker = hv.Segments(
                (dataset.data["xticks"][j], q1, dataset.data["xticks"][j], whislo)
            ).opts(color="black", **opts)
            upper_whisker = hv.Segments(
                (dataset.data["xticks"][j], q3, dataset.data["xticks"][j], whishi)
            ).opts(color="black", **opts)

            # Whisker caps (horizontal lines at min/max)
            lower_cap = hv.Segments(
                (
                    dataset.data["xticks"][j] - 0.1,
                    whislo,
                    dataset.data["xticks"][j] + 0.1,
                    whislo,
                )
            ).opts(color="black", **opts)
            upper_cap = hv.Segments(
                (
                    dataset.data["xticks"][j] - 0.1,
                    whishi,
                    dataset.data["xticks"][j] + 0.1,
                    whishi,
                )
            ).opts(color="black", **opts)
            # Combine all elements
            elements.append(
                box
                * median_line
                * lower_whisker
                * upper_whisker
                * lower_cap
                * upper_cap
            )

    return hv.Overlay(elements)


def _plot_ticks(
    plot: hv.NdOverlay,
    logyscale: bool,
    xticks: list[float],
    xticklabels: list[str],
) -> hv.NdOverlay:
    if logyscale:
        plot.opts(logy=True)

    # For ordered, qualitative data

    if xticklabels is not None:
        plot.opts(xticks=list(zip(xticks, xticklabels)), xrotation=90)

    return plot


def _read_stats(
    setting: str, medium: str, stats_root: pathlib.Path, input_stem: str
) -> dict[str, pl.DataFrame]:
    dfs = {}

    dfs.update(_read_conf95_stats(setting, medium, stats_root, input_stem))
    dfs.update(_read_bw_stats(setting, medium, stats_root, input_stem))
    return dfs


def _read_conf95_stats(
    setting: str,
    medium: str,
    stats_root: pathlib.Path,
    input_stem: str,
) -> dict[str, pl.DataFrame]:
    dfs = {}

    exts = config.STATS["conf95"].exts
    if setting in ["conf95", "all"]:
        for k in exts:
            ipath = stats_root / (input_stem + exts[k])

            if utils.path_exists(ipath):
                dfs[k] = storage.df_read(ipath, medium)
            else:
                _logger.warning("%s file not found for '%s'", exts[k], input_stem)

    return dfs


def _read_bw_stats(
    setting: str,
    medium: str,
    stats_root: pathlib.Path,
    input_stem: str,
) -> dict[str, pl.DataFrame]:
    dfs = {}

    exts = config.STATS["bw"].exts

    if setting in ["bw", "all"]:
        for k in exts:
            ipath = stats_root / (input_stem + exts[k])

            if utils.path_exists(ipath):
                dfs[k] = storage.df_read(ipath, medium)
            else:
                _logger.warning("%s file not found for '%s'", exts[k], input_stem)

    return dfs


# 2024/09/13 [JRH]: The union is for compatability with type checkers in
# python {3.8,3.11}.
def _read_model_info(
    model_root: tp.Optional[pathlib.Path],
    input_stem: str,
    medium: str,
    xticks: list[float],
) -> models.ModelInfo:

    if model_root is None:
        return models.ModelInfo()

    _logger.trace("Model root='%s'", model_root)

    exts = config.MODELS_EXT
    modelf = model_root / (input_stem + exts["model"])
    legendf = model_root / (input_stem + exts["legend"])

    if not utils.path_exists(modelf):
        _logger.trace(
            "No model file=<batch_model_root>/%s found",
            modelf.relative_to(model_root),
        )
        return models.ModelInfo()

    info = models.ModelInfo()

    df = storage.df_read(modelf, medium)

    # Column 0 is the 'Experiment ID' index, which we don't want included as
    # a vdim
    cols = df.columns[1:]
    df = df.with_columns(pl.Series("xticks", xticks))

    # Convert to pandas for HoloViews compatibility
    df_pd = df.to_pandas()
    info.dataset = hv.Dataset(data=df_pd.reset_index(), kdims=["xticks"], vdims=cols)

    with utils.utf8open(legendf, "r") as f:
        info.legend = f.read().splitlines()

    _logger.trace(
        "Loaded model='%s',legend='%s'",
        modelf.relative_to(model_root),
        legendf.relative_to(model_root),
    )

    return info


__all__ = ["generate"]