Source code for sierra.core.graphs.summary_line_graph

# Copyright 2018 John Harwell, All rights reserved.
#
# SPDX-License-Identifier: MIT
#
"""
Linegraph for summarizing the results of a batch experiment in different ways.
"""

# Core packages
import typing as tp
import logging
import pathlib

# 3rd party packages
import matplotlib.ticker as mticker
import matplotlib.pyplot as plt
import pandas as pd

# Project packages
from sierra.core import config, utils, storage


[docs]class SummaryLineGraph:
    """Generates a linegraph from a :term:`Summary .csv`.

    Possibly shows the 95% confidence interval or box and whisker plots,
    according to configuration.

    Attributes:

        stats_root: The absolute path to the ``statistics/`` directory for the
                    batch experiment.

        input_stem: Stem of the :term:`Summary .csv` file to generate a graph
                    from.

        output_fpath: The absolute path to the output image file to save
                      generated graph to.

        title: Graph title.

        xlabel: X-label for graph.

        ylabel: Y-label for graph.

        xticks: The xticks for the graph.

        xtick_labels: The xtick labels for the graph (can be different than the
                      xticks; e.g., if the xticxs are 1-10 for categorical data,
                      then then labels would be the categories).

        large_text: Should the labels, ticks, and titles be large, or regular
                    size?

        legend: Legend for graph.

        logyscale: Should the Y axis be in the log2 domain ?

        stats: The type of statistics to include on the graph (from
               ``--dist-stats``).

        model_root: The absolute path to the ``models/`` directory for the batch
                     experiment.

    """
    kLineStyles = ['-', '--', '.-', ':', '-', '--', '.-', ':']
    kMarkStyles = ['o', '^', 's', 'x', 'o', '^', 's', 'x']

[docs]    def __init__(self,
                 stats_root: pathlib.Path,
                 input_stem: str,
                 output_fpath: pathlib.Path,
                 title: str,
                 xlabel: str,
                 ylabel: str,
                 xticks: tp.List[float],
                 xtick_labels: tp.Optional[tp.List[str]] = None,
                 large_text: bool = False,
                 legend: tp.List[str] = ['Empirical Data'],
                 logyscale: bool = False,
                 stats: str = 'none',
                 model_root: tp.Optional[pathlib.Path] = None) -> None:

        # Required arguments
        self.stats_root = stats_root
        self.input_stem = input_stem
        self.output_fpath = output_fpath
        self.title = title
        self.xlabel = xlabel
        self.ylabel = ylabel
        self.xticks = xticks

        # Optional arguments
        if large_text:
            self.text_size = config.kGraphTextSizeLarge
        else:
            self.text_size = config.kGraphTextSizeSmall

        self.xtick_labels = xtick_labels
        self.model_root = model_root
        self.legend = legend
        self.logyscale = logyscale
        self.stats = stats

        self.logger = logging.getLogger(__name__)

[docs]    def generate(self) -> None:
        input_fpath = self.stats_root / (self.input_stem +
                                         config.kStats['mean'].exts['mean'])

        if not utils.path_exists(input_fpath):
            self.logger.debug("Not generating %s: %s does not exist",
                              self.output_fpath,
                              input_fpath)
            return
        else:
            self.logger.debug("Generating %s from %s",
                              self.output_fpath,
                              input_fpath)

        data_dfy = storage.DataFrameReader('storage.csv')(input_fpath)
        model = self._read_models()

        fig, ax = plt.subplots()

        # Plot lines
        self._plot_lines(data_dfy, model)

        # Add legend
        self._plot_legend(model)

        # Add statistics according to configuration
        stat_dfs = self._read_stats()
        self._plot_stats(ax, self.xticks, data_dfy, stat_dfs)

        # Add X,Y labels
        plt.ylabel(self.ylabel, fontsize=self.text_size['xyz_label'])
        plt.xlabel(self.xlabel, fontsize=self.text_size['xyz_label'])

        # Add ticks
        self._plot_ticks(ax)

        # Add title
        plt.title(self.title, fontsize=self.text_size['title'])

        # Output figure
        fig = ax.get_figure()
        fig.set_size_inches(config.kGraphBaseSize,
                            config.kGraphBaseSize)
        fig.savefig(self.output_fpath, bbox_inches='tight',
                    dpi=config.kGraphDPI)
        # Prevent memory accumulation (fig.clf() does not close everything)
        plt.close(fig)

[docs]    def _plot_lines(self,
                    data_dfy: pd.DataFrame,
                    model: tp.Tuple[pd.DataFrame, tp.List[str]]) -> None:
        for i in range(0, len(data_dfy.values)):
            assert len(data_dfy.values[i]) == len(self.xticks),\
                "Length mismatch between xticks,data: {0} vs {1}/{2} vs {3}".format(
                    len(self.xticks),
                    len(data_dfy.values[i]),
                    self.xticks,
                    data_dfy.values[i])

            # Plot data
            plt.plot(self.xticks,
                     data_dfy.values[i],
                     marker=self.kMarkStyles[i],
                     color=f"C{i}")

            # Plot model prediction(s)
            if model[0] is not None:
                # The model might be of different dimensions than the data. If
                # so, truncate it to fit.
                if len(self.xticks) < len(model[0].values[i]):
                    self.logger.warning("Truncating model: model/data lengths disagree: %s vs. %s",
                                        len(model[0].values[i]),
                                        len(self.xticks))
                    xvals = model[0].values[i][:len(self.xticks)]
                else:
                    xvals = model[0].values[i]

                plt.plot(self.xticks,
                         xvals,
                         '--',
                         marker=self.kMarkStyles[i],
                         color="C{}".format(i + len(data_dfy.index)))

[docs]    def _plot_stats(self,
                    ax,
                    xticks,
                    data_dfy: pd.DataFrame,
                    stat_dfs: tp.Dict[str, pd.DataFrame]) -> None:
        """
        Plot statistics for all lines on the graph.
        """
        self._plot_conf95_stats(xticks, data_dfy, stat_dfs)
        self._plot_bw_stats(ax, xticks, data_dfy, stat_dfs)

[docs]    def _plot_conf95_stats(self,
                           xticks,
                           data_dfy: pd.DataFrame,
                           stat_dfs: tp.Dict[str, pd.DataFrame]) -> None:
        if self.stats not in ['conf95', 'all']:
            return

        if not all(k in stat_dfs.keys() for k in config.kStats['conf95'].exts):
            self.logger.warning(("Cannot plot 95%% confidence intervals: "
                                 "missing some statistics: %s vs %s"),
                                stat_dfs.keys(),
                                config.kStats['conf95'].exts)
            return

        for i in range(0, len(data_dfy.values)):
            stddev_i = stat_dfs['stddev'].abs().values[i]
            # 95% interval = 2 std stdeviations
            plt.fill_between(xticks,
                             data_dfy.values[i] - 2 * stddev_i,
                             data_dfy.values[i] + 2 * stddev_i,
                             alpha=0.25,
                             color="C{}".format(i),
                             interpolate=True)

[docs]    def _plot_bw_stats(self,
                       ax,
                       xticks,
                       data_dfy: pd.DataFrame,
                       stat_dfs: tp.Dict[str, pd.DataFrame]) -> None:
        if self.stats not in ['bw', 'all']:
            return

        if not all(k in stat_dfs.keys() for k in config.kStats['bw'].exts):
            self.logger.warning(("Cannot plot box-and-whisker plots: "
                                 "missing some statistics: %s vs %s"),
                                stat_dfs.keys(),
                                config.kStats['bw'].exts)
            return

        for i in range(0, len(data_dfy.values)):
            boxes = []
            for j in range(0, len(data_dfy.columns)):
                boxes.append({
                    # Bottom whisker position
                    'whislo': stat_dfs['whislo'].iloc[i, j],
                    # Top whisker position
                    'whishi': stat_dfs['whishi'].iloc[i, j],
                    # First quartile (25th percentile)
                    'q1': stat_dfs['q1'].iloc[i, j],
                    # Median         (50th percentile)
                    'med': stat_dfs['median'].iloc[i, j],
                    # Third quartile (75th percentile)
                    'q3': stat_dfs['q3'].iloc[i, j],
                    # Confidence interval lower bound
                    'cilo': stat_dfs['cilo'].iloc[i, j],
                    # Confidence interval upper bound
                    'cihi': stat_dfs['cihi'].iloc[i, j],
                    'fliers': []  # Ignoring outliers
                })
            ax.bxp(boxes,
                   manage_ticks=False,
                   positions=self.xticks,
                   shownotches=True)

[docs]    def _plot_ticks(self, ax) -> None:
        if self.logyscale:
            ax.set_yscale('symlog', base=2)
            ax.yaxis.set_minor_formatter(mticker.ScalarFormatter())

            # Use scientific or decimal notation--whichever has fewer chars
            # ax.yaxis.set_major_formatter(mticker.FormatStrFormatter("%.02g"))

        ax.tick_params(labelsize=self.text_size['tick_label'])

        # For ordered, qualitative data
        if self.xtick_labels is not None:
            ax.set_xticks(self.xticks)
            ax.set_xticklabels(self.xtick_labels, rotation='vertical')

[docs]    def _plot_legend(self, model: tp.Tuple[pd.DataFrame, tp.List[str]]) -> None:
        legend = self.legend

        if model[1]:
            legend = [val for pair in zip(self.legend, model[1])
                      for val in pair]

        plt.legend(legend,
                   fontsize=self.text_size['legend_label'],
                   ncol=max(1, int(len(legend) / 3.0)))

[docs]    def _read_stats(self) -> tp.Dict[str, tp.List[pd.DataFrame]]:
        dfs = {}

        dfs.update(self._read_conf95_stats())
        dfs.update(self._read_bw_stats())

        return dfs

[docs]    def _read_conf95_stats(self) -> tp.Dict[str, tp.List[pd.DataFrame]]:
        dfs = {}
        reader = storage.DataFrameReader('storage.csv')
        exts = config.kStats['conf95'].exts

        if self.stats in ['conf95', 'all']:
            for k in exts:
                ipath = self.stats_root / (self.input_stem + exts[k])

                if utils.path_exists(ipath):
                    dfs[k] = reader(ipath)
                else:
                    self.logger.warning("%s file not found for '%s'",
                                        exts[k],
                                        self.input_stem)

        return dfs

[docs]    def _read_bw_stats(self) -> tp.Dict[str, tp.List[pd.DataFrame]]:
        dfs = {}
        reader = storage.DataFrameReader('storage.csv')
        exts = config.kStats['bw'].exts

        if self.stats in ['bw', 'all']:
            for k in exts:
                ipath = self.stats_root / (self.input_stem + exts[k])

                if utils.path_exists(ipath):
                    dfs[k] = reader(ipath)
                else:
                    self.logger.warning("%s file not found for '%s'",
                                        exts[k],
                                        self.input_stem)

        return dfs

[docs]    def _read_models(self) -> tp.Tuple[pd.DataFrame, tp.List[str]]:
        if self.model_root is None:
            return (None, [])

        self.logger.trace("Model root='%s'",   # type: ignore
                          self.model_root)

        exts = config.kModelsExt
        modelf = self.model_root / (self.input_stem + exts['model'])
        legendf = self.model_root / (self.input_stem + exts['legend'])

        if not utils.path_exists(modelf):
            self.logger.trace("No model='%s' found in model root",  # type: ignore
                              modelf)
            return (None, [])

        model = storage.DataFrameReader('storage.csv')(modelf)
        if utils.path_exists(legendf):
            with utils.utf8open(legendf, 'r') as f:
                legend = f.read().splitlines()
        else:
            self.logger.warning("No legend file for model '%s' found",
                                modelf)
            legend = ['Model Prediction']

        self.logger.trace("Loaded model='%s',legend='%s'",  # type: ignore
                          modelf.relative_to(self.model_root),
                          legendf.relative_to(self.model_root))

        return (model, legend)


__api__ = [
    'SummaryLineGraph'
]