Source code for sierra.core.graphs.summary_line_graph

# Copyright 2018 John Harwell, All rights reserved.
#
# SPDX-License-Identifier: MIT
#
"""
Linegraph for summarizing the results of a batch experiment in different ways.
"""

# Core packages
import typing as tp
import logging
import pathlib

# 3rd party packages
import matplotlib.ticker as mticker
import matplotlib.pyplot as plt
import pandas as pd

# Project packages
from sierra.core import config, utils, storage


[docs]class SummaryLineGraph: """Generates a linegraph from a :term:`Summary .csv`. Possibly shows the 95% confidence interval or box and whisker plots, according to configuration. Attributes: stats_root: The absolute path to the ``statistics/`` directory for the batch experiment. input_stem: Stem of the :term:`Summary .csv` file to generate a graph from. output_fpath: The absolute path to the output image file to save generated graph to. title: Graph title. xlabel: X-label for graph. ylabel: Y-label for graph. xticks: The xticks for the graph. xtick_labels: The xtick labels for the graph (can be different than the xticks; e.g., if the xticxs are 1-10 for categorical data, then then labels would be the categories). large_text: Should the labels, ticks, and titles be large, or regular size? legend: Legend for graph. logyscale: Should the Y axis be in the log2 domain ? stats: The type of statistics to include on the graph (from ``--dist-stats``). model_root: The absolute path to the ``models/`` directory for the batch experiment. """ kLineStyles = ['-', '--', '.-', ':', '-', '--', '.-', ':'] kMarkStyles = ['o', '^', 's', 'x', 'o', '^', 's', 'x']
[docs] def __init__(self, stats_root: pathlib.Path, input_stem: str, output_fpath: pathlib.Path, title: str, xlabel: str, ylabel: str, xticks: tp.List[float], xtick_labels: tp.Optional[tp.List[str]] = None, large_text: bool = False, legend: tp.List[str] = ['Empirical Data'], logyscale: bool = False, stats: str = 'none', model_root: tp.Optional[pathlib.Path] = None) -> None: # Required arguments self.stats_root = stats_root self.input_stem = input_stem self.output_fpath = output_fpath self.title = title self.xlabel = xlabel self.ylabel = ylabel self.xticks = xticks # Optional arguments if large_text: self.text_size = config.kGraphTextSizeLarge else: self.text_size = config.kGraphTextSizeSmall self.xtick_labels = xtick_labels self.model_root = model_root self.legend = legend self.logyscale = logyscale self.stats = stats self.logger = logging.getLogger(__name__)
[docs] def generate(self) -> None: input_fpath = self.stats_root / (self.input_stem + config.kStats['mean'].exts['mean']) if not utils.path_exists(input_fpath): self.logger.debug("Not generating %s: %s does not exist", self.output_fpath, input_fpath) return else: self.logger.debug("Generating %s from %s", self.output_fpath, input_fpath) data_dfy = storage.DataFrameReader('storage.csv')(input_fpath) model = self._read_models() fig, ax = plt.subplots() # Plot lines self._plot_lines(data_dfy, model) # Add legend self._plot_legend(model) # Add statistics according to configuration stat_dfs = self._read_stats() self._plot_stats(ax, self.xticks, data_dfy, stat_dfs) # Add X,Y labels plt.ylabel(self.ylabel, fontsize=self.text_size['xyz_label']) plt.xlabel(self.xlabel, fontsize=self.text_size['xyz_label']) # Add ticks self._plot_ticks(ax) # Add title plt.title(self.title, fontsize=self.text_size['title']) # Output figure fig = ax.get_figure() fig.set_size_inches(config.kGraphBaseSize, config.kGraphBaseSize) fig.savefig(self.output_fpath, bbox_inches='tight', dpi=config.kGraphDPI) # Prevent memory accumulation (fig.clf() does not close everything) plt.close(fig)
[docs] def _plot_lines(self, data_dfy: pd.DataFrame, model: tp.Tuple[pd.DataFrame, tp.List[str]]) -> None: for i in range(0, len(data_dfy.values)): assert len(data_dfy.values[i]) == len(self.xticks),\ "Length mismatch between xticks,data: {0} vs {1}/{2} vs {3}".format( len(self.xticks), len(data_dfy.values[i]), self.xticks, data_dfy.values[i]) # Plot data plt.plot(self.xticks, data_dfy.values[i], marker=self.kMarkStyles[i], color=f"C{i}") # Plot model prediction(s) if model[0] is not None: # The model might be of different dimensions than the data. If # so, truncate it to fit. if len(self.xticks) < len(model[0].values[i]): self.logger.warning("Truncating model: model/data lengths disagree: %s vs. %s", len(model[0].values[i]), len(self.xticks)) xvals = model[0].values[i][:len(self.xticks)] else: xvals = model[0].values[i] plt.plot(self.xticks, xvals, '--', marker=self.kMarkStyles[i], color="C{}".format(i + len(data_dfy.index)))
[docs] def _plot_stats(self, ax, xticks, data_dfy: pd.DataFrame, stat_dfs: tp.Dict[str, pd.DataFrame]) -> None: """ Plot statistics for all lines on the graph. """ self._plot_conf95_stats(xticks, data_dfy, stat_dfs) self._plot_bw_stats(ax, xticks, data_dfy, stat_dfs)
[docs] def _plot_conf95_stats(self, xticks, data_dfy: pd.DataFrame, stat_dfs: tp.Dict[str, pd.DataFrame]) -> None: if self.stats not in ['conf95', 'all']: return if not all(k in stat_dfs.keys() for k in config.kStats['conf95'].exts): self.logger.warning(("Cannot plot 95%% confidence intervals: " "missing some statistics: %s vs %s"), stat_dfs.keys(), config.kStats['conf95'].exts) return for i in range(0, len(data_dfy.values)): stddev_i = stat_dfs['stddev'].abs().values[i] # 95% interval = 2 std stdeviations plt.fill_between(xticks, data_dfy.values[i] - 2 * stddev_i, data_dfy.values[i] + 2 * stddev_i, alpha=0.25, color="C{}".format(i), interpolate=True)
[docs] def _plot_bw_stats(self, ax, xticks, data_dfy: pd.DataFrame, stat_dfs: tp.Dict[str, pd.DataFrame]) -> None: if self.stats not in ['bw', 'all']: return if not all(k in stat_dfs.keys() for k in config.kStats['bw'].exts): self.logger.warning(("Cannot plot box-and-whisker plots: " "missing some statistics: %s vs %s"), stat_dfs.keys(), config.kStats['bw'].exts) return for i in range(0, len(data_dfy.values)): boxes = [] for j in range(0, len(data_dfy.columns)): boxes.append({ # Bottom whisker position 'whislo': stat_dfs['whislo'].iloc[i, j], # Top whisker position 'whishi': stat_dfs['whishi'].iloc[i, j], # First quartile (25th percentile) 'q1': stat_dfs['q1'].iloc[i, j], # Median (50th percentile) 'med': stat_dfs['median'].iloc[i, j], # Third quartile (75th percentile) 'q3': stat_dfs['q3'].iloc[i, j], # Confidence interval lower bound 'cilo': stat_dfs['cilo'].iloc[i, j], # Confidence interval upper bound 'cihi': stat_dfs['cihi'].iloc[i, j], 'fliers': [] # Ignoring outliers }) ax.bxp(boxes, manage_ticks=False, positions=self.xticks, shownotches=True)
[docs] def _plot_ticks(self, ax) -> None: if self.logyscale: ax.set_yscale('symlog', base=2) ax.yaxis.set_minor_formatter(mticker.ScalarFormatter()) # Use scientific or decimal notation--whichever has fewer chars # ax.yaxis.set_major_formatter(mticker.FormatStrFormatter("%.02g")) ax.tick_params(labelsize=self.text_size['tick_label']) # For ordered, qualitative data if self.xtick_labels is not None: ax.set_xticks(self.xticks) ax.set_xticklabels(self.xtick_labels, rotation='vertical')
[docs] def _plot_legend(self, model: tp.Tuple[pd.DataFrame, tp.List[str]]) -> None: legend = self.legend if model[1]: legend = [val for pair in zip(self.legend, model[1]) for val in pair] plt.legend(legend, fontsize=self.text_size['legend_label'], ncol=max(1, int(len(legend) / 3.0)))
[docs] def _read_stats(self) -> tp.Dict[str, tp.List[pd.DataFrame]]: dfs = {} dfs.update(self._read_conf95_stats()) dfs.update(self._read_bw_stats()) return dfs
[docs] def _read_conf95_stats(self) -> tp.Dict[str, tp.List[pd.DataFrame]]: dfs = {} reader = storage.DataFrameReader('storage.csv') exts = config.kStats['conf95'].exts if self.stats in ['conf95', 'all']: for k in exts: ipath = self.stats_root / (self.input_stem + exts[k]) if utils.path_exists(ipath): dfs[k] = reader(ipath) else: self.logger.warning("%s file not found for '%s'", exts[k], self.input_stem) return dfs
[docs] def _read_bw_stats(self) -> tp.Dict[str, tp.List[pd.DataFrame]]: dfs = {} reader = storage.DataFrameReader('storage.csv') exts = config.kStats['bw'].exts if self.stats in ['bw', 'all']: for k in exts: ipath = self.stats_root / (self.input_stem + exts[k]) if utils.path_exists(ipath): dfs[k] = reader(ipath) else: self.logger.warning("%s file not found for '%s'", exts[k], self.input_stem) return dfs
[docs] def _read_models(self) -> tp.Tuple[pd.DataFrame, tp.List[str]]: if self.model_root is None: return (None, []) self.logger.trace("Model root='%s'", # type: ignore self.model_root) exts = config.kModelsExt modelf = self.model_root / (self.input_stem + exts['model']) legendf = self.model_root / (self.input_stem + exts['legend']) if not utils.path_exists(modelf): self.logger.trace("No model='%s' found in model root", # type: ignore modelf) return (None, []) model = storage.DataFrameReader('storage.csv')(modelf) if utils.path_exists(legendf): with utils.utf8open(legendf, 'r') as f: legend = f.read().splitlines() else: self.logger.warning("No legend file for model '%s' found", modelf) legend = ['Model Prediction'] self.logger.trace("Loaded model='%s',legend='%s'", # type: ignore modelf.relative_to(self.model_root), legendf.relative_to(self.model_root)) return (model, legend)
__api__ = [ 'SummaryLineGraph' ]