Adding custom 'summary' charts to MLflow: simple bar chart and 'grouped' bar chart

MLflow is a convenient and OSS (Open Source Software) service for accepting the result of a machine learning evaluation.

By default, the MLflow user interface can be used to add basic charts of metrics, to view results and also compare across runs.

However if there are many metrics, then it can be useful to create a custom 'summary' chart.

The approach taken here is to use matplotlib to generate custom charts, and then attach as an artifact using the MLflow API. This follows the MLflow documentation.


note: To allow for grouping of metrics by category, the following code assumes a convention where the MLflow metrics are named like '<category>__<metric>'.

So the data being logged to MLflow looks like this:

[('category1__response_time_ms', 11000.0), ('category2__response_time_ms', 20000.0)]

Simple Bar Chart

The following code shows how to generate a summary bar chart, with one bar for each metric:

import logging
import typing
from matplotlib.colors import Colormap
from matplotlib.figure import Figure
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

logger = logging.getLogger(__name__)


MLflowFigure = tuple[str, Figure]  # title and figure
MetricAndMeasure = tuple[str, float | None]

METRIC_NAME_DIVIDER = "__"  # MLflow attributes have been setup to have a category, by using a prefix like: metric_name = <category>__<metric>


def _build_color_map_by_value(values: list[float]) -> Colormap:
    rescale = lambda y: (y - np.min(y)) / (np.max(y) - np.min(y))
    my_cmap = plt.get_cmap("viridis")

    return typing.cast(Colormap, my_cmap(rescale(np.array(list(values)))))


def _create_bar_chart(
    title: str, yaxis_title: str, measurements_by_category: dict[str, float], unit_short: str, is_debug: bool
) -> MLflowFigure:
    if is_debug:
        print(f"measurements_by_category = {measurements_by_category}")

    categories = [*measurements_by_category.keys()]
    values = [*measurements_by_category.values()]

    fig, ax = plt.subplots(layout="constrained", figsize=(21, 7))

    bar_width = 0.1
    ax.bar(
        x=categories,
        height=values,
        width=bar_width,
        align="edge",
        label=categories,
        color=_build_color_map_by_value(values=values),
    )

    x = np.arange(len(categories))  # the label locations
    bar_width = 0.1
    ax.set_xticks(x + bar_width, categories)

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel(yaxis_title)
    ax.set_title(title)
    ax.legend(title=yaxis_title, ncols=len(categories))

    # label each bar with its value
    rects = ax.patches
    for i, rect in enumerate(rects):
        label = f"{values[i]}{unit_short}"
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width() / 2, height, label, ha="center", va="bottom")

    # scale the y-axis
    y_max = 0.0
    for category in measurements_by_category:
        measurement = measurements_by_category[category]
        y_max = max(y_max, measurement)

    ax.set_ylim(0, y_max * 1.2)

    if is_debug:
        plt.show()

    return (title, fig)


def _create_summary_figure(mlflow_metrics: list[MetricAndMeasure], is_debug: bool) -> MLflowFigure:
    # example input: [('category1__response_time_ms', 11000.0), ('category2__response_time_ms', 20000.0)]
    metrics_by_category: dict[str, float] = {}
    metrics = {}
    for [category_and_metric, measurement] in mlflow_metrics:
        [category, metric] = category_and_metric.split(METRIC_NAME_DIVIDER)
        metrics[metric] = True
        metrics_by_category[category] = _from_millis_to_seconds(measurement)

    # assumption: there is just 1 time metric:
    if len(metrics) > 1:
        raise RuntimeError(f"Expected 1 time metric, but have: {metrics}")

    return _create_bar_chart(
        title="Response time by Prompt Category",
        yaxis_title="Response time (seconds)",
        measurements_by_category=metrics_by_category,
        unit_short="s",
        is_debug=is_debug,
    )


def create_figures(
    mlflow_metrics: list[MetricAndMeasure], is_debug: bool
) -> list[MLflowFigure]:
    return [
        _create_summary_figure(mlflow_metrics=mlflow_metrics, is_debug=is_debug),
    ]

This produces a bar chart like this, showing all the metrics for that run, side-by-side.

- the bars are colored according to their value (higher value = lighter hues)


Grouped Bar Chart

A more advanced technique is to take the categorized metrics (where the metric name is prefixed by a category) and then plot a grouped bar chart, with one group for each category. This allows for comparing categories across multiple metrics (or attributes).

import logging
from matplotlib.figure import Figure
from matplotlib import pyplot as plt
import numpy as np

logger = logging.getLogger(__name__)


MLflowFigure = tuple[str, Figure]  # title and figure
MetricAndMeasure = tuple[str, float | None]


def _create_grouped_bar_chart(
    title: str,
    yaxis_title: str,
    categories: list[str],
    grouped_metrics: dict[str, list[float]],
    is_debug: bool,
    decimal_places: int = 2,
) -> MLflowFigure:
    x = np.arange(len(categories))  # the label locations
    bar_width = 0.1
    multiplier = 0
    gap_between_bars = 0.02

    fig, ax = plt.subplots(layout="constrained", figsize=(21, 7))

    for attribute, measurement in grouped_metrics.items():
        offset = (bar_width + gap_between_bars) * multiplier
        measurement = [round(y, decimal_places) for y in measurement]

        rects = ax.bar(x + offset, measurement, bar_width, label=attribute)
        ax.bar_label(rects, padding=3)
        multiplier += 1

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel(yaxis_title)
    ax.set_title(title)
    ax.set_xticks(x + bar_width, categories)
    ax.legend(loc="upper left", ncols=len(grouped_metrics.keys()))

    y_max = 0.0
    for metric in grouped_metrics:
        for value in grouped_metrics[metric]:
            y_max = max(y_max, value)

    ax.set_ylim(0, y_max * 1.2)

    if is_debug:
        plt.show()

    return (title, fig)


METRIC_NAME_DIVIDER = "__"


def _create_accuracy_summary_figure(accuracy_metrics: list[MetricAndMeasure], is_debug: bool) -> MLflowFigure:
    # example input: [('category1__entity_count_accuracy', 1.0), ('category1__attribute_count_accuracy', 1.0)]

    grouped_and_named_metrics: dict[str, dict[str, float | None]] = {}

    all_metrics: list[str] = []
    for category_and_metric in accuracy_metrics:
        if METRIC_NAME_DIVIDER in category_and_metric[0]:  # like 'category1__attribute_count_accuracy'
            [category, metric] = category_and_metric[0].split("__")
            if category not in grouped_and_named_metrics:
                grouped_and_named_metrics[category] = {metric: category_and_metric[1]}
            else:
                existing = grouped_and_named_metrics[category]
                existing[metric] = category_and_metric[1]
            all_metrics.append(metric)
        else:  # like 'overall_accuracy'
            category = category_and_metric[0]
            metric = category_and_metric[0]
            grouped_and_named_metrics[category] = {metric: category_and_metric[1]}
            all_metrics.append(metric)

    # To have same shape of data for plotting, fill in any missing metrics as zero:
    for metric in all_metrics:
        for category in grouped_and_named_metrics:
            existing = grouped_and_named_metrics[category]
            if metric not in existing:
                existing[metric] = 0.0

    # project to the structure required by plot:
    shared_metrics: list[str] = []
    grouped_metrics: dict[str, list[float]] = {}

    if is_debug:
        print(f"grouped_and_named_metrics = {grouped_and_named_metrics}")

    categories = []

    for category in grouped_and_named_metrics:
        categories.append(category)
        sorted_metrics = list(grouped_and_named_metrics[category].keys())
        sorted_metrics.sort()  # sort the metrics to ensure they are consistent across categories
        if not shared_metrics:
            shared_metrics = sorted_metrics
        elif shared_metrics != sorted_metrics:
            raise RuntimeError(
                f"Figure data is misaligned: shared_metrics={shared_metrics}, new metrics={sorted_metrics}"
            )
        for metric in sorted_metrics:
            if metric not in grouped_metrics:
                grouped_metrics[metric] = []
            value = grouped_and_named_metrics[category][metric]
            if not value:
                value = 0.0
            grouped_metrics[metric].append(value)

    if is_debug:
        print(f"categories: {categories}")
        print(f"metrics: {shared_metrics}")
        print(f"grouped_metrics: {grouped_metrics}")

    return _create_grouped_bar_chart(
        title="Accuracy by Prompt Category",
        yaxis_title="Accuracy",
        categories=categories,
        grouped_metrics=grouped_metrics,
        is_debug=is_debug,
    )


def create_figures(
    mlflow_metrics: list[MetricAndMeasure], is_debug: bool
) -> list[MLflowFigure]:
    return [
        _create_accuracy_summary_figure(accuracy_metrics=mlflow_metrics, is_debug=is_debug),
    ]


This produces a grouped bar chart, similar to this:


note: since the code involves some projection of data, it can be useful to cover this with unit tests.

Summary

Creating custom figures for MLflow is a powerful technique to gain more insights into your ML evaluation results, and better communicate the results to colleagues.

For more information, please see the MLflow documentation.

Comments