Source code for alphadia.fdr.plotting

"""Plotting functionality for FDR."""

import logging
from datetime import datetime
from pathlib import Path

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker
import numpy as np
import sklearn
from matplotlib.figure import Figure

auc_difference_percent_warning_threshold = 5

qval_threshold = 0.05

logger = logging.getLogger()



[docs]
def plot_fdr(  # noqa: PLR0913 # Too many arguments
    y_train: np.ndarray,
    y_test: np.ndarray,
    y_train_proba: np.ndarray,
    y_test_proba: np.ndarray,
    qval: np.ndarray,
    figure_path: str | None = None,
) -> None:
    """Plots statistics on the fdr corrected PSMs.

    Parameters
    ----------
    y_train : np.ndarray
        The training labels.

    y_test : np.ndarray
        The test labels.

    y_train_proba : np.ndarray
        The predicted probabilities for the training data.

    y_test_proba : np.ndarray
        The predicted probabilities for the test data.

    qval : np.ndarray
        The q-values of the PSMs.

    figure_path: str | None
        The path to the folder to save the figure to.

    """
    fpr_test, tpr_test, thresholds_test = sklearn.metrics.roc_curve(
        y_test, y_test_proba
    )
    fpr_train, tpr_train, thresholds_train = sklearn.metrics.roc_curve(
        y_train, y_train_proba
    )

    auc_test = sklearn.metrics.auc(fpr_test, tpr_test)
    auc_train = sklearn.metrics.auc(fpr_train, tpr_train)

    logger.info(f"Test AUC: {auc_test:.3f}")
    logger.info(f"Train AUC: {auc_train:.3f}")

    auc_difference_percent = np.abs((auc_test - auc_train) / auc_train * 100)
    logger.info(f"AUC difference: {auc_difference_percent:.2f}%")
    if auc_difference_percent > auc_difference_percent_warning_threshold:
        logger.warning(
            f"AUC difference > {auc_difference_percent_warning_threshold}%. This may indicate overfitting."
        )

    fig, ax = plt.subplots(1, 3, figsize=(12, 4))
    ax[0].plot(fpr_test, tpr_test, label=f"Test AUC: {auc_test:.3f}")
    ax[0].plot(fpr_train, tpr_train, label=f"Train AUC: {auc_train:.3f}")
    ax[0].set_xlabel("false positive rate")
    ax[0].set_ylabel("true positive rate")
    ax[0].legend()

    ax[1].set_xlim(0, 1)
    ax[1].hist(
        np.concatenate([y_test_proba[y_test == 0], y_train_proba[y_train == 0]]),
        bins=50,
        alpha=0.5,
        label="target",
    )
    ax[1].hist(
        np.concatenate([y_test_proba[y_test == 1], y_train_proba[y_train == 1]]),
        bins=50,
        alpha=0.5,
        label="decoy",
    )
    ax[1].set_xlabel("decoy score")
    ax[1].set_ylabel("precursor count")
    ax[1].legend()

    qval_plot = qval[qval < qval_threshold]
    ids = np.arange(0, len(qval_plot), 1)
    ax[2].plot(qval_plot, ids)
    ax[2].set_xlim(-0.001, 0.05)
    ax[2].set_xlabel("q-value")
    ax[2].set_ylabel("number of precursors")

    for axs in ax:
        # remove top and right spines
        axs.spines["top"].set_visible(False)
        axs.spines["right"].set_visible(False)
        axs.get_yaxis().set_major_formatter(
            mpl.ticker.FuncFormatter(lambda x, _p: format(int(x), ","))
        )

    fig.tight_layout()

    if figure_path is not None:
        figure_path_ = Path(figure_path)
        i = 0
        file_path = figure_path_ / f"fdr_{i}.pdf"
        while file_path.exists():
            i += 1
            file_path = figure_path_ / f"fdr_{i}.pdf"

        _add_metadata_to_figure(fig, qval, y_test, y_train, file_path)

        fig.savefig(file_path, bbox_inches="tight")
    else:
        plt.show()
        plt.close()



def _add_metadata_to_figure(
    fig: Figure,
    qval: np.ndarray,
    y_test: np.ndarray,
    y_train: np.ndarray,
    file_path: Path,
) -> None:
    """Add metadata to the figure."""
    current_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # noqa: DTZ005
    n_train = len(y_train)
    n_test = len(y_test)
    n_train_targets = (y_train == 0).sum()
    n_train_decoys = (y_train == 1).sum()
    n_test_targets = (y_test == 0).sum()
    n_test_decoys = (y_test == 1).sum()
    n_at_1perc_fdr = (qval <= 0.01).sum()  # noqa: PLR2004
    metadata_text = (
        f"{current_date} | "
        f"Train: {n_train:,} ({n_train_targets:,} targets, {n_train_decoys:,} decoys) | "
        f"Test: {n_test:,} ({n_test_targets:,} targets, {n_test_decoys:,} decoys) | "
        f"Entries at 1% FDR: {n_at_1perc_fdr:,}"
    )

    fig.text(0.5, -0.05, metadata_text, ha="center", fontsize=8, style="italic")

    # Add file path to metadata
    fig.text(
        0.5,
        -0.08,
        f"{Path(file_path).absolute()}",
        ha="center",
        fontsize=8,
        style="italic",
    )