Source code for alphadia.fdr._fdrx.stats

# make it an abc
import numpy as np
import pandas as pd



[docs]
def get_pep(
    psm_df: pd.DataFrame,
    score_column: str = "decoy_proba",
    decoy_column: str = "decoy",
    score_std: float = 0.01,
    pep_granularity: int = 1000,
    kernel_size: int = 20,
):
    """Implementation of a very simple nonparametric PEP estimation using a gaussian kernel.

    Parameters
    ----------
    psm_df : pd.DataFrame
        The dataframe containing the PSMs.

    score_column : str, default='decoy_proba'
        The name of the column containing the score to use for the selection.

    decoy_column : str, default='decoy'
        The name of the column containing the decoy information.

    score_std : float, default=0.01
        The standard deviation of the gaussian kernel.

    pep_granularity : int, default=1000
        The number of bins to use for the score histogram.

    kernel_size : int, default=20
        The size of the kernel to use for the convolution.

    Returns
    -------
    np.ndarray
        The PEP values with same shape and order as the input dataframe.

    """
    score_bins = np.linspace(0, 1, pep_granularity)
    target_decoy = psm_df[decoy_column].values
    score = psm_df[score_column].values

    target_hist, _ = np.histogram(score[target_decoy == 0], bins=score_bins)
    decoy_hist, _ = np.histogram(score[target_decoy == 1], bins=score_bins)

    std_norm = score_std / (score_bins[1] - score_bins[0])
    # create a gaussian kernel of 0.01 width with 5 elements
    kernel_gaussian = np.exp(
        -(np.arange(-kernel_size, kernel_size + 1) ** 2) / (2 * std_norm**2)
    )

    # convolve the target and decoy histograms with the kernel
    target_hist = np.convolve(target_hist, kernel_gaussian, mode="same")
    decoy_hist = np.convolve(decoy_hist, kernel_gaussian, mode="same")

    # numerical stability
    EPSILON = 1e-6
    pep = decoy_hist / (target_hist + decoy_hist + EPSILON)

    return pep[np.digitize(score, score_bins) - 1]




[docs]
def add_q_values(
    df: pd.DataFrame,
    decoy_proba_column: str = "decoy_proba",
    decoy_column: str = "decoy",
    qval_column: str = "qval",
    r_target_decoy: float = 1.0,
):
    """Calculates q-values for a dataframe containing PSMs.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe containing the PSMs.

    decoy_proba_column : str, default='proba'
        The name of the column containing the probability of being a decoy.
        Value should be between 0 and 1 with 1 being a decoy.

    decoy_column : str, default='_decoy'
        The name of the column containing the decoy information.
        Decoys are expected to be 1 and targets 0.

    qval_column : str, default='qval'
        The name of the column to store the q-values in.

    Returns
    -------
    pd.DataFrame
        The dataframe containing the q-values in column qval.

    """
    EPSILON = 1e-6
    df = df.sort_values(
        [decoy_proba_column, decoy_column, "precursor_idx"], ascending=True
    )  # last sort to break ties

    # translate the decoy probabilities to target probabilities
    target_values = 1 - df[decoy_column].values
    decoy_cumsum = np.cumsum(df[decoy_column].values)
    target_cumsum = np.cumsum(target_values)
    fdr_values = decoy_cumsum / (target_cumsum + EPSILON)
    df[qval_column] = fdr_to_q_values(fdr_values) * r_target_decoy
    return df




[docs]
def fdr_to_q_values(fdr_values: np.ndarray):
    """Converts FDR values to q-values.
    Takes a ascending sorted array of FDR values and converts them to q-values.
    for every element the lowest FDR where it would be accepted is used as q-value.

    Parameters
    ----------
    fdr_values : np.ndarray
        The FDR values to convert.

    Returns
    -------
    np.ndarray
        The q-values.

    """
    fdr_values_flipped = np.flip(fdr_values)
    q_values_flipped = np.minimum.accumulate(fdr_values_flipped)
    q_vals = np.flip(q_values_flipped)
    return q_vals




[docs]
def keep_best(
    df: pd.DataFrame,
    score_column: str = "decoy_proba",
    group_columns: list[str] | None = None,
):
    """Keep the best PSM for each group of PSMs with the same precursor_idx.
    This function is used to select the best candidate PSM for each precursor.
    if the group_columns is set to ['channel', 'elution_group_idx'] then its used for target decoy competition.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe containing the PSMs.

    score_column : str
        The name of the column containing the score to use for the selection.

    group_columns : list[str], default=['channel', 'precursor_idx']
        The columns to use for the grouping.

    Returns
    -------
    pd.DataFrame
        The dataframe containing the best PSM for each group.

    """
    if group_columns is None:
        group_columns = ["channel", "mod_seq_charge_hash"]
    df = df.reset_index(drop=True)
    df = df.sort_values(
        [score_column, *group_columns, "precursor_idx"], ascending=True
    )  # last sort to break ties
    df = df.groupby(group_columns).head(1)
    df = df.sort_index().reset_index(drop=True)
    return df