Source code for alphadia.fdr._fdrx.stats
# make it an abc
import numpy as np
import pandas as pd
[docs]
def get_pep(
psm_df: pd.DataFrame,
score_column: str = "decoy_proba",
decoy_column: str = "decoy",
score_std: float = 0.01,
pep_granularity: int = 1000,
kernel_size: int = 20,
):
"""Implementation of a very simple nonparametric PEP estimation using a gaussian kernel.
Parameters
----------
psm_df : pd.DataFrame
The dataframe containing the PSMs.
score_column : str, default='decoy_proba'
The name of the column containing the score to use for the selection.
decoy_column : str, default='decoy'
The name of the column containing the decoy information.
score_std : float, default=0.01
The standard deviation of the gaussian kernel.
pep_granularity : int, default=1000
The number of bins to use for the score histogram.
kernel_size : int, default=20
The size of the kernel to use for the convolution.
Returns
-------
np.ndarray
The PEP values with same shape and order as the input dataframe.
"""
score_bins = np.linspace(0, 1, pep_granularity)
target_decoy = psm_df[decoy_column].values
score = psm_df[score_column].values
target_hist, _ = np.histogram(score[target_decoy == 0], bins=score_bins)
decoy_hist, _ = np.histogram(score[target_decoy == 1], bins=score_bins)
std_norm = score_std / (score_bins[1] - score_bins[0])
# create a gaussian kernel of 0.01 width with 5 elements
kernel_gaussian = np.exp(
-(np.arange(-kernel_size, kernel_size + 1) ** 2) / (2 * std_norm**2)
)
# convolve the target and decoy histograms with the kernel
target_hist = np.convolve(target_hist, kernel_gaussian, mode="same")
decoy_hist = np.convolve(decoy_hist, kernel_gaussian, mode="same")
# numerical stability
EPSILON = 1e-6
pep = decoy_hist / (target_hist + decoy_hist + EPSILON)
return pep[np.digitize(score, score_bins) - 1]
[docs]
def add_q_values(
df: pd.DataFrame,
decoy_proba_column: str = "decoy_proba",
decoy_column: str = "decoy",
qval_column: str = "qval",
r_target_decoy: float = 1.0,
):
"""Calculates q-values for a dataframe containing PSMs.
Parameters
----------
df : pd.DataFrame
The dataframe containing the PSMs.
decoy_proba_column : str, default='proba'
The name of the column containing the probability of being a decoy.
Value should be between 0 and 1 with 1 being a decoy.
decoy_column : str, default='_decoy'
The name of the column containing the decoy information.
Decoys are expected to be 1 and targets 0.
qval_column : str, default='qval'
The name of the column to store the q-values in.
Returns
-------
pd.DataFrame
The dataframe containing the q-values in column qval.
"""
EPSILON = 1e-6
df = df.sort_values(
[decoy_proba_column, decoy_column, "precursor_idx"], ascending=True
) # last sort to break ties
# translate the decoy probabilities to target probabilities
target_values = 1 - df[decoy_column].values
decoy_cumsum = np.cumsum(df[decoy_column].values)
target_cumsum = np.cumsum(target_values)
fdr_values = decoy_cumsum / (target_cumsum + EPSILON)
df[qval_column] = fdr_to_q_values(fdr_values) * r_target_decoy
return df
[docs]
def fdr_to_q_values(fdr_values: np.ndarray):
"""Converts FDR values to q-values.
Takes a ascending sorted array of FDR values and converts them to q-values.
for every element the lowest FDR where it would be accepted is used as q-value.
Parameters
----------
fdr_values : np.ndarray
The FDR values to convert.
Returns
-------
np.ndarray
The q-values.
"""
fdr_values_flipped = np.flip(fdr_values)
q_values_flipped = np.minimum.accumulate(fdr_values_flipped)
q_vals = np.flip(q_values_flipped)
return q_vals
[docs]
def keep_best(
df: pd.DataFrame,
score_column: str = "decoy_proba",
group_columns: list[str] | None = None,
):
"""Keep the best PSM for each group of PSMs with the same precursor_idx.
This function is used to select the best candidate PSM for each precursor.
if the group_columns is set to ['channel', 'elution_group_idx'] then its used for target decoy competition.
Parameters
----------
df : pd.DataFrame
The dataframe containing the PSMs.
score_column : str
The name of the column containing the score to use for the selection.
group_columns : list[str], default=['channel', 'precursor_idx']
The columns to use for the grouping.
Returns
-------
pd.DataFrame
The dataframe containing the best PSM for each group.
"""
if group_columns is None:
group_columns = ["channel", "mod_seq_charge_hash"]
df = df.reset_index(drop=True)
df = df.sort_values(
[score_column, *group_columns, "precursor_idx"], ascending=True
) # last sort to break ties
df = df.groupby(group_columns).head(1)
df = df.sort_index().reset_index(drop=True)
return df