Source code for alphadia.fdr._fdrx.base

"""This module implements a base class for semisupervised FDR estimation using targets and decoys.
It is flexible with regards to the features, type of classifier and type of identifications (precursors, peptides, proteins).
"""

import logging

import numpy as np
import pandas as pd
import sklearn.base

from alphadia.fdr._fdrx.plotting import (
    _plot_fdr_curve,
    _plot_roc_curve,
    _plot_score_distribution,
)
from alphadia.fdr._fdrx.stats import add_q_values, get_pep, keep_best
from alphadia.fdr.utils import train_test_split_
from alphadia.fragcomp.fragcomp import FragmentCompetition

logger = logging.getLogger()


[docs] class TargetDecoyFDR:
[docs] def __init__( self, classifier: sklearn.base.BaseEstimator, feature_columns: list, decoy_column: str = "decoy", competition_columns: list | None = None, ): """Target Decoy FDR estimation using a classifier. This class supports target decoy competition as well as fragment competition. Parameters ---------- classifier : sklearn.base.BaseEstimator The classifier to use for target decoy estimation. feature_columns : list The columns to use as features for the classifier. decoy_column : str, default='decoy' The column to use as decoy information. competition_columns : list, default=[] Perform target decoy competition on these columns. Only the best PSM for each group will be kept. """ self._classifier = classifier self._feature_columns = feature_columns self._decoy_column = decoy_column self._competition_columns = competition_columns or []
[docs] def fit_classifier(self, psm_df: pd.DataFrame): """Fit the classifier on the PSMs. Parameters ---------- psm_df : pd.DataFrame The dataframe containing the PSMs. """ is_na_row = psm_df[self._feature_columns].isna().any(axis=1) logger.info(f"Removing {is_na_row.sum()} rows with missing values") X = psm_df.loc[~is_na_row, self._feature_columns].values y = psm_df.loc[~is_na_row, self._decoy_column].values X_train, X_test, y_train, y_test, *_ = train_test_split_( X, y, test_size=0.2 ) # TODO add random_state for reproducibility! self._classifier.fit(X_train, y_train) # evaluate classifier y_test_proba = self._classifier.predict_proba(X_test)[:, 1] y_train_proba = self._classifier.predict_proba(X_train)[:, 1] _plot_score_distribution(y_train, y_train_proba, y_test, y_test_proba) _plot_roc_curve(y_train, y_train_proba, y_test, y_test_proba)
[docs] def predict_classifier(self, psm_df: pd.DataFrame): """Predict the decoy probability for the PSMs. Parameters ---------- psm_df : pd.DataFrame The dataframe containing the PSMs. Returns ------- np.ndarray The decoy probabilities for the PSMs with same shape and order as the input dataframe. """ is_na_row = psm_df[self._feature_columns].isna().any(axis=1) X = psm_df.loc[~is_na_row, self._feature_columns].values # Prediction should have the same shape of input, even for NaN rows # We are therefore assigning a decoy probability of 1 to all rows with NaN values y_proba_full = np.ones(len(psm_df)) y_proba = self._classifier.predict_proba(X)[:, 1] y_proba_full[~is_na_row] = y_proba return y_proba_full
[docs] def predict_qval( self, psm_df: pd.DataFrame, fragments_df: pd.DataFrame | None = None, dia_cycle: np.ndarray | None = None, competition_heuristic: float = 0.10, ) -> pd.DataFrame: """Calculate q-values for scored identifications. Parameters ---------- psm_df : pd.DataFrame The dataframe containing the PSMs. fragments_df : pd.DataFrame, default=None The dataframe containing the fragments. dia_cycle : np.ndarray, default=None The DIA cycle for the fragments. competition_heuristic : float, default=0.10 The q-value threshold for fragment competition. Only precursors with q-values below this threshold will be considered for fragment competition. Returns ------- pd.DataFrame The input dataframe with q-values and PEPs added. """ psm_df = psm_df.copy() psm_df["decoy_proba"] = self.predict_classifier(psm_df) # normalize to a 1:1 target decoy proportion r_target_decoy = (psm_df[self._decoy_column] == 0).sum() / ( psm_df[self._decoy_column] == 1 ).sum() # normalize q-values based on proportion before competition if dia_cycle is not None and fragments_df is not None: psm_df = add_q_values( psm_df, decoy_proba_column="decoy_proba", decoy_column=self._decoy_column, r_target_decoy=r_target_decoy, ) fragment_competition = FragmentCompetition() psm_df = fragment_competition( psm_df[psm_df["qval"] < competition_heuristic], fragments_df, dia_cycle ) psm_df = keep_best(psm_df, group_columns=self._competition_columns) psm_df = add_q_values( psm_df, "decoy_proba", self._decoy_column, r_target_decoy=r_target_decoy ) # calulate PEP psm_df["pep"] = get_pep( psm_df, score_column="decoy_proba", decoy_column=self._decoy_column ) _plot_fdr_curve(psm_df["qval"]) return psm_df
[docs] def fit_predict_qval( self, psm_df: pd.DataFrame, fragments_df: pd.DataFrame | None = None, cycle: np.ndarray | None = None, ): """Fit the classifier, predict the decoy probabilities and calculate q-values. Parameters ---------- psm_df : pd.DataFrame The dataframe containing the PSMs. fragments_df : pd.DataFrame, default=None The dataframe containing the fragments. cycle : np.ndarray, default=None The DIA cycle for the fragments. Returns ------- pd.DataFrame The input dataframe with q-values and PEPs added. """ self.fit_classifier(psm_df) return self.predict_qval(psm_df, fragments_df, cycle)