Source code for alphadia.outputtransform.protein_fdr

import logging

import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

from alphadia.exceptions import TooFewProteinsError
from alphadia.fdr import fdr
from alphadia.fdr.plotting import plot_fdr
from alphadia.fdr.utils import train_test_split_

logger = logging.getLogger()


[docs] def perform_protein_fdr(psm_df: pd.DataFrame, figure_path: str) -> pd.DataFrame: """Perform protein FDR on PSM dataframe""" protein_features = [] for _, group in psm_df.groupby(["pg", "decoy"]): protein_features.append( { "pg": group["pg"].iloc[0], "genes": group["genes"].iloc[0], "proteins": group["proteins"].iloc[0], "decoy": group["decoy"].iloc[0], "count": len(group), "n_precursor": len(group["precursor_idx"].unique()), "n_peptides": len(group["sequence"].unique()), "n_runs": len(group["run"].unique()), "mean_score": group["proba"].mean(), "best_score": group["proba"].min(), "worst_score": group["proba"].max(), } ) feature_columns = [ "count", "mean_score", "n_peptides", "n_precursor", "n_runs", "best_score", "worst_score", ] protein_features = pd.DataFrame(protein_features) X = protein_features[feature_columns].values y = protein_features["decoy"].values X_train, X_test, y_train, y_test, idxs_train, idxs_test = train_test_split_( X, y, test_size=0.2, random_state=42, # we do this only once so a fixed random state is fine exception=TooFewProteinsError, ) scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_scaled = scaler.transform(X) classifier = MLPClassifier( random_state=0 # we do this only once so a fixed random state is fine ).fit(X_train_scaled, y_train) predicted_proba = classifier.predict_proba(X_scaled)[:, 1] protein_features["proba"] = predicted_proba protein_features = pd.DataFrame(protein_features) protein_features = fdr.get_q_values( protein_features, score_column="proba", decoy_column="decoy", qval_column="pg_qval", extra_sort_columns=["pg"], ) n_targets = (protein_features["decoy"] == 0).sum() n_decoys = (protein_features["decoy"] == 1).sum() logger.info( f"Normalizing q-values using {n_targets:,} targets and {n_decoys:,} decoys" ) protein_features["pg_qval"] = protein_features["pg_qval"] * n_targets / n_decoys if figure_path is not None: plot_fdr( y_train, y_test, predicted_proba[idxs_train], predicted_proba[idxs_test], protein_features["pg_qval"], figure_path, ) return pd.concat( [ psm_df[psm_df["decoy"] == 0].merge( protein_features[protein_features["decoy"] == 0][["pg", "pg_qval"]], on="pg", how="left", ), psm_df[psm_df["decoy"] == 1].merge( protein_features[protein_features["decoy"] == 1][["pg", "pg_qval"]], on="pg", how="left", ), ] )