import logging
import os
import pandas as pd
from alphabase.peptide import fragment
from alphabase.spectral_library import base
from alphabase.spectral_library.base import SpecLibBase
from alphadia import utils
from alphadia.constants.keys import (
ConfigKeys,
)
from alphadia.constants.settings import FIGURES_FOLDER_NAME
from alphadia.exceptions import NoPSMFilesFoundError, NoPSMFoundError
from alphadia.libtransform.mbr import MbrLibraryBuilder
from alphadia.outputtransform.df_builders import (
build_run_internal_df,
build_run_stat_df,
log_stat_df,
transfer_library_stat_df,
)
from alphadia.outputtransform.outputaccumulator import (
AccumulationBroadcaster,
TransferLearningAccumulator,
)
from alphadia.outputtransform.protein_fdr import perform_protein_fdr
from alphadia.outputtransform.quantification import QuantOutputBuilder
from alphadia.outputtransform.utils import (
apply_output_column_names,
apply_protein_inference,
get_channels_from_config,
load_psm_files_from_folders,
log_protein_fdr_summary,
prepare_psm_dataframe,
read_df,
write_df,
)
from alphadia.transferlearning.train import FinetuneManager
from alphadia.workflow.config import Config
logger = logging.getLogger()
[docs]
class SearchPlanOutput:
PSM_INPUT = "psm"
PRECURSOR_OUTPUT = "precursors"
STAT_OUTPUT = "stat"
INTERNAL_OUTPUT = "internal"
PG_OUTPUT = "protein_groups"
LIBRARY_OUTPUT = "speclib.mbr"
TRANSFER_OUTPUT = "speclib.transfer"
TRANSFER_MODEL = "peptdeep.transfer"
TRANSFER_STATS_OUTPUT = "stats.transfer"
[docs]
def __init__(self, config: Config, output_folder: str):
"""Combine individual searches into and build combined outputs
In alphaDIA the search plan orchestrates the library building preparation,
schedules the individual searches and combines the individual outputs into a single output.
The SearchPlanOutput class is responsible for combining the individual search outputs into a single output.
This includes:
- combining the individual precursor tables
- building the output stat table
- performing protein grouping
- performing protein FDR
- performin label-free quantification
- building the spectral library
Parameters
----------
config: Config
Configuration object
output_folder: str
Output folder
"""
self.config = config
self.output_folder = output_folder
self._figure_path = (
os.path.join(self.output_folder, FIGURES_FOLDER_NAME)
if self.config[ConfigKeys.GENERAL][ConfigKeys.GENERAL.SAVE_FIGURES]
else None
)
if self._figure_path and not os.path.exists(self._figure_path):
os.makedirs(self._figure_path)
[docs]
def build(self, folder_list: list[str], base_spec_lib: base.SpecLibBase | None):
"""Build output from a list of search outputs.
The following files are written to the output folder:
- precursor.tsv
- protein_groups.tsv
- stat.tsv
- speclib.mbr.hdf
Parameters
----------
folder_list: List[str]
List of folders containing the search outputs
base_spec_lib: base.SpecLibBase, optional
Base spectral library
"""
logger.progress("Processing search outputs")
psm_df = self._build_precursor_table(folder_list, save=False)
self._build_stat_df(folder_list, psm_df=psm_df, save=True)
self._build_internal_df(folder_list, save=True)
self._build_lfq_tables(folder_list, psm_df=psm_df, save=True)
if self.config["general"]["save_mbr_library"]:
if base_spec_lib is None:
raise ValueError(
"Passing base spectral library is required for MBR library building."
)
self._build_mbr_library(base_spec_lib, psm_df=psm_df, save=True)
if self.config["transfer_library"]["enabled"]:
self._build_transfer_library(folder_list, save=True)
if self.config["transfer_learning"]["enabled"]:
self._build_transfer_model(save=True)
def _build_transfer_model(self, save=True):
"""
Finetune PeptDeep models using the transfer library
Parameters
----------
save : bool, optional
Whether to save the statistics of the transfer learning on disk, by default True
"""
logger.progress("Train PeptDeep Models")
transfer_lib_path = os.path.join(
self.output_folder, f"{self.TRANSFER_OUTPUT}.hdf"
)
if not os.path.exists(transfer_lib_path):
raise ValueError(
f"Transfer library not found at {transfer_lib_path}, did you enable library generation?"
)
transfer_lib = SpecLibBase()
transfer_lib.load_hdf(
transfer_lib_path,
load_mod_seq=True,
)
device = utils.get_torch_device(self.config["general"]["use_gpu"])
tune_mgr = FinetuneManager(
device=device,
lr_patience=self.config["transfer_learning"]["lr_patience"],
test_interval=self.config["transfer_learning"]["test_interval"],
train_fraction=self.config["transfer_learning"]["train_fraction"],
validation_fraction=self.config["transfer_learning"]["validation_fraction"],
test_fraction=self.config["transfer_learning"]["test_fraction"],
epochs=self.config["transfer_learning"]["epochs"],
warmup_epochs=self.config["transfer_learning"]["warmup_epochs"],
batch_size=self.config["transfer_learning"]["batch_size"],
max_lr=self.config["transfer_learning"]["max_lr"],
nce=self.config["transfer_learning"]["nce"],
instrument=self.config["transfer_learning"]["instrument"],
charged_frag_types=fragment.get_charged_frag_types(
self.config["transfer_library"]["fragment_types"],
self.config["transfer_library"]["max_charge"],
),
)
rt_stats = tune_mgr.finetune_rt(transfer_lib.precursor_df)
charge_stats = tune_mgr.finetune_charge(transfer_lib.precursor_df)
ms2_stats = tune_mgr.finetune_ms2(
transfer_lib.precursor_df.copy(), transfer_lib.fragment_intensity_df.copy()
)
tune_mgr.save_models(os.path.join(self.output_folder, self.TRANSFER_MODEL))
combined_stats = pd.concat([rt_stats, charge_stats, ms2_stats])
if save:
logger.info("Writing transfer learning stats output to disk")
write_df(
combined_stats,
os.path.join(self.output_folder, self.TRANSFER_STATS_OUTPUT),
file_format="tsv",
)
def _build_transfer_library(
self,
folder_list: list[str],
keep_top: int = 3,
number_of_processes: int = 4,
save: bool = True,
) -> base.SpecLibBase:
"""
A function to get the transfer library
Parameters
----------
folder_list : List[str]
The list of output folders.
keep_top : int
The number of top runs to keep per each precursor, based on the proba. (smaller the proba, better the run)
number_of_processes : int, optional
The number of processes to use, by default 2
save : bool, optional
Whether to save the transfer library to disk, by default True
Returns
-------
base.SpecLibBase
The transfer Learning library
"""
logger.progress("======== Building transfer library ========")
transferAccumulator = TransferLearningAccumulator(
keep_top=self.config["transfer_library"]["top_k_samples"],
norm_delta_max=self.config["transfer_library"]["norm_delta_max"],
precursor_correlation_cutoff=self.config["transfer_library"][
"precursor_correlation_cutoff"
],
fragment_correlation_ratio=self.config["transfer_library"][
"fragment_correlation_ratio"
],
)
accumulationBroadcaster = AccumulationBroadcaster(
folder_list=folder_list,
number_of_processes=number_of_processes,
processing_kwargs={
"charged_frag_types": fragment.get_charged_frag_types(
self.config["transfer_library"]["fragment_types"],
self.config["transfer_library"]["max_charge"],
)
},
)
accumulationBroadcaster.subscribe(transferAccumulator)
accumulationBroadcaster.run()
logger.info(
f"Built transfer library using {len(folder_list)} folders and {number_of_processes} processes"
)
log_stat_df(transfer_library_stat_df(transferAccumulator.consensus_speclibase))
if save:
logging.info("Writing transfer library to disk")
transferAccumulator.consensus_speclibase.save_hdf(
os.path.join(self.output_folder, f"{self.TRANSFER_OUTPUT}.hdf")
)
return transferAccumulator.consensus_speclibase
def _load_precursor_table(self):
"""Load precursor table from output folder.
Helper functions used by other builders.
Returns
-------
psm_df: pd.DataFrame
Precursor table
"""
return read_df(
os.path.join(self.output_folder, f"{self.PRECURSOR_OUTPUT}"),
file_format=self.config["search_output"]["file_format"],
)
def _build_precursor_table(
self,
folder_list: list[str],
save: bool = True,
):
"""Build precursor table from a list of search outputs
Parameters
----------
folder_list: List[str]
List of folders containing the search outputs
save: bool
Save the precursor table to disk
Returns
-------
psm_df: pd.DataFrame
Precursor table
"""
logger.progress("=== Performing protein grouping and FDR ===")
psm_df_list = load_psm_files_from_folders(folder_list, self.PSM_INPUT)
if len(psm_df_list) == 0:
raise NoPSMFilesFoundError()
psm_df = pd.concat(psm_df_list)
if len(psm_df) == 0:
raise NoPSMFoundError()
logger.info("Performing protein inference")
psm_df = prepare_psm_dataframe(psm_df)
psm_df = apply_protein_inference(
psm_df,
self.config["fdr"]["inference_strategy"],
self.config["fdr"]["group_level"],
)
logger.info("Performing protein FDR")
psm_df = perform_protein_fdr(psm_df, self._figure_path)
psm_df = psm_df[psm_df["pg_qval"] <= self.config["fdr"]["fdr"]]
log_protein_fdr_summary(psm_df)
if not self.config["fdr"]["keep_decoys"]:
psm_df = psm_df[psm_df["decoy"] == 0]
if save:
logger.info("Writing precursor output to disk")
write_df(
psm_df,
os.path.join(self.output_folder, self.PRECURSOR_OUTPUT),
file_format=self.config["search_output"]["file_format"],
)
return psm_df
def _build_stat_df(
self,
folder_list: list[str],
psm_df: pd.DataFrame,
save: bool = True,
):
"""Build stat table from a list of search outputs
Parameters
----------
folder_list: List[str]
List of folders containing the search outputs
psm_df: pd.DataFrame
Combined precursor table
save: bool
Save the precursor table to disk
Returns
-------
stat_df: pd.DataFrame
Precursor table
"""
logger.progress("Building search statistics")
all_channels = get_channels_from_config(self.config)
psm_df = psm_df[psm_df["decoy"] == 0]
stat_df_list = []
for folder in folder_list:
raw_name = os.path.basename(folder)
stat_df_list.append(
build_run_stat_df(
folder,
raw_name,
psm_df[psm_df["run"] == raw_name],
all_channels,
)
)
stat_df = pd.concat(stat_df_list)
if save:
logger.info("Writing stat output to disk")
write_df(
stat_df,
os.path.join(self.output_folder, self.STAT_OUTPUT),
file_format="tsv",
)
return stat_df
def _build_internal_df(
self,
folder_list: list[str],
save: bool = True,
):
"""Build internal data table from a list of search outputs
Parameters
----------
folder_list: List[str]
List of folders containing the search outputs
save: bool
Save the precursor table to disk
Returns
-------
stat_df: pd.DataFrame
Precursor table
"""
logger.progress("Building internal statistics")
internal_df_list = []
for folder in folder_list:
internal_df_list.append(
build_run_internal_df(
folder,
)
)
internal_df = pd.concat(internal_df_list)
if save:
logger.info("Writing internal output to disk")
write_df(
internal_df,
os.path.join(self.output_folder, self.INTERNAL_OUTPUT),
file_format="tsv",
)
return internal_df
def _build_lfq_tables(
self,
folder_list: list[str],
psm_df: pd.DataFrame,
save: bool = True,
):
"""Accumulate fragment information and perform label-free protein quantification.
Parameters
----------
folder_list: List[str]
List of folders containing the search outputs
psm_df: pd.DataFrame
Combined precursor table
save: bool
Save the precursor table to disk
"""
quant_output_builder = QuantOutputBuilder(psm_df, self.config)
lfq_results, psm_df_with_quant = quant_output_builder.build(folder_list)
if save:
logger.info("Writing psm output to disk")
psm_df_output = apply_output_column_names(psm_df_with_quant)
write_df(
psm_df_output,
os.path.join(self.output_folder, f"{self.PRECURSOR_OUTPUT}"),
file_format=self.config["search_output"]["file_format"],
)
if lfq_results:
quant_output_builder.save_results(
lfq_results,
self.output_folder,
file_format=self.config["search_output"]["file_format"],
)
return lfq_results
def _build_mbr_library(
self,
base_spec_lib: base.SpecLibBase,
psm_df: pd.DataFrame,
save: bool = True,
) -> SpecLibBase | None:
"""Build MBR spectral library.
Parameters
----------
base_spec_lib: base.SpecLibBase
Base spectral library
psm_df: pd.DataFrame
Combined precursor table
save: bool
Save the MBR spectral library to disk
"""
logger.progress("Building MBR spectral library")
if len(psm_df) == 0:
logger.warning("No precursors found, skipping MBR library building")
return None
libbuilder = MbrLibraryBuilder(
fdr=0.01,
keep_decoys=self.config["fdr"]["keep_decoys_in_mbr_library"],
)
mbr_spec_lib = libbuilder(psm_df, base_spec_lib)
precursor_number = len(mbr_spec_lib.precursor_df)
protein_number = mbr_spec_lib.precursor_df["proteins"].nunique()
logger.info(
f"MBR spectral library contains {precursor_number:,} precursors, {protein_number:,} proteins"
)
if save:
logger.info("Writing MBR spectral library to disk")
mbr_spec_lib.save_hdf(
os.path.join(
self.output_folder, f"{SearchPlanOutput.LIBRARY_OUTPUT}.hdf"
)
)
return mbr_spec_lib