Source code for alphadia.libtransform.harmonize

import logging
import os

import numpy as np
from alphabase.protein import fasta
from alphabase.spectral_library.base import SpecLibBase

from alphadia.libtransform.base import ProcessingStep
from alphadia.utils import get_isotope_columns

logger = logging.getLogger()


def _has_decoys(spec_lib: SpecLibBase) -> bool:
    """Check if the spectral library contains decoy precursors."""
    return (
        "decoy" in spec_lib.precursor_df.columns
        and (spec_lib.precursor_df["decoy"] == 1).any()
    )


[docs] class PrecursorInitializer(ProcessingStep):
[docs] def __init__(self, drop_decoys: bool = False) -> None: """Initialize alphabase spectral library with precursor information. Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object. This step is required for all spectral libraries and will add the `precursor_idx`, `decoy`, `channel` and `elution_group_idx` columns to the precursor dataframe. Parameters ---------- drop_decoys : bool, optional Drop decoys from the library during initialization. Default is False. Set to True to allow FASTA annotation of libraries that already contain decoys. """ super().__init__() self.drop_decoys = drop_decoys
[docs] def validate(self, input: SpecLibBase) -> bool: """Validate the input object. It is expected that the input is a `SpecLibBase` object.""" valid = isinstance(input, SpecLibBase) if len(input.precursor_df) == 0: logger.error("Input library has no precursor information") valid = False if len(input.fragment_intensity_df) == 0: logger.error("Input library has no fragment intensity information") valid = False if len(input.fragment_mz_df) == 0: logger.error("Input library has no fragment mz information") valid = False return valid
[docs] def forward(self, input: SpecLibBase) -> SpecLibBase: """Initialize the precursor dataframe with the `precursor_idx`, `decoy`, `channel` and `elution_group_idx` columns.""" if "decoy" not in input.precursor_df.columns: input.precursor_df["decoy"] = 0 has_decoys = _has_decoys(input) if self.drop_decoys and has_decoys: logger.info( "Removing decoys from input library, decoys will be recalculated" ) input._precursor_df = input._precursor_df[ input._precursor_df["decoy"] == 0 ].copy() input.remove_unused_fragments() has_decoys = False elif has_decoys: logger.info("Decoy column already present, skipping initialization") if "channel" not in input.precursor_df.columns: input.precursor_df["channel"] = 0 else: logger.info("Channel column already present, skipping initialization") if "elution_group_idx" not in input.precursor_df.columns: if has_decoys: logger.warning( "Library contains decoys but no elution_group_idx column. " "Elution groups link targets and decoys via integer indices where " "the highest scoring match per group is retained. This can affect search performance." ) input.precursor_df["elution_group_idx"] = np.arange(len(input.precursor_df)) else: logger.info( "Elution group indices already present, skipping initialization" ) if "precursor_idx" not in input.precursor_df.columns: input.precursor_df["precursor_idx"] = np.arange(len(input.precursor_df)) else: logger.info("Precursor indices already present, skipping initialization") return input
[docs] class AnnotateFasta(ProcessingStep):
[docs] def __init__( self, fasta_path_list: list[str], drop_unannotated: bool = True, ) -> None: """Annotate the precursor dataframe with protein information from a FASTA file. Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object. Parameters ---------- fasta_path_list : List[str] List of paths to FASTA files. Multiple files can be provided and will be merged into a single protein dataframe. drop_unannotated : bool, optional Drop all precursors which could not be annotated by the FASTA file. Default is True. """ super().__init__() self.fasta_path_list = fasta_path_list self.drop_unannotated = drop_unannotated
[docs] def validate(self, input: SpecLibBase) -> bool: """Validate the input object. It is expected that the input is a `SpecLibBase` object and that all FASTA files exist.""" valid = isinstance(input, SpecLibBase) for path in self.fasta_path_list: if not os.path.exists(path): logger.error( f"Annotation by FASTA failed, input path {path} does not exist" ) valid = False return valid
[docs] def forward(self, input: SpecLibBase) -> SpecLibBase: """Annotate the precursor dataframe with protein information from a FASTA file.""" if _has_decoys(input): logger.warning( "Skipping FASTA annotation: library contains decoys which cannot be annotated. " "Set library_loading.drop_decoys=true to drop decoys and enable annotation." ) return input protein_df = fasta.load_fasta_list_as_protein_df(self.fasta_path_list) input._precursor_df = fasta.annotate_precursor_df( input.precursor_df, protein_df ) if self.drop_unannotated and "cardinality" in input._precursor_df.columns: input._precursor_df = input._precursor_df[ input._precursor_df["cardinality"] > 0 ] return input
[docs] class IsotopeGenerator(ProcessingStep):
[docs] def __init__(self, n_isotopes: int = 4, mp_process_num: int = 8) -> None: """Generate isotope information for the spectral library. Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object. Parameters ---------- n_isotopes : int, optional Number of isotopes to generate. Default is 4. """ super().__init__() self.n_isotopes = n_isotopes self.mp_process_num = mp_process_num
[docs] def validate(self, input: SpecLibBase) -> bool: """Validate the input object. It is expected that the input is a `SpecLibBase` object.""" return isinstance(input, SpecLibBase)
[docs] def forward(self, input: SpecLibBase) -> SpecLibBase: """Generate isotope information for the spectral library.""" existing_isotopes = get_isotope_columns(input.precursor_df.columns) if len(existing_isotopes) > 0: logger.info( "Isotope information already present, skipping isotope generation" ) return input input.calc_precursor_isotope_intensity( max_isotope=self.n_isotopes, mp_process_num=self.mp_process_num, ) return input
[docs] class RTNormalization(ProcessingStep):
[docs] def __init__(self) -> None: """Normalize the retention time of the spectral library. Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object. """ super().__init__()
[docs] def validate(self, input: SpecLibBase) -> bool: """Validate the input object. It is expected that the input is a `SpecLibBase` object.""" valid = isinstance(input, SpecLibBase) if not any( [ col in input.precursor_df.columns for col in ["rt", "rt_norm", "rt_norm_pred"] ] ): logger.error( "Input library has no RT information. Please enable RT prediction or provide RT information." ) valid = False return valid
[docs] def forward(self, input: SpecLibBase) -> SpecLibBase: """Normalize the retention time of the spectral library.""" if len(input.precursor_df) == 0: logger.warning( "Input library has no precursor information. Skipping RT normalization" ) return input if "rt" not in input.precursor_df.columns and ( "rt_norm" in input.precursor_df.columns or "rt_norm_pred" in input.precursor_df.columns ): logger.warning( "Input library already contains normalized RT information. Skipping RT normalization" ) return input percentiles: np.ndarray = np.percentile(input.precursor_df["rt"], [0.1, 99.9]) # type: ignore[assignment] input._precursor_df["rt"] = np.clip( input._precursor_df["rt"], percentiles[0], percentiles[1] ) return input