Source code for alphadia.libtransform.flatten

import logging

from alphabase.peptide import fragment
from alphabase.spectral_library.base import SpecLibBase
from alphabase.spectral_library.flat import SpecLibFlat

from alphadia import utils
from alphadia.constants.keys import CalibCols
from alphadia.libtransform.base import ProcessingStep
from alphadia.validation.schemas import fragments_flat_schema, precursors_flat_schema

logger = logging.getLogger()


[docs] class FlattenLibrary(ProcessingStep):
[docs] def __init__( self, top_k_fragments: int = 12, min_fragment_intensity: float = 0.01 ) -> None: """Convert a `SpecLibBase` object into a `SpecLibFlat` object. Parameters ---------- top_k_fragments : int, optional Number of top fragments to keep. Default is 12. min_fragment_intensity : float, optional Minimum intensity threshold for fragments. Default is 0.01. """ self.top_k_fragments = top_k_fragments self.min_fragment_intensity = min_fragment_intensity super().__init__()
[docs] def validate(self, input: SpecLibBase) -> bool: """Validate the input object. It is expected that the input is a `SpecLibBase` object.""" return isinstance(input, SpecLibBase)
[docs] def forward(self, input: SpecLibBase) -> SpecLibFlat: """Convert a `SpecLibBase` object into a `SpecLibFlat` object.""" input._fragment_cardinality_df = fragment.calc_fragment_cardinality( # type: ignore[attr-defined] input.precursor_df, input._fragment_mz_df ) output = SpecLibFlat( min_fragment_intensity=self.min_fragment_intensity, keep_top_k_fragments=self.top_k_fragments, ) output.parse_base_library( input, custom_df={"cardinality": input._fragment_cardinality_df}, # type: ignore[attr-defined] ) return output
[docs] class InitFlatColumns(ProcessingStep):
[docs] def __init__(self) -> None: """Initialize the columns of a `SpecLibFlat` object for alphadia search. Calibratable columns are `mz_library`, `rt_library` and `mobility_library` will be initialized with the first matching column in the input dataframe. """ super().__init__()
[docs] def validate(self, input: SpecLibFlat) -> bool: """Validate the input object. It is expected that the input is a `SpecLibFlat` object.""" return isinstance(input, SpecLibFlat)
[docs] def forward(self, input: SpecLibFlat) -> SpecLibFlat: """Initialize the columns of a `SpecLibFlat` object for alphadia search.""" precursor_columns = { CalibCols.MZ_LIBRARY: ["mz_library", "mz", "precursor_mz"], CalibCols.RT_LIBRARY: [ "rt_library", "rt", "rt_norm", "rt_pred", "rt_norm_pred", "irt", ], CalibCols.MOBILITY_LIBRARY: [ "mobility_library", "mobility", "mobility_pred", ], } fragment_columns = { CalibCols.MZ_LIBRARY: ["mz_library", "mz", "predicted_mz"], } for column_mapping, df in [ (precursor_columns, input.precursor_df), (fragment_columns, input.fragment_df), ]: for key, value in column_mapping.items(): for candidate_columns in value: if candidate_columns in df.columns: df.rename(columns={candidate_columns: key}, inplace=True) # break after first match break if CalibCols.MOBILITY_LIBRARY not in input.precursor_df.columns: input.precursor_df[CalibCols.MOBILITY_LIBRARY] = 0 logger.warning("Library contains no ion mobility annotations") precursors_flat_schema.validate(input.precursor_df) fragments_flat_schema.validate(input.fragment_df) return input
[docs] class LogFlatLibraryStats(ProcessingStep):
[docs] def __init__(self) -> None: """Log basic statistics of a `SpecLibFlat` object.""" super().__init__()
[docs] def validate(self, input: SpecLibFlat) -> bool: """Validate the input object. It is expected that the input is a `SpecLibFlat` object.""" return isinstance(input, SpecLibFlat)
[docs] def forward(self, input: SpecLibFlat) -> SpecLibFlat: """Validate the input object. It is expected that the input is a `SpecLibFlat` object.""" logger.info("============ Library Stats ============") logger.info(f"Number of precursors: {len(input.precursor_df):,}") if "decoy" in input.precursor_df.columns: n_targets = len(input.precursor_df.query("decoy == False")) n_decoys = len(input.precursor_df.query("decoy == True")) logger.info(f"\tthereof targets:{n_targets:,}") logger.info(f"\tthereof decoys: {n_decoys:,}") else: logger.warning("no decoy column was found") if "elution_group_idx" in input.precursor_df.columns: n_elution_groups = len(input.precursor_df["elution_group_idx"].unique()) average_precursors_per_group = len(input.precursor_df) / n_elution_groups logger.info(f"Number of elution groups: {n_elution_groups:,}") logger.info(f"\taverage size: {average_precursors_per_group:.2f}") else: logger.warning("no elution_group_idx column was found") if "proteins" in input.precursor_df.columns: n_proteins = len(input.precursor_df["proteins"].unique()) logger.info(f"Number of proteins: {n_proteins:,}") else: logger.warning("no proteins column was found") if "channel" in input.precursor_df.columns: channels = input.precursor_df["channel"].unique() n_channels = len(channels) logger.info(f"Number of channels: {n_channels:,} ({channels})") else: logger.warning("no channel column was found, will assume only one channel") isotopes = utils.get_isotope_columns(input.precursor_df.columns) if len(isotopes) > 0: logger.info(f"Isotopes Distribution for {len(isotopes)} isotopes") logger.info("=======================================") return input