Source code for alphadia.libtransform.harmonize
import logging
import os
import numpy as np
from alphabase.protein import fasta
from alphabase.spectral_library.base import SpecLibBase
from alphadia.libtransform.base import ProcessingStep
from alphadia.utils import get_isotope_columns
logger = logging.getLogger()
def _has_decoys(spec_lib: SpecLibBase) -> bool:
"""Check if the spectral library contains decoy precursors."""
return (
"decoy" in spec_lib.precursor_df.columns
and (spec_lib.precursor_df["decoy"] == 1).any()
)
[docs]
class PrecursorInitializer(ProcessingStep):
[docs]
def __init__(self, drop_decoys: bool = False) -> None:
"""Initialize alphabase spectral library with precursor information.
Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object.
This step is required for all spectral libraries and will add the `precursor_idx`,
`decoy`, `channel` and `elution_group_idx` columns to the precursor dataframe.
Parameters
----------
drop_decoys : bool, optional
Drop decoys from the library during initialization. Default is False.
Set to True to allow FASTA annotation of libraries that already contain decoys.
"""
super().__init__()
self.drop_decoys = drop_decoys
[docs]
def validate(self, input: SpecLibBase) -> bool:
"""Validate the input object. It is expected that the input is a `SpecLibBase` object."""
valid = isinstance(input, SpecLibBase)
if len(input.precursor_df) == 0:
logger.error("Input library has no precursor information")
valid = False
if len(input.fragment_intensity_df) == 0:
logger.error("Input library has no fragment intensity information")
valid = False
if len(input.fragment_mz_df) == 0:
logger.error("Input library has no fragment mz information")
valid = False
return valid
[docs]
def forward(self, input: SpecLibBase) -> SpecLibBase:
"""Initialize the precursor dataframe with the `precursor_idx`, `decoy`, `channel` and `elution_group_idx` columns."""
if "decoy" not in input.precursor_df.columns:
input.precursor_df["decoy"] = 0
has_decoys = _has_decoys(input)
if self.drop_decoys and has_decoys:
logger.info(
"Removing decoys from input library, decoys will be recalculated"
)
input._precursor_df = input._precursor_df[
input._precursor_df["decoy"] == 0
].copy()
input.remove_unused_fragments()
has_decoys = False
elif has_decoys:
logger.info("Decoy column already present, skipping initialization")
if "channel" not in input.precursor_df.columns:
input.precursor_df["channel"] = 0
else:
logger.info("Channel column already present, skipping initialization")
if "elution_group_idx" not in input.precursor_df.columns:
if has_decoys:
logger.warning(
"Library contains decoys but no elution_group_idx column. "
"Elution groups link targets and decoys via integer indices where "
"the highest scoring match per group is retained. This can affect search performance."
)
input.precursor_df["elution_group_idx"] = np.arange(len(input.precursor_df))
else:
logger.info(
"Elution group indices already present, skipping initialization"
)
if "precursor_idx" not in input.precursor_df.columns:
input.precursor_df["precursor_idx"] = np.arange(len(input.precursor_df))
else:
logger.info("Precursor indices already present, skipping initialization")
return input
[docs]
class AnnotateFasta(ProcessingStep):
[docs]
def __init__(
self,
fasta_path_list: list[str],
drop_unannotated: bool = True,
) -> None:
"""Annotate the precursor dataframe with protein information from a FASTA file.
Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object.
Parameters
----------
fasta_path_list : List[str]
List of paths to FASTA files. Multiple files can be provided and will be merged into a single protein dataframe.
drop_unannotated : bool, optional
Drop all precursors which could not be annotated by the FASTA file. Default is True.
"""
super().__init__()
self.fasta_path_list = fasta_path_list
self.drop_unannotated = drop_unannotated
[docs]
def validate(self, input: SpecLibBase) -> bool:
"""Validate the input object. It is expected that the input is a `SpecLibBase` object and that all FASTA files exist."""
valid = isinstance(input, SpecLibBase)
for path in self.fasta_path_list:
if not os.path.exists(path):
logger.error(
f"Annotation by FASTA failed, input path {path} does not exist"
)
valid = False
return valid
[docs]
def forward(self, input: SpecLibBase) -> SpecLibBase:
"""Annotate the precursor dataframe with protein information from a FASTA file."""
if _has_decoys(input):
logger.warning(
"Skipping FASTA annotation: library contains decoys which cannot be annotated. "
"Set library_loading.drop_decoys=true to drop decoys and enable annotation."
)
return input
protein_df = fasta.load_fasta_list_as_protein_df(self.fasta_path_list)
input._precursor_df = fasta.annotate_precursor_df(
input.precursor_df, protein_df
)
if self.drop_unannotated and "cardinality" in input._precursor_df.columns:
input._precursor_df = input._precursor_df[
input._precursor_df["cardinality"] > 0
]
return input
[docs]
class IsotopeGenerator(ProcessingStep):
[docs]
def __init__(self, n_isotopes: int = 4, mp_process_num: int = 8) -> None:
"""Generate isotope information for the spectral library.
Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object.
Parameters
----------
n_isotopes : int, optional
Number of isotopes to generate. Default is 4.
"""
super().__init__()
self.n_isotopes = n_isotopes
self.mp_process_num = mp_process_num
[docs]
def validate(self, input: SpecLibBase) -> bool:
"""Validate the input object. It is expected that the input is a `SpecLibBase` object."""
return isinstance(input, SpecLibBase)
[docs]
def forward(self, input: SpecLibBase) -> SpecLibBase:
"""Generate isotope information for the spectral library."""
existing_isotopes = get_isotope_columns(input.precursor_df.columns)
if len(existing_isotopes) > 0:
logger.info(
"Isotope information already present, skipping isotope generation"
)
return input
input.calc_precursor_isotope_intensity(
max_isotope=self.n_isotopes,
mp_process_num=self.mp_process_num,
)
return input
[docs]
class RTNormalization(ProcessingStep):
[docs]
def __init__(self) -> None:
"""Normalize the retention time of the spectral library.
Expects a `SpecLibBase` object as input and will return a `SpecLibBase` object.
"""
super().__init__()
[docs]
def validate(self, input: SpecLibBase) -> bool:
"""Validate the input object. It is expected that the input is a `SpecLibBase` object."""
valid = isinstance(input, SpecLibBase)
if not any(
[
col in input.precursor_df.columns
for col in ["rt", "rt_norm", "rt_norm_pred"]
]
):
logger.error(
"Input library has no RT information. Please enable RT prediction or provide RT information."
)
valid = False
return valid
[docs]
def forward(self, input: SpecLibBase) -> SpecLibBase:
"""Normalize the retention time of the spectral library."""
if len(input.precursor_df) == 0:
logger.warning(
"Input library has no precursor information. Skipping RT normalization"
)
return input
if "rt" not in input.precursor_df.columns and (
"rt_norm" in input.precursor_df.columns
or "rt_norm_pred" in input.precursor_df.columns
):
logger.warning(
"Input library already contains normalized RT information. Skipping RT normalization"
)
return input
percentiles: np.ndarray = np.percentile(input.precursor_df["rt"], [0.1, 99.9]) # type: ignore[assignment]
input._precursor_df["rt"] = np.clip(
input._precursor_df["rt"], percentiles[0], percentiles[1]
)
return input