Source code for alphadia.libtransform.fasta_digest

import logging

import numpy as np
from alphabase.peptide.fragment import get_charged_frag_types
from alphabase.protein.fasta import SpecLibFasta, protease_dict
from alphabase.spectral_library.base import SpecLibBase

from alphadia.exceptions import GenericUserError
from alphadia.libtransform.base import ProcessingStep

logger = logging.getLogger()


[docs] class FastaDigest(ProcessingStep):
[docs] def __init__( self, enzyme: str = "trypsin", fixed_modifications: list[str] | None = None, variable_modifications: list[str] | None = None, missed_cleavages: int = 1, precursor_len: list[int] | None = None, precursor_charge: list[int] | None = None, precursor_mz: list[int] | None = None, max_var_mod_num: int = 1, ) -> None: """Digest a FASTA file into a spectral library. Expects a `List[str]` object as input and will return a `SpecLibBase` object. """ if precursor_mz is None: precursor_mz = [400, 1200] if precursor_charge is None: precursor_charge = [2, 4] if precursor_len is None: precursor_len = [7, 35] if variable_modifications is None: variable_modifications = ["Oxidation@M", "Acetyl@Prot N-term"] if fixed_modifications is None: fixed_modifications = ["Carbamidomethyl@C"] if enzyme.lower() == "non-specific" and missed_cleavages < ( required_missed_cleavages := (precursor_len[1] - 1) ): raise GenericUserError( f"Non-specific enzyme requires missed_cleavages >= {required_missed_cleavages} to generate peptides up to length {precursor_len[1]}.", f"Current value: missed_cleavages={missed_cleavages}\nRequired value: missed_cleavages={required_missed_cleavages}\n\n" f"Please update your configuration to set:\n" f" library_prediction.missed_cleavages: {required_missed_cleavages}", ) super().__init__() self.enzyme = enzyme self.fixed_modifications = fixed_modifications self.variable_modifications = variable_modifications self.missed_cleavages = missed_cleavages self.precursor_len = precursor_len self.precursor_charge = precursor_charge self.precursor_mz = precursor_mz self.max_var_mod_num = max_var_mod_num
[docs] def validate(self, input: list[str]) -> bool: if not isinstance(input, list): logger.error("Input fasta list is not a list") return False if len(input) == 0: logger.error("Input fasta list is empty") return False return True
[docs] def forward(self, input: list[str]) -> SpecLibBase: frag_types = get_charged_frag_types(["b", "y"], 2) fasta_lib = SpecLibFasta( frag_types, protease=protease_dict[self.enzyme], var_mods=self.variable_modifications, fix_mods=self.fixed_modifications, max_missed_cleavages=self.missed_cleavages, max_var_mod_num=self.max_var_mod_num, peptide_length_max=self.precursor_len[1], peptide_length_min=self.precursor_len[0], precursor_charge_min=self.precursor_charge[0], precursor_charge_max=self.precursor_charge[1], precursor_mz_min=self.precursor_mz[0], precursor_mz_max=self.precursor_mz[1], decoy=None, # type: ignore[arg-type] ) logger.info("Digesting fasta file") fasta_lib.get_peptides_from_fasta_list(input) logger.info("Adding modifications") fasta_lib.add_modifications() fasta_lib.precursor_df["proteins"] = fasta_lib.precursor_df[ "protein_idxes" ].apply( lambda x: ";".join( [ fasta_lib.protein_df["protein_id"].values[int(i)] for i in x.split(";") ] ) ) fasta_lib.precursor_df["genes"] = fasta_lib.precursor_df["protein_idxes"].apply( lambda x: ";".join( [fasta_lib.protein_df["gene_org"].values[int(i)] for i in x.split(";")] ) ) fasta_lib.add_charge() fasta_lib.hash_precursor_df() fasta_lib.calc_precursor_mz() fasta_lib.precursor_df = fasta_lib.precursor_df[ (fasta_lib.precursor_df["precursor_mz"] > self.precursor_mz[0]) & (fasta_lib.precursor_df["precursor_mz"] < self.precursor_mz[1]) ] logger.info("Removing non-canonical amino acids") forbidden = ["B", "J", "X", "Z"] masks = [] for aa in forbidden: masks.append(fasta_lib.precursor_df["sequence"].str.contains(aa)) mask = np.logical_or.reduce(masks) fasta_lib.precursor_df = fasta_lib.precursor_df[~mask] logger.info( f"Fasta library contains {len(fasta_lib.precursor_df):,} precursors" ) return fasta_lib