Source code for alphadia.libtransform.fasta_digest
import logging
import numpy as np
from alphabase.peptide.fragment import get_charged_frag_types
from alphabase.protein.fasta import SpecLibFasta, protease_dict
from alphabase.spectral_library.base import SpecLibBase
from alphadia.exceptions import GenericUserError
from alphadia.libtransform.base import ProcessingStep
logger = logging.getLogger()
[docs]
class FastaDigest(ProcessingStep):
[docs]
def __init__(
self,
enzyme: str = "trypsin",
fixed_modifications: list[str] | None = None,
variable_modifications: list[str] | None = None,
missed_cleavages: int = 1,
precursor_len: list[int] | None = None,
precursor_charge: list[int] | None = None,
precursor_mz: list[int] | None = None,
max_var_mod_num: int = 1,
) -> None:
"""Digest a FASTA file into a spectral library.
Expects a `List[str]` object as input and will return a `SpecLibBase` object.
"""
if precursor_mz is None:
precursor_mz = [400, 1200]
if precursor_charge is None:
precursor_charge = [2, 4]
if precursor_len is None:
precursor_len = [7, 35]
if variable_modifications is None:
variable_modifications = ["Oxidation@M", "Acetyl@Prot N-term"]
if fixed_modifications is None:
fixed_modifications = ["Carbamidomethyl@C"]
if enzyme.lower() == "non-specific" and missed_cleavages < (
required_missed_cleavages := (precursor_len[1] - 1)
):
raise GenericUserError(
f"Non-specific enzyme requires missed_cleavages >= {required_missed_cleavages} to generate peptides up to length {precursor_len[1]}.",
f"Current value: missed_cleavages={missed_cleavages}\nRequired value: missed_cleavages={required_missed_cleavages}\n\n"
f"Please update your configuration to set:\n"
f" library_prediction.missed_cleavages: {required_missed_cleavages}",
)
super().__init__()
self.enzyme = enzyme
self.fixed_modifications = fixed_modifications
self.variable_modifications = variable_modifications
self.missed_cleavages = missed_cleavages
self.precursor_len = precursor_len
self.precursor_charge = precursor_charge
self.precursor_mz = precursor_mz
self.max_var_mod_num = max_var_mod_num
[docs]
def validate(self, input: list[str]) -> bool:
if not isinstance(input, list):
logger.error("Input fasta list is not a list")
return False
if len(input) == 0:
logger.error("Input fasta list is empty")
return False
return True
[docs]
def forward(self, input: list[str]) -> SpecLibBase:
frag_types = get_charged_frag_types(["b", "y"], 2)
fasta_lib = SpecLibFasta(
frag_types,
protease=protease_dict[self.enzyme],
var_mods=self.variable_modifications,
fix_mods=self.fixed_modifications,
max_missed_cleavages=self.missed_cleavages,
max_var_mod_num=self.max_var_mod_num,
peptide_length_max=self.precursor_len[1],
peptide_length_min=self.precursor_len[0],
precursor_charge_min=self.precursor_charge[0],
precursor_charge_max=self.precursor_charge[1],
precursor_mz_min=self.precursor_mz[0],
precursor_mz_max=self.precursor_mz[1],
decoy=None, # type: ignore[arg-type]
)
logger.info("Digesting fasta file")
fasta_lib.get_peptides_from_fasta_list(input)
logger.info("Adding modifications")
fasta_lib.add_modifications()
fasta_lib.precursor_df["proteins"] = fasta_lib.precursor_df[
"protein_idxes"
].apply(
lambda x: ";".join(
[
fasta_lib.protein_df["protein_id"].values[int(i)]
for i in x.split(";")
]
)
)
fasta_lib.precursor_df["genes"] = fasta_lib.precursor_df["protein_idxes"].apply(
lambda x: ";".join(
[fasta_lib.protein_df["gene_org"].values[int(i)] for i in x.split(";")]
)
)
fasta_lib.add_charge()
fasta_lib.hash_precursor_df()
fasta_lib.calc_precursor_mz()
fasta_lib.precursor_df = fasta_lib.precursor_df[
(fasta_lib.precursor_df["precursor_mz"] > self.precursor_mz[0])
& (fasta_lib.precursor_df["precursor_mz"] < self.precursor_mz[1])
]
logger.info("Removing non-canonical amino acids")
forbidden = ["B", "J", "X", "Z"]
masks = []
for aa in forbidden:
masks.append(fasta_lib.precursor_df["sequence"].str.contains(aa))
mask = np.logical_or.reduce(masks)
fasta_lib.precursor_df = fasta_lib.precursor_df[~mask]
logger.info(
f"Fasta library contains {len(fasta_lib.precursor_df):,} precursors"
)
return fasta_lib