Source code for alphadia.outputtransform.df_builders

import logging
import os
from collections import defaultdict

import numpy as np
import pandas as pd
from alphabase.spectral_library.base import SpecLibBase

from alphadia.constants.keys import (
    OutputRawCols,
    StatCalibrationCols,
    StatOutputCols,
    StatSearchCols,
)
from alphadia.workflow.managers.calibration_manager import (
    CalibrationEstimators,
    CalibrationGroups,
    CalibrationManager,
)
from alphadia.workflow.managers.optimization_manager import OptimizationManager
from alphadia.workflow.managers.raw_file_manager import RawFileManager
from alphadia.workflow.managers.timing_manager import TimingManager
from alphadia.workflow.peptidecentric.peptidecentric import PeptideCentricWorkflow

logger = logging.getLogger()


[docs] def build_run_stat_df( folder: str, raw_name: str, run_df: pd.DataFrame, channels: list[int] | None = None, ): """Build stat dataframe for a single run. Parameters ---------- folder: str Directory containing the raw file and the managers raw_name: str Name of the raw file run_df: pd.DataFrame Dataframe containing the precursor data channels: List[int], optional List of channels to include in the output, default=[0] Returns ------- pd.DataFrame Dataframe containing the statistics """ if channels is None: channels = [0] all_stats = [] for channel in channels: channel_df = run_df[run_df["channel"] == channel] stats = { OutputRawCols.NAME: raw_name, StatSearchCols.CHANNEL: channel, StatSearchCols.PRECURSORS: len(channel_df), StatSearchCols.PROTEINS: channel_df["pg"].nunique(), } stats[StatSearchCols.FWHM_RT] = np.nan if "cycle_fwhm" in channel_df.columns: stats[StatSearchCols.FWHM_RT] = np.mean(channel_df["cycle_fwhm"]) stats[StatSearchCols.FWHM_MOBILITY] = np.nan if "mobility_fwhm" in channel_df.columns: stats[StatSearchCols.FWHM_MOBILITY] = np.mean(channel_df["mobility_fwhm"]) # collect optimization stats optimization_stats = defaultdict(lambda: np.nan) if os.path.exists( optimization_manager_path := os.path.join( folder, PeptideCentricWorkflow.OPTIMIZATION_MANAGER_PKL_NAME, ) ): optimization_manager = OptimizationManager(path=optimization_manager_path) optimization_stats[StatOutputCols.MS2_ERROR] = ( optimization_manager.ms2_error ) optimization_stats[StatOutputCols.MS1_ERROR] = ( optimization_manager.ms1_error ) optimization_stats[StatOutputCols.RT_ERROR] = optimization_manager.rt_error optimization_stats[StatOutputCols.MOBILITY_ERROR] = ( optimization_manager.mobility_error ) else: logger.warning(f"Error reading optimization manager for {raw_name}") for key in [ StatOutputCols.MS2_ERROR, StatOutputCols.MS1_ERROR, StatOutputCols.RT_ERROR, StatOutputCols.MOBILITY_ERROR, ]: stats[f"{StatOutputCols.OPTIMIZATION_PREFIX}{key}"] = optimization_stats[ key ] # collect calibration stats calibration_stats = defaultdict(lambda: np.nan) if os.path.exists( calibration_manager_path := os.path.join( folder, PeptideCentricWorkflow.CALIBRATION_MANAGER_PKL_NAME, ) ): calibration_manager = CalibrationManager(path=calibration_manager_path) if ( fragment_mz_estimator := calibration_manager.get_estimator( CalibrationGroups.FRAGMENT, CalibrationEstimators.MZ ) ) and (fragment_mz_metrics := fragment_mz_estimator.metrics): # TODO: rename internal metric key "median_accuracy" to "median_bias" calibration_stats[StatCalibrationCols.MS2_BIAS] = fragment_mz_metrics[ "median_accuracy" ] # TODO: rename internal metric key "median_precision" to "median_variance" calibration_stats[StatCalibrationCols.MS2_ERROR] = fragment_mz_metrics[ "median_precision" ] if ( precursor_mz_estimator := calibration_manager.get_estimator( CalibrationGroups.PRECURSOR, CalibrationEstimators.MZ ) ) and (precursor_mz_metrics := precursor_mz_estimator.metrics): # TODO: rename internal metric key "median_accuracy" to "median_bias" calibration_stats[StatCalibrationCols.MS1_BIAS] = precursor_mz_metrics[ "median_accuracy" ] # TODO: rename internal metric key "median_precision" to "median_variance" calibration_stats[StatCalibrationCols.MS1_ERROR] = precursor_mz_metrics[ "median_precision" ] else: logger.warning(f"Error reading calibration manager for {raw_name}") for key in [ StatCalibrationCols.MS2_BIAS, StatCalibrationCols.MS2_ERROR, StatCalibrationCols.MS1_BIAS, StatCalibrationCols.MS1_ERROR, ]: stats[key] = calibration_stats.get(key, "NaN") # collect raw stats raw_stats = defaultdict(lambda: np.nan) if os.path.exists( raw_file_manager_path := os.path.join( folder, PeptideCentricWorkflow.RAW_FILE_MANAGER_PKL_NAME ) ): raw_stats = RawFileManager( path=raw_file_manager_path, load_from_file=True ).stats else: logger.warning(f"Error reading raw file manager for {raw_name}") # deliberately mapping explicitly to avoid coupling raw_stats to the output too tightly prefix = "raw." stats[f"{prefix}gradient_length"] = ( raw_stats["rt_limit_max"] - raw_stats["rt_limit_min"] ) for key in [ "cycle_length", "cycle_duration", "cycle_number", "ms2_range_min", "ms2_range_max", ]: stats[f"{prefix}{key}"] = raw_stats[key] all_stats.append(stats) return pd.DataFrame(all_stats)
[docs] def build_run_internal_df( folder_path: str, ): """Build stat dataframe for a single run. Parameters ---------- folder_path: str Path (from the base directory of the output_folder attribute of the SearchStep class) to the directory containing the raw file and the managers Returns ------- pd.DataFrame Dataframe containing the statistics """ timing_manager_path = os.path.join( folder_path, PeptideCentricWorkflow.TIMING_MANAGER_PKL_NAME ) raw_name = os.path.basename(folder_path) internal_dict = { "run": [raw_name], } if os.path.exists(timing_manager_path): timing_manager = TimingManager(path=timing_manager_path) for key in timing_manager.timings: internal_dict[f"duration_{key}"] = [timing_manager.timings[key]["duration"]] else: logger.warning(f"Error reading timing manager for {raw_name}") return pd.DataFrame(internal_dict)
[docs] def transfer_library_stat_df(transfer_library: SpecLibBase) -> pd.DataFrame: """create statistics dataframe for transfer library Parameters ---------- transfer_library : SpecLibBase transfer library Returns ------- pd.DataFrame statistics dataframe """ # get unique modifications modifications = ( transfer_library.precursor_df["mods"].str.split(";").explode().unique() ) modifications = [mod for mod in modifications if mod != ""] statistics_df = [] for mod in modifications: mod_df = transfer_library.precursor_df[ transfer_library.precursor_df["mods"].str.contains(mod) ] mod_ms2_df = mod_df[mod_df["use_for_ms2"]] statistics_df.append( { "modification": mod, "num_precursors": len(mod_df), "num_unique_precursor": len(mod_df["mod_seq_charge_hash"].unique()), "num_ms2_precursors": len(mod_ms2_df), "num_unique_ms2_precursor": len( mod_ms2_df["mod_seq_charge_hash"].unique() ), } ) # add unmodified mod_df = transfer_library.precursor_df[transfer_library.precursor_df["mods"] == ""] mod_ms2_df = mod_df[mod_df["use_for_ms2"]] statistics_df.append( { "modification": "", "num_precursors": len(mod_df), "num_unique_precursor": len(mod_df["mod_seq_charge_hash"].unique()), "num_ms2_precursors": len(mod_ms2_df), "num_unique_ms2_precursor": len(mod_ms2_df["mod_seq_charge_hash"].unique()), } ) # add total statistics_df.append( { "modification": "Total", "num_precursors": len(transfer_library.precursor_df), "num_unique_precursor": len( transfer_library.precursor_df["mod_seq_charge_hash"].unique() ), "num_ms2_precursors": len( transfer_library.precursor_df[ transfer_library.precursor_df["use_for_ms2"] ] ), "num_unique_ms2_precursor": len( transfer_library.precursor_df[ transfer_library.precursor_df["use_for_ms2"] ]["mod_seq_charge_hash"].unique() ), } ) return pd.DataFrame(statistics_df)
[docs] def log_stat_df(stat_df: pd.DataFrame): """log statistics dataframe to console Parameters ---------- stat_df : pd.DataFrame statistics dataframe """ # iterate over all modifications d # print with space padding space = 12 logger.info( "Modification".ljust(25) + "Total".rjust(space) + "Unique".rjust(space) + "Total MS2".rjust(space) + "Unique MS2".rjust(space) ) for _, row in stat_df.iterrows(): if row["modification"] == "Total": continue logger.info( row["modification"].ljust(25) + f'{row["num_precursors"]:,}'.rjust(space) + f'{row["num_unique_precursor"]:,}'.rjust(space) + f'{row["num_ms2_precursors"]:,}'.rjust(space) + f'{row["num_unique_ms2_precursor"]:,}'.rjust(space) ) # log line logger.info("-" * 25 + " " + "-" * space * 4) # log total total = stat_df[stat_df["modification"] == "Total"].iloc[0] logger.info( "Total".ljust(25) + f'{total["num_precursors"]:,}'.rjust(space) + f'{total["num_unique_precursor"]:,}'.rjust(space) + f'{total["num_ms2_precursors"]:,}'.rjust(space) + f'{total["num_unique_ms2_precursor"]:,}'.rjust(space) )