Source code for alphadia.calibration.estimator

"""Calibration estimator module."""

import logging
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

from alphadia.calibration.models import (
    CalibrationModel,
    LOESSRegression,
    construct_polynomial_regression,
)
from alphadia.calibration.plot import plot_calibration


[docs] class CalibrationEstimator: """A single estimator for a property."""
[docs] def __init__( # noqa: PLR0913 # Too many arguments self, name: str, model: CalibrationModel, input_columns: list[str], target_columns: list[str], output_columns: list[str], transform_deviation: None | str | float = None, ): """A single estimator for a property (mz, rt, etc.). Calibration is performed by modeling the deviation of an input values (e.g. mz_library) from an observed property (e.g. mz_observed) using a function (e.g. LinearRegression). Once calibrated, calibrated values (e.g. mz_calibrated) can be predicted from input values (e.g. mz_library). Additional explaining variables can be added to the input values (e.g. rt_library) to improve the calibration. Parameters ---------- name : str Name of the estimator for logging and plotting e.g. 'mz' model : CalibrationModel The estimator object instance which must have a fit and predict method. This will usually be a sklearn estimator or a custom estimator. input_columns : list[str] The columns of the dataframe that are used as input for the estimator e.g. ['mz_library']. The first column is the property which should be calibrated, additional columns can be used as explaining variables e.g. ['mz_library', 'rt_library']. target_columns : list[str] The columns of the dataframe that are used as target for the estimator e.g. ['mz_observed']. At the moment only one target column is supported. output_columns : list[str] The columns of the dataframe that are used as output for the estimator e.g. ['mz_calibrated']. At the moment only one output column is supported. transform_deviation : typing.List[Union[None, float, str]] If set to a valid float, the deviation is expressed as a fraction of the input value e.g. 1e6 for ppm. If set to None, the deviation is expressed in absolute units. """ self.name = name self._model = model self.input_columns = input_columns self._target_columns = target_columns self._output_columns = output_columns self.transform_deviation = ( float(transform_deviation) if transform_deviation is not None else None ) self.is_fitted = False self.metrics = None if len(output_columns) != 1 or len(target_columns) != 1: raise ValueError( f"{self.name} calibration: only one output and target column is supported, got {len(output_columns)=} {len(target_columns)=}" )
def __repr__(self) -> str: """Return a string representation of the Calibration object.""" return f"<Calibration {self.name}, is_fitted: {self.is_fitted}>"
[docs] def save(self, file_name: str) -> None: """Save the estimator to pickle file. Parameters ---------- file_name : str Path to the pickle file """ with Path(file_name).open("wb") as f: pickle.dump(self, f)
[docs] @classmethod def from_file(cls, file_name: str) -> "CalibrationEstimator": """Load the estimator from pickle file. Parameters ---------- file_name : str Path to the pickle file """ with Path(file_name).open("rb") as f: loaded_calibration: CalibrationEstimator = pickle.load(f) # noqa: S301 new_calibration = CalibrationEstimator( name=loaded_calibration.name, model=loaded_calibration._model, # noqa: SLF001 input_columns=loaded_calibration.input_columns, target_columns=loaded_calibration._target_columns, # noqa: SLF001 output_columns=loaded_calibration._output_columns, # noqa: SLF001 transform_deviation=loaded_calibration.transform_deviation, ) new_calibration.__dict__.update(loaded_calibration.__dict__) return new_calibration
def _validate_columns(self, df: pd.DataFrame, required_columns: list[str]) -> bool: """Validate that the input and target columns are present in the dataframe. Parameters ---------- df : pd.DataFrame Dataframe containing the input and target columns required_columns : list[str] List of required columns to check in the dataframe Returns ------- bool True if df is valid, False otherwise """ required_columns_set = set(required_columns) if not required_columns_set.issubset(df.columns): logging.warning( f"{self.name}, at least one column {required_columns_set} not found in dataframe" ) return False return True
[docs] def fit( self, df: pd.DataFrame, *, plot: bool = True, figure_path: str | None = None, ) -> None: """Fit the estimator based on the input and target columns of the dataframe. Parameters ---------- df : pd.DataFrame Dataframe containing the input and target columns plot : bool, default=True If True, a plot of the calibration is generated. figure_path : str, default=None If not None, a plot of the calibration is generated and saved. Returns ------- np.ndarray Array of shape (n_input_columns, ) containing the mean absolute deviation of the residual deviation at the given confidence interval """ if not self._validate_columns(df, self.input_columns + self._target_columns): raise ValueError( f"{self.name} calibration fitting: failed input validation" ) input_values = df[self.input_columns].to_numpy() target_value = df[self._target_columns].to_numpy() try: self._model.fit(input_values, target_value) except Exception as e: # noqa: BLE001 logging.warning(f"Could not fit estimator {self.name}: {e}") return self.is_fitted = True self.metrics = self._get_metrics(df) if plot: plot_calibration(self, df, figure_path=figure_path)
[docs] def predict(self, df: pd.DataFrame, *, inplace: bool = True) -> np.ndarray | None: """Perform a prediction based on the input columns of the dataframe. Parameters ---------- df : pd.DataFrame Dataframe containing the input and target columns inplace : bool, default=True If True, the prediction is added as a new column to the dataframe. Returns ------- np.ndarray Array of shape (n_samples, ) containing the prediction """ if not self.is_fitted: logging.warning( f"{self.name} prediction was skipped as it has not been fitted yet" ) return None if not self._validate_columns(df, self.input_columns): raise ValueError( f"{self.name} calibration prediction: failed input validation" ) input_values = df[self.input_columns].to_numpy() predicted_values = self._model.predict(input_values) if inplace: df[self._output_columns[0]] = predicted_values else: return predicted_values return None
[docs] def calc_deviation(self, df: pd.DataFrame) -> np.ndarray: """Calculate the deviations between the input, target and calibrated values. Parameters ---------- df : pd.DataFrame Dataframe containing the input and target columns Returns ------- np.ndarray Array of shape (n_samples, 3 + n_input_columns). The second dimension contains the observed deviation, calibrated deviation, residual deviation and the input values. """ # the first column is the unclaibrated input property # all other columns are explaining variables input_values = df[self.input_columns].to_numpy() # the first column is the unclaibrated input property uncalibrated_values = input_values[:, [0]] # only one target column is supported target_values = df[self._target_columns].to_numpy()[:, [0]] input_transform = self.transform_deviation calibrated_values = self.predict(df, inplace=False) assert calibrated_values is not None # type checker if calibrated_values.ndim == 1: calibrated_values = calibrated_values[:, np.newaxis] # only one output column is supported calibrated_dim = calibrated_values[:, [0]] # deviation is the difference between the (observed) target value and the uncalibrated input value observed_deviation = target_values - uncalibrated_values if input_transform is not None: observed_deviation = ( observed_deviation / uncalibrated_values * float(input_transform) ) # calibrated deviation is the explained difference between the (calibrated) target value and the uncalibrated input value calibrated_deviation = calibrated_dim - uncalibrated_values if input_transform is not None: calibrated_deviation = ( calibrated_deviation / uncalibrated_values * float(input_transform) ) # residual deviation is the unexplained difference between the (observed) target value and the (calibrated) target value residual_deviation = observed_deviation - calibrated_deviation return np.concatenate( [ observed_deviation, calibrated_deviation, residual_deviation, input_values, ], axis=1, )
def _get_metrics(self, df: pd.DataFrame) -> dict[str, float]: """Calculate the metrics for the calibration.""" deviation = self.calc_deviation(df) return { "median_accuracy": float(np.median(np.abs(deviation[:, 1]))), "median_precision": float(np.median(np.abs(deviation[:, 2]))), }
[docs] def ci(self, df: pd.DataFrame, ci: float = 0.95) -> float: """Calculate the residual deviation at the given confidence interval. Parameters ---------- df : pandas.DataFrame Dataframe containing the input and target columns ci : float, default=0.95 confidence interval Returns ------- float the confidence interval of the residual deviation after calibration """ if not 0 < ci < 1: raise ValueError("Confidence interval must be between 0 and 1") if not self.is_fitted: return 0 ci_percentile = [100 * (1 - ci) / 2, 100 * (1 + ci) / 2] deviation = self.calc_deviation(df) residual_deviation = deviation[:, 2] return float(np.mean(np.abs(np.percentile(residual_deviation, ci_percentile))))
[docs] class CalibrationModelProvider: """A provider for calibration models that can be used in the calibration process."""
[docs] def __init__(self): """Provides a collection of scikit-learn compatible models for calibration.""" self.model_dict = {}
def __repr__(self) -> str: """Return a string representation of the CalibrationModelProvider.""" string = "<CalibrationModelProvider, \n[\n" for key, value in self.model_dict.items(): string += f" \t {key}: {value}\n" string += "]>" return string
[docs] def register_model( self, model_name: str, model_template: type[CalibrationModel] ) -> None: """Register a model template with a given name. Parameters ---------- model_name : str Name of the model model_template : type[CalibrationModel] The model template which must have a fit and predict method. """ self.model_dict[model_name] = model_template
[docs] def get_model(self, model_name: str) -> type[CalibrationModel]: """Get a model template by name. Parameters ---------- model_name : str Name of the model Returns ------- type[CalibrationModel] The model template which must have a fit and predict method. """ if model_name not in self.model_dict: raise ValueError(f"Unknown model {model_name}") return self.model_dict[model_name]
calibration_model_provider = CalibrationModelProvider() calibration_model_provider.register_model("LinearRegression", LinearRegression) calibration_model_provider.register_model("LOESSRegression", LOESSRegression) calibration_model_provider.register_model( "PolynomialRegression", construct_polynomial_regression )