Source code for alphadia.validation.base

import logging

import numpy as np
import pandas as pd

logger = logging.getLogger()


[docs] class Property: """Column property base class"""
[docs] def __init__(self, name, type): """Base class for all properties Parameters ---------- name: str Name of the property type: type Type of the property """ self.name = name self.type = type
[docs] class Optional(Property): """Optional property"""
[docs] def __init__(self, name, type): """Optional property Parameters ---------- name: str Name of the property type: type Type of the property """ self.name = name self.type = type
def __call__(self, df, logging=True): """Casts the property to the specified type if it is present in the dataframe Parameters ---------- df: pd.DataFrame Dataframe to validate logging: bool If True, log the validation results """ if self.name in df.columns and df[self.name].dtype != self.type: df[self.name] = df[self.name].astype(self.type) return True
[docs] class Required(Property): """Required property"""
[docs] def __init__(self, name, type): """Required property Parameters ---------- name: str Name of the property type: type Type of the property """ self.name = name self.type = type
def __call__(self, df, logging=True): """Casts the property to the specified type if it is present in the dataframe Parameters ---------- df: pd.DataFrame Dataframe to validate logging: bool If True, log the validation results """ if self.name in df.columns: if df[self.name].dtype != self.type: df[self.name] = df[self.name].astype(self.type) return True return False
[docs] class Schema:
[docs] def __init__(self, name, properties): """Schema for validating dataframes Parameters ---------- name: str Name of the schema properties: list List of Property objects """ self.name = name self.schema = properties for property in self.schema: if not isinstance(property, Property): raise ValueError("Schema must contain only Property objects")
[docs] def validate( self, df: pd.DataFrame, logging: bool = True, warn_on_critical_values: bool = False, ) -> None: """Validates the dataframe. Parameters ---------- df: pd.DataFrame Dataframe to validate logging: bool If True, log the validation results. Defaults to True. warn_on_critical_values: bool If True, warn on critical values like NaN and Inf in the dataframe. Defaults to False. Raises ------ ValueError If validation fails. """ if warn_on_critical_values: self._warn_on_critical_values(df) for property in self.schema: if not property(df, logging=logging): raise ValueError( f"Validation of {self.name} failed: Column {property.name} is not present in the dataframe" )
[docs] def docstring(self) -> str: """Automatically generate a docstring for the schema. Returns ------- str Docstring for the schema """ docstring = """ Schema ------ .. list-table:: :widths: 1 1 1 :header-rows: 1 * - Name - Required - Type """ for property in self.schema: emphasis = "**" if isinstance(property, Required) else "" docstring += f""" * - {property.name} - {emphasis}{property.__class__.__name__}{emphasis} - {property.type.__name__} """ return docstring
def _warn_on_critical_values(self, input_df: pd.DataFrame) -> None: """Warns about critical values in the dataframe, such as NaN and Inf.""" for col in input_df.columns: if np.issubdtype(input_df[col].dtype, np.floating): nan_count = input_df[col].isna().sum() inf_count = np.isinf(input_df[col]).sum() if nan_count > 0: nan_percentage = nan_count / len(input_df) * 100 logger.warning( f"{col} has {nan_count} NaNs ( {nan_percentage:.2f} % out of {len(input_df)})" ) if inf_count > 0: inf_percentage = inf_count / len(input_df) * 100 logger.warning( f"{col} has {inf_count} Infs ( {inf_percentage:.2f} % out of {len(input_df)})" )