Source code for neer_match_utilities.training

from .base import SuperClass
import pandas as pd
from datetime import datetime
from pathlib import Path
import shutil
import dill
import os
import numpy as np
import tensorflow.keras.backend as K
import tensorflow as tf



[docs]
class Training(SuperClass):
    """
    A class for managing and evaluating training processes, including 
    reordering matches, evaluating performance metrics, and exporting models.

    Inherits:
    ---------
    SuperClass : Base class providing shared attributes and methods.
    """


[docs]
    def matches_reorder(self, matches: pd.DataFrame, matches_id_left: str, matches_id_right: str):
        """
        Reorders a matches DataFrame to include indices from the left and 
        right DataFrames instead of their original IDs.

        Parameters
        ----------
        matches : pd.DataFrame
            DataFrame containing matching pairs.
        matches_id_left : str
            Column name in the `matches` DataFrame corresponding to the left IDs.
        matches_id_right : str
            Column name in the `matches` DataFrame corresponding to the right IDs.

        Returns
        -------
        pd.DataFrame
            A DataFrame with columns `left` and `right`, representing the indices
            of matching pairs in the left and right DataFrames.
        """
        
        # Create local copies of the original dataframes
        df_left = self.df_left.copy()
        df_right = self.df_right.copy()


        # Add custom indices
        df_left['index_left'] = self.df_left.index
        df_right['index_right'] = self.df_right.index

        # Combine the datasets into one
        df = pd.merge(
            df_left, 
            matches, 
            left_on=self.id_left, 
            right_on=matches_id_left,
            how='right',
            validate='1:m',
            suffixes=('_l', '_r')
        )

        df = pd.merge(
            df,
            df_right,
            left_on=matches_id_right,
            right_on=self.id_right,
            how='left',
            validate='m:1',
            suffixes=('_l', '_r')
        )

        # Extract and rename index columns
        matches = df[['index_left', 'index_right']].rename(
            columns={
                'index_left': 'left', 
                'index_right': 'right'
            }
        ).reset_index(drop=True)

        matches = matches.sort_values(by='left', ascending=True).reset_index(drop=True)

        return matches



[docs]
    def evaluate_dataframe(self, evaluation_test: dict, evaluation_train: dict):
        """
        Combines and evaluates test and training performance metrics.

        Parameters
        ----------
        evaluation_test : dict
            Dictionary containing performance metrics for the test dataset.
        evaluation_train : dict
            Dictionary containing performance metrics for the training dataset.

        Returns
        -------
        pd.DataFrame
            A DataFrame with accuracy, precision, recall, F-score, and a timestamp
            for both test and training datasets.
        """

        # Create DataFrames for test and training metrics
        df_test = pd.DataFrame([evaluation_test])
        df_test.insert(0, 'data', ['test'])

        df_train = pd.DataFrame([evaluation_train])
        df_train.insert(0, 'data', ['train'])

        # Concatenate and calculate metrics
        df = pd.concat([df_test, df_train], axis=0, ignore_index=True)

        df['timestamp'] = datetime.now()

        return df



[docs]
    def performance_statistics_export(self, model, model_name: str, target_directory: Path, evaluation_train: dict = {}, evaluation_test: dict = {}):
        """
        Exports the trained model, similarity map, and evaluation metrics to the specified directory.

        Parameters:
        -----------
        model : Model object
            The trained model to export.
        model_name : str
            Name of the model to use as the export directory name.
        target_directory : Path
            The target directory where the model will be exported.
        evaluation_train : dict, optional
            Performance metrics for the training dataset (default is {}).
        evaluation_test : dict, optional
            Performance metrics for the test dataset (default is {}).

        Returns:
        --------
        None

        Notes:
        ------
        - The method creates a subdirectory named after `model_name` inside `target_directory`.
        - If `evaluation_train` and `evaluation_test` are provided, their metrics are saved as a CSV file.
        - Similarity maps are serialized using `dill` and saved in the export directory.
        """

        # Construct the full path for the model directory
        model_dir = target_directory / model_name

        # Ensure the directory exists
        if not model_dir.exists():
            os.mkdir(model_dir)
            print(f"Directory {model_dir} created for model export.")
        else:
            print(f"Directory {model_dir} already exists. Files will be written into it.")

        # Generate performance metrics and save
        if evaluation_test and evaluation_train:
            df_evaluate = self.evaluate_dataframe(evaluation_test, evaluation_train)
            df_evaluate.to_csv(model_dir / 'performance.csv', index=False)
            print(f"Performance metrics saved to {model_dir / 'performance.csv'}")





[docs]
def focal_loss(alpha=0.75, gamma=2.0):
    """
    Focal Loss function for binary classification tasks.

    Focal Loss is designed to address class imbalance by assigning higher weights
    to the minority class and focusing the model's learning on hard-to-classify examples.
    It reduces the loss contribution from well-classified examples, making it
    particularly effective for imbalanced datasets.

    Parameters
    ----------
    alpha : float, optional, default=0.75
        Weighting factor for the positive class (minority class).

        - Must be in the range [0, 1].
        - A higher value increases the loss contribution from the positive class
          (underrepresented class) relative to the negative class (overrepresented class).

    gamma : float, optional, default=2.0
        Focusing parameter that reduces the loss contribution from easy examples.

        - ``gamma = 0``: No focusing, equivalent to Weighted Binary Cross-Entropy Loss (if alpha is set to 0.5).
        - ``gamma > 0``: Focuses more on hard-to-classify examples.
        - Larger values emphasize harder examples more strongly.

    Returns
    -------
    loss : callable
        A loss function that computes the focal loss given the true labels (`y_true`)
        and predicted probabilities (`y_pred`).

    Raises
    ------
    ValueError
        If `alpha` is not in the range [0, 1].

    Notes
    -----
    - The positive class (minority or underrepresented class) is weighted by `alpha`.
    - The negative class (majority or overrepresented class) is automatically weighted
      by ``1 - alpha``.
    - Ensure `alpha` is set appropriately to reflect the level of imbalance in the dataset.

    References
    ----------
    Lin, T.-Y., Goyal, P., Girshick, R., He, K., & Dollár, P. (2017).
    Focal Loss for Dense Object Detection. In ICCV.

    Explanation of Key Terms
    -------------------------
    - **Positive Class (Underrepresented):**

      - Refers to the class with fewer examples in the dataset.
      - Typically weighted by `alpha`, which should be greater than 0.5 in highly imbalanced datasets.

    - **Negative Class (Overrepresented):**

      - Refers to the class with more examples in the dataset.
      - Its weight is automatically ``1 - alpha``.
    """

    if not (0 <= alpha <= 1):
        raise ValueError("Parameter `alpha` must be in the range [0, 1].")

    def loss(y_true, y_pred):
        # Compute the binary cross-entropy
        bce = K.binary_crossentropy(y_true, y_pred)

        # Compute p_t, the probability of the true class
        p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)

        # Apply focal loss scaling
        return K.mean(alpha * K.pow(1 - p_t, gamma) * bce)

    return loss




[docs]
def soft_f1_loss(epsilon: float = 1e-7):
    """
    Soft F1 Loss for imbalanced binary classification tasks.

    Soft F1 Loss provides a differentiable approximation of the F1 score,
    combining precision and recall into a single metric. By optimizing
    this loss, models are encouraged to balance false positives and false
    negatives, which is especially useful when classes are imbalanced.

    Parameters
    ----------
    epsilon : float, optional, default=1e-7
        Small constant added to numerator and denominator to avoid division
        by zero and stabilize training. Must be > 0.

    Returns
    -------
    loss : callable
        A loss function that takes true labels (`y_true`) and predicted
        probabilities (`y_pred`) and returns `1 - soft_f1`, so that
        minimizing this loss maximizes the soft F1 score.

    Raises
    ------
    ValueError
        If `epsilon` is not strictly positive.

    Notes
    -----
    - True positives (TP), false positives (FP), and false negatives (FN)
      are computed in a “soft” (differentiable) manner by summing over
      probabilities rather than thresholded predictions.
    - Soft F1 = (2·TP + ε) / (2·TP + FP + FN + ε).
    - Loss = 1 − Soft F1, which ranges from 0 (perfect) to 1 (worst).

    References
    ----------
    - Bénédict, G., Koops, V., Odijk D., & de Rijke M. (2021). SigmoidF1: A 
      Smooth F1 Score Surrogate Loss for Multilabel Classification. *arXiv 2108.10566*.

    Explanation of Key Terms
    ------------------------
    - **True Positives (TP):** Sum of predicted probabilities for actual
      positive examples.
    - **False Positives (FP):** Sum of predicted probabilities assigned to
      negative examples.
    - **False Negatives (FN):** Sum of (1 − predicted probability) for
      positive examples.
    - **ε (epsilon):** Stabilizer to prevent division by zero when TP, FP,
      and FN are all zero.

    Examples
    --------
    ```python
    loss_fn = soft_f1_loss(epsilon=1e-6)
    y_true = tf.constant([[1, 0, 1]], dtype=tf.float32)
    y_pred = tf.constant([[0.9, 0.2, 0.7]], dtype=tf.float32)
    loss_value = loss_fn(y_true, y_pred)
    print(loss_value.numpy())  # e.g. 0.1…
    ```
    """
    def loss(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.cast(y_pred, tf.float32)

        # Soft counts
        tp = tf.reduce_sum(y_pred * y_true)
        fp = tf.reduce_sum(y_pred * (1 - y_true))
        fn = tf.reduce_sum((1 - y_pred) * y_true)

        # Soft F1 calculation
        soft_f1 = (2 * tp + epsilon) / (2 * tp + fp + fn + epsilon)
        return 1.0 - soft_f1

    return loss




[docs]
def combined_loss(
    weight_f1: float = 0.5,
    epsilon: float = 1e-7,
    alpha: float = 0.25,
    gamma: float = 2.0
):
    """
    Combined Loss: weighted sum of Soft F1 Loss and Focal Loss for imbalanced binary classification.

    This loss blends the advantages of a differentiable F1-based objective (which balances
    precision and recall) with the sample‐focusing property of Focal Loss (which down‐weights
    easy examples). By tuning `weight_f1`, you can interpolate between solely optimizing
    for F1 score (when `weight_f1=1.0`) and solely focusing on hard examples via focal loss
    (when `weight_f1=0.0`).

    Parameters
    ----------
    weight_f1 : float, optional, default=0.5
        Mixing coefficient ∈ [0, 1].  
        - `weight_f1=1.0`: optimize only Soft F1 Loss.  
        - `weight_f1=0.0`: optimize only Focal Loss.  
        - Intermediate values blend the two objectives proportionally.
    epsilon : float, optional, default=1e-7
        Small stabilizer for Soft F1 calculation. Must be > 0.
    alpha : float, optional, default=0.25
        Balancing factor for Focal Loss, weighting the positive (minority) class.  
        Must lie in [0, 1].
    gamma : float, optional, default=2.0
        Focusing parameter for Focal Loss.  
        - `gamma=0` → reduces to weighted BCE.  
        - Larger `gamma` emphasizes harder (misclassified) examples.

    Returns
    -------
    loss : callable
        A function `loss(y_true, y_pred)` that computes:
        
            weight_f1 * SoftF1(y_true, y_pred; ε)
          + (1 − weight_f1) * FocalLoss(y_true, y_pred; α, γ)

        Minimizing this combined loss encourages both a high F1 score
        and focus on hard‐to‐classify samples.

    Raises
    ------
    ValueError
        If `weight_f1` is not in [0, 1], or if `epsilon` ≤ 0, or if `alpha` is not
        in [0, 1], or if `gamma` < 0.

    Notes
    -----
    - **Soft F1 Loss** (1 − F1) is differentiable and promotes balanced precision/recall.  
    - **Focal Loss** down‐weights well‐classified examples to focus learning on difficult cases.  
    - Adjust `weight_f1` to prioritize either overall F1 (higher weight_f1) or hard‐example mining (lower weight_f1).

    References
    ----------
    - Lin, T.-Y., Goyal, P., Girshick, R., He, K., & Dollár, P. (2017).
      Focal Loss for Dense Object Detection. *ICCV*.
    - Bénédict, G., Koops, V., Odijk D., & de Rijke M. (2021). SigmoidF1: A 
      Smooth F1 Score Surrogate Loss for Multilabel Classification. *arXiv 2108.10566*.

    Explanation of Key Terms
    ------------------------
    - **Soft F1**: (2·TP + ε) / (2·TP + FP + FN + ε), differentiable surrogate for the F1 score.
    - **Focal Loss**: α·(1−p_t)ᵞ·BCE(p_t), where p_t is the model’s estimated probability for the true class.
    - **TP, FP, FN**: soft counts over probabilities (see Soft F1 docs).

    Examples
    --------
    ```python
    # create combined loss with equal weighting
    loss_fn = combined_loss(weight_f1=0.5, epsilon=1e-6, alpha=0.25, gamma=2.0)

    y_true = tf.constant([[1, 0, 1]], dtype=tf.float32)
    y_pred = tf.constant([[0.9, 0.2, 0.7]], dtype=tf.float32)

    value = loss_fn(y_true, y_pred)
    print("Combined loss:", value.numpy())
    ```
    """
    # Validate hyper-parameters
    if not (0.0 <= weight_f1 <= 1.0):
        raise ValueError("`weight_f1` must be in [0, 1].")
    if epsilon <= 0:
        raise ValueError("`epsilon` must be strictly positive.")
    if not (0.0 <= alpha <= 1.0):
        raise ValueError("`alpha` must be in [0, 1].")
    if gamma < 0:
        raise ValueError("`gamma` must be non-negative.")

    # Instantiate the individual losses
    f1_fn   = soft_f1_loss(epsilon)
    focal_fn = focal_loss(alpha=alpha, gamma=gamma)

    def loss(y_true, y_pred):
        # Weighted combination
        return (weight_f1 * f1_fn(y_true, y_pred)
                + (1.0 - weight_f1) * focal_fn(y_true, y_pred))

    return loss