Source code for ssl_framework.strategies

# ABOUTME: Strategy classes for label selection and integration in SSL framework
# ABOUTME: Provides modular, swappable components for customizing semi-supervised learning behavior

from typing import Tuple, Optional
import numpy as np



[docs]
class ConfidenceThreshold:
    """Label selection strategy based on confidence threshold.

    Selects unlabeled samples where the maximum predicted probability
    exceeds a specified threshold.
    """


[docs]
    def __init__(self, threshold=0.95):
        """Initialize the confidence threshold strategy.

        Parameters
        ----------
        threshold : float, default=0.95
            Confidence threshold for selecting pseudo-labels.
            Samples with max probability > threshold will be selected.
        """
        self.threshold = threshold



[docs]
    def select_labels(self, X_unlabeled, y_proba):
        """Select samples based on confidence threshold.

        Parameters
        ----------
        X_unlabeled : ndarray of shape (n_unlabeled_samples, n_features)
            Unlabeled feature data.
        y_proba : ndarray of shape (n_unlabeled_samples, n_classes)
            Predicted class probabilities for unlabeled samples.

        Returns
        -------
        X_new_labeled : ndarray
            Feature data for newly selected samples.
        y_new_labels : ndarray
            Predicted labels for newly selected samples.
        indices_to_remove : ndarray
            Indices of samples to remove from unlabeled set.
        """
        # Find samples with max probability > threshold
        max_proba = np.max(y_proba, axis=1)
        confident_indices = np.where(max_proba > self.threshold)[0]

        if len(confident_indices) == 0:
            # Return empty arrays if no confident samples
            return (
                np.empty((0, X_unlabeled.shape[1])),
                np.empty(0, dtype=int),
                np.empty(0, dtype=int)
            )

        # Get pseudo-labels and corresponding features
        y_new_labels = np.argmax(y_proba[confident_indices], axis=1)
        X_new_labeled = X_unlabeled[confident_indices]

        return X_new_labeled, y_new_labels, confident_indices





[docs]
class AppendAndGrow:
    """Label integration strategy that appends new labels to existing set.

    This strategy grows the labeled dataset monotonically by appending
    newly pseudo-labeled samples to the current labeled set.
    """


[docs]
    def __init__(self):
        """Initialize the append-and-grow strategy."""
        pass



[docs]
    def integrate_labels(self, X_labeled, y_labeled, X_new_labeled, y_new_labels, **kwargs):
        """Integrate new pseudo-labeled samples by appending them.

        Parameters
        ----------
        X_labeled : ndarray
            Current labeled feature data.
        y_labeled : ndarray
            Current labeled targets.
        X_new_labeled : ndarray
            New pseudo-labeled feature data.
        y_new_labels : ndarray
            New pseudo-labels.
        **kwargs
            Additional parameters (ignored).

        Returns
        -------
        X_labeled_next : ndarray
            Updated labeled feature data.
        y_labeled_next : ndarray
            Updated labeled targets.
        sample_weights_next : None
            Sample weights (None for this strategy).
        """
        if len(X_new_labeled) == 0:
            # No new samples to add
            return X_labeled, y_labeled, None

        # Concatenate new data with existing labeled data
        X_labeled_next = np.vstack([X_labeled, X_new_labeled])
        y_labeled_next = np.hstack([y_labeled, y_new_labels])

        return X_labeled_next, y_labeled_next, None





[docs]
class TopKFixedCount:
    """Label selection strategy that selects top K most confident samples.

    This strategy always selects exactly K samples with the highest
    maximum predicted probabilities, regardless of confidence threshold.
    """


[docs]
    def __init__(self, k=10):
        """Initialize the top-K strategy.

        Parameters
        ----------
        k : int, default=10
            Number of samples to select in each iteration.
        """
        self.k = k



[docs]
    def select_labels(self, X_unlabeled, y_proba):
        """Select the K most confident samples.

        Parameters
        ----------
        X_unlabeled : ndarray of shape (n_unlabeled_samples, n_features)
            Unlabeled feature data.
        y_proba : ndarray of shape (n_unlabeled_samples, n_classes)
            Predicted class probabilities for unlabeled samples.

        Returns
        -------
        X_new_labeled : ndarray
            Feature data for newly selected samples.
        y_new_labels : ndarray
            Predicted labels for newly selected samples.
        indices_to_remove : ndarray
            Indices of samples to remove from unlabeled set.
        """
        if len(X_unlabeled) == 0:
            return (
                np.empty((0, X_unlabeled.shape[1])),
                np.empty(0, dtype=int),
                np.empty(0, dtype=int)
            )

        # Find the K samples with highest maximum probability
        max_proba = np.max(y_proba, axis=1)
        k_actual = min(self.k, len(X_unlabeled))  # Don't exceed available samples

        # Get indices of top-k most confident samples
        top_k_indices = np.argpartition(max_proba, -k_actual)[-k_actual:]
        # Sort them by confidence (highest first)
        top_k_indices = top_k_indices[np.argsort(max_proba[top_k_indices])[::-1]]

        # Get pseudo-labels and corresponding features
        y_new_labels = np.argmax(y_proba[top_k_indices], axis=1)
        X_new_labeled = X_unlabeled[top_k_indices]

        return X_new_labeled, y_new_labels, top_k_indices





[docs]
class FullReLabeling:
    """Label integration strategy that re-labels the entire dataset each iteration.

    Instead of growing the labeled set monotonically, this strategy always
    uses the original labeled data plus all newly pseudo-labeled samples.
    """


[docs]
    def __init__(self, X_original, y_original):
        """Initialize the full re-labeling strategy.

        Parameters
        ----------
        X_original : ndarray
            Original labeled feature data.
        y_original : ndarray
            Original labeled targets.
        """
        self.X_original = X_original.copy()
        self.y_original = y_original.copy()



[docs]
    def integrate_labels(self, X_labeled, y_labeled, X_new_labeled, y_new_labels, **kwargs):
        """Integrate labels by concatenating with original data only.

        Parameters
        ----------
        X_labeled : ndarray
            Current labeled feature data (ignored).
        y_labeled : ndarray
            Current labeled targets (ignored).
        X_new_labeled : ndarray
            New pseudo-labeled feature data.
        y_new_labels : ndarray
            New pseudo-labels.
        **kwargs
            Additional parameters (ignored).

        Returns
        -------
        X_labeled_next : ndarray
            Original data concatenated with new pseudo-labeled data.
        y_labeled_next : ndarray
            Original labels concatenated with new pseudo-labels.
        sample_weights_next : None
            Sample weights (None for this strategy).
        """
        if len(X_new_labeled) == 0:
            # No new samples, return original data
            return self.X_original, self.y_original, None

        # Concatenate original data with new pseudo-labeled data
        X_labeled_next = np.vstack([self.X_original, X_new_labeled])
        y_labeled_next = np.hstack([self.y_original, y_new_labels])

        return X_labeled_next, y_labeled_next, None





[docs]
class ConfidenceWeighting:
    """Label integration strategy that weights samples by their confidence.

    Newly pseudo-labeled samples are assigned weights proportional to their
    confidence, while original labeled samples maintain weight 1.0.
    """


[docs]
    def __init__(self):
        """Initialize the confidence weighting strategy."""
        pass



[docs]
    def integrate_labels(self, X_labeled, y_labeled, X_new_labeled, y_new_labels, y_proba=None, indices=None):
        """Integrate labels with confidence-based weighting.

        Parameters
        ----------
        X_labeled : ndarray
            Current labeled feature data.
        y_labeled : ndarray
            Current labeled targets.
        X_new_labeled : ndarray
            New pseudo-labeled feature data.
        y_new_labels : ndarray
            New pseudo-labels.
        y_proba : ndarray, optional
            Predicted probabilities for all unlabeled samples.
        indices : ndarray, optional
            Indices of selected samples in y_proba.

        Returns
        -------
        X_labeled_next : ndarray
            Updated labeled feature data.
        y_labeled_next : ndarray
            Updated labeled targets.
        sample_weights_next : ndarray
            Sample weights with confidence-based weighting.
        """
        if len(X_new_labeled) == 0:
            # No new samples, return current data with unit weights
            sample_weights = np.ones(len(X_labeled))
            return X_labeled, y_labeled, sample_weights

        # Concatenate new data with existing labeled data
        X_labeled_next = np.vstack([X_labeled, X_new_labeled])
        y_labeled_next = np.hstack([y_labeled, y_new_labels])

        # Create sample weights
        original_weights = np.ones(len(X_labeled))  # Original data gets weight 1.0

        if y_proba is not None and indices is not None:
            # Calculate confidence weights from probabilities
            max_proba = np.max(y_proba, axis=1)
            new_weights = max_proba[indices]
        else:
            # Fallback: assign weight 1.0 to new samples
            new_weights = np.ones(len(X_new_labeled))

        sample_weights_next = np.hstack([original_weights, new_weights])

        return X_labeled_next, y_labeled_next, sample_weights_next