Source code for ssl_framework.strategies

# ABOUTME: Strategy classes for label selection and integration in SSL framework
# ABOUTME: Provides modular, swappable components for customizing semi-supervised learning behavior

from typing import Tuple, Optional
import numpy as np


[docs] class ConfidenceThreshold: """Label selection strategy based on confidence threshold. Selects unlabeled samples where the maximum predicted probability exceeds a specified threshold. """
[docs] def __init__(self, threshold=0.95): """Initialize the confidence threshold strategy. Parameters ---------- threshold : float, default=0.95 Confidence threshold for selecting pseudo-labels. Samples with max probability > threshold will be selected. """ self.threshold = threshold
[docs] def select_labels(self, X_unlabeled, y_proba): """Select samples based on confidence threshold. Parameters ---------- X_unlabeled : ndarray of shape (n_unlabeled_samples, n_features) Unlabeled feature data. y_proba : ndarray of shape (n_unlabeled_samples, n_classes) Predicted class probabilities for unlabeled samples. Returns ------- X_new_labeled : ndarray Feature data for newly selected samples. y_new_labels : ndarray Predicted labels for newly selected samples. indices_to_remove : ndarray Indices of samples to remove from unlabeled set. """ # Find samples with max probability > threshold max_proba = np.max(y_proba, axis=1) confident_indices = np.where(max_proba > self.threshold)[0] if len(confident_indices) == 0: # Return empty arrays if no confident samples return ( np.empty((0, X_unlabeled.shape[1])), np.empty(0, dtype=int), np.empty(0, dtype=int) ) # Get pseudo-labels and corresponding features y_new_labels = np.argmax(y_proba[confident_indices], axis=1) X_new_labeled = X_unlabeled[confident_indices] return X_new_labeled, y_new_labels, confident_indices
[docs] class AppendAndGrow: """Label integration strategy that appends new labels to existing set. This strategy grows the labeled dataset monotonically by appending newly pseudo-labeled samples to the current labeled set. """
[docs] def __init__(self): """Initialize the append-and-grow strategy.""" pass
[docs] def integrate_labels(self, X_labeled, y_labeled, X_new_labeled, y_new_labels, **kwargs): """Integrate new pseudo-labeled samples by appending them. Parameters ---------- X_labeled : ndarray Current labeled feature data. y_labeled : ndarray Current labeled targets. X_new_labeled : ndarray New pseudo-labeled feature data. y_new_labels : ndarray New pseudo-labels. **kwargs Additional parameters (ignored). Returns ------- X_labeled_next : ndarray Updated labeled feature data. y_labeled_next : ndarray Updated labeled targets. sample_weights_next : None Sample weights (None for this strategy). """ if len(X_new_labeled) == 0: # No new samples to add return X_labeled, y_labeled, None # Concatenate new data with existing labeled data X_labeled_next = np.vstack([X_labeled, X_new_labeled]) y_labeled_next = np.hstack([y_labeled, y_new_labels]) return X_labeled_next, y_labeled_next, None
[docs] class TopKFixedCount: """Label selection strategy that selects top K most confident samples. This strategy always selects exactly K samples with the highest maximum predicted probabilities, regardless of confidence threshold. """
[docs] def __init__(self, k=10): """Initialize the top-K strategy. Parameters ---------- k : int, default=10 Number of samples to select in each iteration. """ self.k = k
[docs] def select_labels(self, X_unlabeled, y_proba): """Select the K most confident samples. Parameters ---------- X_unlabeled : ndarray of shape (n_unlabeled_samples, n_features) Unlabeled feature data. y_proba : ndarray of shape (n_unlabeled_samples, n_classes) Predicted class probabilities for unlabeled samples. Returns ------- X_new_labeled : ndarray Feature data for newly selected samples. y_new_labels : ndarray Predicted labels for newly selected samples. indices_to_remove : ndarray Indices of samples to remove from unlabeled set. """ if len(X_unlabeled) == 0: return ( np.empty((0, X_unlabeled.shape[1])), np.empty(0, dtype=int), np.empty(0, dtype=int) ) # Find the K samples with highest maximum probability max_proba = np.max(y_proba, axis=1) k_actual = min(self.k, len(X_unlabeled)) # Don't exceed available samples # Get indices of top-k most confident samples top_k_indices = np.argpartition(max_proba, -k_actual)[-k_actual:] # Sort them by confidence (highest first) top_k_indices = top_k_indices[np.argsort(max_proba[top_k_indices])[::-1]] # Get pseudo-labels and corresponding features y_new_labels = np.argmax(y_proba[top_k_indices], axis=1) X_new_labeled = X_unlabeled[top_k_indices] return X_new_labeled, y_new_labels, top_k_indices
[docs] class FullReLabeling: """Label integration strategy that re-labels the entire dataset each iteration. Instead of growing the labeled set monotonically, this strategy always uses the original labeled data plus all newly pseudo-labeled samples. """
[docs] def __init__(self, X_original, y_original): """Initialize the full re-labeling strategy. Parameters ---------- X_original : ndarray Original labeled feature data. y_original : ndarray Original labeled targets. """ self.X_original = X_original.copy() self.y_original = y_original.copy()
[docs] def integrate_labels(self, X_labeled, y_labeled, X_new_labeled, y_new_labels, **kwargs): """Integrate labels by concatenating with original data only. Parameters ---------- X_labeled : ndarray Current labeled feature data (ignored). y_labeled : ndarray Current labeled targets (ignored). X_new_labeled : ndarray New pseudo-labeled feature data. y_new_labels : ndarray New pseudo-labels. **kwargs Additional parameters (ignored). Returns ------- X_labeled_next : ndarray Original data concatenated with new pseudo-labeled data. y_labeled_next : ndarray Original labels concatenated with new pseudo-labels. sample_weights_next : None Sample weights (None for this strategy). """ if len(X_new_labeled) == 0: # No new samples, return original data return self.X_original, self.y_original, None # Concatenate original data with new pseudo-labeled data X_labeled_next = np.vstack([self.X_original, X_new_labeled]) y_labeled_next = np.hstack([self.y_original, y_new_labels]) return X_labeled_next, y_labeled_next, None
[docs] class ConfidenceWeighting: """Label integration strategy that weights samples by their confidence. Newly pseudo-labeled samples are assigned weights proportional to their confidence, while original labeled samples maintain weight 1.0. """
[docs] def __init__(self): """Initialize the confidence weighting strategy.""" pass
[docs] def integrate_labels(self, X_labeled, y_labeled, X_new_labeled, y_new_labels, y_proba=None, indices=None): """Integrate labels with confidence-based weighting. Parameters ---------- X_labeled : ndarray Current labeled feature data. y_labeled : ndarray Current labeled targets. X_new_labeled : ndarray New pseudo-labeled feature data. y_new_labels : ndarray New pseudo-labels. y_proba : ndarray, optional Predicted probabilities for all unlabeled samples. indices : ndarray, optional Indices of selected samples in y_proba. Returns ------- X_labeled_next : ndarray Updated labeled feature data. y_labeled_next : ndarray Updated labeled targets. sample_weights_next : ndarray Sample weights with confidence-based weighting. """ if len(X_new_labeled) == 0: # No new samples, return current data with unit weights sample_weights = np.ones(len(X_labeled)) return X_labeled, y_labeled, sample_weights # Concatenate new data with existing labeled data X_labeled_next = np.vstack([X_labeled, X_new_labeled]) y_labeled_next = np.hstack([y_labeled, y_new_labels]) # Create sample weights original_weights = np.ones(len(X_labeled)) # Original data gets weight 1.0 if y_proba is not None and indices is not None: # Calculate confidence weights from probabilities max_proba = np.max(y_proba, axis=1) new_weights = max_proba[indices] else: # Fallback: assign weight 1.0 to new samples new_weights = np.ones(len(X_new_labeled)) sample_weights_next = np.hstack([original_weights, new_weights]) return X_labeled_next, y_labeled_next, sample_weights_next