Skip to content

Techniques API Reference

Complete API reference for all resampling techniques.

Base API

All techniques follow scikit-learn's API:

sampler = Technique(parameters)
X_resampled, y_resampled = sampler.fit_resample(X, y)

Overlap-Based Undersampling

RFCL

fairsample.techniques.RFCL

Bases: BaseSampler

Random Forest Cleaning Rule (RFCL).

This technique uses Random Forest to identify and remove noisy/overlapping majority class samples that are likely to be misclassified.

Parameters:

Name Type Description Default
n_estimators int

Number of trees in the random forest

100
random_state int

Random state for reproducibility

None
cv int

Number of cross-validation folds

3
Source code in fairsample/techniques/rfcl.py
class RFCL(BaseSampler):
    """
    Random Forest Cleaning Rule (RFCL).

    This technique uses Random Forest to identify and remove noisy/overlapping
    majority class samples that are likely to be misclassified.

    Parameters
    ----------
    n_estimators : int, default=100
        Number of trees in the random forest
    random_state : int, default=None
        Random state for reproducibility
    cv : int, default=3
        Number of cross-validation folds
    """

    def __init__(self, n_estimators=100, random_state=None, cv=3):
        super().__init__(random_state=random_state)
        self.n_estimators = n_estimators
        self.cv = cv
        self._sampling_type = 'undersampling'

    def fit_resample(self, X, y):
        """
        Resample the dataset using RFCL.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data
        y : array-like of shape (n_samples,)
            Target values

        Returns
        -------
        X_resampled : array-like
            Resampled training data
        y_resampled : array-like
            Resampled target values
        """
        X, y = self._validate_input(X, y)

        # Get minority and majority class indices
        minority_indices, majority_indices, minority_class, majority_class = \
            self._get_minority_majority_indices(y)

        # If dataset is already balanced or minority is larger, return as is
        if len(minority_indices) >= len(majority_indices):
            return X.copy(), y.copy()

        # Create Random Forest classifier
        rf = RandomForestClassifier(
            n_estimators=self.n_estimators,
            random_state=self.random_state
        )

        # Get cross-validation predictions
        try:
            y_pred = cross_val_predict(rf, X, y, cv=self.cv)
        except:
            # Fallback: fit on full data and predict
            rf.fit(X, y)
            y_pred = rf.predict(X)

        # Identify correctly classified majority samples
        majority_mask = y == majority_class
        correctly_classified_majority = majority_indices[
            y_pred[majority_indices] == y[majority_indices]
        ]

        # Keep all minority samples and correctly classified majority samples
        keep_indices = np.concatenate([minority_indices, correctly_classified_majority])

        # If we removed too many samples, keep some randomly
        if len(correctly_classified_majority) < len(minority_indices):
            # Keep at least as many majority samples as minority samples
            n_additional = len(minority_indices) - len(correctly_classified_majority)
            incorrectly_classified_majority = majority_indices[
                y_pred[majority_indices] != y[majority_indices]
            ]

            if len(incorrectly_classified_majority) > 0:
                np.random.seed(self.random_state)
                additional_indices = np.random.choice(
                    incorrectly_classified_majority,
                    size=min(n_additional, len(incorrectly_classified_majority)),
                    replace=False
                )
                keep_indices = np.concatenate([keep_indices, additional_indices])

        # Sort indices to maintain order
        keep_indices = np.sort(keep_indices)

        return X[keep_indices], y[keep_indices]

fit_resample(X, y)

Resample the dataset using RFCL.

Parameters:

Name Type Description Default
X array-like of shape (n_samples, n_features)

Training data

required
y array-like of shape (n_samples,)

Target values

required

Returns:

Name Type Description
X_resampled array - like

Resampled training data

y_resampled array - like

Resampled target values

Source code in fairsample/techniques/rfcl.py
def fit_resample(self, X, y):
    """
    Resample the dataset using RFCL.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        Training data
    y : array-like of shape (n_samples,)
        Target values

    Returns
    -------
    X_resampled : array-like
        Resampled training data
    y_resampled : array-like
        Resampled target values
    """
    X, y = self._validate_input(X, y)

    # Get minority and majority class indices
    minority_indices, majority_indices, minority_class, majority_class = \
        self._get_minority_majority_indices(y)

    # If dataset is already balanced or minority is larger, return as is
    if len(minority_indices) >= len(majority_indices):
        return X.copy(), y.copy()

    # Create Random Forest classifier
    rf = RandomForestClassifier(
        n_estimators=self.n_estimators,
        random_state=self.random_state
    )

    # Get cross-validation predictions
    try:
        y_pred = cross_val_predict(rf, X, y, cv=self.cv)
    except:
        # Fallback: fit on full data and predict
        rf.fit(X, y)
        y_pred = rf.predict(X)

    # Identify correctly classified majority samples
    majority_mask = y == majority_class
    correctly_classified_majority = majority_indices[
        y_pred[majority_indices] == y[majority_indices]
    ]

    # Keep all minority samples and correctly classified majority samples
    keep_indices = np.concatenate([minority_indices, correctly_classified_majority])

    # If we removed too many samples, keep some randomly
    if len(correctly_classified_majority) < len(minority_indices):
        # Keep at least as many majority samples as minority samples
        n_additional = len(minority_indices) - len(correctly_classified_majority)
        incorrectly_classified_majority = majority_indices[
            y_pred[majority_indices] != y[majority_indices]
        ]

        if len(incorrectly_classified_majority) > 0:
            np.random.seed(self.random_state)
            additional_indices = np.random.choice(
                incorrectly_classified_majority,
                size=min(n_additional, len(incorrectly_classified_majority)),
                replace=False
            )
            keep_indices = np.concatenate([keep_indices, additional_indices])

    # Sort indices to maintain order
    keep_indices = np.sort(keep_indices)

    return X[keep_indices], y[keep_indices]

NUS

fairsample.techniques.NUS

Bases: BaseSampler

Neighbourhood-based Under-Sampling (NUS).

This technique removes majority class samples that are in the neighborhood of minority class samples, helping to reduce class overlap.

Parameters:

Name Type Description Default
n_neighbors int

Number of neighbors to consider

3
random_state int

Random state for reproducibility

None
Source code in fairsample/techniques/nus.py
class NUS(BaseSampler):
    """
    Neighbourhood-based Under-Sampling (NUS).

    This technique removes majority class samples that are in the neighborhood
    of minority class samples, helping to reduce class overlap.

    Parameters
    ----------
    n_neighbors : int, default=3
        Number of neighbors to consider
    random_state : int, default=None
        Random state for reproducibility
    """

    def __init__(self, n_neighbors=3, random_state=None):
        super().__init__(random_state=random_state)
        self.n_neighbors = n_neighbors
        self._sampling_type = 'undersampling'

    def fit_resample(self, X, y):
        """
        Resample the dataset using NUS.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data
        y : array-like of shape (n_samples,)
            Target values

        Returns
        -------
        X_resampled : array-like
            Resampled training data
        y_resampled : array-like
            Resampled target values
        """
        X, y = self._validate_input(X, y)

        # Get minority and majority class indices
        minority_indices, majority_indices, minority_class, majority_class = \
            self._get_minority_majority_indices(y)

        # If dataset is already balanced or minority is larger, return as is
        if len(minority_indices) >= len(majority_indices):
            return X.copy(), y.copy()

        # Fit k-NN on minority samples
        X_minority = X[minority_indices]

        # Adjust n_neighbors if we have fewer minority samples
        k = min(self.n_neighbors, len(X_minority) - 1)
        if k <= 0:
            return X.copy(), y.copy()

        knn = NearestNeighbors(n_neighbors=k + 1)  # +1 because it includes the point itself
        knn.fit(X_minority)

        # Find majority samples that are neighbors of minority samples
        X_majority = X[majority_indices]
        distances, indices = knn.kneighbors(X_majority)

        # Remove majority samples that are too close to minority samples
        # Use median distance as threshold
        median_distance = np.median(distances[:, 1])  # Skip first column (self)

        # Keep majority samples that are far enough from minority samples
        keep_majority_mask = distances[:, 1] > median_distance
        keep_majority_indices = majority_indices[keep_majority_mask]

        # Ensure we keep at least as many majority samples as minority samples
        if len(keep_majority_indices) < len(minority_indices):
            # If we removed too many, keep some of the closest ones
            n_additional = len(minority_indices) - len(keep_majority_indices)
            removed_indices = majority_indices[~keep_majority_mask]

            if len(removed_indices) > 0:
                # Sort by distance and keep the farthest ones among the removed
                removed_distances = distances[~keep_majority_mask, 1]
                sorted_indices = np.argsort(removed_distances)[::-1]  # Descending order
                additional_indices = removed_indices[sorted_indices[:n_additional]]
                keep_majority_indices = np.concatenate([keep_majority_indices, additional_indices])

        # Combine minority and selected majority samples
        keep_indices = np.concatenate([minority_indices, keep_majority_indices])
        keep_indices = np.sort(keep_indices)

        return X[keep_indices], y[keep_indices]

fit_resample(X, y)

Resample the dataset using NUS.

Parameters:

Name Type Description Default
X array-like of shape (n_samples, n_features)

Training data

required
y array-like of shape (n_samples,)

Target values

required

Returns:

Name Type Description
X_resampled array - like

Resampled training data

y_resampled array - like

Resampled target values

Source code in fairsample/techniques/nus.py
def fit_resample(self, X, y):
    """
    Resample the dataset using NUS.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        Training data
    y : array-like of shape (n_samples,)
        Target values

    Returns
    -------
    X_resampled : array-like
        Resampled training data
    y_resampled : array-like
        Resampled target values
    """
    X, y = self._validate_input(X, y)

    # Get minority and majority class indices
    minority_indices, majority_indices, minority_class, majority_class = \
        self._get_minority_majority_indices(y)

    # If dataset is already balanced or minority is larger, return as is
    if len(minority_indices) >= len(majority_indices):
        return X.copy(), y.copy()

    # Fit k-NN on minority samples
    X_minority = X[minority_indices]

    # Adjust n_neighbors if we have fewer minority samples
    k = min(self.n_neighbors, len(X_minority) - 1)
    if k <= 0:
        return X.copy(), y.copy()

    knn = NearestNeighbors(n_neighbors=k + 1)  # +1 because it includes the point itself
    knn.fit(X_minority)

    # Find majority samples that are neighbors of minority samples
    X_majority = X[majority_indices]
    distances, indices = knn.kneighbors(X_majority)

    # Remove majority samples that are too close to minority samples
    # Use median distance as threshold
    median_distance = np.median(distances[:, 1])  # Skip first column (self)

    # Keep majority samples that are far enough from minority samples
    keep_majority_mask = distances[:, 1] > median_distance
    keep_majority_indices = majority_indices[keep_majority_mask]

    # Ensure we keep at least as many majority samples as minority samples
    if len(keep_majority_indices) < len(minority_indices):
        # If we removed too many, keep some of the closest ones
        n_additional = len(minority_indices) - len(keep_majority_indices)
        removed_indices = majority_indices[~keep_majority_mask]

        if len(removed_indices) > 0:
            # Sort by distance and keep the farthest ones among the removed
            removed_distances = distances[~keep_majority_mask, 1]
            sorted_indices = np.argsort(removed_distances)[::-1]  # Descending order
            additional_indices = removed_indices[sorted_indices[:n_additional]]
            keep_majority_indices = np.concatenate([keep_majority_indices, additional_indices])

    # Combine minority and selected majority samples
    keep_indices = np.concatenate([minority_indices, keep_majority_indices])
    keep_indices = np.sort(keep_indices)

    return X[keep_indices], y[keep_indices]

URNS

fairsample.techniques.URNS

Bases: BaseSampler

Undersampling based on Recursive Neighbourhood Search (URNS).

This technique recursively removes majority class samples that are in dense regions and close to the decision boundary.

Parameters:

Name Type Description Default
n_neighbors int

Number of neighbors to consider

5
random_state int

Random state for reproducibility

None
max_iterations int

Maximum number of recursive iterations

10
Source code in fairsample/techniques/urns.py
class URNS(BaseSampler):
    """
    Undersampling based on Recursive Neighbourhood Search (URNS).

    This technique recursively removes majority class samples that are
    in dense regions and close to the decision boundary.

    Parameters
    ----------
    n_neighbors : int, default=5
        Number of neighbors to consider
    random_state : int, default=None
        Random state for reproducibility
    max_iterations : int, default=10
        Maximum number of recursive iterations
    """

    def __init__(self, n_neighbors=5, random_state=None, max_iterations=10):
        super().__init__(random_state=random_state)
        self.n_neighbors = n_neighbors
        self.max_iterations = max_iterations
        self._sampling_type = 'undersampling'

    def fit_resample(self, X, y):
        """
        Resample the dataset using URNS.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data
        y : array-like of shape (n_samples,)
            Target values

        Returns
        -------
        X_resampled : array-like
            Resampled training data
        y_resampled : array-like
            Resampled target values
        """
        X, y = self._validate_input(X, y)

        # Get minority and majority class indices
        minority_indices, majority_indices, minority_class, majority_class = \
            self._get_minority_majority_indices(y)

        # If dataset is already balanced or minority is larger, return as is
        if len(minority_indices) >= len(majority_indices):
            return X.copy(), y.copy()

        # Start with all samples
        current_indices = np.arange(len(X))
        target_majority_size = len(minority_indices)

        for iteration in range(self.max_iterations):
            # Get current majority indices
            current_y = y[current_indices]
            current_majority_mask = current_y == majority_class
            current_majority_indices = current_indices[current_majority_mask]

            # If we've reached the target size, stop
            if len(current_majority_indices) <= target_majority_size:
                break

            # Fit k-NN on current data
            X_current = X[current_indices]
            k = min(self.n_neighbors, len(X_current) - 1)
            if k <= 0:
                break

            knn = NearestNeighbors(n_neighbors=k + 1)
            knn.fit(X_current)

            # Find neighbors for each sample
            distances, indices = knn.kneighbors(X_current)

            # Calculate neighborhood purity for majority samples
            majority_scores = []
            current_majority_local_indices = np.where(current_majority_mask)[0]

            for local_idx in current_majority_local_indices:
                # Get neighbors (excluding self)
                neighbor_indices = indices[local_idx, 1:]
                neighbor_labels = current_y[neighbor_indices]

                # Calculate purity (fraction of majority class neighbors)
                purity = np.sum(neighbor_labels == majority_class) / len(neighbor_labels)
                majority_scores.append((local_idx, purity))

            if not majority_scores:
                break

            # Sort by purity (highest first) and remove samples with highest purity
            majority_scores.sort(key=lambda x: x[1], reverse=True)

            # Remove a fraction of the most pure majority samples
            n_to_remove = min(
                len(current_majority_indices) - target_majority_size,
                max(1, len(majority_scores) // 4)  # Remove 25% at most
            )

            indices_to_remove = [score[0] for score in majority_scores[:n_to_remove]]
            global_indices_to_remove = current_indices[indices_to_remove]

            # Update current indices
            current_indices = np.setdiff1d(current_indices, global_indices_to_remove)

        return X[current_indices], y[current_indices]

fit_resample(X, y)

Resample the dataset using URNS.

Parameters:

Name Type Description Default
X array-like of shape (n_samples, n_features)

Training data

required
y array-like of shape (n_samples,)

Target values

required

Returns:

Name Type Description
X_resampled array - like

Resampled training data

y_resampled array - like

Resampled target values

Source code in fairsample/techniques/urns.py
def fit_resample(self, X, y):
    """
    Resample the dataset using URNS.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        Training data
    y : array-like of shape (n_samples,)
        Target values

    Returns
    -------
    X_resampled : array-like
        Resampled training data
    y_resampled : array-like
        Resampled target values
    """
    X, y = self._validate_input(X, y)

    # Get minority and majority class indices
    minority_indices, majority_indices, minority_class, majority_class = \
        self._get_minority_majority_indices(y)

    # If dataset is already balanced or minority is larger, return as is
    if len(minority_indices) >= len(majority_indices):
        return X.copy(), y.copy()

    # Start with all samples
    current_indices = np.arange(len(X))
    target_majority_size = len(minority_indices)

    for iteration in range(self.max_iterations):
        # Get current majority indices
        current_y = y[current_indices]
        current_majority_mask = current_y == majority_class
        current_majority_indices = current_indices[current_majority_mask]

        # If we've reached the target size, stop
        if len(current_majority_indices) <= target_majority_size:
            break

        # Fit k-NN on current data
        X_current = X[current_indices]
        k = min(self.n_neighbors, len(X_current) - 1)
        if k <= 0:
            break

        knn = NearestNeighbors(n_neighbors=k + 1)
        knn.fit(X_current)

        # Find neighbors for each sample
        distances, indices = knn.kneighbors(X_current)

        # Calculate neighborhood purity for majority samples
        majority_scores = []
        current_majority_local_indices = np.where(current_majority_mask)[0]

        for local_idx in current_majority_local_indices:
            # Get neighbors (excluding self)
            neighbor_indices = indices[local_idx, 1:]
            neighbor_labels = current_y[neighbor_indices]

            # Calculate purity (fraction of majority class neighbors)
            purity = np.sum(neighbor_labels == majority_class) / len(neighbor_labels)
            majority_scores.append((local_idx, purity))

        if not majority_scores:
            break

        # Sort by purity (highest first) and remove samples with highest purity
        majority_scores.sort(key=lambda x: x[1], reverse=True)

        # Remove a fraction of the most pure majority samples
        n_to_remove = min(
            len(current_majority_indices) - target_majority_size,
            max(1, len(majority_scores) // 4)  # Remove 25% at most
        )

        indices_to_remove = [score[0] for score in majority_scores[:n_to_remove]]
        global_indices_to_remove = current_indices[indices_to_remove]

        # Update current indices
        current_indices = np.setdiff1d(current_indices, global_indices_to_remove)

    return X[current_indices], y[current_indices]

DeviOCSVM

fairsample.techniques.DeviOCSVM

Bases: RandomUnderSampler

Devi et al. One-Class SVM method (placeholder implementation).

Currently uses random undersampling as a placeholder. TODO: Implement the actual Devi OCSVM algorithm.

Source code in fairsample/techniques/devi_ocsvm.py
class DeviOCSVM(RandomUnderSampler):
    """
    Devi et al. One-Class SVM method (placeholder implementation).

    Currently uses random undersampling as a placeholder.
    TODO: Implement the actual Devi OCSVM algorithm.
    """

    def __init__(self, random_state=None):
        super().__init__(sampling_strategy='auto', random_state=random_state)
        self._sampling_type = 'undersampling'

FCMBoostOBU

fairsample.techniques.FCMBoostOBU

Bases: RandomUnderSampler

Fuzzy C-Means Boosted Overlap-Based Undersampling (placeholder implementation).

Currently uses random undersampling as a placeholder. TODO: Implement the actual FCM Boost OBU algorithm.

Source code in fairsample/techniques/fcm_boost_obu.py
class FCMBoostOBU(RandomUnderSampler):
    """
    Fuzzy C-Means Boosted Overlap-Based Undersampling (placeholder implementation).

    Currently uses random undersampling as a placeholder.
    TODO: Implement the actual FCM Boost OBU algorithm.
    """

    def __init__(self, random_state=None):
        super().__init__(sampling_strategy='auto', random_state=random_state)
        self._sampling_type = 'hybrid'

Hybrid Methods

SVDDWSMOTE

fairsample.techniques.SVDDWSMOTE

Bases: RandomOverSampler

SVDD-based overlap handler (placeholder implementation).

Currently uses random oversampling as a placeholder. TODO: Implement the actual SVDD WSMOTE algorithm.

Source code in fairsample/techniques/svddwsmote.py
class SVDDWSMOTE(RandomOverSampler):
    """
    SVDD-based overlap handler (placeholder implementation).

    Currently uses random oversampling as a placeholder.
    TODO: Implement the actual SVDD WSMOTE algorithm.
    """

    def __init__(self, random_state=None):
        super().__init__(sampling_strategy='auto', random_state=random_state)
        self._sampling_type = 'hybrid'

ODBOT

fairsample.techniques.ODBOT

Bases: RandomOverSampler

Outlier Detection-Based Oversampling Technique (placeholder implementation).

Currently uses random oversampling as a placeholder. TODO: Implement the actual ODBOT algorithm.

Source code in fairsample/techniques/odbot.py
class ODBOT(RandomOverSampler):
    """
    Outlier Detection-Based Oversampling Technique (placeholder implementation).

    Currently uses random oversampling as a placeholder.
    TODO: Implement the actual ODBOT algorithm.
    """

    def __init__(self, random_state=None):
        super().__init__(sampling_strategy='auto', random_state=random_state)
        self._sampling_type = 'oversampling'

EHSO

fairsample.techniques.EHSO

Bases: RandomUnderSampler

Evolutionary Hybrid Sampling in Overlapping scenarios (placeholder implementation).

Currently uses random undersampling as a placeholder. TODO: Implement the actual EHSO algorithm.

Source code in fairsample/techniques/ehso.py
class EHSO(RandomUnderSampler):
    """
    Evolutionary Hybrid Sampling in Overlapping scenarios (placeholder implementation).

    Currently uses random undersampling as a placeholder.
    TODO: Implement the actual EHSO algorithm.
    """

    def __init__(self, random_state=None):
        super().__init__(sampling_strategy='auto', random_state=random_state)
        self._sampling_type = 'hybrid'

Clustering-Based

NBUS

fairsample.techniques.NBUS

Bases: NUS

Neighbourhood-Based Undersampling (placeholder implementation).

Currently uses NUS as a placeholder. TODO: Implement the actual NBUS variants.

Source code in fairsample/techniques/nbus.py
class NBUS(NUS):
    """
    Neighbourhood-Based Undersampling (placeholder implementation).

    Currently uses NUS as a placeholder.
    TODO: Implement the actual NBUS variants.
    """

    def __init__(self, variant='basic', random_state=None):
        super().__init__(n_neighbors=3, random_state=random_state)
        self.variant = variant
        self._sampling_type = 'undersampling'

KMeansUndersampling

fairsample.techniques.KMeansUndersampling

Bases: RandomUnderSampler

K-Means based undersampling (placeholder implementation).

Currently uses random undersampling as a placeholder. TODO: Implement the actual K-Means undersampling variants.

Source code in fairsample/techniques/kmeans_undersampling.py
class KMeansUndersampling(RandomUnderSampler):
    """
    K-Means based undersampling (placeholder implementation).

    Currently uses random undersampling as a placeholder.
    TODO: Implement the actual K-Means undersampling variants.
    """

    def __init__(self, variant='basic', random_state=None):
        super().__init__(sampling_strategy='auto', random_state=random_state)
        self.variant = variant
        self._sampling_type = 'undersampling'

Comprehensive

OSM

fairsample.techniques.OSM

Bases: RandomUnderSampler

Overlap-Separating Model (placeholder implementation).

Currently uses random undersampling as a placeholder. TODO: Implement the actual OSM algorithm.

Source code in fairsample/techniques/osm.py
class OSM(RandomUnderSampler):
    """
    Overlap-Separating Model (placeholder implementation).

    Currently uses random undersampling as a placeholder.
    TODO: Implement the actual OSM algorithm.
    """

    def __init__(self, random_state=None):
        super().__init__(sampling_strategy='auto', random_state=random_state)
        self._sampling_type = 'hybrid'

Baselines

RandomOverSampler

From imbalanced-learn:

from imblearn.over_sampling import RandomOverSampler

sampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X, y)

RandomUnderSampler

From imbalanced-learn:

from imblearn.under_sampling import RandomUnderSampler

sampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X, y)