Techniques API Reference
Complete API reference for all resampling techniques.
Base API
All techniques follow scikit-learn's API:
sampler = Technique(parameters)
X_resampled, y_resampled = sampler.fit_resample(X, y)
Overlap-Based Undersampling
RFCL
fairsample.techniques.RFCL
Bases: BaseSampler
Random Forest Cleaning Rule (RFCL).
This technique uses Random Forest to identify and remove noisy/overlapping
majority class samples that are likely to be misclassified.
Parameters:
| Name |
Type |
Description |
Default |
n_estimators
|
int
|
Number of trees in the random forest
|
100
|
random_state
|
int
|
Random state for reproducibility
|
None
|
cv
|
int
|
Number of cross-validation folds
|
3
|
Source code in fairsample/techniques/rfcl.py
| class RFCL(BaseSampler):
"""
Random Forest Cleaning Rule (RFCL).
This technique uses Random Forest to identify and remove noisy/overlapping
majority class samples that are likely to be misclassified.
Parameters
----------
n_estimators : int, default=100
Number of trees in the random forest
random_state : int, default=None
Random state for reproducibility
cv : int, default=3
Number of cross-validation folds
"""
def __init__(self, n_estimators=100, random_state=None, cv=3):
super().__init__(random_state=random_state)
self.n_estimators = n_estimators
self.cv = cv
self._sampling_type = 'undersampling'
def fit_resample(self, X, y):
"""
Resample the dataset using RFCL.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data
y : array-like of shape (n_samples,)
Target values
Returns
-------
X_resampled : array-like
Resampled training data
y_resampled : array-like
Resampled target values
"""
X, y = self._validate_input(X, y)
# Get minority and majority class indices
minority_indices, majority_indices, minority_class, majority_class = \
self._get_minority_majority_indices(y)
# If dataset is already balanced or minority is larger, return as is
if len(minority_indices) >= len(majority_indices):
return X.copy(), y.copy()
# Create Random Forest classifier
rf = RandomForestClassifier(
n_estimators=self.n_estimators,
random_state=self.random_state
)
# Get cross-validation predictions
try:
y_pred = cross_val_predict(rf, X, y, cv=self.cv)
except:
# Fallback: fit on full data and predict
rf.fit(X, y)
y_pred = rf.predict(X)
# Identify correctly classified majority samples
majority_mask = y == majority_class
correctly_classified_majority = majority_indices[
y_pred[majority_indices] == y[majority_indices]
]
# Keep all minority samples and correctly classified majority samples
keep_indices = np.concatenate([minority_indices, correctly_classified_majority])
# If we removed too many samples, keep some randomly
if len(correctly_classified_majority) < len(minority_indices):
# Keep at least as many majority samples as minority samples
n_additional = len(minority_indices) - len(correctly_classified_majority)
incorrectly_classified_majority = majority_indices[
y_pred[majority_indices] != y[majority_indices]
]
if len(incorrectly_classified_majority) > 0:
np.random.seed(self.random_state)
additional_indices = np.random.choice(
incorrectly_classified_majority,
size=min(n_additional, len(incorrectly_classified_majority)),
replace=False
)
keep_indices = np.concatenate([keep_indices, additional_indices])
# Sort indices to maintain order
keep_indices = np.sort(keep_indices)
return X[keep_indices], y[keep_indices]
|
fit_resample(X, y)
Resample the dataset using RFCL.
Parameters:
| Name |
Type |
Description |
Default |
X
|
array-like of shape (n_samples, n_features)
|
|
required
|
y
|
array-like of shape (n_samples,)
|
|
required
|
Returns:
| Name | Type |
Description |
X_resampled |
array - like
|
|
y_resampled |
array - like
|
|
Source code in fairsample/techniques/rfcl.py
| def fit_resample(self, X, y):
"""
Resample the dataset using RFCL.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data
y : array-like of shape (n_samples,)
Target values
Returns
-------
X_resampled : array-like
Resampled training data
y_resampled : array-like
Resampled target values
"""
X, y = self._validate_input(X, y)
# Get minority and majority class indices
minority_indices, majority_indices, minority_class, majority_class = \
self._get_minority_majority_indices(y)
# If dataset is already balanced or minority is larger, return as is
if len(minority_indices) >= len(majority_indices):
return X.copy(), y.copy()
# Create Random Forest classifier
rf = RandomForestClassifier(
n_estimators=self.n_estimators,
random_state=self.random_state
)
# Get cross-validation predictions
try:
y_pred = cross_val_predict(rf, X, y, cv=self.cv)
except:
# Fallback: fit on full data and predict
rf.fit(X, y)
y_pred = rf.predict(X)
# Identify correctly classified majority samples
majority_mask = y == majority_class
correctly_classified_majority = majority_indices[
y_pred[majority_indices] == y[majority_indices]
]
# Keep all minority samples and correctly classified majority samples
keep_indices = np.concatenate([minority_indices, correctly_classified_majority])
# If we removed too many samples, keep some randomly
if len(correctly_classified_majority) < len(minority_indices):
# Keep at least as many majority samples as minority samples
n_additional = len(minority_indices) - len(correctly_classified_majority)
incorrectly_classified_majority = majority_indices[
y_pred[majority_indices] != y[majority_indices]
]
if len(incorrectly_classified_majority) > 0:
np.random.seed(self.random_state)
additional_indices = np.random.choice(
incorrectly_classified_majority,
size=min(n_additional, len(incorrectly_classified_majority)),
replace=False
)
keep_indices = np.concatenate([keep_indices, additional_indices])
# Sort indices to maintain order
keep_indices = np.sort(keep_indices)
return X[keep_indices], y[keep_indices]
|
NUS
fairsample.techniques.NUS
Bases: BaseSampler
Neighbourhood-based Under-Sampling (NUS).
This technique removes majority class samples that are in the neighborhood
of minority class samples, helping to reduce class overlap.
Parameters:
| Name |
Type |
Description |
Default |
n_neighbors
|
int
|
Number of neighbors to consider
|
3
|
random_state
|
int
|
Random state for reproducibility
|
None
|
Source code in fairsample/techniques/nus.py
| class NUS(BaseSampler):
"""
Neighbourhood-based Under-Sampling (NUS).
This technique removes majority class samples that are in the neighborhood
of minority class samples, helping to reduce class overlap.
Parameters
----------
n_neighbors : int, default=3
Number of neighbors to consider
random_state : int, default=None
Random state for reproducibility
"""
def __init__(self, n_neighbors=3, random_state=None):
super().__init__(random_state=random_state)
self.n_neighbors = n_neighbors
self._sampling_type = 'undersampling'
def fit_resample(self, X, y):
"""
Resample the dataset using NUS.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data
y : array-like of shape (n_samples,)
Target values
Returns
-------
X_resampled : array-like
Resampled training data
y_resampled : array-like
Resampled target values
"""
X, y = self._validate_input(X, y)
# Get minority and majority class indices
minority_indices, majority_indices, minority_class, majority_class = \
self._get_minority_majority_indices(y)
# If dataset is already balanced or minority is larger, return as is
if len(minority_indices) >= len(majority_indices):
return X.copy(), y.copy()
# Fit k-NN on minority samples
X_minority = X[minority_indices]
# Adjust n_neighbors if we have fewer minority samples
k = min(self.n_neighbors, len(X_minority) - 1)
if k <= 0:
return X.copy(), y.copy()
knn = NearestNeighbors(n_neighbors=k + 1) # +1 because it includes the point itself
knn.fit(X_minority)
# Find majority samples that are neighbors of minority samples
X_majority = X[majority_indices]
distances, indices = knn.kneighbors(X_majority)
# Remove majority samples that are too close to minority samples
# Use median distance as threshold
median_distance = np.median(distances[:, 1]) # Skip first column (self)
# Keep majority samples that are far enough from minority samples
keep_majority_mask = distances[:, 1] > median_distance
keep_majority_indices = majority_indices[keep_majority_mask]
# Ensure we keep at least as many majority samples as minority samples
if len(keep_majority_indices) < len(minority_indices):
# If we removed too many, keep some of the closest ones
n_additional = len(minority_indices) - len(keep_majority_indices)
removed_indices = majority_indices[~keep_majority_mask]
if len(removed_indices) > 0:
# Sort by distance and keep the farthest ones among the removed
removed_distances = distances[~keep_majority_mask, 1]
sorted_indices = np.argsort(removed_distances)[::-1] # Descending order
additional_indices = removed_indices[sorted_indices[:n_additional]]
keep_majority_indices = np.concatenate([keep_majority_indices, additional_indices])
# Combine minority and selected majority samples
keep_indices = np.concatenate([minority_indices, keep_majority_indices])
keep_indices = np.sort(keep_indices)
return X[keep_indices], y[keep_indices]
|
fit_resample(X, y)
Resample the dataset using NUS.
Parameters:
| Name |
Type |
Description |
Default |
X
|
array-like of shape (n_samples, n_features)
|
|
required
|
y
|
array-like of shape (n_samples,)
|
|
required
|
Returns:
| Name | Type |
Description |
X_resampled |
array - like
|
|
y_resampled |
array - like
|
|
Source code in fairsample/techniques/nus.py
| def fit_resample(self, X, y):
"""
Resample the dataset using NUS.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data
y : array-like of shape (n_samples,)
Target values
Returns
-------
X_resampled : array-like
Resampled training data
y_resampled : array-like
Resampled target values
"""
X, y = self._validate_input(X, y)
# Get minority and majority class indices
minority_indices, majority_indices, minority_class, majority_class = \
self._get_minority_majority_indices(y)
# If dataset is already balanced or minority is larger, return as is
if len(minority_indices) >= len(majority_indices):
return X.copy(), y.copy()
# Fit k-NN on minority samples
X_minority = X[minority_indices]
# Adjust n_neighbors if we have fewer minority samples
k = min(self.n_neighbors, len(X_minority) - 1)
if k <= 0:
return X.copy(), y.copy()
knn = NearestNeighbors(n_neighbors=k + 1) # +1 because it includes the point itself
knn.fit(X_minority)
# Find majority samples that are neighbors of minority samples
X_majority = X[majority_indices]
distances, indices = knn.kneighbors(X_majority)
# Remove majority samples that are too close to minority samples
# Use median distance as threshold
median_distance = np.median(distances[:, 1]) # Skip first column (self)
# Keep majority samples that are far enough from minority samples
keep_majority_mask = distances[:, 1] > median_distance
keep_majority_indices = majority_indices[keep_majority_mask]
# Ensure we keep at least as many majority samples as minority samples
if len(keep_majority_indices) < len(minority_indices):
# If we removed too many, keep some of the closest ones
n_additional = len(minority_indices) - len(keep_majority_indices)
removed_indices = majority_indices[~keep_majority_mask]
if len(removed_indices) > 0:
# Sort by distance and keep the farthest ones among the removed
removed_distances = distances[~keep_majority_mask, 1]
sorted_indices = np.argsort(removed_distances)[::-1] # Descending order
additional_indices = removed_indices[sorted_indices[:n_additional]]
keep_majority_indices = np.concatenate([keep_majority_indices, additional_indices])
# Combine minority and selected majority samples
keep_indices = np.concatenate([minority_indices, keep_majority_indices])
keep_indices = np.sort(keep_indices)
return X[keep_indices], y[keep_indices]
|
URNS
fairsample.techniques.URNS
Bases: BaseSampler
Undersampling based on Recursive Neighbourhood Search (URNS).
This technique recursively removes majority class samples that are
in dense regions and close to the decision boundary.
Parameters:
| Name |
Type |
Description |
Default |
n_neighbors
|
int
|
Number of neighbors to consider
|
5
|
random_state
|
int
|
Random state for reproducibility
|
None
|
max_iterations
|
int
|
Maximum number of recursive iterations
|
10
|
Source code in fairsample/techniques/urns.py
| class URNS(BaseSampler):
"""
Undersampling based on Recursive Neighbourhood Search (URNS).
This technique recursively removes majority class samples that are
in dense regions and close to the decision boundary.
Parameters
----------
n_neighbors : int, default=5
Number of neighbors to consider
random_state : int, default=None
Random state for reproducibility
max_iterations : int, default=10
Maximum number of recursive iterations
"""
def __init__(self, n_neighbors=5, random_state=None, max_iterations=10):
super().__init__(random_state=random_state)
self.n_neighbors = n_neighbors
self.max_iterations = max_iterations
self._sampling_type = 'undersampling'
def fit_resample(self, X, y):
"""
Resample the dataset using URNS.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data
y : array-like of shape (n_samples,)
Target values
Returns
-------
X_resampled : array-like
Resampled training data
y_resampled : array-like
Resampled target values
"""
X, y = self._validate_input(X, y)
# Get minority and majority class indices
minority_indices, majority_indices, minority_class, majority_class = \
self._get_minority_majority_indices(y)
# If dataset is already balanced or minority is larger, return as is
if len(minority_indices) >= len(majority_indices):
return X.copy(), y.copy()
# Start with all samples
current_indices = np.arange(len(X))
target_majority_size = len(minority_indices)
for iteration in range(self.max_iterations):
# Get current majority indices
current_y = y[current_indices]
current_majority_mask = current_y == majority_class
current_majority_indices = current_indices[current_majority_mask]
# If we've reached the target size, stop
if len(current_majority_indices) <= target_majority_size:
break
# Fit k-NN on current data
X_current = X[current_indices]
k = min(self.n_neighbors, len(X_current) - 1)
if k <= 0:
break
knn = NearestNeighbors(n_neighbors=k + 1)
knn.fit(X_current)
# Find neighbors for each sample
distances, indices = knn.kneighbors(X_current)
# Calculate neighborhood purity for majority samples
majority_scores = []
current_majority_local_indices = np.where(current_majority_mask)[0]
for local_idx in current_majority_local_indices:
# Get neighbors (excluding self)
neighbor_indices = indices[local_idx, 1:]
neighbor_labels = current_y[neighbor_indices]
# Calculate purity (fraction of majority class neighbors)
purity = np.sum(neighbor_labels == majority_class) / len(neighbor_labels)
majority_scores.append((local_idx, purity))
if not majority_scores:
break
# Sort by purity (highest first) and remove samples with highest purity
majority_scores.sort(key=lambda x: x[1], reverse=True)
# Remove a fraction of the most pure majority samples
n_to_remove = min(
len(current_majority_indices) - target_majority_size,
max(1, len(majority_scores) // 4) # Remove 25% at most
)
indices_to_remove = [score[0] for score in majority_scores[:n_to_remove]]
global_indices_to_remove = current_indices[indices_to_remove]
# Update current indices
current_indices = np.setdiff1d(current_indices, global_indices_to_remove)
return X[current_indices], y[current_indices]
|
fit_resample(X, y)
Resample the dataset using URNS.
Parameters:
| Name |
Type |
Description |
Default |
X
|
array-like of shape (n_samples, n_features)
|
|
required
|
y
|
array-like of shape (n_samples,)
|
|
required
|
Returns:
| Name | Type |
Description |
X_resampled |
array - like
|
|
y_resampled |
array - like
|
|
Source code in fairsample/techniques/urns.py
| def fit_resample(self, X, y):
"""
Resample the dataset using URNS.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data
y : array-like of shape (n_samples,)
Target values
Returns
-------
X_resampled : array-like
Resampled training data
y_resampled : array-like
Resampled target values
"""
X, y = self._validate_input(X, y)
# Get minority and majority class indices
minority_indices, majority_indices, minority_class, majority_class = \
self._get_minority_majority_indices(y)
# If dataset is already balanced or minority is larger, return as is
if len(minority_indices) >= len(majority_indices):
return X.copy(), y.copy()
# Start with all samples
current_indices = np.arange(len(X))
target_majority_size = len(minority_indices)
for iteration in range(self.max_iterations):
# Get current majority indices
current_y = y[current_indices]
current_majority_mask = current_y == majority_class
current_majority_indices = current_indices[current_majority_mask]
# If we've reached the target size, stop
if len(current_majority_indices) <= target_majority_size:
break
# Fit k-NN on current data
X_current = X[current_indices]
k = min(self.n_neighbors, len(X_current) - 1)
if k <= 0:
break
knn = NearestNeighbors(n_neighbors=k + 1)
knn.fit(X_current)
# Find neighbors for each sample
distances, indices = knn.kneighbors(X_current)
# Calculate neighborhood purity for majority samples
majority_scores = []
current_majority_local_indices = np.where(current_majority_mask)[0]
for local_idx in current_majority_local_indices:
# Get neighbors (excluding self)
neighbor_indices = indices[local_idx, 1:]
neighbor_labels = current_y[neighbor_indices]
# Calculate purity (fraction of majority class neighbors)
purity = np.sum(neighbor_labels == majority_class) / len(neighbor_labels)
majority_scores.append((local_idx, purity))
if not majority_scores:
break
# Sort by purity (highest first) and remove samples with highest purity
majority_scores.sort(key=lambda x: x[1], reverse=True)
# Remove a fraction of the most pure majority samples
n_to_remove = min(
len(current_majority_indices) - target_majority_size,
max(1, len(majority_scores) // 4) # Remove 25% at most
)
indices_to_remove = [score[0] for score in majority_scores[:n_to_remove]]
global_indices_to_remove = current_indices[indices_to_remove]
# Update current indices
current_indices = np.setdiff1d(current_indices, global_indices_to_remove)
return X[current_indices], y[current_indices]
|
DeviOCSVM
fairsample.techniques.DeviOCSVM
Bases: RandomUnderSampler
Devi et al. One-Class SVM method (placeholder implementation).
Currently uses random undersampling as a placeholder.
TODO: Implement the actual Devi OCSVM algorithm.
Source code in fairsample/techniques/devi_ocsvm.py
| class DeviOCSVM(RandomUnderSampler):
"""
Devi et al. One-Class SVM method (placeholder implementation).
Currently uses random undersampling as a placeholder.
TODO: Implement the actual Devi OCSVM algorithm.
"""
def __init__(self, random_state=None):
super().__init__(sampling_strategy='auto', random_state=random_state)
self._sampling_type = 'undersampling'
|
FCMBoostOBU
fairsample.techniques.FCMBoostOBU
Bases: RandomUnderSampler
Fuzzy C-Means Boosted Overlap-Based Undersampling (placeholder implementation).
Currently uses random undersampling as a placeholder.
TODO: Implement the actual FCM Boost OBU algorithm.
Source code in fairsample/techniques/fcm_boost_obu.py
| class FCMBoostOBU(RandomUnderSampler):
"""
Fuzzy C-Means Boosted Overlap-Based Undersampling (placeholder implementation).
Currently uses random undersampling as a placeholder.
TODO: Implement the actual FCM Boost OBU algorithm.
"""
def __init__(self, random_state=None):
super().__init__(sampling_strategy='auto', random_state=random_state)
self._sampling_type = 'hybrid'
|
Hybrid Methods
SVDDWSMOTE
fairsample.techniques.SVDDWSMOTE
Bases: RandomOverSampler
SVDD-based overlap handler (placeholder implementation).
Currently uses random oversampling as a placeholder.
TODO: Implement the actual SVDD WSMOTE algorithm.
Source code in fairsample/techniques/svddwsmote.py
| class SVDDWSMOTE(RandomOverSampler):
"""
SVDD-based overlap handler (placeholder implementation).
Currently uses random oversampling as a placeholder.
TODO: Implement the actual SVDD WSMOTE algorithm.
"""
def __init__(self, random_state=None):
super().__init__(sampling_strategy='auto', random_state=random_state)
self._sampling_type = 'hybrid'
|
ODBOT
fairsample.techniques.ODBOT
Bases: RandomOverSampler
Outlier Detection-Based Oversampling Technique (placeholder implementation).
Currently uses random oversampling as a placeholder.
TODO: Implement the actual ODBOT algorithm.
Source code in fairsample/techniques/odbot.py
| class ODBOT(RandomOverSampler):
"""
Outlier Detection-Based Oversampling Technique (placeholder implementation).
Currently uses random oversampling as a placeholder.
TODO: Implement the actual ODBOT algorithm.
"""
def __init__(self, random_state=None):
super().__init__(sampling_strategy='auto', random_state=random_state)
self._sampling_type = 'oversampling'
|
EHSO
fairsample.techniques.EHSO
Bases: RandomUnderSampler
Evolutionary Hybrid Sampling in Overlapping scenarios (placeholder implementation).
Currently uses random undersampling as a placeholder.
TODO: Implement the actual EHSO algorithm.
Source code in fairsample/techniques/ehso.py
| class EHSO(RandomUnderSampler):
"""
Evolutionary Hybrid Sampling in Overlapping scenarios (placeholder implementation).
Currently uses random undersampling as a placeholder.
TODO: Implement the actual EHSO algorithm.
"""
def __init__(self, random_state=None):
super().__init__(sampling_strategy='auto', random_state=random_state)
self._sampling_type = 'hybrid'
|
Clustering-Based
NBUS
fairsample.techniques.NBUS
Bases: NUS
Neighbourhood-Based Undersampling (placeholder implementation).
Currently uses NUS as a placeholder.
TODO: Implement the actual NBUS variants.
Source code in fairsample/techniques/nbus.py
| class NBUS(NUS):
"""
Neighbourhood-Based Undersampling (placeholder implementation).
Currently uses NUS as a placeholder.
TODO: Implement the actual NBUS variants.
"""
def __init__(self, variant='basic', random_state=None):
super().__init__(n_neighbors=3, random_state=random_state)
self.variant = variant
self._sampling_type = 'undersampling'
|
KMeansUndersampling
fairsample.techniques.KMeansUndersampling
Bases: RandomUnderSampler
K-Means based undersampling (placeholder implementation).
Currently uses random undersampling as a placeholder.
TODO: Implement the actual K-Means undersampling variants.
Source code in fairsample/techniques/kmeans_undersampling.py
| class KMeansUndersampling(RandomUnderSampler):
"""
K-Means based undersampling (placeholder implementation).
Currently uses random undersampling as a placeholder.
TODO: Implement the actual K-Means undersampling variants.
"""
def __init__(self, variant='basic', random_state=None):
super().__init__(sampling_strategy='auto', random_state=random_state)
self.variant = variant
self._sampling_type = 'undersampling'
|
Comprehensive
OSM
fairsample.techniques.OSM
Bases: RandomUnderSampler
Overlap-Separating Model (placeholder implementation).
Currently uses random undersampling as a placeholder.
TODO: Implement the actual OSM algorithm.
Source code in fairsample/techniques/osm.py
| class OSM(RandomUnderSampler):
"""
Overlap-Separating Model (placeholder implementation).
Currently uses random undersampling as a placeholder.
TODO: Implement the actual OSM algorithm.
"""
def __init__(self, random_state=None):
super().__init__(sampling_strategy='auto', random_state=random_state)
self._sampling_type = 'hybrid'
|
Baselines
RandomOverSampler
From imbalanced-learn:
from imblearn.over_sampling import RandomOverSampler
sampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X, y)
RandomUnderSampler
From imbalanced-learn:
from imblearn.under_sampling import RandomUnderSampler
sampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X, y)