比较欠采样采样器#

以下示例旨在对不平衡学习包中提供的不同欠采样算法进行定性比较。

# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT
print(__doc__)

import seaborn as sns

sns.set_context("poster")

以下函数将用于创建玩具数据集。它使用了来自scikit-learn的make_classification,但固定了一些参数。

from sklearn.datasets import make_classification


def create_dataset(
    n_samples=1000,
    weights=(0.01, 0.01, 0.98),
    n_classes=3,
    class_sep=0.8,
    n_clusters=1,
):
    return make_classification(
        n_samples=n_samples,
        n_features=2,
        n_informative=2,
        n_redundant=0,
        n_repeated=0,
        n_classes=n_classes,
        n_clusters_per_class=n_clusters,
        weights=list(weights),
        class_sep=class_sep,
        random_state=0,
    )

以下函数将用于绘制重采样后的样本空间,以说明算法的特性。

def plot_resampling(X, y, sampler, ax, title=None):
    X_res, y_res = sampler.fit_resample(X, y)
    ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor="k")
    if title is None:
        title = f"Resampling with {sampler.__class__.__name__}"
    ax.set_title(title)
    sns.despine(ax=ax, offset=10)

以下函数将用于绘制给定一些数据的分类器的决策函数。

import numpy as np


def plot_decision_function(X, y, clf, ax, title=None):
    plot_step = 0.02
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(
        np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)
    )

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, alpha=0.4)
    ax.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolor="k")
    if title is not None:
        ax.set_title(title)
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

原型生成:通过生成新样本进行欠采样#

ClusterCentroids 通过用找到的簇的质心替换原始样本来进行欠采样。

import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans

from imblearn import FunctionSampler
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import ClusterCentroids

X, y = create_dataset(n_samples=400, weights=(0.05, 0.15, 0.8), class_sep=0.8)

samplers = {
    FunctionSampler(),  # identity resampler
    ClusterCentroids(
        estimator=MiniBatchKMeans(n_init=1, random_state=0), random_state=0
    ),
}

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 15))
for ax, sampler in zip(axs, samplers):
    model = make_pipeline(sampler, clf).fit(X, y)
    plot_decision_function(
        X, y, model, ax[0], title=f"Decision function with {sampler.__class__.__name__}"
    )
    plot_resampling(X, y, sampler, ax[1])

fig.tight_layout()
Decision function with ClusterCentroids, Resampling with ClusterCentroids, Decision function with FunctionSampler, Resampling with FunctionSampler

原型选择:通过选择现有样本进行欠采样#

执行原型选择的算法可以分为两类:(i) 受控欠采样方法和 (ii) 清理欠采样方法。

使用受控的欠采样方法,可以指定要选择的样本数量。 RandomUnderSampler 是通过随机选择目标类别的给定数量的样本来执行此类选择的最简单方法。

from imblearn.under_sampling import RandomUnderSampler

X, y = create_dataset(n_samples=400, weights=(0.05, 0.15, 0.8), class_sep=0.8)

samplers = {
    FunctionSampler(),  # identity resampler
    RandomUnderSampler(random_state=0),
}

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 15))
for ax, sampler in zip(axs, samplers):
    model = make_pipeline(sampler, clf).fit(X, y)
    plot_decision_function(
        X, y, model, ax[0], title=f"Decision function with {sampler.__class__.__name__}"
    )
    plot_resampling(X, y, sampler, ax[1])

fig.tight_layout()
Decision function with RandomUnderSampler, Resampling with RandomUnderSampler, Decision function with FunctionSampler, Resampling with FunctionSampler

NearMiss 算法实现了一些启发式规则来选择样本。NearMiss-1 从多数类中选择那些与少数类的 \(k`\) 个最近样本的平均距离最小的样本。NearMiss-2 从多数类中选择那些与负类的最远样本的平均距离最小的样本。NearMiss-3 是一个两步算法:首先,对于每个少数类样本,保留它们的 \(m\) 个最近邻居;然后,选择的多数类样本是那些与 \(k\) 个最近邻居的平均距离最大的样本。

from imblearn.under_sampling import NearMiss

X, y = create_dataset(n_samples=1000, weights=(0.05, 0.15, 0.8), class_sep=1.5)

samplers = [NearMiss(version=1), NearMiss(version=2), NearMiss(version=3)]

fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15, 25))
for ax, sampler in zip(axs, samplers):
    model = make_pipeline(sampler, clf).fit(X, y)
    plot_decision_function(
        X,
        y,
        model,
        ax[0],
        title=f"Decision function for {sampler.__class__.__name__}-{sampler.version}",
    )
    plot_resampling(
        X,
        y,
        sampler,
        ax[1],
        title=f"Resampling using {sampler.__class__.__name__}-{sampler.version}",
    )
fig.tight_layout()
Decision function for NearMiss-1, Resampling using NearMiss-1, Decision function for NearMiss-2, Resampling using NearMiss-2, Decision function for NearMiss-3, Resampling using NearMiss-3
/home/circleci/project/imblearn/under_sampling/_prototype_selection/_nearmiss.py:206: UserWarning: The number of the samples to be selected is larger than the number of samples available. The balancing ratio cannot be ensure and all samples will be returned.
  warnings.warn(
/home/circleci/project/imblearn/under_sampling/_prototype_selection/_nearmiss.py:206: UserWarning: The number of the samples to be selected is larger than the number of samples available. The balancing ratio cannot be ensure and all samples will be returned.
  warnings.warn(
/home/circleci/project/imblearn/under_sampling/_prototype_selection/_nearmiss.py:206: UserWarning: The number of the samples to be selected is larger than the number of samples available. The balancing ratio cannot be ensure and all samples will be returned.
  warnings.warn(
/home/circleci/project/imblearn/under_sampling/_prototype_selection/_nearmiss.py:206: UserWarning: The number of the samples to be selected is larger than the number of samples available. The balancing ratio cannot be ensure and all samples will be returned.
  warnings.warn(

EditedNearestNeighbours 移除多数类中那些类别与其最近邻不同的样本。这个筛选过程可以重复进行,这就是RepeatedEditedNearestNeighbours的原理。AllKNNRepeatedEditedNearestNeighbours略有不同,它通过改变内部最近邻算法的\(k\)参数,并在每次迭代中增加它。

from imblearn.under_sampling import (
    AllKNN,
    EditedNearestNeighbours,
    RepeatedEditedNearestNeighbours,
)

X, y = create_dataset(n_samples=500, weights=(0.2, 0.3, 0.5), class_sep=0.8)

samplers = [
    EditedNearestNeighbours(),
    RepeatedEditedNearestNeighbours(),
    AllKNN(allow_minority=True),
]

fig, axs = plt.subplots(3, 2, figsize=(15, 25))
for ax, sampler in zip(axs, samplers):
    model = make_pipeline(sampler, clf).fit(X, y)
    plot_decision_function(
        X, y, clf, ax[0], title=f"Decision function for \n{sampler.__class__.__name__}"
    )
    plot_resampling(
        X, y, sampler, ax[1], title=f"Resampling using \n{sampler.__class__.__name__}"
    )

fig.tight_layout()
Decision function for  EditedNearestNeighbours, Resampling using  EditedNearestNeighbours, Decision function for  RepeatedEditedNearestNeighbours, Resampling using  RepeatedEditedNearestNeighbours, Decision function for  AllKNN, Resampling using  AllKNN

CondensedNearestNeighbour 使用 1-NN 来迭代决定一个样本是否应该保留在数据集中。 问题是 CondensedNearestNeighbour 对噪声敏感,因为它保留了噪声样本。 OneSidedSelection 也使用了 1-NN 并 使用 TomekLinks 来移除被认为 是噪声的样本。 NeighbourhoodCleaningRule 使用 EditedNearestNeighbours 来移除一些 样本。此外,它们使用 3 个最近邻来移除不符合此规则的样本。

from imblearn.under_sampling import (
    CondensedNearestNeighbour,
    NeighbourhoodCleaningRule,
    OneSidedSelection,
)

X, y = create_dataset(n_samples=500, weights=(0.2, 0.3, 0.5), class_sep=0.8)

fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15, 25))

samplers = [
    CondensedNearestNeighbour(random_state=0),
    OneSidedSelection(random_state=0),
    NeighbourhoodCleaningRule(n_neighbors=11),
]

for ax, sampler in zip(axs, samplers):
    model = make_pipeline(sampler, clf).fit(X, y)
    plot_decision_function(
        X, y, clf, ax[0], title=f"Decision function for \n{sampler.__class__.__name__}"
    )
    plot_resampling(
        X, y, sampler, ax[1], title=f"Resampling using \n{sampler.__class__.__name__}"
    )
fig.tight_layout()
Decision function for  CondensedNearestNeighbour, Resampling using  CondensedNearestNeighbour, Decision function for  OneSidedSelection, Resampling using  OneSidedSelection, Decision function for  NeighbourhoodCleaningRule, Resampling using  NeighbourhoodCleaningRule

InstanceHardnessThreshold 使用分类器的预测来排除样本。所有被分类为低概率的样本将被移除。

from imblearn.under_sampling import InstanceHardnessThreshold

samplers = {
    FunctionSampler(),  # identity resampler
    InstanceHardnessThreshold(
        estimator=LogisticRegression(),
        random_state=0,
    ),
}

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 15))
for ax, sampler in zip(axs, samplers):
    model = make_pipeline(sampler, clf).fit(X, y)
    plot_decision_function(
        X,
        y,
        model,
        ax[0],
        title=f"Decision function with \n{sampler.__class__.__name__}",
    )
    plot_resampling(
        X, y, sampler, ax[1], title=f"Resampling using \n{sampler.__class__.__name__}"
    )

fig.tight_layout()
plt.show()
Decision function with  FunctionSampler, Resampling using  FunctionSampler, Decision function with  InstanceHardnessThreshold, Resampling using  InstanceHardnessThreshold

脚本的总运行时间: (0 分钟 9.198 秒)

预计内存使用量: 227 MB

图库由Sphinx-Gallery生成