集成学习(Ensemble Learning)

在机器学习领域,单一模型往往难以在所有场景下都达到最佳效果。因此,集成学习(Ensemble Learning) 通过组合多个弱学习器(单个模型)来构建更强大模型,提高预测性能和泛化能力。它的核心思想是集众之智,让多个模型协同工作,从而减少单一模型的偏差和方差,提高整体性能。

  • 提高准确率:通过多个模型的组合降低误差。
  • 减少过拟合:不同模型的多样性有助于提高泛化能力。
  • 增强稳定性:避免单个模型在特定情况下表现不佳。

1. Bagging

Bagging(自助聚合) 通过随机抽样生成多个训练数据集,并对每个数据集训练一个基学习器,最终通过投票或平均的方式得出最终预测。Bagging 的关键特性:

  • 样本采样:从训练数据集中有放回地随机抽取多个子集。
  • 模型训练:每个子集训练一个独立的学习器。
  • 结果融合:分类任务用投票,回归任务用均值

随机森林(Random Forest)是 Bagging 的经典应用,通过组合多个决策树降低过拟合。

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
def test():
data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)
estimators = [DecisionTreeClassifier(), SVC(), RandomForestClassifier()]
for base_estimator in estimators:
estimator = BaggingClassifier(estimator=base_estimator, n_estimators=3)
estimator.fit(X_train, y_train)
print(estimator.estimators_)
score = estimator.score(X_test, y_test)
print(score)
if __name__ == '__main__':
test()
from sklearn.ensemble import BaggingClassifier from sklearn.datasets import load_iris from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler def test(): data = load_iris() X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42) estimators = [DecisionTreeClassifier(), SVC(), RandomForestClassifier()] for base_estimator in estimators: estimator = BaggingClassifier(estimator=base_estimator, n_estimators=3) estimator.fit(X_train, y_train) print(estimator.estimators_) score = estimator.score(X_test, y_test) print(score) if __name__ == '__main__': test()
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler




def test():
    data = load_iris()
    X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

    estimators = [DecisionTreeClassifier(), SVC(), RandomForestClassifier()]

    for base_estimator in estimators:
        estimator = BaggingClassifier(estimator=base_estimator, n_estimators=3)
        estimator.fit(X_train, y_train)
        print(estimator.estimators_)

        score = estimator.score(X_test, y_test)
        print(score)


if __name__ == '__main__':
    test()

2. Boosting

Boosting(提升) 通过逐步调整模型权重来提升整体效果。不同于 Bagging,Boosting 让后续模型重点关注前一轮误分类的样本,逐步降低整体误差。Boosting 的关键特性:

  • 加权训练:每轮训练时,调整样本权重,使错误分类的样本在下一轮更受关注。
  • 弱学习器组合:一般使用简单模型,如决策树(弱学习器)。
  • 最终决策:按各个学习器的权重进行加权投票。

典型算法

  • AdaBoost(Adaptive Boosting)
  • Gradient Boosting
  • XGBoost(Extreme Gradient Boosting)
  • LightGBM(Light Gradient Boosting Machine)
  • CatBoost(Category Boosting)

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
def test():
data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)
# 根据样本权重
estimator = AdaBoostClassifier(estimator=DecisionTreeClassifier(),
n_estimators=10,
learning_rate=1.0,
algorithm="SAMME.R",
random_state=42)
estimator.fit(X_train, y_train)
print(estimator.score(X_test, y_test))
# 根据拟合残差
estimator = GradientBoostingClassifier()
estimator.fit(X_train, y_train)
print(estimator.score(X_test, y_test))
if __name__ == '__main__':
test()
from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier def test(): data = load_iris() X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42) # 根据样本权重 estimator = AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, learning_rate=1.0, algorithm="SAMME.R", random_state=42) estimator.fit(X_train, y_train) print(estimator.score(X_test, y_test)) # 根据拟合残差 estimator = GradientBoostingClassifier() estimator.fit(X_train, y_train) print(estimator.score(X_test, y_test)) if __name__ == '__main__': test()
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


def test():
    data = load_iris()
    X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

    # 根据样本权重
    estimator = AdaBoostClassifier(estimator=DecisionTreeClassifier(),
                                   n_estimators=10,
                                   learning_rate=1.0,
                                   algorithm="SAMME.R",
                                   random_state=42)
    estimator.fit(X_train, y_train)
    print(estimator.score(X_test, y_test))

    # 根据拟合残差
    estimator = GradientBoostingClassifier()
    estimator.fit(X_train, y_train)
    print(estimator.score(X_test, y_test))



if __name__ == '__main__':
    test()

3. Stacking

Stacking(堆叠) 通过不同类型的模型组合,使用更高级的模型(元学习器,Meta Learner)对基础模型的输出进行学习。Stacking 的关键特性:

  • 多层结构:第一层由多个不同类型的学习器组成,第二层使用一个新的学习器进行融合。
  • 数据流:第一层模型的输出作为下一层模型的输入。
  • 提升泛化能力:融合多个不同模型的优势。

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
from sklearn.ensemble import StackingClassifier
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
def test():
data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)
estimators = [('dt', DecisionTreeClassifier()), ('svc', SVC()), ('rf', RandomForestClassifier())]
estimator = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
estimator.fit(X_train, y_train)
score = estimator.score(X_test, y_test)
print(score)
if __name__ == '__main__':
test()
from sklearn.ensemble import StackingClassifier from sklearn.datasets import load_iris from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler def test(): data = load_iris() X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42) estimators = [('dt', DecisionTreeClassifier()), ('svc', SVC()), ('rf', RandomForestClassifier())] estimator = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) estimator.fit(X_train, y_train) score = estimator.score(X_test, y_test) print(score) if __name__ == '__main__': test()
from sklearn.ensemble import StackingClassifier
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler




def test():
    data = load_iris()
    X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

    estimators = [('dt', DecisionTreeClassifier()), ('svc', SVC()), ('rf', RandomForestClassifier())]
    estimator = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
    estimator.fit(X_train, y_train)
    score = estimator.score(X_test, y_test)
    print(score)


if __name__ == '__main__':
    test()

未经允许不得转载:一亩三分地 » 集成学习(Ensemble Learning)
评论 (0)

2 + 7 =