XGBoost 学习调参的例子

发现后面设置参数的时候，原生接口和sklearn的参数混在一起了，现在修改为

def run_xgboost(data_x,data_y,random_state_num):
    train_x,valid_x,train_y,valid_y = train_test_split(data_x.values,data_y.values,test_size=0.25,random_state=random_state_num)
    print('开始训练模型')
    start = time.time()
    #转换成xgb运算格式
    d_train = xgb.DMatrix(train_x,train_y)
    d_valid = xgb.DMatrix(valid_x,valid_y)
    watchlist = [(d_train,'train'),(d_valid,'valid')]
    #参数设置（未调箱前的参数）
    params={
        'eta':0.2,                        #特征权重，取值范围0~1，通常最后设置eta为0.01~0.2
        'max_depth':3,                    #树的深度，通常取值3-10，过大容易过拟合，过小欠拟合
        'min_child_weight':1,             #最小样本的权重，调大参数可以繁殖过拟合
        'gamma':0.4,                      #控制是否后剪枝，越大越保守，一般0.1、 0.2的样子
        'subsample':0.8,                  #随机取样比例
        'colsample_bytree':0.8 ,          #默认为1，取值0~1，对特征随机采集比例
        'lambda':0.8,
        'alpha':0.6,
        'n_estimators':500,
        'booster':'gbtree',               #迭代树
        'objective':'binary:logistic',    #逻辑回归，输出为概率
        'nthread':6,                      #设置最大的进程量，若不设置则会使用全部资源
        'scale_pos_weight':1,             #默认为0,1可以处理类别不平衡

        'seed':1234,                      #随机树种子
        'silent':1,                       #0表示输出结果
        'eval_metric':'auc'               #评分指标
    }
    bst = xgb.train(params, d_train,1000,watchlist,early_stopping_rounds=100, verbose_eval=5)   #最大迭代次数1000次
    print(time.time()-start)
    tree_nums = bst.best_ntree_limit
    print('最优模型树的数量：%s,最优迭代次数：%s,auc: %s' %(bst.best_ntree_limit,bst.best_iteration,bst.best_score))
    bst = xgb.train(params, d_train,tree_nums,watchlist,early_stopping_rounds=100, verbose_eval=10) #最优模型迭代次数去训练
    
#     feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False)
#     #新版需要转换成dict or list 
#     #feat_imp = pd.Series(dict(clf.get_booster().get_fscore())).sort_values(ascending=False)
#     #plt.bar(feat_imp.index, feat_imp)
#     feat_imp.plot(kind='bar', title='Feature Importances')
    #展示特征重要性排名
    feat_imp = bst.get_fscore(fmap='xgb.txt')
    feat_imp = sorted(feat_imp.items(),key=operator.itemgetter(1))
    df = pd.DataFrame(feat_imp,columns=['feature','fscore'])
    #每个特征被调用的次数/所有特征被调用总次数
    df['fscore'] = df['fscore']/df['fscore'].sum()
    #分数高的排在前面,展示前40个重要特征排名
    df = df.sort_values(by='fscore',ascending=False)
    df = df.iloc[:40]
    plt.figure()
    df.plot(kind='bar',x='feature',y='fscore',legend=True,figsize=(32,10))
    plt.title('XGBoost Feature Importance')
    plt.xlabel('relative importance')
    plt.gcf().savefig('feature_importance_xgb.png')
    plt.show()
    return bst

XGBoost 其实也是GBDT的一种，本编就说一下代码

导入模块

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import time
import xgboost as xgb
from xgboost import plot_importance  #画特征重要性的函数
#from imblearn.ensemble import EasyEnsemble  #还有模块木有安装
from sklearn.model_selection import train_test_split
#from sklearn.externals import joblib 已经改成了下面这种方式
import joblib
from sklearn.metrics import auc,roc_curve  #说明是分类
plt.rc('font',family='SimHei',size=13)   #使画出的图形中能正常显示中文
%matplotlib inline

EDA数据探索性分析

#训练数据、线上数据(无Y)、验证数据
train_data = pd.read_csv('F:\\win10 升级桌面数据备份\\3.学习模型\\train_user_model_feat.csv')
print(len(train_data[train_data['label']==1]),len(train_data[train_data['label']==0]))  # 1: 815  0:42688
online_data = pd.read_csv('F:\\win10 升级桌面数据备份\\3.学习模型\\online_user_model_feat.csv')
valid_data = pd.read_csv('F:\\win10 升级桌面数据备份\\3.学习模型\\valid_user_model_feat.csv')
print(len(valid_data[valid_data['label']==1]),len(valid_data[valid_data['label']==0]))  # 1:892   0:39302

拆分特征和标签

train_y = train_data[['label']]
train_y.columns = ['y']
train_x = train_data.drop(['label','user_id'],axis=1)

valid_y = valid_data[['label']]
valid_y.columns = ['y']
valid_x = valid_data.drop(['label','user_id'],axis=1)
# 
file_xgboost_model='./xgboost_model' #模型文件
file_xgboost_columns='./columns.csv' #最终使用的特征
file_xgboost_model_auc_ks='./xgboost_model_auc_ks.png' #模型AUC和KS值
file_xgboost_model_score='./xgboost_model_score.png' # 模型预测用户的评分分布
file_xgboost_model_prob='./xgboost_model_prob.png' #模型预测用户的概率分布

网格搜索法调参

#coding=utf-8
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV  #网格搜索法
import xgboost as xgb
def xgbpa(trainX, trainY):
    # init ，分类
    xgb1 = XGBClassifier(  
        learning_rate=0.3,
        n_estimators=200,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=6
    )

    # max_depth 和 min_weight 参数调优   这个用给网格搜索法时的参数，数的层数由3-6
    param1 = {'max_depth': list(range(3, 7)), 'min_child_weight': list(range(1, 5, 2))}

    from sklearn import svm, datasets
    gsearch1 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=5,
            min_child_weight=1,
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param1, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch1.fit(trainX, trainY)
    print(gsearch1.scorer_)
    print(gsearch1.best_params_, gsearch1.best_score_)  #最佳参数（形式是字典型），最高分数（就一个值）
    best_max_depth = gsearch1.best_params_['max_depth'] #输出的max_depth的values
    best_min_child_weight = gsearch1.best_params_['min_child_weight']  #同理上面

    # gamma参数调优
    param2 = {'gamma': [i / 10.0 for i in range(0, 5, 2)]}
    gsearch2 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,  # 如同学习率
            n_estimators=150,  # 树的个数
            max_depth=best_max_depth,  #同时替换上面的值
            min_child_weight=best_min_child_weight,  #同理上面
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param2, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch2.fit(trainX, trainY)
    print(gsearch2.scorer_)
    print(gsearch2.best_params_, gsearch2.best_score_)
    best_gamma = gsearch2.best_params_['gamma']

    # 调整subsample 和 colsample_bytree参数
    param3 = {'subsample': [i / 10.0 for i in range(6, 9)], 'colsample_bytree': [i / 10.0 for i in range(6, 9)]}
    gsearch3 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=best_max_depth,
            min_child_weight=best_min_child_weight,
            gamma=best_gamma,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param3, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch3.fit(trainX, trainY)
    print(gsearch3.scorer_)
    print(gsearch3.best_params_, gsearch3.best_score_)
    best_subsample = gsearch3.best_params_['subsample']
    best_colsample_bytree = gsearch3.best_params_['colsample_bytree']

    # 正则化参数调优
    param4 = {'reg_alpha': [i / 10.0 for i in range(2, 10, 2)], 'reg_lambda': [i / 10.0 for i in range(2, 10, 2)]}
    gsearch4 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=best_max_depth,
            min_child_weight=best_min_child_weight,
            gamma=best_gamma,
            subsample=best_subsample,
            colsample_bytree=best_colsample_bytree,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param4, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch4.fit(trainX, trainY)
    print(gsearch4.scorer_)
    print(gsearch4.best_params_, gsearch4.best_score_)
    best_reg_alpha = gsearch4.best_params_['reg_alpha']
    best_reg_lambda = gsearch4.best_params_['reg_lambda']


    param5= {'scale_pos_weight': [i for i in [0.5, 1, 2]]}

    gsearch5 = GridSearchCV(
        estimator = XGBClassifier(
            learning_rate = 0.3,
            n_estimators = 150,
            max_depth = best_max_depth,
            min_child_weight = best_min_child_weight,
            gamma = best_gamma,
            subsample = best_subsample,
            colsample_bytree = best_colsample_bytree,
            reg_alpha = best_reg_alpha,
            reg_lambda = best_reg_lambda,
            objective = 'binary:logistic',
            nthread = 4,
            scale_pos_weight = 1,
            seed = 6
            ),
        param_grid = param5, scoring = 'roc_auc', n_jobs = 4, iid = False, cv = 5)
    gsearch5.fit(trainX, trainY)
    print(gsearch5.best_params_, gsearch5.best_score_)
    best_scale_pos_weight = gsearch5.best_params_['scale_pos_weight']

    # 降低学习速率，数的数量
    param6 = [{'learning_rate': [0.01, 0.05, 0.1, 0.2], 'n_estimators': [800, 1000, 1200]}]

    gsearch6 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=best_max_depth,
            min_child_weight=best_min_child_weight,
            gamma=best_gamma,
            subsample=best_subsample,
            colsample_bytree=best_colsample_bytree,
            reg_alpha=best_reg_alpha,
            reg_lambda = best_reg_lambda,
            objective = 'binary:logistic',
            nthread = 4,
            scale_pos_weight = best_scale_pos_weight,
            seed = 6
    ),
    param_grid = param6, scoring = 'roc_auc', n_jobs = 4, iid = False, cv = 5)
    gsearch6.fit(trainX, trainY)
    print(gsearch6.scorer_)
    print(gsearch6.best_params_, gsearch6.best_score_)
    best_learning_rate = gsearch6.best_params_['learning_rate']
    best_n_estimators = gsearch6.best_params_['n_estimators']
    print('最好参数集：')
    print(gsearch1.best_params_, gsearch1.best_score_)
    print(gsearch2.best_params_, gsearch2.best_score_)
    print(gsearch3.best_params_, gsearch3.best_score_)
    print(gsearch4.best_params_, gsearch4.best_score_)
    print(gsearch5.best_params_, gsearch5.best_score_)
    print(gsearch6.best_params_, gsearch6.best_score_)


if __name__ == '__main__':
    # user_model cv
    #调参前得保持模型训练样本和调参样本数据一致
    print('--------------开始调参---------------')
    start = time.time()
    data_x,temp_x,data_y,temp_y = train_test_split(train_x,train_y,test_size=0.25,random_state=1234)
    xgbpa(data_x,data_y.y) #标签值有要是数组型的，不能是df，所以就.y了
    print('调参用时：%s'%(time.time()-start))

这个数据要跑挺久的（>0.5h）要留足时间去运行

--------------开始调参---------------
make_scorer(roc_auc_score, needs_threshold=True)
{'max_depth': 3, 'min_child_weight': 3} 0.8169763045780181
make_scorer(roc_auc_score, needs_threshold=True)
{'gamma': 0.0} 0.8169763045780181
make_scorer(roc_auc_score, needs_threshold=True)
{'colsample_bytree': 0.8, 'subsample': 0.8} 0.8169763045780181
make_scorer(roc_auc_score, needs_threshold=True)
{'reg_alpha': 0.6, 'reg_lambda': 0.8} 0.8148521719194484
{'scale_pos_weight': 0.5} 0.8155242908735241
make_scorer(roc_auc_score, needs_threshold=True)
{'learning_rate': 0.01, 'n_estimators': 1200} 0.8467294278425243
最好参数集：
{'max_depth': 3, 'min_child_weight': 3} 0.8169763045780181
{'gamma': 0.0} 0.8169763045780181
{'colsample_bytree': 0.8, 'subsample': 0.8} 0.8169763045780181
{'reg_alpha': 0.6, 'reg_lambda': 0.8} 0.8148521719194484
{'scale_pos_weight': 0.5} 0.8155242908735241
{'learning_rate': 0.01, 'n_estimators': 1200} 0.8467294278425243
调参用时：1126.5513534545898

特征列集索引表的建立

def create_feature_map(features):
    outfile = open('xgb.txt', 'w')  #写，新建一个叫xgb.txt的文件
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))   #格式为 0  feature  q  \t是分隔符，为空  就是说第一列是序号，第二列是特征名称，第三列是q,不知道需要这个q干吗，可以是多写了，先要着吧，后面再看看吧
        i = i + 1
    outfile.close()
create_feature_map(train_x.columns)

具体为什么要使用q，看这个https://blog.csdn.net/ai_XX/article/details/102778684?utm_medium=distribute.pc_aggpage_search_result.none-task-blog-2~all~first_rank_v2~rank_v25-3-102778684.nonecase&utm_term=python%E7%9A%84fmap%E6%96%87%E4%BB%B6

使用XGBoost训练模型

只是用了一部分，还有一些参数没有根据最优参数来使用，但是大部分都已经运用进去了

#运行XGBoost,输出特征重要性排名
def run_xgboost(data_x,data_y,random_state_num):
    train_x,valid_x,train_y,valid_y = train_test_split(data_x.values,data_y.values,test_size=0.25,random_state=random_state_num)
    print('开始训练模型')
    start = time.time()
    #转换成xgb运算格式
    d_train = xgb.DMatrix(train_x,train_y)
    d_valid = xgb.DMatrix(valid_x,valid_y)
    watchlist = [(d_train,'train'),(d_valid,'valid')]
    #参数设置（未调箱前的参数）
    params={
        'eta':0.2,                        #特征权重，取值范围0~1，通常最后设置eta为0.01~0.2
        'max_depth':3,                    #树的深度，通常取值3-10，过大容易过拟合，过小欠拟合
        'min_child_weight':1,             #最小样本的权重，调大参数可以繁殖过拟合
        'gamma':0.4,                      #控制是否后剪枝，越大越保守，一般0.1、 0.2的样子
        'subsample':0.8,                  #随机取样比例
        'colsample_bytree':0.8 ,          #默认为1，取值0~1，对特征随机采集比例
        'reg_lambda':0.8,
        'reg_alpha':0.6,
        'learning_rate':0.1,
        'n_estimators':1000,
        'booster':'gbtree',               #迭代树
        'objective':'binary:logistic',    #逻辑回归，输出为概率
        'nthread':6,                      #设置最大的进程量，若不设置则会使用全部资源
        'scale_pos_weight':1,             #默认为0,1可以处理类别不平衡
        'lambda':1,                       #默认为1，用于L2平滑处理项，避免模型过拟合
        'seed':1234,                      #随机树种子
        'silent':1,                       #0表示输出结果
        'eval_metric':'auc'               #评分指标
    }
    bst = xgb.train(params, d_train,1000,watchlist,early_stopping_rounds=100, verbose_eval=5)   #最大迭代次数1000次
    print(time.time()-start)
    tree_nums = bst.best_ntree_limit
    print('最优模型树的数量：%s,最优迭代次数：%s,auc: %s' %(bst.best_ntree_limit,bst.best_iteration,bst.best_score))
    bst = xgb.train(params, d_train,tree_nums,watchlist,early_stopping_rounds=100, verbose_eval=10) #最优模型迭代次数去训练
    
#     feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False)
#     #新版需要转换成dict or list 
#     #feat_imp = pd.Series(dict(clf.get_booster().get_fscore())).sort_values(ascending=False)
#     #plt.bar(feat_imp.index, feat_imp)
#     feat_imp.plot(kind='bar', title='Feature Importances')
    #展示特征重要性排名
    feat_imp = bst.get_fscore(fmap='xgb.txt')
    feat_imp = sorted(feat_imp.items(),key=operator.itemgetter(1))
    df = pd.DataFrame(feat_imp,columns=['feature','fscore'])
    #每个特征被调用的次数/所有特征被调用总次数
    df['fscore'] = df['fscore']/df['fscore'].sum()
    #分数高的排在前面,展示前40个重要特征排名
    df = df.sort_values(by='fscore',ascending=False)
    df = df.iloc[:40]
    plt.figure()
    df.plot(kind='bar',x='feature',y='fscore',legend=True,figsize=(32,10))
    plt.title('XGBoost Feature Importance')
    plt.xlabel('relative importance')
    plt.gcf().savefig('feature_importance_xgb.png')
    plt.show()
    return bst

绘制roc曲线函数

# 绘制ROC曲线函数
def plot_roc(test_x, test_y):
    predictions = bst.predict(xgb.DMatrix(test_x))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions)  #roc的几个参数
    roc_auc = auc(false_positive_rate, true_positive_rate)  #直接计算auc
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r.')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('tpr')
    plt.xlabel('fpr')

# 绘制K-S函数 从大到小排序，分10等分
def plot_ks(test_x, test_y):
    predictions = bst.predict(xgb.DMatrix(test_x))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions, drop_intermediate=False)
    pre = sorted(predictions, reverse=True)  #reverse参数为True意味着按照降序排序，这是画ks时要求的
    num = []
    for i in range(10):
        num.append((i) * int(len(pre) / 10))
    num.append(len(pre) - 1)
    df = pd.DataFrame()
    df['false_positive_rate'] = false_positive_rate
    df['true_positive_rate'] = true_positive_rate
    df['thresholds'] = thresholds
    data_ks = []
    for i in num:
        data_ks.append(list(df[df['thresholds'] == pre[i]].values[0]))
    data_ks = pd.DataFrame(data_ks)
    data_ks.columns = ['fpr', 'tpr', 'thresholds']
    ks = max(data_ks['tpr'] - data_ks['fpr'])
    plt.title('K-S曲线')
    plt.plot(np.array(range(len(num))), data_ks['tpr'])
    plt.plot(np.array(range(len(num))), data_ks['fpr'])
    plt.plot(np.array(range(len(num))), data_ks['tpr'] - data_ks['fpr'], label='K-S = %0.4f' % ks)
    plt.legend(loc='lower right')
    plt.xlim([0, 10])
    plt.ylim([0.0, 1.0])
    plt.ylabel('累计占比')
    plt.xlabel('分组编号')

# 绘制一张图，包含训练和测试集的ROC、AUC、K-S图形指标。
def auc_ks(train_x, test_x, train_y, test_y):
    plt.figure(figsize=(15, 15))
    plt.subplot(221)
    plot_roc(train_x, train_y)
    plt.subplot(222)
    plot_roc(test_x, test_y)
    plt.subplot(223)
    plot_ks(train_x, train_y)
    plt.subplot(224)
    plot_ks(test_x, test_y)
    plt.savefig(file_xgboost_model_auc_ks)
    plt.show()

保存模型、评价指标、选择变量等

#保存模型、评价指标、选择变量到D盘
def run_main(data_x,data_y):
    global bst
    start=time.time()
    bst=run_xgboost(data_x,data_y,random_state_num=1234)  #为什么要是1234，因为调参时候就是=1234
    joblib.dump(bst, file_xgboost_model)  #joblib的用法https://www.cnblogs.com/wzdLY/p/9630671.html 将模型保存
    print('模型已成功保存在 %s'%(file_xgboost_model))
    train_x, test_x, train_y, test_y = train_test_split(data_x.values, data_y.values, test_size=0.25, random_state=1234)
    auc_ks(train_x, test_x, train_y, test_y)
    print('模型评价指标已保存在：%s'%(file_xgboost_model_auc_ks))
    print('运行共花费时间：%s'%(time.time()-start))

if __name__=='__main__':
    run_main(train_x, train_y)

XGBoost 学习调参的例子

分别是训练集和测试集的auc和ks，还有特征重要性的排列

用验证集数据验证模型效果

# 绘制ROC曲线函数
def plot_test_roc(test_x, test_y,filename):
    bst = joblib.load(filename)
    predictions = bst.predict(xgb.DMatrix(test_x.values))
    false_positive_rate,true_positive_rate, thresholds = roc_curve(test_y, predictions)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('Recall')
    plt.xlabel('Fall-out')
    plt.show()

if __name__=='__main__':
    plot_test_roc(valid_x,valid_y,file_xgboost_model)

XGBoost 学习调参的例子

下面附上全部代码

# -*- coding: utf-8 -*-
"""
Created on Wed Mar 10 19:01:07 2021

@author: Administrator
"""



#%%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import time
import xgboost as xgb
from xgboost import plot_importance  #画特征重要性的函数
#from imblearn.ensemble import EasyEnsemble  #还有模块木有安装
from sklearn.model_selection import train_test_split
#from sklearn.externals import joblib 已经改成了下面这种方式
import joblib
from sklearn.metrics import auc,roc_curve  #说明是分类
plt.rc('font',family='SimHei',size=13)   #使画出的图形中能正常显示中文
%matplotlib inline



#%%
#训练数据、线上数据(无Y)、验证数据
train_data = pd.read_csv('D:/迅雷下载/3.学习模型/3.学习模型/train_user_model_feat.csv')
print(len(train_data[train_data['label']==1]),len(train_data[train_data['label']==0]))  # 1: 815  0:42688
online_data = pd.read_csv('D:/迅雷下载/3.学习模型/3.学习模型/online_user_model_feat.csv')
valid_data = pd.read_csv('D:/迅雷下载/3.学习模型/3.学习模型/valid_user_model_feat.csv')
print(len(valid_data[valid_data['label']==1]),len(valid_data[valid_data['label']==0]))  # 1:892   0:39302



#%%
train_y = train_data[['label']]
train_y.columns = ['y']
train_x = train_data.drop(['label','user_id'],axis=1)

valid_y = valid_data[['label']]
valid_y.columns = ['y']
valid_x = valid_data.drop(['label','user_id'],axis=1)
# 
file_xgboost_model='./xgboost_model' #模型文件
file_xgboost_columns='./columns.csv' #最终使用的特征
file_xgboost_model_auc_ks='./xgboost_model_auc_ks.png' #模型AUC和KS值
file_xgboost_model_score='./xgboost_model_score.png' # 模型预测用户的评分分布
file_xgboost_model_prob='./xgboost_model_prob.png' #模型预测用户的概率分布


#%%
#coding=utf-8
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV  #网格搜索法
import xgboost as xgb
def xgbpa(trainX, trainY):
    # init ，分类
    xgb1 = XGBClassifier(  
        learning_rate=0.3,
        n_estimators=150,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=6
    )

    # max_depth 和 min_weight 参数调优   这个用给网格搜索法时的参数，数的层数由3-6
    param1 = {'max_depth': list(range(3, 7)), 'min_child_weight': list(range(1, 5, 2))}

    from sklearn import svm, datasets
    gsearch1 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=5,
            min_child_weight=1,
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param1, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch1.fit(trainX, trainY)
    print(gsearch1.scorer_)
    print(gsearch1.best_params_, gsearch1.best_score_)  #最佳参数（形式是字典型），最高分数（就一个值）
    best_max_depth = gsearch1.best_params_['max_depth'] #输出的max_depth的values
    best_min_child_weight = gsearch1.best_params_['min_child_weight']  #同理上面

    # gamma参数调优
    param2 = {'gamma': [i / 10.0 for i in range(0, 5, 2)]}
    gsearch2 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,  # 如同学习率
            n_estimators=150,  # 树的个数
            max_depth=best_max_depth,  #同时替换上面的值
            min_child_weight=best_min_child_weight,  #同理上面
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param2, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch2.fit(trainX, trainY)
    print(gsearch2.scorer_)
    print(gsearch2.best_params_, gsearch2.best_score_)
    best_gamma = gsearch2.best_params_['gamma']

    # 调整subsample 和 colsample_bytree参数
    param3 = {'subsample': [i / 10.0 for i in range(6, 9)], 'colsample_bytree': [i / 10.0 for i in range(6, 9)]}
    gsearch3 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=best_max_depth,
            min_child_weight=best_min_child_weight,
            gamma=best_gamma,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param3, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch3.fit(trainX, trainY)
    print(gsearch3.scorer_)
    print(gsearch3.best_params_, gsearch3.best_score_)
    best_subsample = gsearch3.best_params_['subsample']
    best_colsample_bytree = gsearch3.best_params_['colsample_bytree']

    # 正则化参数调优
    param4 = {'reg_alpha': [i / 10.0 for i in range(2, 10, 2)], 'reg_lambda': [i / 10.0 for i in range(2, 10, 2)]}
    gsearch4 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=best_max_depth,
            min_child_weight=best_min_child_weight,
            gamma=best_gamma,
            subsample=best_subsample,
            colsample_bytree=best_colsample_bytree,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=6
        ),
        param_grid=param4, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch4.fit(trainX, trainY)
    print(gsearch4.scorer_)
    print(gsearch4.best_params_, gsearch4.best_score_)
    best_reg_alpha = gsearch4.best_params_['reg_alpha']
    best_reg_lambda = gsearch4.best_params_['reg_lambda']


    param5= {'scale_pos_weight': [i for i in [0.5, 1, 2]]}

    gsearch5 = GridSearchCV(
        estimator = XGBClassifier(
            learning_rate = 0.3,
            n_estimators = 150,
            max_depth = best_max_depth,
            min_child_weight = best_min_child_weight,
            gamma = best_gamma,
            subsample = best_subsample,
            colsample_bytree = best_colsample_bytree,
            reg_alpha = best_reg_alpha,
            reg_lambda = best_reg_lambda,
            objective = 'binary:logistic',
            nthread = 4,
            scale_pos_weight = 1,
            seed = 6
            ),
        param_grid = param5, scoring = 'roc_auc', n_jobs = 4, iid = False, cv = 5)
    gsearch5.fit(trainX, trainY)
    print(gsearch5.best_params_, gsearch5.best_score_)
    best_scale_pos_weight = gsearch5.best_params_['scale_pos_weight']

    # 降低学习速率，数的数量
    param6 = [{'learning_rate': [0.01, 0.05, 0.1, 0.2], 'n_estimators': [800, 1000, 1200]}]

    gsearch6 = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.3,
            n_estimators=150,
            max_depth=best_max_depth,
            min_child_weight=best_min_child_weight,
            gamma=best_gamma,
            subsample=best_subsample,
            colsample_bytree=best_colsample_bytree,
            reg_alpha=best_reg_alpha,
            reg_lambda = best_reg_lambda,
            objective = 'binary:logistic',
            nthread = 4,
            scale_pos_weight = best_scale_pos_weight,
            seed = 6
    ),
    param_grid = param6, scoring = 'roc_auc', n_jobs = 4, iid = False, cv = 5)
    gsearch6.fit(trainX, trainY)
    print(gsearch6.scorer_)
    print(gsearch6.best_params_, gsearch6.best_score_)
    best_learning_rate = gsearch6.best_params_['learning_rate']
    best_n_estimators = gsearch6.best_params_['n_estimators']
    print('最好参数集：')
    print(gsearch1.best_params_, gsearch1.best_score_)
    print(gsearch2.best_params_, gsearch2.best_score_)
    print(gsearch3.best_params_, gsearch3.best_score_)
    print(gsearch4.best_params_, gsearch4.best_score_)
    print(gsearch5.best_params_, gsearch5.best_score_)
    print(gsearch6.best_params_, gsearch6.best_score_)


if __name__ == '__main__':
    # user_model cv
    #调参前得保持模型训练样本和调参样本数据一致
    print('--------------开始调参---------------')
    start = time.time()
    data_x,temp_x,data_y,temp_y = train_test_split(train_x,train_y,test_size=0.25,random_state=1234)
    xgbpa(data_x,data_y.y) #标签值有要是数组型的，不能是df，所以就.y了
    print('调参用时：%s'%(time.time()-start))

#%%
def create_feature_map(features):
    outfile = open('xgb.txt', 'w')  #写，新建一个叫xgb.txt的文件
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))   #格式为 0  feature  q  \t是分隔符，为空  就是说第一列是序号，第二列是特征名称，第三列是q,不知道需要这个q干吗，可以是多写了，先要着吧，后面再看看吧
        i = i + 1
    outfile.close()
create_feature_map(train_x.columns)

#%%
#运行XGBoost,输出特征重要性排名
def run_xgboost(data_x,data_y,random_state_num):
    train_x,valid_x,train_y,valid_y = train_test_split(data_x.values,data_y.values,test_size=0.25,random_state=random_state_num)
    print('开始训练模型')
    start = time.time()
    #转换成xgb运算格式
    d_train = xgb.DMatrix(train_x,train_y)
    d_valid = xgb.DMatrix(valid_x,valid_y)
    watchlist = [(d_train,'train'),(d_valid,'valid')]
    #参数设置（未调箱前的参数）
    params={
        'eta':0.2,                        #特征权重，取值范围0~1，通常最后设置eta为0.01~0.2
        'max_depth':3,                    #树的深度，通常取值3-10，过大容易过拟合，过小欠拟合
        'min_child_weight':1,             #最小样本的权重，调大参数可以繁殖过拟合
        'gamma':0.4,                      #控制是否后剪枝，越大越保守，一般0.1、 0.2的样子
        'subsample':0.8,                  #随机取样比例
        'colsample_bytree':0.8 ,          #默认为1，取值0~1，对特征随机采集比例
        'reg_lambda':0.8,
        'reg_alpha':0.6,
        'learning_rate':0.1,
        'n_estimators':1000,
        'booster':'gbtree',               #迭代树
        'objective':'binary:logistic',    #逻辑回归，输出为概率
        'nthread':6,                      #设置最大的进程量，若不设置则会使用全部资源
        'scale_pos_weight':1,             #默认为0,1可以处理类别不平衡
        'lambda':1,                       #默认为1，用于L2平滑处理项，避免模型过拟合
        'seed':1234,                      #随机树种子
        'silent':1,                       #0表示输出结果
        'eval_metric':'auc'               #评分指标
    }
    bst = xgb.train(params, d_train,1000,watchlist,early_stopping_rounds=100, verbose_eval=5)   #最大迭代次数1000次
    print(time.time()-start)
    tree_nums = bst.best_ntree_limit
    print('最优模型树的数量：%s,最优迭代次数：%s,auc: %s' %(bst.best_ntree_limit,bst.best_iteration,bst.best_score))
    bst = xgb.train(params, d_train,tree_nums,watchlist,early_stopping_rounds=100, verbose_eval=10) #最优模型迭代次数去训练
    
#     feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False)
#     #新版需要转换成dict or list 
#     #feat_imp = pd.Series(dict(clf.get_booster().get_fscore())).sort_values(ascending=False)
#     #plt.bar(feat_imp.index, feat_imp)
#     feat_imp.plot(kind='bar', title='Feature Importances')
    #展示特征重要性排名
    feat_imp = bst.get_fscore(fmap='xgb.txt')
    feat_imp = sorted(feat_imp.items(),key=operator.itemgetter(1))
    df = pd.DataFrame(feat_imp,columns=['feature','fscore'])
    #每个特征被调用的次数/所有特征被调用总次数
    df['fscore'] = df['fscore']/df['fscore'].sum()
    #分数高的排在前面,展示前40个重要特征排名
    df = df.sort_values(by='fscore',ascending=False)
    df = df.iloc[:40]
    plt.figure()
    df.plot(kind='bar',x='feature',y='fscore',legend=True,figsize=(32,10))
    plt.title('XGBoost Feature Importance')
    plt.xlabel('relative importance')
    plt.gcf().savefig('feature_importance_xgb.png')
    plt.show()
    return bst

#%%
# 绘制ROC曲线函数
def plot_roc(test_x, test_y):
    predictions = bst.predict(xgb.DMatrix(test_x))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions)  #roc的几个参数
    roc_auc = auc(false_positive_rate, true_positive_rate)  #直接计算auc
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r.')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('tpr')
    plt.xlabel('fpr')

# 绘制K-S函数 从大到小排序，分10等分
def plot_ks(test_x, test_y):
    predictions = bst.predict(xgb.DMatrix(test_x))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_y, predictions, drop_intermediate=False)
    pre = sorted(predictions, reverse=True)  #reverse参数为True意味着按照降序排序，这是画ks时要求的
    num = []
    for i in range(10):
        num.append((i) * int(len(pre) / 10))
    num.append(len(pre) - 1)
    df = pd.DataFrame()
    df['false_positive_rate'] = false_positive_rate
    df['true_positive_rate'] = true_positive_rate
    df['thresholds'] = thresholds
    data_ks = []
    for i in num:
        data_ks.append(list(df[df['thresholds'] == pre[i]].values[0]))
    data_ks = pd.DataFrame(data_ks)
    data_ks.columns = ['fpr', 'tpr', 'thresholds']
    ks = max(data_ks['tpr'] - data_ks['fpr'])
    plt.title('K-S曲线')
    plt.plot(np.array(range(len(num))), data_ks['tpr'])
    plt.plot(np.array(range(len(num))), data_ks['fpr'])
    plt.plot(np.array(range(len(num))), data_ks['tpr'] - data_ks['fpr'], label='K-S = %0.4f' % ks)
    plt.legend(loc='lower right')
    plt.xlim([0, 10])
    plt.ylim([0.0, 1.0])
    plt.ylabel('累计占比')
    plt.xlabel('分组编号')

# 绘制一张图，包含训练和测试集的ROC、AUC、K-S图形指标。
def auc_ks(train_x, test_x, train_y, test_y):
    plt.figure(figsize=(15, 15))
    plt.subplot(221)
    plot_roc(train_x, train_y)
    plt.subplot(222)
    plot_roc(test_x, test_y)
    plt.subplot(223)
    plot_ks(train_x, train_y)
    plt.subplot(224)
    plot_ks(test_x, test_y)
    plt.savefig(file_xgboost_model_auc_ks)
    plt.show()

#%%
#保存模型、评价指标、选择变量到D盘
def run_main(data_x,data_y):
    global bst
    start=time.time()
    bst=run_xgboost(data_x,data_y,random_state_num=1234)  #为什么要是1234，因为调参时候就是=1234
    joblib.dump(bst, file_xgboost_model)  #joblib的用法https://www.cnblogs.com/wzdLY/p/9630671.html 将模型保存
    print('模型已成功保存在 %s'%(file_xgboost_model))
    train_x, test_x, train_y, test_y = train_test_split(data_x.values, data_y.values, test_size=0.25, random_state=1234)
    auc_ks(train_x, test_x, train_y, test_y)
    print('模型评价指标已保存在：%s'%(file_xgboost_model_auc_ks))
    print('运行共花费时间：%s'%(time.time()-start))

if __name__=='__main__':
    run_main(train_x, train_y)
    
# 绘制ROC曲线函数
def plot_test_roc(test_x, test_y,filename):
    bst = joblib.load(filename)
    predictions = bst.predict(xgb.DMatrix(test_x.values))
    false_positive_rate,true_positive_rate, thresholds = roc_curve(test_y, predictions)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate,true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.ylabel('Recall')
    plt.xlabel('Fall-out')
    plt.show()

if __name__=='__main__':
    plot_test_roc(valid_x,valid_y,file_xgboost_model)

View Code