概述
文章目录
- LightGBM
- xgboost
- Learning Curves
LightGBM
from sklearn.model_selection import train_test_split
col = [i for i in train_notnull.columns if i not in ['sales', 'date','id']]
y = 'sales'
train_x, test_x, train_y, test_y = train_test_split(train_notnull[col],train_notnull[y], test_size=0.2, random_state=2018)
import lightgbm as lgb
from bayes_opt import BayesianOptimization
# select the best parameter of LGBM
def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=5, random_seed=6, n_estimators=10000, learning_rate=0.02, output_process=False):
# prepare data
train_data = lgb.Dataset(data=X, label=y)
# parameters
def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
params = {'application':'regression_l1','num_iterations': n_estimators, 'learning_rate':learning_rate, 'early_stopping_round':100, 'metric':'auc'}
params["num_leaves"] = int(round(num_leaves))
params['feature_fraction'] = max(min(feature_fraction, 1), 0)
params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
params['max_depth'] = int(round(max_depth))
params['lambda_l1'] = max(lambda_l1, 0)
params['lambda_l2'] = max(lambda_l2, 0)
params['min_split_gain'] = min_split_gain
params['min_child_weight'] = min_child_weight
cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
return max(cv_result['auc-mean'])
# range
lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 45),
'feature_fraction': (0.1, 0.9),
'bagging_fraction': (0.8, 1),
'max_depth': (5, 8.99),
'lambda_l1': (0, 5),
'lambda_l2': (0, 3),
'min_split_gain': (0.001, 0.1),
'min_child_weight': (5, 50)}, random_state=0)
# optimize
lgbBO.maximize(init_points=init_round, n_iter=opt_round)
# output optimization process
if output_process==True: lgbBO.points_to_csv("bayes_opt_result.csv")
# return best parameters
return lgbBO.res['max']['max_params']
opt_params = bayes_parameter_opt_lgb(train_x, train_y, init_round=5, opt_round=10, n_folds=3, random_seed=6, n_estimators=100, learning_rate=0.02)
print(opt_params)
def model(train_x,train_y,test_x,test_y,col):
params = {
'nthread': 10,
# 'max_depth': 5.347645905809148,
'max_depth': 9,
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression_l1',
'metric': 'mape', # this is abs(a-e)/max(1,a)
'num_leaves': 26,
# 'num_leaves': 64,
'learning_rate': 0.2,
# 'feature_fraction': 0.9,
'feature_fraction': 0.12341870437127458,
'bagging_fraction': 0.8292882963367367,
# 'bagging_fraction': 0.8,
'bagging_freq': 5,
'lambda_l1': 4.214312104137376,
'lambda_l2': 2.84705737213127,
# 'lambda_l1': 0.06,
# 'lambda_l2': 0.1,
'verbose': -1,
'min_child_weight': 5.489438096050738,
'min_split_gain': 0.014593823181544105,
}
lgb_train = lgb.Dataset(train_x,train_y)
lgb_valid = lgb.Dataset(test_x,test_y)
model = lgb.train(params, lgb_train, 3000, valid_sets=[lgb_train, lgb_valid],early_stopping_rounds=50, verbose_eval=50)
return model
model = model(train_x,train_y,test_x,test_y,col)
y_pred = model.predict(test[col])
result = pd.DataFrame({'id':sample_submission['id'].as_matrix(), 'sales':y_pred.astype(np.int32)})
result.to_csv('submission.csv', index=False)
xgboost
调参流程:
- 设置初始值,如最重要的learning_rate和estimator,由于后续grid search时间较长,所以learning_rate要较大,一般可以取值为0.1;
- 保持learning rate和其他booster相关的参数不变,用户cv的方法调节estimators的参数;
- 保持estimator和learning rate不变,调节booster相关的参数,可以从影响最大的max_depth和min_child_weight开始,然后是gamma, subsample, colsample_bytree和Regularization Parameters等,逐步调节所有可能有影响的booster参数,调节时,可以先粗粒度,确定粗粒度范围后,再细粒度调节,在调节过程中,每得到一组最佳参数后,可以尝试不断用cv的方法调节estimator参数;
- 最后缩小learning rate,此时estimator的数量会增加,训练时间也会较长,得到最佳的learning rate和estimator的值;
- 得到一组较好的参数。
import xgboost as xgb
from xgboost import plot_importance
from sklearn.model_selection import train_test_split
# train_xgb = train.filter(regex='sales|year|item_.*|month_.*|weekday_.*|quarter_.*|store_.*|monthavg|weekdayavg')
# test_xgb = test.filter(regex='year|item_.*|month_.*|weekday_.*|quarter_.*|store_.*|monthavg|weekdayavg')
train_xgb = train.dropna(axis=0, how='any')
test_xgb = test
x_train,x_test,y_train,y_test = train_test_split(train_xgb.drop('sales',axis=1),train_xgb.pop('sales'),random_state=123,test_size=0.2)
def XGBmodel(x_train,x_test,y_train,y_test):
matrix_train = xgb.DMatrix(x_train,label=y_train)
matrix_test = xgb.DMatrix(x_test,label=y_test)
model=xgb.train(params={'objective':'reg:linear','eval_metric':'mae'}
,dtrain=matrix_train,num_boost_round=500,
early_stopping_rounds=20,evals=[(matrix_test,'test')],)
return model
m_xgb = XGBmodel(x_train, x_test, y_train, y_test)
## the importance of features
plot_importance(m_xgb)
## xgb : test data processor
y_pred = m_xgb.predict(xgb.DMatrix(test_xgb), ntree_limit = m_xgb.best_ntree_limit)
result = pd.DataFrame({'id':sample_submission['id'].as_matrix(), 'sales':y_pred.astype(np.int32)})
result.to_csv('submission.csv', index=False)
Learning Curves
from sklearn.learning_curve import learning_curve
# 用sklearn的learning_curve得到training_score和cv_score,使用matplotlib画出learning curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1,
train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):
"""
画出data在某模型上的learning curve.
参数解释
----------
estimator : 你用的分类器。
title : 表格的标题。
X : 输入的feature,numpy类型
y : 输入的target vector
ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点
cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
n_jobs : 并行的的任务数(默认1)
"""
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
if plot:
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("num of train")
plt.ylabel("scores")
plt.gca().invert_yaxis()
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
alpha=0.1, color="r")
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"scores of train")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"scores of cross validation")
plt.legend(loc="best")
plt.gca().invert_yaxis()
plt.draw()
plt.show()
midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
return midpoint, diff
plot_learning_curve(m_xgb, "Learning Curves", x_train.as_matrix(), y_train.as_matrix())
最后
以上就是沉静丝袜为你收集整理的常用模型使用LightGBMxgboostLearning Curves的全部内容,希望文章能够帮你解决常用模型使用LightGBMxgboostLearning Curves所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复