概述
DeepCTR包主要是对目前的一些“基于深度学习的点击率预测算法”进行了实现,官方文档参考
本文主要记录DeepFM算法的相关操作细节。
实验数据
prefix:用户输入(query前缀)
query_prediction:预测的用户完整需求查询词,最多10条;预测的查询词可能是前缀本身,数字为统计概率
title:文章标题
tag:文章类型
label:是否点击0/1
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
train_data = pd.read_table('./data/oppo_round1_train_20180929.txt',
names= ['prefix','query_prediction','title','tag','label'], header= None, encoding='utf-8').astype(str)
print(train_data.shape)
val_data = pd.read_table('./data/oppo_round1_vali_20180929.txt',
names = ['prefix','query_prediction','title','tag','label'], header = None, encoding='utf-8').astype(str)
print(val_data.shape)
# O:(1999999, 5)
# O:(50000, 5)
vctr = train_data['label'].value_counts() #可以看到标签异常值和样本不均衡
vcvl = val_data['label'].value_counts()
vctr[0]/vctr[1], vcvl[0]/vcvl[1]
# O:(1.6875350690893836, 1.690341673392521)
train_data = train_data[train_data['label'] != '音乐' ]
train_data = pd.concat([train_data,val_data]) #把train和val集合拼接在一起
train_data['label'] = train_data['label'].apply(lambda x: int(x))
items = ['prefix', 'title', 'tag'] #query前缀、文章标题、文章类型
for item in items: #分别按各item列分组 按label取值0/1 统计搜索次数、点击次数、点击率
temp = train_data.groupby(item, as_index = False)['label'].agg({item+'_click':'sum', item+'_count':'count'})
temp[item+'_ctr'] = temp[item+'_click']/(temp[item+'_count'])
train_data = pd.merge(train_data, temp, on=item, how='left')
for i in range(len(items)):
for j in range(i+1, len(items)):
item_g = [items[i], items[j]] #三选二,作为分组指标
temp = train_data.groupby(item_g, as_index=False)['label'].agg({'_'.join(item_g)+'_click': 'sum','_'.join(item_g)+'count':'count'})
temp['_'.join(item_g)+'_ctr'] = temp['_'.join(item_g)+'_click']/(temp['_'.join(item_g)+'count']+3)
train_data = pd.merge(train_data, temp, on=item_g, how='left')
from sklearn import preprocessing
import time
# 转换tag
encoder = preprocessing.LabelEncoder()
train_data['tag'] = encoder.fit_transform(train_data['tag'])
encoder = preprocessing.LabelEncoder()
train_data['prefix'] = encoder.fit_transform(train_data['prefix'])
encoder = preprocessing.LabelEncoder()
train_data['title'] = encoder.fit_transform(train_data['title'])
train_data_ = train_data.drop(['query_prediction'], axis = 1)
dense_features = ['prefix_click', 'prefix_count', 'prefix_ctr',
'title_click', 'title_count', 'title_ctr',
'tag_click', 'tag_count', 'tag_ctr',
'prefix_title_click', 'prefix_titlecount', 'prefix_title_ctr',
'prefix_tag_click', 'prefix_tagcount', 'prefix_tag_ctr',
'title_tag_click', 'title_tagcount', 'title_tag_ctr'] #类别型特征
sparse_features = ['prefix', 'title', 'tag'] #数值型特征
train_data_[sparse_features] = train_data_[sparse_features].fillna('-1', )
train_data_[dense_features] = train_data_[dense_features].fillna(0, )
target = ['label']
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
for feat in sparse_features:
lbe = LabelEncoder()
train_data_[feat] = lbe.fit_transform(train_data_[feat]) #将类别转化成id编码
mms = MinMaxScaler(feature_range=(0, 1))
train_data_[dense_features] = mms.fit_transform(train_data_[dense_features])
train_data_.head()
from deepctr.models import DeepFM
from deepctr.inputs import SparseFeat, DenseFeat, get_feature_names
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=train_data_[feat].nunique(),embedding_dim=4)
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
for feat in dense_features]
dnn_feature_columns = fixlen_feature_columns #类别型特征DNN
linear_feature_columns = fixlen_feature_columns #连续型特征线性FM
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
train, test = train_test_split(train_data_, test_size=0.2)
train_model_input = {name:train[name] for name in feature_names} #每项index:该列取值
test_model_input = {name:test[name] for name in feature_names}
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )
history = model.fit(train_model_input, train[target].values,
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
from sklearn.metrics import f1_score
pred_ans = model.predict(test_model_input, batch_size=256)
f1_score(test[target].values, np.where(pred_ans>0.5, 1, 0))
# O:0.7944898855019384
特征
类别特征embedding:(1)FM二阶项(2)DNN网络
连续值特征:FM一阶项
最后
以上就是舒适鲜花为你收集整理的深度推荐模型包DeepCTR的全部内容,希望文章能够帮你解决深度推荐模型包DeepCTR所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复