我是靠谱客的博主 贪玩画板,最近开发中收集的这篇文章主要介绍写个数仓吧(7) 用户行为数据仓—— DWS层,用户活跃主题需求一:用户活跃主题,觉得挺不错的,现在分享给大家,希望可以做个参考。

概述

需求一:用户活跃主题

分层:DWS层
目标:统计当日、当周、当月活动的每个设备明细

每日活跃:

建表语句

hive (gmall)>
drop table if exists dws_uv_detail_day;
create   table dws_uv_detail_day( 
 `mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识', 
`version_code` string COMMENT '程序版本号', 
 `version_name` string COMMENT '程序版本名', 
`lang` string COMMENT '系统语言', 
`source` string COMMENT '渠道号', 
`os` string COMMENT '安卓系统版本', 
`area` string COMMENT '区域', 
`model` string COMMENT '手机型号', 
`brand` string COMMENT '手机品牌', 
`sdk_version` string COMMENT 'sdkVersion', 
`gmail` string COMMENT 'gmail', 
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度'
) COMMENT '活跃用户按天明细'
PARTITIONED BY ( `dt` string)
stored as  parquet
location '/warehouse/gmall/dws/dws_uv_detail_day/'
;

2)数据导入
以用户单日访问为key进行聚合,如果某个用户在一天中使用了两种操作系统、两个系统版本、多个地区,登录不同账号,只取其中之一

hive (gmall)>
set hive.exec.dynamic.partition.mode=nonstrict;  //非严格动态分区

insert overwrite table dws_uv_detail_day  partition(dt)
select  
    mid_id,
    collect_set(user_id)[0] user_id,
    collect_set(version_code)[0] version_code,
    collect_set(version_name)[0] version_name,
    collect_set(lang)[0]lang,
    collect_set(source)[0] source,
    collect_set(os)[0] os,
    collect_set(area)[0] area, 
    collect_set(model)[0] model,
    collect_set(brand)[0] brand,
    collect_set(sdk_version)[0] sdk_version,
    collect_set(gmail)[0] gmail,
    collect_set(height_width)[0] height_width,
    collect_set(app_time)[0]app_time,
    collect_set(network)[0] network,
    collect_set(lng)[0]lng,
    collect_set(lat)[0]lat,
    '2019-02-10'
from dwd_start_log   //启动日志
where dt='2019-02-10'  
group by mid_id;

3)查询导入结果

hive (gmall)> select * from dws_uv_detail_day limit 1;
hive (gmall)> select count(*) from dws_uv_detail_day;

不同渠道来源的每日活跃数统计怎么计算?

增加一个区段字段再使用group by 分组 ,或者使用partition分区将不同渠道获得的数据放在一起。

每周活跃

根据日用户访问明细 ,获得周用户访问明细
1)建表语句

hive (gmall)>
drop table if exists dws_uv_detail_wk;

create table dws_uv_detail_wk( 
    `mid_id` string COMMENT '设备唯一标识',
    `user_id` string COMMENT '用户标识', 
    `version_code` string COMMENT '程序版本号', 
    `version_name` string COMMENT '程序版本名', 
`lang` string COMMENT '系统语言', 
`source` string COMMENT '渠道号', 
`os` string COMMENT '安卓系统版本', 
`area` string COMMENT '区域', 
`model` string COMMENT '手机型号', 
`brand` string COMMENT '手机品牌', 
`sdk_version` string COMMENT 'sdkVersion', 
`gmail` string COMMENT 'gmail', 
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
    `monday_date` string COMMENT '周一日期',
    `sunday_date` string COMMENT  '周日日期' 
) COMMENT '活跃用户按周明细'
PARTITIONED BY (`wk_dt` string)
stored as  parquet
location '/warehouse/gmall/dws/dws_uv_detail_wk/'
;

2)数据导入

hive (gmall)>
set hive.exec.dynamic.partition.mode=nonstrict;

insert  overwrite table dws_uv_detail_wk partition(wk_dt)
select  
    mid_id,
    collect_set(user_id)[0] user_id,
    collect_set(version_code)[0] version_code,
    collect_set(version_name)[0] version_name,
    collect_set(lang)[0]lang,
    collect_set(source)[0] source,
    collect_set(os)[0] os,
    collect_set(area)[0] area, 
    collect_set(model)[0] model,
    collect_set(brand)[0] brand,
    collect_set(sdk_version)[0] sdk_version,
    collect_set(gmail)[0] gmail,
    collect_set(height_width)[0] height_width,
    collect_set(app_time)[0]app_time,
    collect_set(network)[0] network,
    collect_set(lng)[0]lng,
    collect_set(lat)[0]lat,
   date_add(next_day('2019-02-10','MO'),-7),
   date_add(next_day('2019-02-10','MO'),-1),
   concat(date_add( next_day('2019-02-10','MO'),-7), '_' , date_add(next_day('2019-02-10','MO'),-1) 
)   //字段的连接,表示一周七天,第一个date_add表示取上星期一,第二个表示取上星期天
from dws_uv_detail_day 
where dt>=date_add(next_day('2019-02-10','MO'),-7) and dt<=date_add(next_day('2019-02-10','MO'),-1) 
group by mid_id; 

3)查询导入结果

hive (gmall)> select * from dws_uv_detail_wk limit 1;
hive (gmall)> select count(*) from dws_uv_detail_wk;

每月活跃设备明细

hive (gmall)>
drop table if exists dws_uv_detail_mn;

create  external table dws_uv_detail_mn( 
    `mid_id` string COMMENT '设备唯一标识',
    `user_id` string COMMENT '用户标识', 
    `version_code` string COMMENT '程序版本号', 
    `version_name` string COMMENT '程序版本名', 
`lang` string COMMENT '系统语言', 
`source` string COMMENT '渠道号', 
`os` string COMMENT '安卓系统版本', 
`area` string COMMENT '区域', 
`model` string COMMENT '手机型号', 
`brand` string COMMENT '手机品牌', 
`sdk_version` string COMMENT 'sdkVersion', 
`gmail` string COMMENT 'gmail', 
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度'
) COMMENT '活跃用户按月明细'
PARTITIONED BY (`mn` string)
stored as  parquet
location '/warehouse/gmall/dws/dws_uv_detail_mn/'
;

2)数据导入

hive (gmall)>
set hive.exec.dynamic.partition.mode=nonstrict;

insert  overwrite table dws_uv_detail_mn  partition(mn)
select  
    mid_id,
    collect_set(user_id)[0] user_id,
    collect_set(version_code)[0] version_code,
    collect_set(version_name)[0] version_name,
    collect_set(lang)[0]lang,
    collect_set(source)[0] source,
    collect_set(os)[0] os,
    collect_set(area)[0] area, 
    collect_set(model)[0] model,
    collect_set(brand)[0] brand,
    collect_set(sdk_version)[0] sdk_version,
    collect_set(gmail)[0] gmail,
    collect_set(height_width)[0] height_width,
    collect_set(app_time)[0]app_time,
    collect_set(network)[0] network,
    collect_set(lng)[0]lng,
    collect_set(lat)[0]lat,
date_format('2019-02-10','yyyy-MM')
from dws_uv_detail_day
where date_format(dt,'yyyy-MM') = date_format('2019-02-10','yyyy-MM')   
group by mid_id;

3)查询导入结果

hive (gmall)> select * from dws_uv_detail_mn limit 1;
hive (gmall)> select count(*) from dws_uv_detail_mn ;

载入数据的脚本

在hadoop102的/home/atguigu/bin目录下创建脚本

[atguigu@hadoop102 bin]$ vim dws.sh
在脚本中编写如下内容

#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
	log_date=$1
else 
	log_date=`date -d "-1 day" +%F`  
fi 


sql="
  set hive.exec.dynamic.partition.mode=nonstrict;

  insert overwrite table "$APP".dws_uv_detail_day partition(dt='$log_date')
  select  
    mid_id,
    collect_set(user_id)[0] user_id,
    collect_set(version_code)[0] version_code,
    collect_set(version_name)[0] version_name,
    collect_set(lang)[0]lang,
    collect_set(source)[0] source,
    collect_set(os)[0] os,
    collect_set(area)[0] area, 
    collect_set(model)[0] model,
    collect_set(brand)[0] brand,
    collect_set(sdk_version)[0] sdk_version,
    collect_set(gmail)[0] gmail,
    collect_set(height_width)[0] height_width,
    collect_set(app_time)[0]app_time,
    collect_set(network)[0] network,
    collect_set(lng)[0]lng,
    collect_set(lat)[0]lat
  from "$APP".dwd_start_log
  where dt='$log_date'  
  group by mid_id;


  insert  overwrite table "$APP".dws_uv_detail_wk partition(wk_dt)
  select  
    mid_id,
    collect_set(user_id)[0] user_id,
    collect_set(version_code)[0] version_code,
    collect_set(version_name)[0] version_name,
    collect_set(lang)[0]lang,
    collect_set(source)[0] source,
    collect_set(os)[0] os,
    collect_set(area)[0] area, 
    collect_set(model)[0] model,
    collect_set(brand)[0] brand,
    collect_set(sdk_version)[0] sdk_version,
    collect_set(gmail)[0] gmail,
    collect_set(height_width)[0] height_width,
    collect_set(app_time)[0]app_time,
    collect_set(network)[0] network,
    collect_set(lng)[0]lng,
    collect_set(lat)[0]lat,
   date_add(next_day('$log_date','MO'),-7),
   date_add(next_day('$log_date','SU'),-7),
   concat(date_add( next_day('$log_date','MO'),-7), '_' , date_add(next_day('$log_date','MO'),-1) 
  )
  from "$APP".dws_uv_detail_day 
  where dt>=date_add(next_day('$log_date','MO'),-7) and dt<=date_add(next_day('$log_date','MO'),-1) 
  group by mid_id,lang,gmail,app_time,lng,lat; 


  insert overwrite table "$APP".dws_uv_detail_mn partition(mn)
  select  
    mid_id,
    collect_set(user_id)[0] user_id,
    collect_set(version_code)[0] version_code,
    collect_set(version_name)[0] version_name,
    collect_set(lang)[0]lang,
    collect_set(source)[0] source,
    collect_set(os)[0] os,
    collect_set(area)[0] area, 
    collect_set(model)[0] model,
    collect_set(brand)[0] brand,
    collect_set(sdk_version)[0] sdk_version,
    collect_set(gmail)[0] gmail,
    collect_set(height_width)[0] height_width,
    collect_set(app_time)[0]app_time,
    collect_set(network)[0] network,
    collect_set(lng)[0]lng,
    collect_set(lat)[0]lat,
    date_format('$log_date','yyyy-MM')
  from "$APP".dws_uv_detail_day
  where date_format(dt,'yyyy-MM') = date_format('$log_date','yyyy-MM')   
  group by mid_id,lang,gmail,app_time,lng,lat;
"

$hive -e "$sql"

2)增加脚本执行权限

[atguigu@hadoop102 bin]$ chmod 777 dws.sh

3)脚本使用

[atguigu@hadoop102 module]$ dws.sh 2019-02-11

4)查询结果

hive (gmall)> select count(*) from dws_uv_detail_day;
hive (gmall)> select count(*) from dws_uv_detail_wk;
hive (gmall)> select count(*) from dws_uv_detail_mn ;

最后

以上就是贪玩画板为你收集整理的写个数仓吧(7) 用户行为数据仓—— DWS层,用户活跃主题需求一:用户活跃主题的全部内容,希望文章能够帮你解决写个数仓吧(7) 用户行为数据仓—— DWS层,用户活跃主题需求一:用户活跃主题所遇到的程序开发问题。

如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。

本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
点赞(56)

评论列表共有 0 条评论

立即
投稿
返回
顶部