我是靠谱客的博主 现实刺猬,最近开发中收集的这篇文章主要介绍hive中路径转换(漏斗模型),觉得挺不错的,现在分享给大家,希望可以做个参考。

概述

1 加载数据

load data local inpath '/root/hivedata/click-part-r-00000' overwrite into table ods_click_pageviews partition(datestr='20181103');

2 查询每一个步骤的总访问人数

UNION All将多个SELECT语句的结果集合并为一个独立的结果集

create table dw_oute_numbs as 
select 'step1' as step,count(distinct remote_addr)  as numbs from ods_click_pageviews where datestr='20181103' and request like '/item%'
union all
select 'step2' as step,count(distinct remote_addr)  as numbs from ods_click_pageviews where datestr='20181103' and request like '/category%'
union all
select 'step3' as step,count(distinct remote_addr)  as numbs from ods_click_pageviews where datestr='20181103' and request like '/order%'
union all
select 'step4' as step,count(distinct remote_addr)  as numbs from ods_click_pageviews where datestr='20181103' and request like '/index%';

3 结果

+---------------------+----------------------+--+
| dw_oute_numbs.step  | dw_oute_numbs.numbs  |
+---------------------+----------------------+--+
| step1               | 1029                 |
| step2               | 1029                 |
| step3               | 1028                 |
| step4               | 1018                 |
+---------------------+----------------------+--+

4 查询每一步骤相对于路径起点人数的比例级联查询,自己跟自己join

select 
	* 
from 
	(select 
		rn.step as rnstep,
		rn.numbs as rnnumbs,
		rr.step as rrstep,
		rr.numbs as rrnumbs  
	from 
		dw_oute_numbs rn
	inner join 
	dw_oute_numbs rr) tmp 
where 
	rrstep ="step1";
select 
	(rnnumbs/rrnumbs)*100 
from 
	(select 
		rn.step as rnstep,
		rn.numbs as rnnumbs,
		rr.step as rrstep,
		rr.numbs as rrnumbs  
	from 
		dw_oute_numbs rn
	inner join 
		dw_oute_numbs rr) tmp 
where 
	rrstep ="step1";

5 自join后结果如下所示:


+---------+----------+---------+----------+--+
| rnstep  | rnnumbs  | rrstep  | rrnumbs  |
+---------+----------+---------+----------+--+
| step1   | 1029     | step1   | 1029     |
| step2   | 1029     | step1   | 1029     |
| step3   | 1028     | step1   | 1029     |
| step4   | 1018     | step1   | 1029     |
| step1   | 1029     | step2   | 1029     |
| step2   | 1029     | step2   | 1029     |
| step3   | 1028     | step2   | 1029     |
| step4   | 1018     | step2   | 1029     |
| step1   | 1029     | step3   | 1028     |
| step2   | 1029     | step3   | 1028     |
| step3   | 1028     | step3   | 1028     |
| step4   | 1018     | step3   | 1028     |
| step1   | 1029     | step4   | 1018     |
| step2   | 1029     | step4   | 1018     |
| step3   | 1028     | step4   | 1018     |
| step4   | 1018     | step4   | 1018     |
+---------+----------+---------+----------+--+

6 每一步的人数/第一步的人数==每一步相对起点人数比例

select 
	tmp.rnstep,
	tmp.rnnumbs/tmp.rrnumbs as ratio
from
	(select 
		rn.step as rnstep,
		rn.numbs as rnnumbs,
		rr.step as rrstep,
		rr.numbs as rrnumbs  
	from 
		dw_oute_numbs rn
	inner join 
		dw_oute_numbs rr) tmp
where 
	tmp.rrstep='step1';--tmp

7 结果

+---------+----------+---------+----------+--+
| rnstep  | rnnumbs  | rrstep  | rrnumbs  |
+---------+----------+---------+----------+--+
| step1   | 1029     | step1   | 1029     |
| step2   | 1029     | step1   | 1029     |
| step3   | 1028     | step1   | 1029     |
| step4   | 1018     | step1   | 1029     |

8 查询每一步骤相对于上一步骤的漏出率首先通过自join表过滤出每一步跟上一步的记录

select 
	rn.step as rnstep,
	rn.numbs as rnnumbs,
	rr.step as rrstep,
	rr.numbs as rrnumbs  
from 
	dw_oute_numbs rn
inner join 
dw_oute_numbs rr
where cast(substr(rn.step,5,1) as int)=cast(substr(rr.step,5,1) as int)-1;

9 注意:cast为Hive内置函数 类型转换

select cast(1 as float); --1.0
select cast(‘2016-05-22’ as date); --2016-05-22

10 结果

+---------+----------+---------+----------+--+
| rnstep  | rnnumbs  | rrstep  | rrnumbs  |
+---------+----------+---------+----------+--+
| step1   | 1029     | step2   | 1029     |
| step2   | 1029     | step3   | 1028     |
| step3   | 1028     | step4   | 1018     |
+---------+----------+---------+----------+--+

11 然后就可以非常简单的计算出每一步相对上一步的漏出率

select 
	tmp.rrstep as step,
	tmp.rrnumbs/tmp.rnnumbs as leakage_rate
from
	(
	select 
		rn.step as rnstep,
		rn.numbs as rnnumbs,
		rr.step as rrstep,rr.numbs as rrnumbs  
	from 
		dw_oute_numbs rn
	inner join 
		dw_oute_numbs rr) tmp
where 
	cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1;

12 汇总以上两种指标

select 
	abs.step,
	abs.numbs,
	abs.rate as abs_ratio,
	rel.rate as leakage_rate
from 
	(select 
		tmp.rnstep as step,
		tmp.rnnumbs as numbs,
		tmp.rnnumbs/tmp.rrnumbs as rate
	from
		(select 
			rn.step as rnstep,
			rn.numbs as rnnumbs,
			rr.step as rrstep,
			rr.numbs as rrnumbs  
		from 
			dw_oute_numbs rn
		inner join 
			dw_oute_numbs rr) tmp
		where 
			tmp.rrstep='step1') abs
		left outer join
		(select 
			tmp.rrstep as step,
			tmp.rrnumbs/tmp.rnnumbs as rate
		from
			(select 
				rn.step as rnstep,
				rn.numbs as rnnumbs,
				rr.step as rrstep,
				rr.numbs as rrnumbs  
			from 
				dw_oute_numbs rn
			inner join 
				dw_oute_numbs rr) tmp
	where 
		cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1) rel
on 
	abs.step=rel.step;

最后

以上就是现实刺猬为你收集整理的hive中路径转换(漏斗模型)的全部内容,希望文章能够帮你解决hive中路径转换(漏斗模型)所遇到的程序开发问题。

如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。

本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
点赞(36)

评论列表共有 0 条评论

立即
投稿
返回
顶部