概述
前言
dplyr中的窗口函数主要分为4类:排名、迁移、聚合、分布
每一类的熟练掌握都可以使大家在实际的生产中提高效率100%+
如果有忘记dplyr及其拓展包的用法,可以参考下记文章:
【R】dplyr - 数据处理瑞士军刀
【R】dtplyr - 史上最速:dplyr与data.table共舞
【R】dbplyr - 数据库底层操作 SQL代码转译
【R】magrittr - 4种管道操作符活用教程
数据准备
pacman::p_load(tidyverse)
data <- nycflights13::flights %>%
tbl_df() %>%
select(tailnum, flight, year, month, day) %>%
arrange(tailnum) %>%
slice(1:10) %>%
mutate(date = str_c(year, month, day, sep = "-")) %>%
select(-year, -month, -day)
data$date <- as.Date(data$date)
#我们将nycflights13包中flights经过变换处理为一个小型数据集
> data
# A tibble: 10 x 3
tailnum flight date
<chr> <int> <date>
1 D942DN 2247 2013-02-11
2 D942DN 1685 2013-03-23
3 D942DN 1959 2013-03-24
4 D942DN 781 2013-07-05
5 N0EGMQ 4579 2013-01-01
6 N0EGMQ 4584 2013-01-01
7 N0EGMQ 4610 2013-01-02
8 N0EGMQ 4662 2013-01-02
9 N0EGMQ 4661 2013-01-04
10 N0EGMQ 4610 2013-01-05
#只剩航班号tailnum,飞行时间flight,日期date 3列10行
排 名
排名窗口函数,用于给分组后每组内的值按顺序编码
1
顺序编码 - row_number
data %>%
group_by(tailnum) %>% #按tailnum分组
mutate(f_rank = row_number(flight)) %>% #以flight升序有序编码赋值到f_rank
arrange(tailnum, flight) #结果根据tailnum, flight升序排序
# A tibble: 10 x 4
# Groups: tailnum [2]
tailnum flight date f_rank
<chr> <int> <date> <int>
1 D942DN 781 2013-07-05 1
2 D942DN 1685 2013-03-23 2
3 D942DN 1959 2013-03-24 3
4 D942DN 2247 2013-02-11 4
5 N0EGMQ 4579 2013-01-01 1
6 N0EGMQ 4584 2013-01-01 2
7 N0EGMQ 4610 2013-01-02 3
8 N0EGMQ 4610 2013-01-05 4
9 N0EGMQ 4661 2013-01-04 5
10 N0EGMQ 4662 2013-01-02 6
2
并列跳过编码 - min_rank
#观察数据集中f_rank列第7、8行的值:均等于3
#因为我们是按照flight来进行排序编码的,第7、8行的flight值相等,所以是并列第3。第9行flight的值是5,是由于并列编码占用的位置会被跳过
data %>%
group_by(tailnum) %>%
mutate(f_rank = min_rank(flight)) %>%
arrange(tailnum, flight)
# A tibble: 10 x 4
# Groups: tailnum [2]
tailnum flight date f_rank
<chr> <int> <date> <int>
1 D942DN 781 2013-07-05 1
2 D942DN 1685 2013-03-23 2
3 D942DN 1959 2013-03-24 3
4 D942DN 2247 2013-02-11 4
5 N0EGMQ 4579 2013-01-01 1
6 N0EGMQ 4584 2013-01-01 2
7 N0EGMQ 4610 2013-01-02 3
8 N0EGMQ 4610 2013-01-05 3
9 N0EGMQ 4661 2013-01-04 5
10 N0EGMQ 4662 2013-01-02 6
3
并列连续编码 - dense_rank
#相反的,在使用dense_rank窗口函数的情况下,f_rank列的第9行值为4。
#表示并列编码占用的位置不会被跳过,继续连续编码
data %>%
group_by(tailnum) %>%
mutate(f_rank = dense_rank(flight)) %>%
arrange(tailnum, flight)
# A tibble: 10 x 4
# Groups: tailnum [2]
tailnum flight date f_rank
<chr> <int> <date> <int>
1 D942DN 781 2013-07-05 1
2 D942DN 1685 2013-03-23 2
3 D942DN 1959 2013-03-24 3
4 D942DN 2247 2013-02-11 4
5 N0EGMQ 4579 2013-01-01 1
6 N0EGMQ 4584 2013-01-01 2
7 N0EGMQ 4610 2013-01-02 3
8 N0EGMQ 4610 2013-01-05 3
9 N0EGMQ 4661 2013-01-04 4
10 N0EGMQ 4662 2013-01-02 5
4
组内分区 - ntile
#此时f_rank列内的值代表每组行数的平均分后的第几份
#ntile(flight, 2)表示按flight升序将记录平均分成2份
data %>%
group_by(tailnum) %>%
mutate(f_rank = ntile(flight, 2)) %>%
arrange(tailnum, flight)
# A tibble: 10 x 4
# Groups: tailnum [2]
tailnum flight date f_rank
<chr> <int> <date> <int>
1 D942DN 781 2013-07-05 1
2 D942DN 1685 2013-03-23 1
3 D942DN 1959 2013-03-24 2
4 D942DN 2247 2013-02-11 2
5 N0EGMQ 4579 2013-01-01 1
6 N0EGMQ 4584 2013-01-01 1
7 N0EGMQ 4610 2013-01-02 1
8 N0EGMQ 4610 2013-01-05 2
9 N0EGMQ 4661 2013-01-04 2
10 N0EGMQ 4662 2013-01-02 2
偏 移
偏移窗口函数,指定某列内的记录向上或向下偏移
1
组内向上偏移 - lead
#lead(date, 1, order_by = date)),表示根据date列并按照date升序将此列值向上平移1行
#此时我们看到lead_date列的日期正好是date列对应的下一个日期
#diff_date = difftime(lead_date, date, units = "days")是将每次日期间隔的天数计算出
data %>%
group_by(tailnum) %>%
mutate(lead_date = lead(date, 1, order_by = date)) %>%
arrange(tailnum, date) %>%
mutate(diff_date = difftime(lead_date, date, units = "days"))
# A tibble: 10 x 5
# Groups: tailnum [2]
tailnum flight date lead_date diff_date
<chr> <int> <date> <date> <drtn>
1 D942DN 2247 2013-02-11 2013-03-23 40 days
2 D942DN 1685 2013-03-23 2013-03-24 1 days
3 D942DN 1959 2013-03-24 2013-07-05 103 days
4 D942DN 781 2013-07-05 NA NA days
5 N0EGMQ 4579 2013-01-01 2013-01-01 0 days
6 N0EGMQ 4584 2013-01-01 2013-01-02 1 days
7 N0EGMQ 4610 2013-01-02 2013-01-02 0 days
8 N0EGMQ 4662 2013-01-02 2013-01-04 2 days
9 N0EGMQ 4661 2013-01-04 2013-01-05 1 days
10 N0EGMQ 4610 2013-01-05 NA NA days
2
组内向下偏移 - lag
#相反的,lag函数表示向下偏移
#此时lag_date列是对应date列的上一个日期
data %>%
group_by(tailnum) %>%
mutate(lag_date = lag(date, 1, order_by = date)) %>%
arrange(tailnum, date) %>%
mutate(diff_date = difftime(lag_date, date, units = "days"))
# A tibble: 10 x 5
# Groups: tailnum [2]
tailnum flight date lag_date diff_date
<chr> <int> <date> <date> <drtn>
1 D942DN 2247 2013-02-11 NA NA days
2 D942DN 1685 2013-03-23 2013-02-11 -40 days
3 D942DN 1959 2013-03-24 2013-03-23 -1 days
4 D942DN 781 2013-07-05 2013-03-24 -103 days
5 N0EGMQ 4579 2013-01-01 NA NA days
6 N0EGMQ 4584 2013-01-01 2013-01-01 0 days
7 N0EGMQ 4610 2013-01-02 2013-01-01 -1 days
8 N0EGMQ 4662 2013-01-02 2013-01-02 0 days
9 N0EGMQ 4661 2013-01-04 2013-01-02 -2 days
10 N0EGMQ 4610 2013-01-05 2013-01-04 -1 days
3
取组内首位 - first
#将每组内按date列升序排序后的首位值提取出
data %>%
group_by(tailnum) %>%
mutate(first_date = first(date, order_by = date)) %>%
arrange(tailnum, date) %>%
mutate(diff_date = difftime(first_date, date, units = "days"))
# A tibble: 10 x 5
# Groups: tailnum [2]
tailnum flight date first_date diff_date
<chr> <int> <date> <date> <drtn>
1 D942DN 2247 2013-02-11 2013-02-11 0 days
2 D942DN 1685 2013-03-23 2013-02-11 -40 days
3 D942DN 1959 2013-03-24 2013-02-11 -41 days
4 D942DN 781 2013-07-05 2013-02-11 -144 days
5 N0EGMQ 4579 2013-01-01 2013-01-01 0 days
6 N0EGMQ 4584 2013-01-01 2013-01-01 0 days
7 N0EGMQ 4610 2013-01-02 2013-01-01 -1 days
8 N0EGMQ 4662 2013-01-02 2013-01-01 -1 days
9 N0EGMQ 4661 2013-01-04 2013-01-01 -3 days
10 N0EGMQ 4610 2013-01-05 2013-01-01 -4 days
4
取组内末位 - last
#相反的,我们提取出了每组排序后的末尾值
data %>%
group_by(tailnum) %>%
mutate(last_date = last(date, order_by = date)) %>%
arrange(tailnum, date) %>%
mutate(diff_date = difftime(last_date, date, units = "days"))
# A tibble: 10 x 5
# Groups: tailnum [2]
tailnum flight date last_date diff_date
<chr> <int> <date> <date> <drtn>
1 D942DN 2247 2013-02-11 2013-07-05 144 days
2 D942DN 1685 2013-03-23 2013-07-05 104 days
3 D942DN 1959 2013-03-24 2013-07-05 103 days
4 D942DN 781 2013-07-05 2013-07-05 0 days
5 N0EGMQ 4579 2013-01-01 2013-01-05 4 days
6 N0EGMQ 4584 2013-01-01 2013-01-05 4 days
7 N0EGMQ 4610 2013-01-02 2013-01-05 3 days
8 N0EGMQ 4662 2013-01-02 2013-01-05 3 days
9 N0EGMQ 4661 2013-01-04 2013-01-05 1 days
10 N0EGMQ 4610 2013-01-05 2013-01-05 0 days
5
取组内固定位 - nth
#取每组内排序后固定第2位的值
data %>%
group_by(tailnum) %>%
mutate(nth_date = nth(date, 2, order_by = date)) %>%
arrange(tailnum, date) %>%
mutate(diff_date = difftime(nth_date, date, units = "days"))
# A tibble: 10 x 5
# Groups: tailnum [2]
tailnum flight date nth_date diff_date
<chr> <int> <date> <date> <drtn>
1 D942DN 2247 2013-02-11 2013-03-23 40 days
2 D942DN 1685 2013-03-23 2013-03-23 0 days
3 D942DN 1959 2013-03-24 2013-03-23 -1 days
4 D942DN 781 2013-07-05 2013-03-23 -104 days
5 N0EGMQ 4579 2013-01-01 2013-01-01 0 days
6 N0EGMQ 4584 2013-01-01 2013-01-01 0 days
7 N0EGMQ 4610 2013-01-02 2013-01-01 -1 days
8 N0EGMQ 4662 2013-01-02 2013-01-01 -1 days
9 N0EGMQ 4661 2013-01-04 2013-01-01 -3 days
10 N0EGMQ 4610 2013-01-05 2013-01-01 -4 days
聚 合
聚合窗口函数,用于组内算数&累计计算
1
分组求和 - sum
data %>%
group_by(tailnum) %>%
arrange(tailnum, date) %>%
mutate(sum_flight = sum(flight))
# A tibble: 10 x 4
# Groups: tailnum [2]
tailnum flight date sum_flight
<chr> <int> <date> <int>
1 D942DN 2247 2013-02-11 6672
2 D942DN 1685 2013-03-23 6672
3 D942DN 1959 2013-03-24 6672
4 D942DN 781 2013-07-05 6672
5 N0EGMQ 4579 2013-01-01 27706
6 N0EGMQ 4584 2013-01-01 27706
7 N0EGMQ 4610 2013-01-02 27706
8 N0EGMQ 4662 2013-01-02 27706
9 N0EGMQ 4661 2013-01-04 27706
10 N0EGMQ 4610 2013-01-05 27706
2
分组累计求和 - sum
data %>%
group_by(tailnum) %>%
arrange(tailnum, date) %>%
mutate(cumsum_flight = cumsum(flight))
# A tibble: 10 x 4
# Groups: tailnum [2]
tailnum flight date cumsum_flight
<chr> <int> <date> <int>
1 D942DN 2247 2013-02-11 2247
2 D942DN 1685 2013-03-23 3932
3 D942DN 1959 2013-03-24 5891
4 D942DN 781 2013-07-05 6672
5 N0EGMQ 4579 2013-01-01 4579
6 N0EGMQ 4584 2013-01-01 9163
7 N0EGMQ 4610 2013-01-02 13773
8 N0EGMQ 4662 2013-01-02 18435
9 N0EGMQ 4661 2013-01-04 23096
10 N0EGMQ 4610 2013-01-05 27706
3
分组计算最小值 - min
data %>%
group_by(tailnum) %>%
arrange(tailnum, date) %>%
mutate(min_flight = min(flight))
# A tibble: 10 x 4
# Groups: tailnum [2]
tailnum flight date min_flight
<chr> <int> <date> <int>
1 D942DN 2247 2013-02-11 781
2 D942DN 1685 2013-03-23 781
3 D942DN 1959 2013-03-24 781
4 D942DN 781 2013-07-05 781
5 N0EGMQ 4579 2013-01-01 4579
6 N0EGMQ 4584 2013-01-01 4579
7 N0EGMQ 4610 2013-01-02 4579
8 N0EGMQ 4662 2013-01-02 4579
9 N0EGMQ 4661 2013-01-04 4579
10 N0EGMQ 4610 2013-01-05 4579
4
分组计算累计最小值 - cummin
data %>%
group_by(tailnum) %>%
arrange(tailnum, date) %>%
mutate(cummin_flight = cummin(flight))
# A tibble: 10 x 4
# Groups: tailnum [2]
tailnum flight date cummin_flight
<chr> <int> <date> <int>
1 D942DN 2247 2013-02-11 2247
2 D942DN 1685 2013-03-23 1685
3 D942DN 1959 2013-03-24 1685
4 D942DN 781 2013-07-05 781
5 N0EGMQ 4579 2013-01-01 4579
6 N0EGMQ 4584 2013-01-01 4579
7 N0EGMQ 4610 2013-01-02 4579
8 N0EGMQ 4662 2013-01-02 4579
9 N0EGMQ 4661 2013-01-04 4579
10 N0EGMQ 4610 2013-01-05 4579
5
分组计算最大值 - max
data %>%
group_by(tailnum) %>%
arrange(tailnum, date) %>%
mutate(max_flight = max(flight))
# A tibble: 10 x 4
# Groups: tailnum [2]
tailnum flight date max_flight
<chr> <int> <date> <int>
1 D942DN 2247 2013-02-11 2247
2 D942DN 1685 2013-03-23 2247
3 D942DN 1959 2013-03-24 2247
4 D942DN 781 2013-07-05 2247
5 N0EGMQ 4579 2013-01-01 4662
6 N0EGMQ 4584 2013-01-01 4662
7 N0EGMQ 4610 2013-01-02 4662
8 N0EGMQ 4662 2013-01-02 4662
9 N0EGMQ 4661 2013-01-04 4662
10 N0EGMQ 4610 2013-01-05 4662
6
分组计算累计最大值 - cummax
data %>%
group_by(tailnum) %>%
arrange(tailnum, date) %>%
mutate(cummax_flight = cummax(flight))
# A tibble: 10 x 4
# Groups: tailnum [2]
tailnum flight date cummax_flight
<chr> <int> <date> <int>
1 D942DN 2247 2013-02-11 2247
2 D942DN 1685 2013-03-23 2247
3 D942DN 1959 2013-03-24 2247
4 D942DN 781 2013-07-05 2247
5 N0EGMQ 4579 2013-01-01 4579
6 N0EGMQ 4584 2013-01-01 4584
7 N0EGMQ 4610 2013-01-02 4610
8 N0EGMQ 4662 2013-01-02 4662
9 N0EGMQ 4661 2013-01-04 4662
10 N0EGMQ 4610 2013-01-05 4662
7
分组计算均值 - mean
data %>%
group_by(tailnum) %>%
arrange(tailnum, date) %>%
mutate(mean_flight = mean(flight))
# A tibble: 10 x 4
# Groups: tailnum [2]
tailnum flight date mean_flight
<chr> <int> <date> <dbl>
1 D942DN 2247 2013-02-11 1668
2 D942DN 1685 2013-03-23 1668
3 D942DN 1959 2013-03-24 1668
4 D942DN 781 2013-07-05 1668
5 N0EGMQ 4579 2013-01-01 4618.
6 N0EGMQ 4584 2013-01-01 4618.
7 N0EGMQ 4610 2013-01-02 4618.
8 N0EGMQ 4662 2013-01-02 4618.
9 N0EGMQ 4661 2013-01-04 4618.
10 N0EGMQ 4610 2013-01-05 4618.
8
分组计算累计均值 - cummean
data %>%
group_by(tailnum) %>%
arrange(tailnum, date) %>%
mutate(cummean_flight = cummean(flight))
# A tibble: 10 x 4
# Groups: tailnum [2]
tailnum flight date cummean_flight
<chr> <int> <date> <dbl>
1 D942DN 2247 2013-02-11 2247
2 D942DN 1685 2013-03-23 1966
3 D942DN 1959 2013-03-24 1964.
4 D942DN 781 2013-07-05 1668
5 N0EGMQ 4579 2013-01-01 4579
6 N0EGMQ 4584 2013-01-01 4582.
7 N0EGMQ 4610 2013-01-02 4591
8 N0EGMQ 4662 2013-01-02 4609.
9 N0EGMQ 4661 2013-01-04 4619.
10 N0EGMQ 4610 2013-01-05 4618.
9
分组计算记录数 - n
data %>%
group_by(tailnum) %>%
arrange(tailnum, date) %>%
mutate(n_flight = n())
# A tibble: 10 x 4
# Groups: tailnum [2]
tailnum flight date n_flight
<chr> <int> <date> <int>
1 D942DN 2247 2013-02-11 4
2 D942DN 1685 2013-03-23 4
3 D942DN 1959 2013-03-24 4
4 D942DN 781 2013-07-05 4
5 N0EGMQ 4579 2013-01-01 6
6 N0EGMQ 4584 2013-01-01 6
7 N0EGMQ 4610 2013-01-02 6
8 N0EGMQ 4662 2013-01-02 6
9 N0EGMQ 4661 2013-01-04 6
10 N0EGMQ 4610 2013-01-05 6
分 布
1
首位包含型分布 - cume_dist
#以D942DN组为例:
组内date列存在4条记录,算法是将值域为1的空间4等分,得到分布小数0.25 0.5 0.75 1
首条记录包含在值域内(不为0)
data %>%
group_by(tailnum) %>%
arrange(tailnum, date) %>%
mutate(cume_dist_date = cume_dist(date))
# A tibble: 10 x 4
# Groups: tailnum [2]
tailnum flight date cume_dist_date
<chr> <int> <date> <dbl>
1 D942DN 2247 2013-02-11 0.25
2 D942DN 1685 2013-03-23 0.5
3 D942DN 1959 2013-03-24 0.75
4 D942DN 781 2013-07-05 1
5 N0EGMQ 4579 2013-01-01 0.333
6 N0EGMQ 4584 2013-01-01 0.333
7 N0EGMQ 4610 2013-01-02 0.667
8 N0EGMQ 4662 2013-01-02 0.667
9 N0EGMQ 4661 2013-01-04 0.833
10 N0EGMQ 4610 2013-01-05 1
2
首位非包含型分布 - percent_rank
#相应的,首位非包含型分布是指:在记录数为n时,将值域为1的空间以n-1均等分,首条记录不包含在值域内(等于0)
data %>%
group_by(tailnum) %>%
arrange(tailnum, date) %>%
mutate(percent_rank_date = percent_rank(date))
# A tibble: 10 x 4
# Groups: tailnum [2]
tailnum flight date percent_rank_date
<chr> <int> <date> <dbl>
1 D942DN 2247 2013-02-11 0
2 D942DN 1685 2013-03-23 0.333
3 D942DN 1959 2013-03-24 0.667
4 D942DN 781 2013-07-05 1
5 N0EGMQ 4579 2013-01-01 0
6 N0EGMQ 4584 2013-01-01 0
7 N0EGMQ 4610 2013-01-02 0.4
8 N0EGMQ 4662 2013-01-02 0.4
9 N0EGMQ 4661 2013-01-04 0.8
10 N0EGMQ 4610 2013-01-05 1
·END·
R语言与数据分析
生产力干货
微信号:RforData最后
以上就是热心橘子为你收集整理的【R】dplyr进阶 - 窗口函数(排名、迁移、聚合、分布)的全部内容,希望文章能够帮你解决【R】dplyr进阶 - 窗口函数(排名、迁移、聚合、分布)所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复