Scala105-Spark.sql中collect_list用法

84 阅读 0 评论 56 点赞

我是靠谱客的博主愤怒硬币，这篇文章主要介绍Scala105-Spark.sql中collect_list用法，现在分享给大家，希望可以做个参考。

复制代码

1
2
3
4
5
6
import org.apache.spark.sql.functions._
import spark.implicits._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

复制代码

1
2
3
4
5
6
import org.apache.spark.sql.functions._
import spark.implicits._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
val builder = SparkSession
      .builder()
      .appName("learningScala")
      .config("spark.executor.heartbeatInterval","60s")
      .config("spark.network.timeout","120s")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.kryoserializer.buffer.max","512m")
      .config("spark.dynamicAllocation.enabled", false)
      .config("spark.sql.inMemoryColumnarStorage.compressed", true)
      .config("spark.sql.inMemoryColumnarStorage.batchSize", 10000)
      .config("spark.sql.broadcastTimeout", 600)
      .config("spark.sql.autoBroadcastJoinThreshold", -1)
      .config("spark.sql.crossJoin.enabled", true)
      .master("local[*]") 
val spark = builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

复制代码

1
2
3
builder: org.apache.spark.sql.SparkSession.Builder = org.apache.spark.sql.SparkSession$Builder@2b380850
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@7b0a688

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
var df1 = Seq(
(1,"2019-04-01 11:45:50",11.15,"2019-04-02 11:45:49"),
(2,"2019-05-02 11:56:50",10.37,"2019-05-02 11:56:51"),
(3,"2019-07-21 12:45:50",12.11,"2019-08-21 12:45:50"),
(2,"2019-08-01 12:40:50",14.50,"2020-08-03 12:40:50"),
(5,"2019-01-06 10:00:50",16.39,"2019-01-05 10:00:50")
).toDF("id","startTimeStr", "payamount","endTimeStr")
df1 = df1.withColumn("startTime",$"startTimeStr".cast("Timestamp"))
         .withColumn("endTime",$"endTimeStr".cast("Timestamp"))
df1.printSchema
df1.show()

复制代码

+---+-------------------+---------+-------------------+-------------------+-------------------+
| id|       startTimeStr|payamount|         endTimeStr|          startTime|            endTime|
+---+-------------------+---------+-------------------+-------------------+-------------------+
|  1|2019-04-01 11:45:50|    11.15|2019-04-02 11:45:49|2019-04-01 11:45:50|2019-04-02 11:45:49|
|  2|2019-05-02 11:56:50|    10.37|2019-05-02 11:56:51|2019-05-02 11:56:50|2019-05-02 11:56:51|
|  3|2019-07-21 12:45:50|    12.11|2019-08-21 12:45:50|2019-07-21 12:45:50|2019-08-21 12:45:50|
|  2|2019-08-01 12:40:50|     14.5|2020-08-03 12:40:50|2019-08-01 12:40:50|2020-08-03 12:40:50|
|  5|2019-01-06 10:00:50|    16.39|2019-01-05 10:00:50|2019-01-06 10:00:50|2019-01-05 10:00:50|
+---+-------------------+---------+-------------------+-------------------+-------------------+

df1: org.apache.spark.sql.DataFrame = [id: int, startTimeStr: string ... 4 more fields]
df1: org.apache.spark.sql.DataFrame = [id: int, startTimeStr: string ... 4 more fields]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
root
 |-- id: integer (nullable = false)
 |-- startTimeStr: string (nullable = true)
 |-- payamount: double (nullable = false)
 |-- endTimeStr: string (nullable = true)
 |-- startTime: timestamp (nullable = true)
 |-- endTime: timestamp (nullable = true)

+---+-------------------+---------+-------------------+-------------------+-------------------+
| id|       startTimeStr|payamount|         endTimeStr|          startTime|            endTime|
+---+-------------------+---------+-------------------+-------------------+-------------------+
|  1|2019-04-01 11:45:50|    11.15|2019-04-02 11:45:49|2019-04-01 11:45:50|2019-04-02 11:45:49|
|  2|2019-05-02 11:56:50|    10.37|2019-05-02 11:56:51|2019-05-02 11:56:50|2019-05-02 11:56:51|
|  3|2019-07-21 12:45:50|    12.11|2019-08-21 12:45:50|2019-07-21 12:45:50|2019-08-21 12:45:50|
|  2|2019-08-01 12:40:50|     14.5|2020-08-03 12:40:50|2019-08-01 12:40:50|2020-08-03 12:40:50|
|  5|2019-01-06 10:00:50|    16.39|2019-01-05 10:00:50|2019-01-06 10:00:50|2019-01-05 10:00:50|
+---+-------------------+---------+-------------------+-------------------+-------------------+






df1: org.apache.spark.sql.DataFrame = [id: int, startTimeStr: string ... 4 more fields]
df1: org.apache.spark.sql.DataFrame = [id: int, startTimeStr: string ... 4 more fields]

复制代码

1
2
df1.createOrReplaceTempView("temp1")

复制代码

1
2
3
4
5
6
val sql = s"""
SELECT *,
collect_list(payamount) over(partition BY id ORDER BY startTimeStr) payamount_array
FROM temp1
"""

复制代码

1
2
3
4
5
6
7
sql: String =
"
SELECT *,
collect_list(payamount) over(partition BY id ORDER BY startTimeStr) payamount_array
FROM temp1
"

复制代码

1
2
val dfCollect = spark.sql(sql)

复制代码

1
2
dfCollect: org.apache.spark.sql.DataFrame = [id: int, startTimeStr: string ... 5 more fields]

复制代码

1
2
dfCollect.show()

复制代码

+---+-------------------+---------+-------------------+-------------------+-------------------+---------------+
| id|       startTimeStr|payamount|         endTimeStr|          startTime|            endTime|payamount_array|
+---+-------------------+---------+-------------------+-------------------+-------------------+---------------+
|  1|2019-04-01 11:45:50|    11.15|2019-04-02 11:45:49|2019-04-01 11:45:50|2019-04-02 11:45:49|        [11.15]|
|  3|2019-07-21 12:45:50|    12.11|2019-08-21 12:45:50|2019-07-21 12:45:50|2019-08-21 12:45:50|        [12.11]|
|  5|2019-01-06 10:00:50|    16.39|2019-01-05 10:00:50|2019-01-06 10:00:50|2019-01-05 10:00:50|        [16.39]|
|  2|2019-05-02 11:56:50|    10.37|2019-05-02 11:56:51|2019-05-02 11:56:50|2019-05-02 11:56:51|        [10.37]|
|  2|2019-08-01 12:40:50|     14.5|2020-08-03 12:40:50|2019-08-01 12:40:50|2020-08-03 12:40:50|  [10.37, 14.5]|
+---+-------------------+---------+-------------------+-------------------+-------------------+---------------+

1
2
3
4
5
6
7
8
9
10
+---+-------------------+---------+-------------------+-------------------+-------------------+---------------+
| id|       startTimeStr|payamount|         endTimeStr|          startTime|            endTime|payamount_array|
+---+-------------------+---------+-------------------+-------------------+-------------------+---------------+
|  1|2019-04-01 11:45:50|    11.15|2019-04-02 11:45:49|2019-04-01 11:45:50|2019-04-02 11:45:49|        [11.15]|
|  3|2019-07-21 12:45:50|    12.11|2019-08-21 12:45:50|2019-07-21 12:45:50|2019-08-21 12:45:50|        [12.11]|
|  5|2019-01-06 10:00:50|    16.39|2019-01-05 10:00:50|2019-01-06 10:00:50|2019-01-05 10:00:50|        [16.39]|
|  2|2019-05-02 11:56:50|    10.37|2019-05-02 11:56:51|2019-05-02 11:56:50|2019-05-02 11:56:51|        [10.37]|
|  2|2019-08-01 12:40:50|     14.5|2020-08-03 12:40:50|2019-08-01 12:40:50|2020-08-03 12:40:50|  [10.37, 14.5]|
+---+-------------------+---------+-------------------+-------------------+-------------------+---------------+

SQL的基础逻辑，按照id分组，组内按照startTimeStr排序，拼接payamount组成array,array中元素排序，按照startTimeStr升序排列

2020-05-28 于南京市江宁区九龙湖