复制代码
1
2
3
4
5
6import org.apache.spark.sql.functions._ import spark.implicits._ import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.sql.{DataFrame, Row, SparkSession}
复制代码
1
2
3
4
5
6import org.apache.spark.sql.functions._ import spark.implicits._ import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.sql.{DataFrame, Row, SparkSession}
复制代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17val builder = SparkSession .builder() .appName("learningScala") .config("spark.executor.heartbeatInterval","60s") .config("spark.network.timeout","120s") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.kryoserializer.buffer.max","512m") .config("spark.dynamicAllocation.enabled", false) .config("spark.sql.inMemoryColumnarStorage.compressed", true) .config("spark.sql.inMemoryColumnarStorage.batchSize", 10000) .config("spark.sql.broadcastTimeout", 600) .config("spark.sql.autoBroadcastJoinThreshold", -1) .config("spark.sql.crossJoin.enabled", true) .master("local[*]") val spark = builder.getOrCreate() spark.sparkContext.setLogLevel("ERROR")
复制代码
1
2
3builder: org.apache.spark.sql.SparkSession.Builder = org.apache.spark.sql.SparkSession$Builder@2b380850 spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@7b0a688
复制代码
1
2
3
4
5
6
7
8
9
10
11
12var df1 = Seq( (1,"2019-04-01 11:45:50",11.15,"2019-04-02 11:45:49"), (2,"2019-05-02 11:56:50",10.37,"2019-05-02 11:56:51"), (3,"2019-07-21 12:45:50",12.11,"2019-08-21 12:45:50"), (2,"2019-08-01 12:40:50",14.50,"2020-08-03 12:40:50"), (5,"2019-01-06 10:00:50",16.39,"2019-01-05 10:00:50") ).toDF("id","startTimeStr", "payamount","endTimeStr") df1 = df1.withColumn("startTime",$"startTimeStr".cast("Timestamp")) .withColumn("endTime",$"endTimeStr".cast("Timestamp")) df1.printSchema df1.show()
复制代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26root |-- id: integer (nullable = false) |-- startTimeStr: string (nullable = true) |-- payamount: double (nullable = false) |-- endTimeStr: string (nullable = true) |-- startTime: timestamp (nullable = true) |-- endTime: timestamp (nullable = true) +---+-------------------+---------+-------------------+-------------------+-------------------+ | id| startTimeStr|payamount| endTimeStr| startTime| endTime| +---+-------------------+---------+-------------------+-------------------+-------------------+ | 1|2019-04-01 11:45:50| 11.15|2019-04-02 11:45:49|2019-04-01 11:45:50|2019-04-02 11:45:49| | 2|2019-05-02 11:56:50| 10.37|2019-05-02 11:56:51|2019-05-02 11:56:50|2019-05-02 11:56:51| | 3|2019-07-21 12:45:50| 12.11|2019-08-21 12:45:50|2019-07-21 12:45:50|2019-08-21 12:45:50| | 2|2019-08-01 12:40:50| 14.5|2020-08-03 12:40:50|2019-08-01 12:40:50|2020-08-03 12:40:50| | 5|2019-01-06 10:00:50| 16.39|2019-01-05 10:00:50|2019-01-06 10:00:50|2019-01-05 10:00:50| +---+-------------------+---------+-------------------+-------------------+-------------------+ df1: org.apache.spark.sql.DataFrame = [id: int, startTimeStr: string ... 4 more fields] df1: org.apache.spark.sql.DataFrame = [id: int, startTimeStr: string ... 4 more fields]
复制代码
1
2df1.createOrReplaceTempView("temp1")
复制代码
1
2
3
4
5
6val sql = s""" SELECT *, collect_list(payamount) over(partition BY id ORDER BY startTimeStr) payamount_array FROM temp1 """
复制代码
1
2
3
4
5
6
7sql: String = " SELECT *, collect_list(payamount) over(partition BY id ORDER BY startTimeStr) payamount_array FROM temp1 "
复制代码
1
2val dfCollect = spark.sql(sql)
复制代码
1
2dfCollect: org.apache.spark.sql.DataFrame = [id: int, startTimeStr: string ... 5 more fields]
复制代码
1
2dfCollect.show()
复制代码
1
2
3
4
5
6
7
8
9
10+---+-------------------+---------+-------------------+-------------------+-------------------+---------------+ | id| startTimeStr|payamount| endTimeStr| startTime| endTime|payamount_array| +---+-------------------+---------+-------------------+-------------------+-------------------+---------------+ | 1|2019-04-01 11:45:50| 11.15|2019-04-02 11:45:49|2019-04-01 11:45:50|2019-04-02 11:45:49| [11.15]| | 3|2019-07-21 12:45:50| 12.11|2019-08-21 12:45:50|2019-07-21 12:45:50|2019-08-21 12:45:50| [12.11]| | 5|2019-01-06 10:00:50| 16.39|2019-01-05 10:00:50|2019-01-06 10:00:50|2019-01-05 10:00:50| [16.39]| | 2|2019-05-02 11:56:50| 10.37|2019-05-02 11:56:51|2019-05-02 11:56:50|2019-05-02 11:56:51| [10.37]| | 2|2019-08-01 12:40:50| 14.5|2020-08-03 12:40:50|2019-08-01 12:40:50|2020-08-03 12:40:50| [10.37, 14.5]| +---+-------------------+---------+-------------------+-------------------+-------------------+---------------+
SQL的基础逻辑,按照id分组,组内按照startTimeStr排序,拼接payamount组成array,array中元素排序,按照startTimeStr升序排列
2020-05-28 于南京市江宁区九龙湖
最后
以上就是愤怒硬币最近收集整理的关于Scala105-Spark.sql中collect_list用法的全部内容,更多相关Scala105-Spark内容请搜索靠谱客的其他文章。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复