概述
有一个日志文件,要完成数据清洗:
1593136280858|{"cm":{"ln":"-55.0","sv":"V2.9.6","os":"8.0.4","g":"C6816QZ0@gmail.com","mid":"489","nw":"3G","l":"es","vc":"4","hw":"640*960","ar":"MX","uid":"489","t":"1593123253541","la":"5.2","md":"sumsung-18","vn":"1.3.4","ba":"Sumsung","sr":"I"},"ap":"app","et":[{"ett":"1593050051366","en":"loading","kv":{"extend2":"","loading_time":"14","action":"3","extend1":"","type":"2","type1":"201","loading_way":"1"}},{"ett":"1593108791764","en":"ad","kv":{"activityId":"1","displayMills":"78522","entry":"1","action":"1","contentType":"0"}},{"ett":"1593111271266","en":"notification","kv":{"ap_time":"1593097087883","action":"1","type":"1","content":""}},{"ett":"1593066033562","en":"active_background","kv":{"active_source":"3"}},{"ett":"1593135644347","en":"comment","kv":{"p_comment_id":1,"addtime":"1593097573725","praise_count":973,"other_id":5,"comment_id":9,"reply_count":40,"userid":7,"content":"辑赤蹲慰鸽抿肘捎"}}]}
1593136280858|{"cm":{"ln":"-114.9","sv":"V2.7.8","os":"8.0.4","g":"NW0S962J@gmail.com","mid":"490","nw":"3G","l":"pt","vc":"8","hw":"640*1136","ar":"MX","uid":"490","t":"1593121224789","la":"-44.4","md":"Huawei-8","vn":"1.0.1","ba":"Huawei","sr":"O"},"ap":"app","et":[{"ett":"1593063223807","en":"loading","kv":{"extend2":"","loading_time":"0","action":"3","extend1":"","type":"1","type1":"102","loading_way":"1"}},{"ett":"1593095105466","en":"ad","kv":{"activityId":"1","displayMills":"1966","entry":"3","action":"2","contentType":"0"}},{"ett":"1593051718208","en":"notification","kv":{"ap_time":"1593095336265","action":"2","type":"3","content":""}},{"ett":"1593100021275","en":"comment","kv":{"p_comment_id":4,"addtime":"1593098946009","praise_count":220,"other_id":4,"comment_id":9,"reply_count":151,"userid":4,"content":"抄应螟皮釉倔掉汉蛋蕾街羡晶"}},{"ett":"1593105344120","en":"praise","kv":{"target_id":9,"id":7,"type":1,"add_time":"1593098545976","userid":8}}]}
选取其中一行分析:可以看到日志文件内容是json字符串格式的,找一个在线解析工具看一下,结果如下:
1593136280858 | {
"cm": {
"ln": "-55.0",
"sv": "V2.9.6",
"os": "8.0.4",
"g": "C6816QZ0@gmail.com",
"mid": "489",
"nw": "3G",
"l": "es",
"vc": "4",
"hw": "640*960",
"ar": "MX",
"uid": "489",
"t": "1593123253541",
"la": "5.2",
"md": "sumsung-18",
"vn": "1.3.4",
"ba": "Sumsung",
"sr": "I"
},
"ap": "app",
"et": [{
"ett": "1593050051366",
"en": "loading",
"kv": {
"extend2": "",
"loading_time": "14",
"action": "3",
"extend1": "",
"type": "2",
"type1": "201",
"loading_way": "1"
}
}, {
"ett": "1593108791764",
"en": "ad",
"kv": {
"activityId": "1",
"displayMills": "78522",
"entry": "1",
"action": "1",
"contentType": "0"
}
}, {
"ett": "1593111271266",
"en": "notification",
"kv": {
"ap_time": "1593097087883",
"action": "1",
"type": "1",
"content": ""
}
}, {
"ett": "1593066033562",
"en": "active_background",
"kv": {
"active_source": "3"
}
}, {
"ett": "1593135644347",
"en": "comment",
"kv": {
"p_comment_id": 1,
"addtime": "1593097573725",
"praise_count": 973,
"other_id": 5,
"comment_id": 9,
"reply_count": 40,
"userid": 7,
"content": "辑赤蹲慰鸽抿肘捎"
}
}]
}
先将文件上传到hdfs目录下:
[root@hadoop100 ~]# hdfs dfs -put /opt/kb09file/op.log /kb09file
使用spark-shell读取文件:
scala> sc.textFile("file:///opt/kb09file/op.log") //读取Linux系统文件
scala> val fileRDD = sc.textFile("hdfs://192.168.237.100:9000/kb09file/op.log") //读取hdfs文件
对日志文件分析:
日志格式为:1593136280858 (用户标识) | json字符串的形式 //分隔符为 | 管道符
使用split切割字符串:
//注意这里应该使用单引号而不是双引号。单引号表示是字符,双引号表示是字符串
scala> val jsonStrRDD = fileRDD.map(x=>x.split('|')).map(x=>(x(0),x(1)))
如果想要保留用户标识,添加到json字符串中,进行字符串拼接:
scala> val jsonRdd = jsonStrRDD.map(x=>{ var jsonStr=x._2 ; jsonStr = jsonStr.substring(0,jsonStr.length-1); jsonStr + ","id":"+ x._1 + "}" })
将Rdd转换成DataFrame:
scala> val jsonDF=jsonRdd.toDF
查看一下jsonDF的结果:
jsonDF.show() //只有一个列,列名为value
+--------------------+
| value|
+--------------------+
|{"cm":{"ln":"-55....|
|{"cm":{"ln":"-114...|
+--------------------+
添加需要的包:
import spark.implicits._ //隐式转换
import org.apache.spark.sql.functions._ //内置方法
import org.apache.spark.sql.types._ //类型
import org.apache.spark.sql._
获取json字符串的值的方法:
scala> jsonDF.select( get_json_object($"value","$.cm").alias("cm") ).show(false)
输出结果:
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|cm |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"ln":"-55.0","sv":"V2.9.6","os":"8.0.4","g":"C6816QZ0@gmail.com","mid":"489","nw":"3G","l":"es","vc":"4","hw":"640*960","ar":"MX","uid":"489","t":"1593123253541","la":"5.2","md":"sumsung-18","vn":"1.3.4","ba":"Sumsung","sr":"I"} |
|{"ln":"-114.9","sv":"V2.7.8","os":"8.0.4","g":"NW0S962J@gmail.com","mid":"490","nw":"3G","l":"pt","vc":"8","hw":"640*1136","ar":"MX","uid":"490","t":"1593121224789","la":"-44.4","md":"Huawei-8","vn":"1.0.1","ba":"Huawei","sr":"O"}|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
日志文件共可以拆分成4个json字符串,每组字符串还可以继续拆分json格式
将json字符串{“cm”:“a1” , “ap”:“b1” , “et”:“c1” , “id”:“d1”} 结构化
表头 | cm | ap | et | id |
---|---|---|---|---|
列 | a1 | b1 | c1 | d1 |
//将json字符串{"cm":"a1" , "ap":"b1" , "et":"c1" , "id":"d1"} 结构化
// 表头 cm ap et id
// 列 a1 b1 c1 d1
val jsonDF2: DataFrame = jsonDF.select(get_json_object($"value", "$.cm").alias("cm")
, get_json_object($"value", "$.ap").alias("ap")
, get_json_object($"value", "$.et").alias("et")
, get_json_object($"value", "$.id").alias("id")
)
jsonDF2.select($"id", $"ap", $"cm", $"et").show()
输出结果:
+-------------+---+--------------------+--------------------+
| id| ap| cm| et|
+-------------+---+--------------------+--------------------+
|1593136280858|app|{"ln":"-55.0","sv...|[{"ett":"15930500...|
|1593136280858|app|{"ln":"-114.9","s...|[{"ett":"15930632...|
+-------------+---+--------------------+--------------------+
将cm继续拆分,把每个字段拆分成一个列:
val jsonDF3: DataFrame = jsonDF2.select($"id"
, $"ap"
, get_json_object($"cm", "$.ln").alias("ln")
, get_json_object($"cm", "$.sv").alias("sv")
, get_json_object($"cm", "$.os").alias("os")
, get_json_object($"cm", "$.g").alias("g")
, get_json_object($"cm", "$.mid").alias("mid")
, get_json_object($"cm", "$.nw").alias("nw")
, get_json_object($"cm", "$.l").alias("l")
, get_json_object($"cm", "$.vc").alias("vc")
, get_json_object($"cm", "$.hw").alias("hw")
, get_json_object($"cm", "$.ar").alias("ar")
, get_json_object($"cm", "$.uid").alias("uid")
, get_json_object($"cm", "$.t").alias("t")
, get_json_object($"cm", "$.la").alias("la")
, get_json_object($"cm", "$.md").alias("md")
, get_json_object($"cm", "$.vn").alias("vn")
, get_json_object($"cm", "$.ba").alias("ba")
, get_json_object($"cm", "$.sr").alias("sr")
, $"et")
jsonDF3.show()
输出结果:
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
| id| ap| ln| sv| os| g|mid| nw| l| vc| hw| ar|uid| t| la| md| vn| ba| sr| et|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|[{"ett":"15930500...|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|640*1136| MX|490|1593121224789|-44.4| Huawei-8|1.0.1| Huawei| O|[{"ett":"15930632...|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
idea整体代码如下:
package nj.zb.kb09.project
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
object OpLog {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("clearDemo").getOrCreate()
import spark.implicits._
val sc: SparkContext = spark.sparkContext
val fileRDD: RDD[String] = sc.textFile("in/op.log")
val jsonStrRDD: RDD[(String, String)] = fileRDD.map(x => x.split('|')).map(x => (x(0), x(1)))
val jsonRdd: RDD[String] = jsonStrRDD
.map(x => {
var jsonStr = x._2
jsonStr = jsonStr.substring(0, jsonStr.length - 1)
jsonStr + ","id":" + x._1 + "}"
})
val jsonDF: DataFrame = jsonRdd.toDF
jsonDF.show()
import spark.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
jsonDF.select(get_json_object($"value", "$.cm").alias("cm")).show(false)
val jsonDF2: DataFrame = jsonDF.select(get_json_object($"value", "$.cm").alias("cm")
, get_json_object($"value", "$.ap").alias("ap")
, get_json_object($"value", "$.et").alias("et")
, get_json_object($"value", "$.id").alias("id")
)
jsonDF2.select($"id", $"ap", $"cm", $"et").show
val jsonDF3: DataFrame = jsonDF2.select($"id"
, $"ap"
, get_json_object($"cm", "$.ln").alias("ln")
, get_json_object($"cm", "$.sv").alias("sv")
, get_json_object($"cm", "$.os").alias("os")
, get_json_object($"cm", "$.g").alias("g")
, get_json_object($"cm", "$.mid").alias("mid")
, get_json_object($"cm", "$.nw").alias("nw")
, get_json_object($"cm", "$.l").alias("l")
, get_json_object($"cm", "$.vc").alias("vc")
, get_json_object($"cm", "$.hw").alias("hw")
, get_json_object($"cm", "$.ar").alias("ar")
, get_json_object($"cm", "$.uid").alias("uid")
, get_json_object($"cm", "$.t").alias("t")
, get_json_object($"cm", "$.la").alias("la")
, get_json_object($"cm", "$.md").alias("md")
, get_json_object($"cm", "$.vn").alias("vn")
, get_json_object($"cm", "$.ba").alias("ba")
, get_json_object($"cm", "$.sr").alias("sr")
, $"et")
jsonDF3.show()
}
}
对"et"字段分析,通过from_json方法把字符串"et"结构化:
from_json 把字符串
"et":[
{"ett":"a1","en":"a2","kv":{"a3"}}
,{"ett":"a1","en":"b2","kv":{"b3"}}
,{"ett":"a1","en":"c2","kv":{"c3"}}
]
结构化:
ett en kv
a1 a2 a3
b1 b2 b3
c1 c2 c3
在idea中出现错误,
回到黑窗口界面操作:
scala> val jsonDF4 = jsonDF3.select($"id", $"ap", $"ln", $"sv", $"os", $"g", $"mid",
$"nw", $"l", $"vc", $"hw", $"ar", $"uid", $"t", $"la", $"md", $"vn", $"ba", $"sr",
from_json($"et",ArrayType(StructType(StructField("ett",StringType)::StructField
("en",StringType)::StructField("kv",StringType)::Nil))).as("event"))
输出结果:
scala> jsonDF4.show()
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
| id| ap| ln| sv| os| g|mid| nw| l| vc| hw| ar|uid| t| la| md| vn| ba| sr| event|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|[[1593050051366, ...|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|640*1136| MX|490|1593121224789|-44.4| Huawei-8|1.0.1| Huawei| O|[[1593063223807, ...|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
jsonDF4大脑想到的结构是这样:
id | ap | mid | nw | *** | event |
---|---|---|---|---|---|
1593136280858 | app | 12345 | dsdwe | **** | [[ett en kv], [b1 b2 b3],[c1 c2 c3] ] |
期望得到的结果,将数组拆分成行,及数组中的每个字段拆分成列:
id | ap | mid | nw | *** | ett | en | kv |
---|---|---|---|---|---|---|---|
1593136280858 | app | 12345 | dsdwe | **** | ett | en | kv |
1593136280858 | app | 12345 | dsdwe | **** | b1 | b2 | b3 |
1593136280858 | app | 12345 | dsdwe | **** | c1 | c2 | c3 |
中间结果结果的效果,先将数组拆分,使用explode()方法(得到jsonDF5):
id | ap | mid | nw | *** | event |
---|---|---|---|---|---|
1593136280858 | app | 12345 | dsdwe | **** | [ett en kv] |
1593136280858 | app | 12345 | dsdwe | **** | [b1 b2 b3] |
1593136280858 | app | 12345 | dsdwe | **** | [c1 c2 c3] |
scala> val jsonDF5 = jsonDF4.select($"id", $"ap", $"ln", $"sv", $"os", $"g", $"mid",
$"nw" , $"l", $"vc", $"hw", $"ar", $"uid", $"t", $"la", $"md" , $"vn", $"ba",
$"sr", explode($"event").alias("event"))
输出结果:
scala> jsonDF5.show()
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
| id| ap| ln| sv| os| g|mid| nw| l| vc| hw| ar|uid| t| la| md| vn| ba| sr| event|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|[1593050051366, l...|
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|[1593108791764, a...|
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|[1593111271266, n...|
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|[1593066033562, a...|
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|[1593135644347, c...|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|640*1136| MX|490|1593121224789|-44.4| Huawei-8|1.0.1| Huawei| O|[1593063223807, l...|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|640*1136| MX|490|1593121224789|-44.4| Huawei-8|1.0.1| Huawei| O|[1593095105466, a...|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|640*1136| MX|490|1593121224789|-44.4| Huawei-8|1.0.1| Huawei| O|[1593051718208, n...|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|640*1136| MX|490|1593121224789|-44.4| Huawei-8|1.0.1| Huawei| O|[1593100021275, c...|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|640*1136| MX|490|1593121224789|-44.4| Huawei-8|1.0.1| Huawei| O|[1593105344120, p...|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
单独查看一下event列,做后续分析:
scala> jsonDF5.select("event").show(false)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|event |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[1593050051366, loading, {"extend2":"","loading_time":"14","action":"3","extend1":"","type":"2","type1":"201","loading_way":"1"}] |
|[1593108791764, ad, {"activityId":"1","displayMills":"78522","entry":"1","action":"1","contentType":"0"}] |
|[1593111271266, notification, {"ap_time":"1593097087883","action":"1","type":"1","content":""}] |
|[1593066033562, active_background, {"active_source":"3"}] |
|[1593135644347, comment, {"p_comment_id":1,"addtime":"1593097573725","praise_count":973,"other_id":5,"comment_id":9,"reply_count":40,"userid":7,"content":"辑赤蹲慰鸽抿肘捎"}] |
|[1593063223807, loading, {"extend2":"","loading_time":"0","action":"3","extend1":"","type":"1","type1":"102","loading_way":"1"}] |
|[1593095105466, ad, {"activityId":"1","displayMills":"1966","entry":"3","action":"2","contentType":"0"}] |
|[1593051718208, notification, {"ap_time":"1593095336265","action":"2","type":"3","content":""}] |
|[1593100021275, comment, {"p_comment_id":4,"addtime":"1593098946009","praise_count":220,"other_id":4,"comment_id":9,"reply_count":151,"userid":4,"content":"抄应螟皮釉倔掉汉蛋蕾街羡晶"}]|
|[1593105344120, praise, {"target_id":9,"id":7,"type":1,"add_time":"1593098545976","userid":8}] |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
将event列拆分成"ett",“en”,"kv"三列
scala> jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid"
,$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md"
,$"vn",$"ba",$"sr",$"event.ett",$"event.en",$"event.kv").show(false)
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |ap |ln |sv |os |g |mid|nw |l |vc |hw |ar |uid|t |la |md |vn |ba |sr |ett |en |kv |
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1593136280858|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |es |4 |640*960 |MX |489|1593123253541|5.2 |sumsung-18|1.3.4|Sumsung|I |1593050051366|loading |{"extend2":"","loading_time":"14","action":"3","extend1":"","type":"2","type1":"201","loading_way":"1"} |
|1593136280858|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |es |4 |640*960 |MX |489|1593123253541|5.2 |sumsung-18|1.3.4|Sumsung|I |1593108791764|ad |{"activityId":"1","displayMills":"78522","entry":"1","action":"1","contentType":"0"} |
|1593136280858|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |es |4 |640*960 |MX |489|1593123253541|5.2 |sumsung-18|1.3.4|Sumsung|I |1593111271266|notification |{"ap_time":"1593097087883","action":"1","type":"1","content":""} |
|1593136280858|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |es |4 |640*960 |MX |489|1593123253541|5.2 |sumsung-18|1.3.4|Sumsung|I |1593066033562|active_background|{"active_source":"3"} |
|1593136280858|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |es |4 |640*960 |MX |489|1593123253541|5.2 |sumsung-18|1.3.4|Sumsung|I |1593135644347|comment |{"p_comment_id":1,"addtime":"1593097573725","praise_count":973,"other_id":5,"comment_id":9,"reply_count":40,"userid":7,"content":"辑赤蹲慰鸽抿肘捎"} |
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |pt |8 |640*1136|MX |490|1593121224789|-44.4|Huawei-8 |1.0.1|Huawei |O |1593063223807|loading |{"extend2":"","loading_time":"0","action":"3","extend1":"","type":"1","type1":"102","loading_way":"1"} |
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |pt |8 |640*1136|MX |490|1593121224789|-44.4|Huawei-8 |1.0.1|Huawei |O |1593095105466|ad |{"activityId":"1","displayMills":"1966","entry":"3","action":"2","contentType":"0"} |
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |pt |8 |640*1136|MX |490|1593121224789|-44.4|Huawei-8 |1.0.1|Huawei |O |1593051718208|notification |{"ap_time":"1593095336265","action":"2","type":"3","content":""} |
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |pt |8 |640*1136|MX |490|1593121224789|-44.4|Huawei-8 |1.0.1|Huawei |O |1593100021275|comment |{"p_comment_id":4,"addtime":"1593098946009","praise_count":220,"other_id":4,"comment_id":9,"reply_count":151,"userid":4,"content":"抄应螟皮釉倔掉汉蛋蕾街羡晶"}|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |pt |8 |640*1136|MX |490|1593121224789|-44.4|Huawei-8 |1.0.1|Huawei |O |1593105344120|praise |{"target_id":9,"id":7,"type":1,"add_time":"1593098545976","userid":8} |
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
en有如下几种类型:
loading、ad、notification、active_background、comment、praise
scala> jsonDF5.select($"event.en").distinct().show()
+-----------------+
| en|
+-----------------+
| praise|
| notification|
| comment|
| ad|
|active_background|
| loading|
+-----------------+
通过filter过滤条件,可以生成不同的DataFrame
//举例过滤出en的值为loading的行,将结果生成一个DataFrame
val loadingDF = jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw"
,$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba"
,$"sr",$"event.ett",$"event.en",$"event.kv")
.filter($"event.en"==="loading")
查看结果:
scala> loadingDF.show(false)
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------------------------------------------------------------------------------------------------------+
|id |ap |ln |sv |os |g |mid|nw |l |vc |hw |ar |uid|t |la |md |vn |ba |sr |ett |en |kv |
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------------------------------------------------------------------------------------------------------+
|1593136280858|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |es |4 |640*960 |MX |489|1593123253541|5.2 |sumsung-18|1.3.4|Sumsung|I |1593050051366|loading|{"extend2":"","loading_time":"14","action":"3","extend1":"","type":"2","type1":"201","loading_way":"1"}|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |pt |8 |640*1136|MX |490|1593121224789|-44.4|Huawei-8 |1.0.1|Huawei |O |1593063223807|loading|{"extend2":"","loading_time":"0","action":"3","extend1":"","type":"1","type1":"102","loading_way":"1"} |
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------------------------------------------------------------------------------------------------------+
单独查看kv列
scala> loadingDF.select("kv").show(false)
+-------------------------------------------------------------------------------------------------------+
|kv |
+-------------------------------------------------------------------------------------------------------+
|{"extend2":"","loading_time":"14","action":"3","extend1":"","type":"2","type1":"201","loading_way":"1"}|
|{"extend2":"","loading_time":"0","action":"3","extend1":"","type":"1","type1":"102","loading_way":"1"} |
+-------------------------------------------------------------------------------------------------------+
可以看到kv列中仍然是json格式则字符串,有很多的kv组合,进行拆分成不同的列
scala> loadingDF.select(
get_json_object($"kv","$.extend2").alias("extend2")
,get_json_object($"kv","$.loading_time").alias("loading_time")
,get_json_object($"kv","$.action").alias("action")
,get_json_object($"kv","$.extend1").alias("extend1")
,get_json_object($"kv","$.type").alias("type")
,get_json_object($"kv","$.type1").alias("type1")
,get_json_object($"kv","$.loading_way").alias("loading_way")
).show(false)
查看结果:
+-------+------------+------+-------+----+-----+-----------+
|extend2|loading_time|action|extend1|type|type1|loading_way|
+-------+------------+------+-------+----+-----+-----------+
| |14 |3 | |2 |201 |1 |
| |0 |3 | |1 |102 |1 |
+-------+------------+------+-------+----+-----+-----------+
把前面的列加上,生成完整的loadingDF
scala> val loadingDF1 = loadingDF.select( $"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l"
,$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en"
,get_json_object($"kv","$.extend2").alias("extend2")
,get_json_object($"kv","$.loading_time").alias("loading_time")
,get_json_object($"kv","$.action").alias("action")
,get_json_object($"kv","$.extend1").alias("extend1")
,get_json_object($"kv","$.type").alias("type")
,get_json_object($"kv","$.type1").alias("type1")
,get_json_object($"kv","$.loading_way").alias("loading_way"))
查看结果:
scala> loadingDF1.show()
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------+------------+------+-------+----+-----+-----------+
| id| ap| ln| sv| os| g|mid| nw| l| vc| hw| ar|uid| t| la| md| vn| ba| sr| ett| en|extend2|loading_time|action|extend1|type|type1|loading_way|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------+------------+------+-------+----+-----+-----------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|1593050051366|loading| | 14| 3| | 2| 201| 1|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|640*1136| MX|490|1593121224789|-44.4| Huawei-8|1.0.1| Huawei| O|1593063223807|loading| | 0| 3| | 1| 102| 1|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------+------------+------+-------+----+-----+-----------+
最后
以上就是整齐大碗为你收集整理的2020.11.19课堂笔记(日志文件完成数据清洗)的全部内容,希望文章能够帮你解决2020.11.19课堂笔记(日志文件完成数据清洗)所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复