Spark-基于scala实现文章特征提取(TF-IDF)

90 阅读 0 评论 60 点赞

我是靠谱客的博主暴躁唇彩，这篇文章主要介绍Spark-基于scala实现文章特征提取(TF-IDF)，现在分享给大家，希望可以做个参考。

一.基本原理:

TF-IDF（term frequency–inverse document frequency）:TF表示词频,IDF表示反文档频率.TF-IDF主要内容就是:如果一个词语在本篇文章出现的频率(TF)高,并且在其他文章出现少(即反文档频率IDF高),那么就可以认为这个词语是本篇文章的关键词,因为它具有很好的区分和代表能力.

二.SparkML库:

TF：HashingTF 是一个Transformer，在文本处理中，接收词条的集合然后把这些集合转化成固定长度的特征向量。这个算法在哈希的同时会统计各个词条的词频。
IDF：IDF是一个Estimator，在一个数据集上应用它的fit（）方法，产生一个IDFModel。该IDFModel 接收特征向量（由HashingTF产生），然后计算每一个词在文档中出现的频次。IDF会减少那些在语料库中出现频率较高的词的权重。

三.Spark实例:

复制代码

import java.io.{FileInputStream, FileOutputStream, ObjectInputStream, ObjectOutputStream, _}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.feature._
import org.apache.spark.sql.SQLContext
object tfidftest {
def main(args: Array[String]): Unit = {
val masterUrl = "local[2]"
val appName ="tfidf_test"
val sparkConf = new SparkConf().setMaster(masterUrl).setAppName(appName)
@transient val sc = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val df = sc.parallelize(Seq(
(0, Array("a", "b", "c","a")),
(1, Array("c", "b", "b", "c", "a")),
(2, Array("a", "a", "c","d")),
(3, Array("c", "a", "b", "a", "a")),
(4, Array("我", "爱", "旅行", "土耳其", "大理","云南")),
(5, Array("我", "爱", "学习")),
(6, Array("胡歌", "优秀","演员", "幽默", "责任感"))
)).map(x => (x._1, x._2)).toDF("id", "words")
df.show(false)
//展示数据
val hashModel = new HashingTF()
.setInputCol("words")
.setOutputCol("rawFeatures")
.setNumFeatures(Math.pow(2, 20).toInt)
val featurizedData = hashModel.transform(df)
featurizedData.show(false) //展示数据
val df3 = sc.parallelize(Seq(
(0, Array("a", "a", "c","d")),
(1, Array("c", "a", "b", "a", "a"))
)).map(x => (x._1, x._2)).toDF("id", "words")
hashModel.transform(df3).show(false)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
rescaledData.select("words", "features").show(false)
try {
val fileOut: FileOutputStream = new FileOutputStream("idf.jserialized")
val out: ObjectOutputStream = new ObjectOutputStream(fileOut)
out.writeObject(idfModel)
out.close()
fileOut.close()
System.out.println("nSerialization Successful... Checkout your specified output file..n")
}
catch {
case foe: FileNotFoundException => foe.printStackTrace()
case ioe: IOException => ioe.printStackTrace()
}
val fos = new FileOutputStream("model.obj")
val oos = new ObjectOutputStream(fos)
oos.writeObject(idfModel)
oos.close
val fis = new FileInputStream("model.obj")
val ois = new ObjectInputStream(fis)
val newModel = ois.readObject().asInstanceOf[IDFModel]
val df2 = sc.parallelize(Seq(
(0, Array("a", "b", "c","a")),
(1, Array("c", "b", "b", "c", "a")),
(2, Array("我", "爱", "旅行", "土耳其", "大理","云南")),
(3, Array("我", "爱", "工作")),
(4, Array("胡歌", "优秀","演员", "幽默", "责任感"))
)).map(x => (x._1, x._2)).toDF("id", "words")
val hashModel2 = new HashingTF()
.setInputCol("words")
.setOutputCol("rawFeatures")
.setNumFeatures(Math.pow(2, 20).toInt)
val featurizedData2 = hashModel2.transform(df2)
val rescaledData2 = newModel.transform(featurizedData2)
rescaledData2.select("words", "features").show(false)
}
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import java.io.{FileInputStream, FileOutputStream, ObjectInputStream, ObjectOutputStream, _}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.feature._
import org.apache.spark.sql.SQLContext
object tfidftest {
def main(args: Array[String]): Unit = {
val masterUrl = "local[2]"
val appName ="tfidf_test"
val sparkConf = new SparkConf().setMaster(masterUrl).setAppName(appName)
@transient val sc = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val df = sc.parallelize(Seq(
(0, Array("a", "b", "c","a")),
(1, Array("c", "b", "b", "c", "a")),
(2, Array("a", "a", "c","d")),
(3, Array("c", "a", "b", "a", "a")),
(4, Array("我", "爱", "旅行", "土耳其", "大理","云南")),
(5, Array("我", "爱", "学习")),
(6, Array("胡歌", "优秀","演员", "幽默", "责任感"))
)).map(x => (x._1, x._2)).toDF("id", "words")
df.show(false)
//展示数据
val hashModel = new HashingTF()
.setInputCol("words")
.setOutputCol("rawFeatures")
.setNumFeatures(Math.pow(2, 20).toInt)
val featurizedData = hashModel.transform(df)
featurizedData.show(false) //展示数据
val df3 = sc.parallelize(Seq(
(0, Array("a", "a", "c","d")),
(1, Array("c", "a", "b", "a", "a"))
)).map(x => (x._1, x._2)).toDF("id", "words")
hashModel.transform(df3).show(false)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
rescaledData.select("words", "features").show(false)
try {
val fileOut: FileOutputStream = new FileOutputStream("idf.jserialized")
val out: ObjectOutputStream = new ObjectOutputStream(fileOut)
out.writeObject(idfModel)
out.close()
fileOut.close()
System.out.println("nSerialization Successful... Checkout your specified output file..n")
}
catch {
case foe: FileNotFoundException => foe.printStackTrace()
case ioe: IOException => ioe.printStackTrace()
}
val fos = new FileOutputStream("model.obj")
val oos = new ObjectOutputStream(fos)
oos.writeObject(idfModel)
oos.close
val fis = new FileInputStream("model.obj")
val ois = new ObjectInputStream(fis)
val newModel = ois.readObject().asInstanceOf[IDFModel]
val df2 = sc.parallelize(Seq(
(0, Array("a", "b", "c","a")),
(1, Array("c", "b", "b", "c", "a")),
(2, Array("我", "爱", "旅行", "土耳其", "大理","云南")),
(3, Array("我", "爱", "工作")),
(4, Array("胡歌", "优秀","演员", "幽默", "责任感"))
)).map(x => (x._1, x._2)).toDF("id", "words")
val hashModel2 = new HashingTF()
.setInputCol("words")
.setOutputCol("rawFeatures")
.setNumFeatures(Math.pow(2, 20).toInt)
val featurizedData2 = hashModel2.transform(df2)
val rescaledData2 = newModel.transform(featurizedData2)
rescaledData2.select("words", "features").show(false)
}
}