Spark的MLlib使用基本数据类型：向量、标签点、矩阵、稀疏格式文件libSVM

83 阅读 0 评论 55 点赞

我是靠谱客的博主落后发夹，这篇文章主要介绍Spark的MLlib使用基本数据类型：向量、标签点、矩阵、稀疏格式文件libSVM，现在分享给大家，希望可以做个参考。

Spark MLlib库实现了很多的机器学习算法，其基本的几类数据类型解释及代码演示如下。

复制代码

import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, IndexedRow, IndexedRowMatrix, MatrixEntry, RowMatrix}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.junit.Test
class DataType {
val conf = new SparkConf().setMaster("local[4]").setAppName("datatype").set("spark.testing.memory", "471859200")
val sc = new SparkContext(conf)
/**
* local vector 本地向量
*
向量具有存储在单台计算机上的整数类型索引和基于0的索引以及双精度类型的值。MLlib支持两种类型的局部向量：密集和稀疏。
*/
def localVectorDemo() = {
//
创建一个稠密向量 (1.0, 0.0, 3.0)
val dv: Vector = Vectors.dense(1.0, 0.0, 3.0)
//
创建一个稀疏向量，有3个元素，位置（0， 2）上的值分别为（1.0， 3.0）
val sv1: Vector = Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0))
//
创建一个稀疏向量，有3个元素，第一个值位于0位置，值为1.0； 第二个值位于2位置，值为3.0
val sv2: Vector = Vectors.sparse(3, Seq((0, 1.0), (2, 3.0)))
}
/**
* label point 标签点
*
由一个本地向量（密集或稀疏）和一个类标签组成
*/
def labelPointDemo() = {
//
使用正标签 (1.0) 和一个稠密特征向量来创建一个标签点
val pos: LabeledPoint = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0))
//
使用负标签（0.0）和一个稀疏特征向量来创建一个标签点
val neg: LabeledPoint = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0)))
}
/**
* 稀疏数据：在MLlib中读取libSVM格式存储的训练实例
*
libSVM格式是LIBSVM和LIBLINEAR的默认格式，这是一种文本格式，每行代表一个含类标签的稀疏特征向量
*
格式：label index1：value1, index2:vale2 ......
*/
@Test
def sparseDateDemo() = {
val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
examples.take(10).foreach(println)
}
/**
* Local Matrix 本地矩阵
*
本地矩阵的基类是 Matrices，MLlib中的矩阵其实是向量型的RDD
*
实体值以 列优先的方式 存储在一个双精度浮点型的数组中
*/
def localMatrixDemo() = {
//
创建一个密集矩阵((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
val dm: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
//
创建一个稀疏矩阵((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
val sm: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(9, 6, 8))
}
/**
* Distributed Maxtrix 分布式矩阵
*/
/**
* RowMatrix 行矩阵
*/
@Test
def rowMatrixDemo(): Unit = {
//
定义一个本地向量的RDD： RDD[Vector]
val rows: RDD[Vector] = sc.parallelize(Seq(
Vectors.dense(1.0, 2.0, 3.0),
Vectors.dense(1.0, 4.0, 3.0),
Vectors.dense(1.0, 5.0, 3.0),
Vectors.dense(1.0, 6.0, 3.0)
))
//
使用RDD[Vector]来创建行矩阵
val mat: RowMatrix = new RowMatrix(rows)
//
分别获取行矩阵的行、列数
val m = mat.numRows()
val n = mat.numCols()
println(f"行数：${m}, 列数：${n}")
//
QR分解
mat.tallSkinnyQR(true)
}
/**
* IndexedRowMatrix 索引行矩阵：行索引矩阵和行矩阵类似，但其行索引是由意义的，本是是是是要给含有索引信息的行数据集合
*
行索引矩阵可从一个RDD[IndexedRow]实例创建，这里的IndexedRow是（Long， Vector）的封装类
*
剔除行索引矩阵中的行索引信息就变成了一个行矩阵
*/
def indexdRowMatrixDemo = {
//
一个索引行RDD: RDD[IndexedRow]
val rows: RDD[IndexedRow] = sc.parallelize(Seq(
IndexedRow(1L, Vectors.dense(1.0, 0.0, 3.0)),
IndexedRow(2L, Vectors.dense(1.0, 1.0, 3.0)),
IndexedRow(3L, Vectors.dense(1.0, 2.0, 3.0))
))
//
从一个 RDD[IndexedRow] 创建一个 索引行矩阵
val mat = new IndexedRowMatrix(rows)
//
获取它的大小
val m = mat.numRows()
val n = mat.numCols()
//
通过将行索引矩阵的行索引信息提出，得到行矩阵。 即删去其行索引
val rowMat: RowMatrix = mat.toRowMatrix()
}
/**
* CorrdinateMatrix 三元组矩阵：其实体集合是一个RDD，每个实体是一个（i:Long, j: Long, value: Double）三元组
*
i代表行索引，j代表列索引，value代表实体值
*
三元组矩阵常用于稀疏性比较高的计算中，是由RDD[MatrixEntry]来构建的
*
MatrixEntry 是一个Tuple类型的元素，其中包含行、列、和元素的值
*/
def corrdinRDD(): Unit = {
//
一个 RDD[MatrixEntry]
val entries: RDD[MatrixEntry] = sc.parallelize(Seq(
MatrixEntry(0, 0, 1.1),
MatrixEntry(0, 1, 1.2),
MatrixEntry(1, 0, 1.3),
MatrixEntry(1, 1, 1.4)
))
//
创建一个三元组矩阵
val mat = new CoordinateMatrix(entries)
//
获取它的大小
val m = mat.numRows()
val n = mat.numCols()
//
把三元组矩阵转换为行索引矩阵
val indexedRowMatrixConv = mat.toIndexedRowMatrix()
}
def blockMatrixDemo():Unit = {
//
一个 RDD[MatrixEntry]
val entries: RDD[MatrixEntry] = sc.parallelize(Seq(
MatrixEntry(0, 0, 1.1),
MatrixEntry(0, 1, 1.2),
MatrixEntry(1, 0, 1.3),
MatrixEntry(1, 1, 1.4)
))
//
创建一个三元组矩阵
val coordMat = new CoordinateMatrix(entries)
// Transform the CoordinateMatrix to a BlockMatrix
val matA: BlockMatrix = coordMat.toBlockMatrix().cache()
// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
// Nothing happens if it is valid.
matA.validate()
// Calculate A^T A.
val ata = matA.transpose.multiply(matA)
}
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, CoordinateMatrix, IndexedRow, IndexedRowMatrix, MatrixEntry, RowMatrix}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.junit.Test
class DataType {
val conf = new SparkConf().setMaster("local[4]").setAppName("datatype").set("spark.testing.memory", "471859200")
val sc = new SparkContext(conf)
/**
* local vector 本地向量
*
向量具有存储在单台计算机上的整数类型索引和基于0的索引以及双精度类型的值。MLlib支持两种类型的局部向量：密集和稀疏。
*/
def localVectorDemo() = {
//
创建一个稠密向量 (1.0, 0.0, 3.0)
val dv: Vector = Vectors.dense(1.0, 0.0, 3.0)
//
创建一个稀疏向量，有3个元素，位置（0， 2）上的值分别为（1.0， 3.0）
val sv1: Vector = Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0))
//
创建一个稀疏向量，有3个元素，第一个值位于0位置，值为1.0； 第二个值位于2位置，值为3.0
val sv2: Vector = Vectors.sparse(3, Seq((0, 1.0), (2, 3.0)))
}
/**
* label point 标签点
*
由一个本地向量（密集或稀疏）和一个类标签组成
*/
def labelPointDemo() = {
//
使用正标签 (1.0) 和一个稠密特征向量来创建一个标签点
val pos: LabeledPoint = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0))
//
使用负标签（0.0）和一个稀疏特征向量来创建一个标签点
val neg: LabeledPoint = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0)))
}
/**
* 稀疏数据：在MLlib中读取libSVM格式存储的训练实例
*
libSVM格式是LIBSVM和LIBLINEAR的默认格式，这是一种文本格式，每行代表一个含类标签的稀疏特征向量
*
格式：label index1：value1, index2:vale2 ......
*/
@Test
def sparseDateDemo() = {
val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
examples.take(10).foreach(println)
}
/**
* Local Matrix 本地矩阵
*
本地矩阵的基类是 Matrices，MLlib中的矩阵其实是向量型的RDD
*
实体值以 列优先的方式 存储在一个双精度浮点型的数组中
*/
def localMatrixDemo() = {
//
创建一个密集矩阵((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
val dm: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0))
//
创建一个稀疏矩阵((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
val sm: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(9, 6, 8))
}
/**
* Distributed Maxtrix 分布式矩阵
*/
/**
* RowMatrix 行矩阵
*/
@Test
def rowMatrixDemo(): Unit = {
//
定义一个本地向量的RDD： RDD[Vector]
val rows: RDD[Vector] = sc.parallelize(Seq(
Vectors.dense(1.0, 2.0, 3.0),
Vectors.dense(1.0, 4.0, 3.0),
Vectors.dense(1.0, 5.0, 3.0),
Vectors.dense(1.0, 6.0, 3.0)
))
//
使用RDD[Vector]来创建行矩阵
val mat: RowMatrix = new RowMatrix(rows)
//
分别获取行矩阵的行、列数
val m = mat.numRows()
val n = mat.numCols()
println(f"行数：${m}, 列数：${n}")
//
QR分解
mat.tallSkinnyQR(true)
}
/**
* IndexedRowMatrix 索引行矩阵：行索引矩阵和行矩阵类似，但其行索引是由意义的，本是是是是要给含有索引信息的行数据集合
*
行索引矩阵可从一个RDD[IndexedRow]实例创建，这里的IndexedRow是（Long， Vector）的封装类
*
剔除行索引矩阵中的行索引信息就变成了一个行矩阵
*/
def indexdRowMatrixDemo = {
//
一个索引行RDD: RDD[IndexedRow]
val rows: RDD[IndexedRow] = sc.parallelize(Seq(
IndexedRow(1L, Vectors.dense(1.0, 0.0, 3.0)),
IndexedRow(2L, Vectors.dense(1.0, 1.0, 3.0)),
IndexedRow(3L, Vectors.dense(1.0, 2.0, 3.0))
))
//
从一个 RDD[IndexedRow] 创建一个 索引行矩阵
val mat = new IndexedRowMatrix(rows)
//
获取它的大小
val m = mat.numRows()
val n = mat.numCols()
//
通过将行索引矩阵的行索引信息提出，得到行矩阵。 即删去其行索引
val rowMat: RowMatrix = mat.toRowMatrix()
}
/**
* CorrdinateMatrix 三元组矩阵：其实体集合是一个RDD，每个实体是一个（i:Long, j: Long, value: Double）三元组
*
i代表行索引，j代表列索引，value代表实体值
*
三元组矩阵常用于稀疏性比较高的计算中，是由RDD[MatrixEntry]来构建的
*
MatrixEntry 是一个Tuple类型的元素，其中包含行、列、和元素的值
*/
def corrdinRDD(): Unit = {
//
一个 RDD[MatrixEntry]
val entries: RDD[MatrixEntry] = sc.parallelize(Seq(
MatrixEntry(0, 0, 1.1),
MatrixEntry(0, 1, 1.2),
MatrixEntry(1, 0, 1.3),
MatrixEntry(1, 1, 1.4)
))
//
创建一个三元组矩阵
val mat = new CoordinateMatrix(entries)
//
获取它的大小
val m = mat.numRows()
val n = mat.numCols()
//
把三元组矩阵转换为行索引矩阵
val indexedRowMatrixConv = mat.toIndexedRowMatrix()
}
def blockMatrixDemo():Unit = {
//
一个 RDD[MatrixEntry]
val entries: RDD[MatrixEntry] = sc.parallelize(Seq(
MatrixEntry(0, 0, 1.1),
MatrixEntry(0, 1, 1.2),
MatrixEntry(1, 0, 1.3),
MatrixEntry(1, 1, 1.4)
))
//
创建一个三元组矩阵
val coordMat = new CoordinateMatrix(entries)
// Transform the CoordinateMatrix to a BlockMatrix
val matA: BlockMatrix = coordMat.toBlockMatrix().cache()
// Validate whether the BlockMatrix is set up properly. Throws an Exception when it is not valid.
// Nothing happens if it is valid.
matA.validate()
// Calculate A^T A.
val ata = matA.transpose.multiply(matA)
}
}