概述
import jieba
import jieba.analyse
import jieba.posseg as pseg
from pyspark import SparkConf, SparkContext,SQLContext
from pyspark.ml.feature import Word2Vec,CountVectorizer
import pandas as pd
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.sql import Row
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.util import MLUtils
conf = SparkConf().setAppName("cluster")
sc = SparkContext(conf=conf)
sqlContext=SQLContext(sc)
#my_df 加载数据
spark_df = sqlContext.createDataFrame(my_df)
#计算tfidf
cv = CountVectorizer(inputCol="words", outputCol="rawFeatures")
cvmodel =cv.fit(spark_df);
cvResult= cvmodel.transform(spark_df);
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(cvResult)
cvResult = idfModel.transform(cvResult)
ddf = MLUtils.convertVectorColumnsFromML(cvResult, 'features')
ddf=ddf.select('features').rdd.map(lambda row : row[0])
mat = RowMatrix(ddf)
#奇异值分解
svd = mat.computeSVD(k=60, computeU=True)
#转成dataframe格式
svd_u = svd.U.rows.map(lambda row : row.tolist())
svd_df = sqlContext.createDataFrame(svd_u)
#kmeans聚类
kmeans = KMeans().setK(60).setSeed(1)
vecAssembler = VectorAssembler(inputCols=svd_df.schema.names, outputCol='features')
svd_df = vecAssembler.transform(svd_df)
#聚类结果
c_result = svd_df.select('features')
model = kmeans.fit(c_result)
results = model.transform(svd_df)
最后
以上就是笨笨芒果为你收集整理的spark进行svd降维和kmeans聚类的全部内容,希望文章能够帮你解决spark进行svd降维和kmeans聚类所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复