我是靠谱客的博主 贪玩鸡,最近开发中收集的这篇文章主要介绍Python:通过gensim和jieba分词进行文本相似度分析,觉得挺不错的,现在分享给大家,希望可以做个参考。

概述

2019独角兽企业重金招聘Python工程师标准>>> hot3.png

#! -*- coding:utf-8 -*-


import pymongo
import codecs,sys
from pymongo import MongoClient
import jieba
from gensim import corpora, models, similarities
import nltk
import jieba.analyse
from nltk.tokenize import word_tokenize
from pprint import pprint # pretty-printer

reload(sys)
sys.setdefaultencoding('utf-8')

kickpath="" #"/root/python/"

dics=[]
dits={}
labels={}
count=1
mydoclist =[]
courses=[]
questions=[]
uuids=[]


#通过jieba中文分词生成词条
def jieba_preprocess_cn(courses, low_freq_filter = True):
    #jieba.analyse.set_stop_words("../extra_dict/stop_words.txt")
    #jieba.analyse.set_idf_path("../extra_dict/idf.txt.big");
    texts_tokenized = []
    for document in courses:
        texts_tokenized_tmp = []
        words= jieba.cut(document,cut_all=True)
        tages= jieba.analyse.extract_tags(document,500)
        texts_tokenized.append(tages)

    texts_filtered_stopwords = texts_tokenized
    pprint(texts_filtered_stopwords)

    #去除标点符号
    english_punctuations = [',''.'':'';''?''('')''['']''&''!''*''@''#''$''%']
    texts_filtered = [[word for word in document if not word in english_punctuations] for document intexts_filtered_stopwords]

    #去除过低频词
    if low_freq_filter:
        # remove words that appear only once
        from collections import defaultdict
        frequency = defaultdict(int)
        for text in texts_filtered:
            for token in text:
                frequency[token] += 1
        texts = [[token for token in text if frequency[token] > 1for text in texts_filtered]
    else:
        texts = texts_filtered
        pprint(texts)
    return texts

def train_by_lsi(lib_texts):
    #为了能看到过程日志
    #import logging
    #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    dictionary = corpora.Dictionary(lib_texts)
    corpus = [dictionary.doc2bow(text) for text in lib_texts] #doc2bow(): 将collection words 转为词袋,用两元组(word_id, word_frequency)表示
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    #拍脑袋的:训练topic数量为10的LSI模型
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary) #, num_topics=10)
    index = similarities.MatrixSimilarity(lsi[corpus]) # index 是 gensim.similarities.docsim.MatrixSimilarity 实例

    dictionary.save(kickpath+"kick.dict")
    lsi.save(kickpath+"kick.lsi")
    index.save(kickpath+"kick.index")
    return (index, dictionary, lsi)


if __name__ == '__main__':
conn = MongoClient("xxx"27017)
db = conn.health
db.authenticate("xx""xxx")
content = db.kickchufang.find({'doctorId':'huanghuang'})
index=0
for in content:
line = str(i['desc']) #.decode("utf-8") #.encode("GB18030"))
#print "line:",line
uuid = i['uuid']
uuids.append(uuid)
#print uuid, line
courses.append(line)
print str(index)
index=index+1
#if (index>10):
# break

man_file = open(kickpath+"kick.uuids"'w')
print(uuids, man_file)
man_file.close()
courses_name = courses

# 库建立完成 -- 这部分可能数据很大,可以预先处理好,存储起来
lib_texts = jieba_preprocess_cn(courses)
(index, dictionary, lsi) = train_by_lsi(lib_texts)

转载于:https://my.oschina.net/u/778683/blog/828670

最后

以上就是贪玩鸡为你收集整理的Python:通过gensim和jieba分词进行文本相似度分析的全部内容,希望文章能够帮你解决Python:通过gensim和jieba分词进行文本相似度分析所遇到的程序开发问题。

如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。

本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
点赞(46)

评论列表共有 0 条评论

立即
投稿
返回
顶部