概述
# -*- coding: utf-8 -*-
# 去重:①同义词去重,停顿词去掉 ②删除特殊符号 ③ 删除重复
import re
from typing import Iterable
import time
import jieba
jieba.initialize()
jieba.load_userdict('./stopword.txt')
tongyici = {
'梦见': ['梦到', '做梦'],
'怎么': ['怎么样', '如何'],
'男孩': ['男孩子', '男宝宝', '男生'],
'女孩': ['女孩子', '女宝宝', '女生', '闺女'],
}
# 去除特殊符号
def remove_char(word_text: str):
word_text = re.sub(r"[! ?:;$#^&*()@+-\|=_—…%¥!《》.,<>?。,:;’“【】、]+", "", word_text)
# 删除单个字母
return re.sub(r'([^a-z]*)[a-z]([^a-z]*)', 'g<1>g<2>', word_text, flags=re.I)
# 去掉同义词
def replace_word(tyc, word):
new_word = word
for ke, va in tyc.items():
new_word = re.sub('|'.join(va), ke, new_word)
return [w for w in new_word.split('n')]
#去掉停词,双循环在词量大的条件下创建字典效率非常低下
# def repalce_stopword(stop_words: Iterable, text: Iterable):
# result = {}
# for word in text:
# new_word = word
# for item in stop_words:
# new_word = re.sub(item, '', new_word)
# result[word] = new_word
# 去除重复
# return result
def repalce_stopword(stop_words, text):
result2 = {}
for word in text:
#通过jieba导入,取交际替换,避免出现单一停词替换影响语句不通顺的问题。
cut_word = jieba.lcut(word)
new_cut_word = set(cut_word)
same_word = new_cut_word&stop_words
new_words = ''.join(w for w in cut_word if w not in same_word)
result2[word] = new_words
return result2
def quchong(word):
result1 = {}
for key1, val1 in word.items():
if result1.get(val1):
result1[val1].append(key1)
else:
result1[val1] = [key1]
return result1
if __name__ == '__main__':
with open('keywords.txt', encoding='utf-8') as fd:
start_time = time.time()
words = remove_char(fd.read())
next_text = replace_word(tongyici, words)
with open('stopword.txt', encoding='utf-8') as fs:
# print(fs.read())
stop_word = {w.strip() for w in fs.readlines()}
next_stop = repalce_stopword(stop_word, next_text)
qc_word = quchong(next_stop)
result = {}
with open('quchong.txt',"w",encoding='utf-8') as fd:
for key,val in qc_word.items():
num = val[0]
for index in range(1,len(val)):
if len(val[index]) > len(num):
num = index
result[key] = num
for key,val in result.items():
fd.write(f"[{key}]t[{val}]n")
# 保留字数最多的词,搜索量大的词,竞争度小的词
print(f'处理耗时:{time.time() - start_time:.3f}s')
最后
以上就是粗心小蝴蝶为你收集整理的同义词替换,停词去除的全部内容,希望文章能够帮你解决同义词替换,停词去除所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复