同义词替换，停词去除

168 阅读 0 评论 111 点赞

我是靠谱客的博主粗心小蝴蝶，这篇文章主要介绍同义词替换，停词去除，现在分享给大家，希望可以做个参考。

# -*- coding: utf-8 -*-
# 去重：①同义词去重，停顿词去掉 ②删除特殊符号  ③ 删除重复
import re
from typing import Iterable
import time
import jieba

jieba.initialize()
jieba.load_userdict('./stopword.txt')

tongyici = {
    '梦见': ['梦到', '做梦'],
    '怎么': ['怎么样', '如何'],
    '男孩': ['男孩子', '男宝宝', '男生'],
    '女孩': ['女孩子', '女宝宝', '女生', '闺女'],
}


# 去除特殊符号
def remove_char(word_text: str):
    word_text = re.sub(r"[! ?:;$#^&*()@+-\|=_—…%￥！《》.,<>？。，：；’“【】、]+", "", word_text)
    # 删除单个字母
    return re.sub(r'([^a-z]*)[a-z]([^a-z]*)', 'g<1>g<2>', word_text, flags=re.I)


# 去掉同义词
def replace_word(tyc, word):
    new_word = word
    for ke, va in tyc.items():
        new_word = re.sub('|'.join(va), ke, new_word)
    return [w for w in new_word.split('n')]



#去掉停词,双循环在词量大的条件下创建字典效率非常低下
# def repalce_stopword(stop_words: Iterable, text: Iterable):
#     result = {}
#     for word in text:
#         new_word = word
#         for item in stop_words:
#             new_word = re.sub(item, '', new_word)
#         result[word] = new_word
    # 去除重复
    # return result

def repalce_stopword(stop_words, text):
    result2 = {}
    for word in text:
        #通过jieba导入，取交际替换，避免出现单一停词替换影响语句不通顺的问题。
        cut_word = jieba.lcut(word)
        new_cut_word = set(cut_word)
        same_word = new_cut_word&stop_words
        new_words = ''.join(w for w in cut_word if w not in same_word)
        result2[word] = new_words
    return result2

def quchong(word):
    result1 = {}
    for key1, val1 in word.items():
        if result1.get(val1):
            result1[val1].append(key1)
        else:
            result1[val1] = [key1]
    return result1


if __name__ == '__main__':
    with open('keywords.txt', encoding='utf-8') as fd:
        start_time = time.time()
        words = remove_char(fd.read())
        next_text = replace_word(tongyici, words)

    with open('stopword.txt', encoding='utf-8') as fs:
        # print(fs.read())
        stop_word = {w.strip() for w in fs.readlines()}
        next_stop = repalce_stopword(stop_word, next_text)
        qc_word = quchong(next_stop)
        result = {}
        with open('quchong.txt',"w",encoding='utf-8') as fd:
            for key,val in qc_word.items():
                num = val[0]
                for index in range(1,len(val)):
                    if len(val[index]) > len(num):
                        num = index
                result[key] = num
            for key,val in result.items():
                 fd.write(f"[{key}]t[{val}]n")
        # 保留字数最多的词，搜索量大的词，竞争度小的词

    print(f'处理耗时:{time.time() - start_time:.3f}s')