概述
from basicInfo import BasicInfo
import jieba
from pypinyin import pinyin, lazy_pinyin
from dataServer import DataServer
class ErrorRecovery(object):
def __init__(self, dataSer):
self.basic = BasicInfo()
self.data = dataServer
""" ===============一定要带标点符号==========="""
def recoveryMain(self, sentence):
word_list, word_gram_list = self.gram2Main(sentence)
user_pin_list =[]
for item in word_gram_list:
pin_list=lazy_pinyin(item, errors='ignore')
user_pin_list.append(pin_list)
for k, v in dataServer.dict_gupiao_pin.items():
for user_pin in user_pin_list:
if v == user_pin:
print("识别代码===$$$$$$=="+str(k))
return self.restoreQue(user_pin, user_pin_list, word_gram_list, word_list, k)
else:
if self.findok(v, user_pin):
print("识别代码===%%%%%%====="+str(k))
return self.restoreQue(user_pin, user_pin_list, word_gram_list, word_list, k)
#问句排查错误后,重组问题
def restoreQue(self, user_pin, user_pin_list, word_gram_list, word_list, k):
index = user_pin_list.index(user_pin)
shortence = word_gram_list[index]
seg_list = jieba.cut(shortence, cut_all=False, HMM=True)
short_list = [e for e in seg_list]
word_list[word_list.index(short_list[0])] = str(k)
del word_list[word_list.index(str(k)) + 1]
symbol_list = [',', ',', '。', '、', '.', '#.', '?', '?', ';', ':', ';', '=', '+', '-', '@', '#', '(', ')', '!', '!',
'*', '%']
for word in word_list:
for symbol in symbol_list:
if symbol in word or '\ue' in word:
word_list.remove(word)
return word_list
def gram2Main(self, sentence):
print("原句===="+sentence)
sentence = ''.join(sentence.split())
#sentence = re.sub("[s+.!/_,[]:$-:);%;=^*(+"']+|[+——“!”,?。?<《》>、~@#¥%……&*()]+", '', sentence)
seg_list = jieba.cut(sentence, cut_all=False, HMM=True)
word_list = [e for e in seg_list]
return self.test2gram(word_list)
def test2gram(self, list2=None):
#list2 = ['请问','这','只','股票','怎么','样','呢']
word_gram_list=[]
for i in range(len(list2)-1):
ce = list2[i]+list2[i+1]
word_gram_list.append(ce)
return list2, word_gram_list
def findok(self, v, user_pin):
if len(v) >= 4 and self.unionlen(user_pin, v) >= 3:
return True
elif len(v) == 3 and self.unionlen(user_pin, v) >= 2:
return True
return False
#blist 为字典,len(blist)为每项字典长度列表
def unionlen(self, alist, blist):
ret = []
if len(alist)<=len(blist):
for i in range(len(alist)):
if alist[i] in blist:
ret.append(alist)
return len(ret)
else:
for i in range(len(blist)):
if alist[i] in blist:
ret.append(alist)
return len(ret)
if __name__ == '__main__':
dataServer = DataServer()
erRecovery = ErrorRecovery(dataServer)
words = erRecovery.recoveryMain("请问老师,爱第二怎么样?")#大脸有一,这只股票怎么样? 浙江现xian,怎么,样?,航茶集怎么样呢 爱第二怎么样
print(words)
words = erRecovery.recoveryMain("大脸有一,这只股票,怎么样?")
print(words)
最后
以上就是可靠御姐为你收集整理的智能纠错(N-gram、编辑距离、转化拼音)的全部内容,希望文章能够帮你解决智能纠错(N-gram、编辑距离、转化拼音)所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复