58 lines
1.5 KiB
Python
58 lines
1.5 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
@author:XuMing(xuming624@qq.com)
|
||
@description:
|
||
"""
|
||
|
||
import sys
|
||
|
||
sys.path.append("../")
|
||
import pycorrector
|
||
|
||
c = pycorrector.corrector
|
||
c.initialize_corrector()
|
||
c.initialize_detector()
|
||
same_pinyin = c.same_pinyin
|
||
same_shape = c.same_stroke
|
||
model = c.lm
|
||
|
||
|
||
def get_confusion_set(char):
|
||
confusion_set = same_pinyin.get(char, set()).union(same_shape.get(char, set()))
|
||
confusion_set.add(char)
|
||
return confusion_set
|
||
|
||
|
||
def get_score(c1, c2, c3, origin, lambd=1.5):
|
||
ret = model.score(" ".join([c1, c2, c3]))
|
||
if c3 == origin:
|
||
ret *= lambd
|
||
return ret
|
||
|
||
|
||
def trigram_correct(sentence):
|
||
print('sentence:', sentence)
|
||
min_p = 10000
|
||
sentence = list(sentence)
|
||
result = sentence
|
||
for i in range(1, len(sentence) - 1):
|
||
v1 = get_confusion_set(sentence[i - 1])
|
||
v2 = get_confusion_set(sentence[i])
|
||
v3 = get_confusion_set(sentence[i + 1])
|
||
for j in v1:
|
||
for k in v2:
|
||
for l in v3:
|
||
candidate = sentence[0:i - 1] + list(j) + list(k) + list(l) + sentence[i + 2:]
|
||
score = model.perplexity(" ".join(candidate))
|
||
print('candidate, score:', candidate, score)
|
||
if score < min_p:
|
||
result = candidate
|
||
min_p = score
|
||
result = "".join(result)
|
||
print('result:', result)
|
||
return result
|
||
|
||
|
||
if __name__ == '__main__':
|
||
trigram_correct("少先队员因该为老人让坐")
|