add zhtools: tradition simply chinese tool. xuming 20180302

This commit is contained in:
xuming06 2018-03-02 13:45:57 +08:00
parent 4311a37f12
commit 7ee15f5223
8 changed files with 101 additions and 25701 deletions

46
detect.py Normal file
View File

@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
# Author: XuMing <xuming624@qq.com>
# Brief: error word detector
import os
import kenlm
import jieba
import pickle
import math
import wubi
import numpy as np
import pypinyin
from pypinyin import pinyin
from collections import Counter
import config
import re
def get_homophones_by_char(input_char):
"""
根据汉字取同音字
:param input_char:
:return:
"""
result = []
# CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
for i in range(0x4e00, 0x9fa6):
if pinyin([chr(i)], style=pypinyin.NORMAL)[0][0] == pinyin(input_char, style=pypinyin.NORMAL)[0][0]:
result.append(chr(i))
return result
def get_homophones_by_pinyin(input_pinyin):
"""
根据拼音取同音字
:param input_pinyin:
:return:
"""
result = []
# CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
for i in range(0x4e00, 0x9fa6):
if pinyin([chr(i)], style=pypinyin.TONE2)[0][0] == input_pinyin:
# TONE2: 中zho1ng
result.append(chr(i))
return result

5
requirements.txt Normal file
View File

@ -0,0 +1,5 @@
numpy
pypinyin
Keras
scikit-learn
tensorflow

22
spell.py Normal file
View File

@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
# Author: XuMing <xuming624@qq.com>
# Brief:
import os
import kenlm
import jieba
import pickle
import math
import wubi
import numpy as np
import pypinyin
from pypinyin import pinyin
from collections import Counter
import config
import re
def main():
line = '我们现在使用的数学福号'
print('input sentence is:'%line)
# corrected_sent,correct_ranges = correct(line)

10
test/detect_test.py Normal file
View File

@ -0,0 +1,10 @@
# -*- coding: utf-8 -*-
# Author: XuMing <xuming624@qq.com>
# Brief:
from detect import *
pron = get_homophones_by_char('')
print(pron)
# 回家的时候他们坐公车,张大林站着沾着,然后就问她。

18
test/util_test.py Normal file
View File

@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
# Author: XuMing <xuming624@qq.com>
# Brief:
from util import *
from pypinyin import pinyin, lazy_pinyin
# traditional simplified
traditional_sentence = '憂郁的臺灣烏龜'
simplified_sentence = traditional2simplified(traditional_sentence)
print(simplified_sentence)
simplified_sentence = '忧郁的台湾乌龟'
traditional_sentence = simplified2traditional(simplified_sentence)
print(traditional_sentence)
print(pinyin('中心')) # 带音调
print(pinyin('中心', heteronym=True)) # 多音字
print(lazy_pinyin('中心')) # 不带音调

26
util.py
View File

@ -4,7 +4,6 @@
import re
from zhtools.langconv import Converter
from zhtools.xpinyin import Pinyin
# 去除标点符号
@ -30,28 +29,3 @@ def simplified2traditional(sentence):
'''
sentence = Converter('zh-hant').convert(sentence)
return sentence
if __name__ == "__main__":
# traditional simplified
traditional_sentence = '憂郁的臺灣烏龜'
simplified_sentence = traditional2simplified(traditional_sentence)
print(simplified_sentence)
simplified_sentence = '忧郁的台湾乌龟'
traditional_sentence = simplified2traditional(simplified_sentence)
print(traditional_sentence)
# pinyin
p = Pinyin()
pinyin = p.get_pinyin('坐骑,你骑哪里了')
print(pinyin)
pinyin_tone = p.get_pinyin('坐骑,你骑哪里了', tone=True)
print(pinyin_tone)
print(p.get_initials(""))
print(''.join(p.py2hz('shang4')))
print(''.join(p.py2hz('wo')))

File diff suppressed because it is too large Load Diff

View File

@ -1,126 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Translate chinese hanzi to pinyin by python
Created by Eric Lo on 2010-05-20.
Copyright (c) 2010 __lxneng@gmail.com__. http://lxneng.com All rights reserved.
Forked by skydarkchen <skydark2 at gmail>
"""
import os.path
try:
chr = unichr
except NameError:
pass
VERSION = '0.3a'
class Pinyin(object):
"""translate chinese hanzi to pinyin by python, inspired by flyerhzms
`chinese\_pinyin`_ gem
.. _chinese\_pinyin: https://github.com/flyerhzm/chinese_pinyin
usage(python3)
-----
::
>>> p = Pinyin()
>>> p.get_pinyin("上海")
'shanghai'
>>> p.get_pinyin("上海", tone=True)
'shang4hai3'
>>> p.get_initials("")
'S'
>>> print(''.join(p.py2hz('shang4')))
丄上姠尙尚蠰銄鑜
>>> print(''.join(p.py2hz('a')))
吖腌錒锕阿嗄阿阿啊阿
"""
data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), \
'Mandarin.dat')
def __init__(self):
self.dict = {}
self.revdict = {}
for line in open(self.data_path):
k, v = line.strip().split('\t')
v = v.lower().split(' ')
hz = chr(int('0x%s' % k, 16))
self.dict[hz] = v
for vkey in v:
self.revdict.setdefault(vkey, [])
self.revdict[vkey].append(hz)
def py2hz(self, pinyin):
if pinyin == '':
return []
pinyin = pinyin.lower()
if pinyin[-1].isdigit():
return self.revdict.get(pinyin, [])
ret = []
for i in range(1, 6):
key = '%s%s' % (pinyin, i)
ret += self.revdict.get(key, [])
return ret
def get_pinyin(self, chars='', splitter='', tone=False):
result = []
for char in chars:
v = self.dict.get(char, None)
if v:
v = v[0]
if not tone and v[-1].isdigit():
v = v[:-1]
else:
v = char
result.append(v)
return splitter.join(result)
def get_initials(self, char=''):
if char == '':
return ''
return self.dict.get(char, [char])[0][0].upper()
if __name__ == '__main__':
import unittest
class PinyinTestCase(unittest.TestCase):
def setUp(self):
import sys
py = sys.version_info
self.py3k = py >= (3, 0, 0)
self.py = Pinyin()
def to_unicode(self, s):
if self.py3k:
return s
return s.decode('utf-8')
def test_get_pinyin(self): ## test method names begin 'test*'
s = self.to_unicode('上A2#海')
a = self.to_unicode('shangA2#hai')
aa = self.to_unicode('shang4A2#hai3')
aaa = self.to_unicode('shang A 2 # hai')
self.assertEqual(self.py.get_pinyin(s), a)
self.assertEqual(self.py.get_pinyin(s, tone=True), aa)
self.assertEqual(self.py.get_pinyin(s, splitter=' '), aaa)
def test_get_initials(self):
s = self.to_unicode('')
a = self.to_unicode('S')
self.assertEqual(self.py.get_initials(s), a)
def test_py2hz(self):
s1 = self.to_unicode('shang4')
s2 = self.to_unicode('a')
a1 = self.to_unicode('丄上姠尙尚蠰銄鑜')
a2 = self.to_unicode('吖腌錒锕阿嗄阿阿啊阿')
self.assertEqual(''.join(self.py.py2hz(s1)), a1)
self.assertEqual(''.join(self.py.py2hz(s2)), a2)
unittest.main()