add zhtools: tradition simply chinese tool. xuming 20180302
This commit is contained in:
parent
4311a37f12
commit
7ee15f5223
|
@ -0,0 +1,46 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Author: XuMing <xuming624@qq.com>
|
||||
# Brief: error word detector
|
||||
|
||||
import os
|
||||
import kenlm
|
||||
import jieba
|
||||
import pickle
|
||||
import math
|
||||
import wubi
|
||||
import numpy as np
|
||||
import pypinyin
|
||||
from pypinyin import pinyin
|
||||
from collections import Counter
|
||||
import config
|
||||
import re
|
||||
|
||||
|
||||
def get_homophones_by_char(input_char):
|
||||
"""
|
||||
根据汉字取同音字
|
||||
:param input_char:
|
||||
:return:
|
||||
"""
|
||||
result = []
|
||||
# CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
|
||||
for i in range(0x4e00, 0x9fa6):
|
||||
if pinyin([chr(i)], style=pypinyin.NORMAL)[0][0] == pinyin(input_char, style=pypinyin.NORMAL)[0][0]:
|
||||
result.append(chr(i))
|
||||
return result
|
||||
|
||||
|
||||
def get_homophones_by_pinyin(input_pinyin):
|
||||
"""
|
||||
根据拼音取同音字
|
||||
:param input_pinyin:
|
||||
:return:
|
||||
"""
|
||||
result = []
|
||||
# CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
|
||||
for i in range(0x4e00, 0x9fa6):
|
||||
if pinyin([chr(i)], style=pypinyin.TONE2)[0][0] == input_pinyin:
|
||||
# TONE2: 中zho1ng
|
||||
result.append(chr(i))
|
||||
return result
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
numpy
|
||||
pypinyin
|
||||
Keras
|
||||
scikit-learn
|
||||
tensorflow
|
|
@ -0,0 +1,22 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Author: XuMing <xuming624@qq.com>
|
||||
# Brief:
|
||||
import os
|
||||
import kenlm
|
||||
import jieba
|
||||
import pickle
|
||||
import math
|
||||
import wubi
|
||||
import numpy as np
|
||||
import pypinyin
|
||||
from pypinyin import pinyin
|
||||
from collections import Counter
|
||||
import config
|
||||
import re
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
line = '我们现在使用的数学福号'
|
||||
print('input sentence is:'%line)
|
||||
# corrected_sent,correct_ranges = correct(line)
|
|
@ -0,0 +1,10 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Author: XuMing <xuming624@qq.com>
|
||||
# Brief:
|
||||
|
||||
from detect import *
|
||||
|
||||
pron = get_homophones_by_char('长')
|
||||
print(pron)
|
||||
|
||||
# 回家的时候他们坐公车,张大林站着沾着,然后就问她。
|
|
@ -0,0 +1,18 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Author: XuMing <xuming624@qq.com>
|
||||
# Brief:
|
||||
from util import *
|
||||
from pypinyin import pinyin, lazy_pinyin
|
||||
|
||||
# traditional simplified
|
||||
traditional_sentence = '憂郁的臺灣烏龜'
|
||||
simplified_sentence = traditional2simplified(traditional_sentence)
|
||||
print(simplified_sentence)
|
||||
|
||||
simplified_sentence = '忧郁的台湾乌龟'
|
||||
traditional_sentence = simplified2traditional(simplified_sentence)
|
||||
print(traditional_sentence)
|
||||
|
||||
print(pinyin('中心')) # 带音调
|
||||
print(pinyin('中心', heteronym=True)) # 多音字
|
||||
print(lazy_pinyin('中心')) # 不带音调
|
26
util.py
26
util.py
|
@ -4,7 +4,6 @@
|
|||
import re
|
||||
|
||||
from zhtools.langconv import Converter
|
||||
from zhtools.xpinyin import Pinyin
|
||||
|
||||
|
||||
# 去除标点符号
|
||||
|
@ -30,28 +29,3 @@ def simplified2traditional(sentence):
|
|||
'''
|
||||
sentence = Converter('zh-hant').convert(sentence)
|
||||
return sentence
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# traditional simplified
|
||||
traditional_sentence = '憂郁的臺灣烏龜'
|
||||
simplified_sentence = traditional2simplified(traditional_sentence)
|
||||
print(simplified_sentence)
|
||||
|
||||
simplified_sentence = '忧郁的台湾乌龟'
|
||||
traditional_sentence = simplified2traditional(simplified_sentence)
|
||||
print(traditional_sentence)
|
||||
|
||||
# pinyin
|
||||
p = Pinyin()
|
||||
pinyin = p.get_pinyin('坐骑,你骑哪里了')
|
||||
print(pinyin)
|
||||
|
||||
pinyin_tone = p.get_pinyin('坐骑,你骑哪里了', tone=True)
|
||||
print(pinyin_tone)
|
||||
|
||||
print(p.get_initials("上"))
|
||||
|
||||
print(''.join(p.py2hz('shang4')))
|
||||
|
||||
print(''.join(p.py2hz('wo')))
|
||||
|
|
25549
zhtools/Mandarin.dat
25549
zhtools/Mandarin.dat
File diff suppressed because it is too large
Load Diff
|
@ -1,126 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""Translate chinese hanzi to pinyin by python
|
||||
Created by Eric Lo on 2010-05-20.
|
||||
Copyright (c) 2010 __lxneng@gmail.com__. http://lxneng.com All rights reserved.
|
||||
|
||||
Forked by skydarkchen <skydark2 at gmail>
|
||||
"""
|
||||
|
||||
import os.path
|
||||
|
||||
try:
|
||||
chr = unichr
|
||||
except NameError:
|
||||
pass
|
||||
|
||||
VERSION = '0.3a'
|
||||
|
||||
|
||||
class Pinyin(object):
|
||||
"""translate chinese hanzi to pinyin by python, inspired by flyerhzm’s
|
||||
`chinese\_pinyin`_ gem
|
||||
.. _chinese\_pinyin: https://github.com/flyerhzm/chinese_pinyin
|
||||
|
||||
usage(python3)
|
||||
-----
|
||||
::
|
||||
>>> p = Pinyin()
|
||||
>>> p.get_pinyin("上海")
|
||||
'shanghai'
|
||||
>>> p.get_pinyin("上海", tone=True)
|
||||
'shang4hai3'
|
||||
>>> p.get_initials("上")
|
||||
'S'
|
||||
>>> print(''.join(p.py2hz('shang4')))
|
||||
丄上姠尙尚蠰銄鑜
|
||||
>>> print(''.join(p.py2hz('a')))
|
||||
吖腌錒锕阿嗄阿阿啊阿
|
||||
"""
|
||||
|
||||
data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), \
|
||||
'Mandarin.dat')
|
||||
|
||||
def __init__(self):
|
||||
self.dict = {}
|
||||
self.revdict = {}
|
||||
for line in open(self.data_path):
|
||||
k, v = line.strip().split('\t')
|
||||
v = v.lower().split(' ')
|
||||
hz = chr(int('0x%s' % k, 16))
|
||||
self.dict[hz] = v
|
||||
for vkey in v:
|
||||
self.revdict.setdefault(vkey, [])
|
||||
self.revdict[vkey].append(hz)
|
||||
|
||||
def py2hz(self, pinyin):
|
||||
if pinyin == '':
|
||||
return []
|
||||
pinyin = pinyin.lower()
|
||||
if pinyin[-1].isdigit():
|
||||
return self.revdict.get(pinyin, [])
|
||||
ret = []
|
||||
for i in range(1, 6):
|
||||
key = '%s%s' % (pinyin, i)
|
||||
ret += self.revdict.get(key, [])
|
||||
return ret
|
||||
|
||||
def get_pinyin(self, chars='', splitter='', tone=False):
|
||||
result = []
|
||||
for char in chars:
|
||||
v = self.dict.get(char, None)
|
||||
if v:
|
||||
v = v[0]
|
||||
if not tone and v[-1].isdigit():
|
||||
v = v[:-1]
|
||||
else:
|
||||
v = char
|
||||
result.append(v)
|
||||
return splitter.join(result)
|
||||
|
||||
def get_initials(self, char=''):
|
||||
if char == '':
|
||||
return ''
|
||||
return self.dict.get(char, [char])[0][0].upper()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import unittest
|
||||
|
||||
class PinyinTestCase(unittest.TestCase):
|
||||
def setUp(self):
|
||||
import sys
|
||||
py = sys.version_info
|
||||
self.py3k = py >= (3, 0, 0)
|
||||
|
||||
self.py = Pinyin()
|
||||
|
||||
def to_unicode(self, s):
|
||||
if self.py3k:
|
||||
return s
|
||||
return s.decode('utf-8')
|
||||
|
||||
def test_get_pinyin(self): ## test method names begin 'test*'
|
||||
s = self.to_unicode('上A2#海')
|
||||
a = self.to_unicode('shangA2#hai')
|
||||
aa = self.to_unicode('shang4A2#hai3')
|
||||
aaa = self.to_unicode('shang A 2 # hai')
|
||||
self.assertEqual(self.py.get_pinyin(s), a)
|
||||
self.assertEqual(self.py.get_pinyin(s, tone=True), aa)
|
||||
self.assertEqual(self.py.get_pinyin(s, splitter=' '), aaa)
|
||||
|
||||
def test_get_initials(self):
|
||||
s = self.to_unicode('上')
|
||||
a = self.to_unicode('S')
|
||||
self.assertEqual(self.py.get_initials(s), a)
|
||||
|
||||
def test_py2hz(self):
|
||||
s1 = self.to_unicode('shang4')
|
||||
s2 = self.to_unicode('a')
|
||||
a1 = self.to_unicode('丄上姠尙尚蠰銄鑜')
|
||||
a2 = self.to_unicode('吖腌錒锕阿嗄阿阿啊阿')
|
||||
self.assertEqual(''.join(self.py.py2hz(s1)), a1)
|
||||
self.assertEqual(''.join(self.py.py2hz(s2)), a2)
|
||||
|
||||
unittest.main()
|
Loading…
Reference in New Issue