add zhtools: tradition simply chinese tool. xuming 20180302

2018-03-02 13:45:57 +08:00 · 2018-03-02 13:45:57 +08:00 · 7ee15f5223
parent 4311a37f12
commit 7ee15f5223
8 changed files with 101 additions and 25701 deletions
--- a/detect.py
+++ b/detect.py
@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+# Author: XuMing <xuming624@qq.com>
+# Brief: error word detector
+
+import os
+import kenlm
+import jieba
+import pickle
+import math
+import wubi
+import numpy as np
+import pypinyin
+from pypinyin import pinyin
+from collections import Counter
+import config
+import re
+
+
+def get_homophones_by_char(input_char):
+    """
+    根据汉字取同音字
+    :param input_char:
+    :return:
+    """
+    result = []
+    # CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
+    for i in range(0x4e00, 0x9fa6):
+        if pinyin([chr(i)], style=pypinyin.NORMAL)[0][0] == pinyin(input_char, style=pypinyin.NORMAL)[0][0]:
+            result.append(chr(i))
+    return result
+
+
+def get_homophones_by_pinyin(input_pinyin):
+    """
+    根据拼音取同音字
+    :param input_pinyin:
+    :return:
+    """
+    result = []
+    # CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字
+    for i in range(0x4e00, 0x9fa6):
+        if pinyin([chr(i)], style=pypinyin.TONE2)[0][0] == input_pinyin:
+            # TONE2: 中zho1ng
+            result.append(chr(i))
+    return result
+
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
+numpy
+pypinyin
+Keras
+scikit-learn
+tensorflow
--- a/spell.py
+++ b/spell.py
@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+# Author: XuMing <xuming624@qq.com>
+# Brief: 
+import os
+import kenlm
+import jieba
+import pickle
+import math
+import wubi
+import numpy as np
+import pypinyin
+from pypinyin import pinyin
+from collections import Counter
+import config
+import re
+
+
+
+def main():
+    line = '我们现在使用的数学福号'
+    print('input sentence is:'%line)
+    # corrected_sent,correct_ranges = correct(line)
--- a/test/detect_test.py
+++ b/test/detect_test.py
@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# Author: XuMing <xuming624@qq.com>
+# Brief: 
+
+from detect import *
+
+pron = get_homophones_by_char('长')
+print(pron)
+
+# 回家的时候他们坐公车，张大林站着沾着，然后就问她。
--- a/test/util_test.py
+++ b/test/util_test.py
@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+# Author: XuMing <xuming624@qq.com>
+# Brief: 
+from util import *
+from pypinyin import pinyin, lazy_pinyin
+
+# traditional simplified
+traditional_sentence = '憂郁的臺灣烏龜'
+simplified_sentence = traditional2simplified(traditional_sentence)
+print(simplified_sentence)
+
+simplified_sentence = '忧郁的台湾乌龟'
+traditional_sentence = simplified2traditional(simplified_sentence)
+print(traditional_sentence)
+
+print(pinyin('中心'))  # 带音调
+print(pinyin('中心', heteronym=True))  # 多音字
+print(lazy_pinyin('中心'))  # 不带音调
--- a/util.py
+++ b/util.py
@ -4,7 +4,6 @@
 import re

 from zhtools.langconv import Converter
-from zhtools.xpinyin import Pinyin


 # 去除标点符号
@ -30,28 +29,3 @@ def simplified2traditional(sentence):
    '''
    sentence = Converter('zh-hant').convert(sentence)
    return sentence
-
-
-if __name__ == "__main__":
-    # traditional simplified
-    traditional_sentence = '憂郁的臺灣烏龜'
-    simplified_sentence = traditional2simplified(traditional_sentence)
-    print(simplified_sentence)
-
-    simplified_sentence = '忧郁的台湾乌龟'
-    traditional_sentence = simplified2traditional(simplified_sentence)
-    print(traditional_sentence)
-
-    # pinyin
-    p = Pinyin()
-    pinyin = p.get_pinyin('坐骑，你骑哪里了')
-    print(pinyin)
-
-    pinyin_tone = p.get_pinyin('坐骑，你骑哪里了', tone=True)
-    print(pinyin_tone)
-
-    print(p.get_initials("上"))
-
-    print(''.join(p.py2hz('shang4')))
-
-    print(''.join(p.py2hz('wo')))
--- a/zhtools/Mandarin.dat
+++ b/zhtools/Mandarin.dat
--- a/zhtools/xpinyin.py
+++ b/zhtools/xpinyin.py
@ -1,126 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-"""Translate chinese hanzi to pinyin by python
-Created by Eric Lo on 2010-05-20.
-Copyright (c) 2010 __lxneng@gmail.com__. http://lxneng.com All rights reserved.
-
-Forked by skydarkchen <skydark2 at gmail>
-"""
-
-import os.path
-
-try:
-    chr = unichr
-except NameError:
-    pass
-
-VERSION = '0.3a'
-
-
-class Pinyin(object):
-    """translate chinese hanzi to pinyin by python, inspired by flyerhzm’s
-    `chinese\_pinyin`_ gem
-    .. _chinese\_pinyin: https://github.com/flyerhzm/chinese_pinyin
-
-    usage(python3)
-    -----
-    ::
-        >>> p = Pinyin()
-        >>> p.get_pinyin("上海")
-        'shanghai'
-        >>> p.get_pinyin("上海", tone=True)
-        'shang4hai3'
-        >>> p.get_initials("上")
-        'S'
-        >>> print(''.join(p.py2hz('shang4')))
-        丄上姠尙尚蠰銄鑜
-        >>> print(''.join(p.py2hz('a')))
-        吖腌錒锕阿嗄阿阿啊阿
-    """
-
-    data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), \
-            'Mandarin.dat')
-
-    def __init__(self):
-        self.dict = {}
-        self.revdict = {}
-        for line in open(self.data_path):
-            k, v = line.strip().split('\t')
-            v = v.lower().split(' ')
-            hz = chr(int('0x%s' % k, 16))
-            self.dict[hz] = v
-            for vkey in v:
-                self.revdict.setdefault(vkey, [])
-                self.revdict[vkey].append(hz)
-
-    def py2hz(self, pinyin):
-        if pinyin == '':
-            return []
-        pinyin = pinyin.lower()
-        if pinyin[-1].isdigit():
-            return self.revdict.get(pinyin, [])
-        ret = []
-        for i in range(1, 6):
-            key = '%s%s' % (pinyin, i)
-            ret += self.revdict.get(key, [])
-        return ret
-
-    def get_pinyin(self, chars='', splitter='', tone=False):
-        result = []
-        for char in chars:
-            v = self.dict.get(char, None)
-            if v:
-                v = v[0]
-                if not tone and v[-1].isdigit():
-                    v = v[:-1]
-            else:
-                v = char
-            result.append(v)
-        return splitter.join(result)
-
-    def get_initials(self, char=''):
-        if char == '':
-            return ''
-        return self.dict.get(char, [char])[0][0].upper()
-
-
-if __name__ == '__main__':
-    import unittest
-
-    class PinyinTestCase(unittest.TestCase):
-        def setUp(self):
-            import sys
-            py = sys.version_info
-            self.py3k = py >= (3, 0, 0)
-
-            self.py = Pinyin()
-
-        def to_unicode(self, s):
-            if self.py3k:
-                return s
-            return s.decode('utf-8')
-
-        def test_get_pinyin(self):  ## test method names begin 'test*'
-            s = self.to_unicode('上A2#海')
-            a = self.to_unicode('shangA2#hai')
-            aa = self.to_unicode('shang4A2#hai3')
-            aaa = self.to_unicode('shang A 2 # hai')
-            self.assertEqual(self.py.get_pinyin(s), a)
-            self.assertEqual(self.py.get_pinyin(s, tone=True), aa)
-            self.assertEqual(self.py.get_pinyin(s, splitter=' '), aaa)
-
-        def test_get_initials(self):
-            s = self.to_unicode('上')
-            a = self.to_unicode('S')
-            self.assertEqual(self.py.get_initials(s), a)
-
-        def test_py2hz(self):
-            s1 = self.to_unicode('shang4')
-            s2 = self.to_unicode('a')
-            a1 = self.to_unicode('丄上姠尙尚蠰銄鑜')
-            a2 = self.to_unicode('吖腌錒锕阿嗄阿阿啊阿')
-            self.assertEqual(''.join(self.py.py2hz(s1)), a1)
-            self.assertEqual(''.join(self.py.py2hz(s2)), a2)
-
-    unittest.main()