2018-03-06 06:35:40 +00:00
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
# Author: XuMing <xuming624@qq.com>
|
|
|
|
|
# Brief:
|
|
|
|
|
from __future__ import print_function
|
|
|
|
|
from setuptools import setup, find_packages
|
2018-08-28 09:34:47 +00:00
|
|
|
|
from pycorrector import __version__
|
2018-03-06 06:35:40 +00:00
|
|
|
|
|
2018-03-14 07:28:36 +00:00
|
|
|
|
long_description = '''
|
|
|
|
|
## Usage
|
2018-03-06 06:35:40 +00:00
|
|
|
|
|
2018-03-14 07:28:36 +00:00
|
|
|
|
### install
|
2018-08-29 12:38:29 +00:00
|
|
|
|
* pip3 install pycorrector
|
|
|
|
|
* Or download https://github.com/shibing624/corrector Unzip and run python3 setup.py install
|
2018-03-14 07:28:36 +00:00
|
|
|
|
|
|
|
|
|
### correct
|
|
|
|
|
input:
|
|
|
|
|
```
|
|
|
|
|
import pycorrector
|
|
|
|
|
|
|
|
|
|
corrected_sent, detail = pycorrector.correct('少先队员因该为老人让坐')
|
|
|
|
|
print(corrected_sent, detail)
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
output:
|
|
|
|
|
```
|
|
|
|
|
少先队员应该为老人让座 [[('因该', '应该', 4, 6)], [('坐', '座', 10, 11)]]
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
----
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# corrector
|
|
|
|
|
中文错别字纠正工具。音似、形似错字(或变体字)纠正,可用于中文拼音、笔画输入法的错误纠正。python开发。
|
|
|
|
|
|
|
|
|
|
**corrector**依据语言模型检测错别字位置,通过拼音音似特征、笔画五笔编辑距离特征及语言模型困惑度特征纠正错别字。
|
|
|
|
|
|
|
|
|
|
## 特征
|
|
|
|
|
### 语言模型
|
|
|
|
|
* Kenlm(统计语言模型工具)
|
|
|
|
|
* RNNLM(TensorFlow、PaddlePaddle均有实现栈式双向LSTM的语言模型)
|
|
|
|
|
|
|
|
|
|
## 使用说明
|
|
|
|
|
|
|
|
|
|
### 安装
|
2018-08-28 09:34:47 +00:00
|
|
|
|
* 全自动安装:pip3 install pycorrector
|
|
|
|
|
* 半自动安装:下载 https://github.com/shibing624/corrector 解压缩并运行 python3 setup.py install
|
2018-03-14 07:28:36 +00:00
|
|
|
|
|
|
|
|
|
### 纠错
|
|
|
|
|
使用示例:
|
|
|
|
|
```
|
|
|
|
|
import pycorrector
|
|
|
|
|
|
|
|
|
|
corrected_sent, detail = pycorrector.correct('少先队员因该为老人让坐')
|
|
|
|
|
print(corrected_sent, detail)
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
输出:
|
|
|
|
|
```
|
|
|
|
|
少先队员应该为老人让座 [[('因该', '应该', 4, 6)], [('坐', '座', 10, 11)]]
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
'''
|
2018-03-06 06:35:40 +00:00
|
|
|
|
|
2018-03-06 06:58:53 +00:00
|
|
|
|
setup(
|
|
|
|
|
name='pycorrector',
|
2018-08-28 09:34:47 +00:00
|
|
|
|
version=__version__,
|
2018-03-06 06:58:53 +00:00
|
|
|
|
description='Chinese Text Error corrector',
|
|
|
|
|
long_description=long_description,
|
|
|
|
|
author='XuMing',
|
|
|
|
|
author_email='xuming624@qq.com',
|
|
|
|
|
url='https://github.com/shibing624/corrector',
|
2018-08-28 09:34:47 +00:00
|
|
|
|
license="Apache 2.0",
|
2018-03-06 06:58:53 +00:00
|
|
|
|
classifiers=[
|
|
|
|
|
'Intended Audience :: Developers',
|
|
|
|
|
'Operating System :: OS Independent',
|
|
|
|
|
'Natural Language :: Chinese (Simplified)',
|
|
|
|
|
'Natural Language :: Chinese (Traditional)',
|
|
|
|
|
'Programming Language :: Python',
|
|
|
|
|
'Programming Language :: Python :: 3',
|
|
|
|
|
'Programming Language :: Python :: 3.5',
|
|
|
|
|
'Programming Language :: Python :: 3.6',
|
|
|
|
|
'Topic :: Text Processing',
|
|
|
|
|
'Topic :: Text Processing :: Indexing',
|
|
|
|
|
'Topic :: Text Processing :: Linguistic',
|
|
|
|
|
],
|
|
|
|
|
keywords='NLP,correction,Chinese error corrector,corrector',
|
|
|
|
|
install_requires=[
|
2018-09-05 13:29:53 +00:00
|
|
|
|
'scipy',
|
|
|
|
|
'scikit-learn',
|
2018-03-14 06:52:58 +00:00
|
|
|
|
'pypinyin',
|
2018-09-05 13:29:53 +00:00
|
|
|
|
'kenlm',
|
|
|
|
|
'jieba',
|
|
|
|
|
'tensorflow',
|
|
|
|
|
'keras>=2.1.5',
|
2018-03-06 06:58:53 +00:00
|
|
|
|
],
|
2018-08-28 09:34:47 +00:00
|
|
|
|
packages=find_packages(exclude=['tests']),
|
2018-03-14 06:52:58 +00:00
|
|
|
|
package_dir={'pycorrector': 'pycorrector'},
|
2018-03-06 06:58:53 +00:00
|
|
|
|
package_data={
|
2018-08-28 09:34:47 +00:00
|
|
|
|
'pycorrector': ['*.*', 'LICENSE', 'README.*', 'data/*', 'data/kenlm/*', 'utils/*'],
|
2018-03-06 06:58:53 +00:00
|
|
|
|
}
|
|
|
|
|
)
|