48 lines
2.0 KiB
Python
48 lines
2.0 KiB
Python
|
|
||
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
import os, glob, re
|
||
|
|
||
|
|
||
|
|
||
|
# 合并错别字文件
|
||
|
def mergeCMC(keyword, strPathCBZ, strFnCbz):
|
||
|
print(1, keyword, strPathCBZ, strFnCbz)
|
||
|
print('glob: ', glob.glob(os.path.join(strPathCBZ, '*'+keyword+'*.xlsx')))
|
||
|
# cityShorten
|
||
|
cityShorten = {'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
||
|
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
||
|
'兰州新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
||
|
'BY': '白银市', 'DX': '定西市', 'JQ': '酒泉市', 'JYG': '嘉峪关市', 'LN': '陇南市',
|
||
|
'LX': '临夏回族自治州', 'PL': '平凉市', 'QY': '庆阳市', 'TS': '天水市', 'WW': '武威市', 'XQ': '兰州新区',
|
||
|
'LZXQ': '兰州新区', 'LZ': '兰州市', 'ZY': '张掖市', 'GN': '甘南藏族自治州', 'SZ': '省直部门', 'JC': '金昌市', }
|
||
|
df = pd.DataFrame()
|
||
|
for fn in glob.glob(os.path.join(strPathCBZ, '*'+keyword+'*.xlsx')):
|
||
|
p, f = os.path.split(fn)
|
||
|
print(f)
|
||
|
city=''
|
||
|
for c in cityShorten.keys():
|
||
|
if c in f:
|
||
|
city = cityShorten[c]
|
||
|
break
|
||
|
if len(city)<1:
|
||
|
print("!!!!! City Name not matched ( ", f, " )")
|
||
|
dfn = pd.read_excel(fn)
|
||
|
dfn['市州'] = city
|
||
|
df = df.append(dfn, ignore_index=True)
|
||
|
print(city, f, dfn.shape[0], '/', df.shape[0])
|
||
|
df.to_excel(strFnCbz)
|
||
|
|
||
|
strPath = 'D:/Projects/POM/DATA/2023年3月/3月29日错敏词/敏感词/'
|
||
|
|
||
|
# 错别字
|
||
|
strFnCbz = strPath + '../汇总/CBZ.xlsx'
|
||
|
if(os.path.isfile(strFnCbz)):
|
||
|
os.remove(strFnCbz)
|
||
|
mergeCMC("错别", strPath, strFnCbz)
|
||
|
|
||
|
# 敏感词
|
||
|
strFnMgc = strPath + '../汇总/MGC.xlsx'
|
||
|
if(os.path.isfile(strFnMgc)):
|
||
|
os.remove(strFnMgc)
|
||
|
mergeCMC("敏感", strPath, strFnMgc)
|