pomscripts/cmcMerge.py


import pandas as pd
import numpy as np
import os, glob, re


# 合并错别字文件
def mergeCMC(keyword, strPathCBZ, strFnCbz):
    print(1, keyword, strPathCBZ, strFnCbz)
    print('glob: ', glob.glob(os.path.join(strPathCBZ, '*'+keyword+'*.xlsx')))
    # cityShorten
    cityShorten = {'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
            '临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
            '兰州新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
                'BY': '白银市', 'DX': '定西市', 'JQ': '酒泉市', 'JYG': '嘉峪关市', 'LN': '陇南市',
            'LX': '临夏回族自治州', 'PL': '平凉市', 'QY': '庆阳市', 'TS': '天水市', 'WW': '武威市', 'XQ': '兰州新区',
            'LZXQ': '兰州新区', 'LZ': '兰州市', 'ZY': '张掖市', 'GN': '甘南藏族自治州', 'SZ': '省直部门', 'JC': '金昌市', }
    df = pd.DataFrame()
    for fn in glob.glob(os.path.join(strPathCBZ, '*'+keyword+'*.xlsx')):
        p, f = os.path.split(fn)
        print(f)
        city=''
        for c in cityShorten.keys():
            if c in f:
                city = cityShorten[c]
                break
        if len(city)<1:
            print("!!!!! City Name not matched ( ", f, " )")
        dfn = pd.read_excel(fn)
        dfn['市州'] = city
        df = df.append(dfn, ignore_index=True)
        print(city, f, dfn.shape[0], '/', df.shape[0])
    df.to_excel(strFnCbz)

strPath = 'D:/Projects/POM/DATA/2023年3月/3月29日错敏词/敏感词/'

# 错别字
strFnCbz = strPath + '../汇总/CBZ.xlsx'
if(os.path.isfile(strFnCbz)):
    os.remove(strFnCbz)
mergeCMC("错别", strPath, strFnCbz)

# 敏感词
strFnMgc = strPath + '../汇总/MGC.xlsx'
if(os.path.isfile(strFnMgc)):
    os.remove(strFnMgc)
mergeCMC("敏感", strPath, strFnMgc)
+ 2023-04-04 04:15:34 +00:00
			`import pandas as pd`
			`import numpy as np`
			`import os, glob, re`



			`# 合并错别字文件`
			`def mergeCMC(keyword, strPathCBZ, strFnCbz):`
			`print(1, keyword, strPathCBZ, strFnCbz)`
			`print('glob: ', glob.glob(os.path.join(strPathCBZ, ''+keyword+'.xlsx')))`
			`# cityShorten`
			`cityShorten = {'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',`
			`'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',`
			`'兰州新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',`
			`'BY': '白银市', 'DX': '定西市', 'JQ': '酒泉市', 'JYG': '嘉峪关市', 'LN': '陇南市',`
			`'LX': '临夏回族自治州', 'PL': '平凉市', 'QY': '庆阳市', 'TS': '天水市', 'WW': '武威市', 'XQ': '兰州新区',`
			`'LZXQ': '兰州新区', 'LZ': '兰州市', 'ZY': '张掖市', 'GN': '甘南藏族自治州', 'SZ': '省直部门', 'JC': '金昌市', }`
			`df = pd.DataFrame()`
			`for fn in glob.glob(os.path.join(strPathCBZ, ''+keyword+'.xlsx')):`
			`p, f = os.path.split(fn)`
			`print(f)`
			`city=''`
			`for c in cityShorten.keys():`
			`if c in f:`
			`city = cityShorten[c]`
			`break`
			`if len(city)<1:`
			`print("!!!!! City Name not matched ( ", f, " )")`
			`dfn = pd.read_excel(fn)`
			`dfn['市州'] = city`
			`df = df.append(dfn, ignore_index=True)`
			`print(city, f, dfn.shape[0], '/', df.shape[0])`
			`df.to_excel(strFnCbz)`

			`strPath = 'D:/Projects/POM/DATA/2023年3月/3月29日错敏词/敏感词/'`

			`# 错别字`
			`strFnCbz = strPath + '../汇总/CBZ.xlsx'`
			`if(os.path.isfile(strFnCbz)):`
			`os.remove(strFnCbz)`
			`mergeCMC("错别", strPath, strFnCbz)`

			`# 敏感词`
			`strFnMgc = strPath + '../汇总/MGC.xlsx'`
			`if(os.path.isfile(strFnMgc)):`
			`os.remove(strFnMgc)`
			`mergeCMC("敏感", strPath, strFnMgc)`