pomscripts/StatSeasonly4.py

619 lines
25 KiB
Python
Raw Normal View History

2023-04-04 04:15:34 +00:00
# 1. 打开监测任务表格
import pandas as pd
import numpy as np
import os, glob, re
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import datetime
from docxtpl import DocxTemplate
from docxtpl import InlineImage
from docx.shared import Mm
def fetch_chinese(s):
pattern =re.compile(r'[^\u4e00-\u9fa5]')
sc = re.sub(pattern, '', s)
return sc
def toDate(strDT):
dt = pd.to_datetime(strDT, errors='coerce')
dts = ''
# print('-+-+:', type(dt), dt)
if not pd.isna(dt):
dts = dt.strftime('%m-%d')
return dts
# word模板替换
def temp_word(tmep_path, word_apth, dContext, pathImage, city):
tpl = DocxTemplate(tmep_path)
dC = {'annulusMediaCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusMediaCount.png'), width=Mm(120)),
'annulusCountyCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyCount.png'),
width=Mm(120)),
'annulusCountyArticle': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyArticle.png'),
width=Mm(120)),
'annulusResult': InlineImage(tpl, os.path.join(pathImage, city + 'annulusResult.png'), width=Mm(120)),
'barCountyRatio': InlineImage(tpl, os.path.join(pathImage, city + 'barCountyRatio.png'), width=Mm(120))
}
dContext.update(dC)
tpl.render(dContext)
tpl.save(word_apth)
# 画柱状图
def drawBar(data, recipe, title='', fn=''):
plt.figure(figsize=(6, 4))
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
counties = recipe
countyRates = data
plt.bar(counties, countyRates, width=0.5)
plt.xticks(counties, counties, rotation=35)
plt.ylim((0, 1))
def to_percent(temp, position):
return '%2.0f' % (100 * temp) + '%'
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
plt.title(title, fontsize=16)
plt.tight_layout()
plt.savefig(fn)
# plt.show()
plt.cla()
plt.clf()
plt.close()
# 画环状图
def drawAnnulus(data, recipe, title='', fn=''):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
xxx = 8 # 画布x
yyy = 4 # 画布y
nnncol = 1 # 图例列数
fs = 'medium' ## xx--small;x-small;small;medium;large;x-large;xx-large
# if title == '政务新媒体账号类型':
if len(recipe) > 20:
if len(recipe) > 40:
xxx = 16
nnncol = 4
fs = 'x-small'
else:
xxx = 16
nnncol = 2
fs = 'xmall'
fig, ax = plt.subplots(figsize=(xxx, yyy), subplot_kw=dict(aspect="equal"))
"""
设置圆环宽度绘图方向起始角度
参数wedgeprops以字典形式传递设置饼图边界的相关属性例如圆环宽度0.5
饼状图默认从x轴正向沿逆时针绘图参数startangle可指定新的角例如负40度度起画
"""
wedges, texts = ax.pie(data, radius=1.1, wedgeprops=dict(width=0.4), startangle=0) # 画环,返回扇形列表和每个标注文本对象(坐标,文字,属性)
if 1:
x = 1.2
if title == '政务新媒体监测结果':
x = 1.0
plt.legend(labels=recipe, loc="center left", bbox_to_anchor=(x, 0.5), borderaxespad=0., ncol=nnncol,
fontsize=fs) # , ncol=3
if len(title) > 0:
ax.set_title(title, fontsize=16, fontweight='heavy') # , x=0.6
plt.tight_layout()
if len(fn) > 0:
plt.savefig(fn)
# plt.show()
plt.cla()
plt.clf()
plt.close()
# summaryCity(city, dfc, dfcw, dfcs, context, strfnTemplate, os.path.join(strPathVerified,'Reports', city+'.docx'), strPathVerified )
# 汇总市州数据,
# 市州名称, 监测数据, cbz数据 mgc数据 context(编号、名称) word模板文件名称 输出word文件名称 临时文件目录
# 需要传入模板文件,数据、错别字、敏感词,单位名称等
def summaryCity(info, city, df, dfW, dfS, fnTemplate, fnReport, dirTemp):
dCityClient = {
'甘肃省': "甘肃省人民政府办公厅",
'省直部门': "甘肃省人民政府办公厅",
'白银市': "白银市人民政府办公室",
'定西市': "定西市人民政府办公室",
'临夏回族自治州': "临夏回族自治州人民政府办公室",
'平凉市': "中共平凉市委网络安全和信息化委员会办公室",
"庆阳市": "庆阳市电子政务与信息资源管理办公室",
'庆阳市华池县': "华池县人民政府办公室",
'庆阳市宁县': "宁县人民政府办公室",
"庆阳市镇原县": "镇原县人民政府办公室",
"酒泉市": "酒泉市人民政府办公室",
"天水市": "天水市人民政府办公室",
"武威市": "武威市人民政府办公室",
"金昌市": "金昌市人民政府办公室",
"嘉峪关市": "嘉峪关市人民政府办公室",
"兰州新区": "兰州新区管委会办公室",
"陇南市": "陇南市政务服务中心",
"张掖市": "张掖市政务服务中心",
"甘南藏族自治州": "甘南藏族自治州政务服务中心",
"兰州市": "兰州市政务服务中心",
"陇南市": "陇南市政务服务中心",
}
print("----------------" + city + "----------------")
# 报告编号、委托单位
strID = "%02d" % (list(dCityClient).index(city))
# print(strID)
context = {
"city": city,
"client": dCityClient[city],
"reportid": strID + info['num'],
}
context.update(info)
subordinate = '区县/地方部门'
subordinateName = '县区'
# 区县数据筛选
if "庆阳市" in city:
if "华池县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '华池县')].copy()
elif "宁县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '宁县')].copy()
elif "镇原县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '镇原县')].copy()
else:
dfc = df.loc[(df['市/省局'] == '庆阳市')].copy()
# & (df['区县/地方部门']!='华池县')
# & (df['区县/地方部门']!='宁县')
# & (df['区县/地方部门']!='镇原县') ].copy()
dfcw = dfW.loc[dfW['市州'] == '庆阳市'].copy()
dfcs = dfS.loc[dfS['市州'] == '庆阳市'].copy()
elif "甘肃" in city :
dfc = df.copy()
dfcw = dfW.copy()
dfcs = dfS.copy()
subordinate = '市/省局'
subordinateName = '市州'
elif "省直部门" in city :
dfc = df.loc[df['市/省局'] == city].copy()
#dfcw = dfW.loc[dfW['市州'] == dictSC[city]].copy()
#dfcs = dfS.loc[dfS['市州'] == dictSC[city]].copy()
dfcw = dfW.loc[dfW['市州'] == city].copy()
dfcs = dfS.loc[dfS['市州'] == city].copy()
else:
dfc = df.loc[(df['市/省局'] == city)].copy()
dfcw = dfW.loc[dfW['市州'] == city].copy()
dfcs = dfS.loc[dfS['市州'] == city].copy()
# -----------------------
# 统计结果分析
dCity = {'1': '2'}
#
# 县区-监测结果 统计
#
# 透视表, 按县区统计各个监测结果账号数量
dfCountyAccount = pd.pivot_table(dfc, index=[subordinate], columns=['监测结果'], values=['账号名称'], aggfunc='count',
fill_value='', margins=True)
dfCountyAccount.columns = dfCountyAccount.columns.droplevel(0)
# 准备模板中的表格
tt3_list = []
for index, row in dfCountyAccount.iterrows():
county = ''
if index == 'All':
county = '总 计'
else:
county = index
hg = ''
u2w = ''
un = ''
count = ''
if '合格' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['合格'], str):
hg = int(row['合格'])
if '监测期间未更新' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['监测期间未更新'], str):
un = int(row['监测期间未更新'])
if '超过两周未更新' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['超过两周未更新'], str):
u2w = int(row['超过两周未更新'])
if 'All' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['All'], str):
count = int(row['All'])
tt3_a = {'county': county, 'hg': hg, 'u2w': u2w, 'un': un, 'count': count}
tt3_list.append(tt3_a)
context['tt3_contents'] = tt3_list
# dfCountyAccount.to_excel(dirTask+strPathCity+'县区监测结果.xlsx')
# -----------------------
#
# 按媒体类型统计
#
# 透视表, 按账号类型统计账号数量
dfMedia = pd.pivot_table(dfc, index=['账号类型'], values=['账号名称'], aggfunc='count', fill_value='', margins=True)
# 提取该市账号数量
dCity['nmCount'] = dfMedia.loc['All', '账号名称']
print(' 监测账号数:', dCity['nmCount'])
# 提取 账号类型-数量 拼成文本串
dfMedia = dfMedia.sort_values(by='账号名称', ascending=False)
lTableCs1 = []
strMedia = ''
i = 0
tt1_list = []
for m in dfMedia.index.tolist()[1:]: # 第一个是总数,不用取
strNum = str(dfMedia.iloc[:, 0].tolist()[1:][i])
strMedia = strMedia + m + strNum + '个,'
tt1_a = {'type': m, 'count': strNum}
tt1_list.append(tt1_a)
i = i + 1
dCity['sMediaCount'] = strMedia[:-1].rstrip('')
context.update({'tt1_contents': tt1_list})
# -----------------------
#
# 按县区-更新次数 统计
#
dfCountyArticle = pd.pivot_table(dfc, index=[subordinate], values=['更新次数'], aggfunc='sum', fill_value='',
margins=True)
dfCountyArticle = dfCountyArticle.sort_values(by='更新次数', ascending=False).copy()
dCity['cityArticleCount'] = "%d" % dfCountyArticle.iloc[0, 0]
dCity['countyMostArticle'] = dfCountyArticle.index.tolist()[1]
dCity['countyMostArticleCount'] = "%d" % dfCountyArticle.iloc[1, 0]
strCountyArticle = ''
iiii = 1
for cccc in dfCountyArticle.index.tolist()[1:]:
strCountyArticle = strCountyArticle + cccc + "%d" % dfCountyArticle.iloc[iiii, 0] + "次,"
iiii = iiii + 1
dCity['sCountyArticles'] = strCountyArticle.rstrip('')
# 市各县区监测结果按总数排序,
dfCountyAccount.loc[:, '合格'] = dfCountyAccount['合格'].astype('int')
dfCountyAccount = dfCountyAccount.sort_values(by='All', ascending=False).copy()
# 计算合格率
dfCountyAccount.eval('rate = 合格 / All ', inplace=True)
dfResult = dfCountyAccount.copy()
# 提取city合格率
dCity['cityRatio'] = "{:.1%}".format(dfCountyAccount.loc['All', 'rate'])
print(' 合格率:', dCity['cityRatio'])
# 导出文件
# dfCountyAccount.to_excel(dirIntermediate+sFileBase+'县区合格率.xlsx')
# dfMedia = dfMedia.drop(['All'])
# 提取县区名称,县区账号数, 县区合格率,转成字符串
dfCountyAccount = dfCountyAccount.drop(['All']) # 删除"All"行
counties = dfCountyAccount.index.tolist()
countyCounts = dfCountyAccount['All'].values.tolist()
countyHeges = dfCountyAccount['合格'].values.tolist()
print(countyCounts)
print(counties)
# 按县区账号数量排序
strCountyCount = ''
strCounties = ''
i = 0
for c in counties:
strCounties = strCounties + c + ''
strCountyCount = strCountyCount + c + str(countyCounts[i]) + '个,'
i = i + 1
dCity['countyCount'] = "%d" % i
dCity['sCounties'] = strCounties.rstrip('')
dCity['sCountyCount'] = strCountyCount.rstrip('')
# 按合格率排序
dfCountyAccount = dfCountyAccount.sort_values(by='rate', ascending=False)
countieshege = dfCountyAccount.index.tolist()
countyRates = dfCountyAccount['rate']
strCountyRatio = ''
i = 0
tt2_list = []
for c in countieshege:
strRatio = "%.1f" % (100.0 * countyRates[i])
strCountyRatio = strCountyRatio + c + strRatio + '%'
tt2_a = {'county': c, 'ratio': strRatio + '%'}
tt2_list.append(tt2_a)
i = i + 1
dCity['sCountyRatio'] = strCountyRatio.rstrip('')
dCity['tt2_contents'] = tt2_list
# -----------------------
#
# 绘图
#
print(' 生成图片...')
drawAnnulus(dfMedia.iloc[:, 0].tolist()[1:], dfMedia.index.tolist()[1:],
'政务新媒体账号类型', os.path.join(dirTemp, city + 'annulusMediaCount.png'))
print(countyCounts)
print(counties)
drawAnnulus(countyCounts, counties,
subordinateName + '政务新媒体账号数量', os.path.join(dirTemp, city + 'annulusCountyCount.png'))
drawAnnulus(dfCountyArticle.iloc[:, 0].tolist()[1:], dfCountyArticle.index.tolist()[1:],
subordinateName + '政务新媒体累计更新次数', os.path.join(dirTemp, city + 'annulusCountyArticle.png'))
# {{resultNoUpdated}}个政务新媒体监测期间未更新,占监测总数的{{resultNoUpdatedRatio}}
# {{resultNoUpdated2W}}个政务新媒体连续未更新时间超过两周,占监测总数的{{resultNoUpdated2WRatio}}
# 政务新媒体监测结果
dfResult = dfResult.drop('All', axis=1)
dfResult = dfResult.drop('rate', axis=1)
# 合格数,合格率,不合格数
dCity['resultQualified'] = "%d" % (dfResult.loc['All', '合格'])
dCity['resultQualifiedRatio'] = "%.1f%%" % (dfResult.loc['All', '合格'] / dCity['nmCount'] * 100.0)
dCity['resultUnqualified'] = "%d" % (dCity['nmCount'] - dfResult.loc['All', '合格'])
#
# numNoupdated = 0
if '监测期间未更新' in dfResult.columns.values.tolist():
numNoupdated = dfResult.loc['All', '监测期间未更新']
dCity['stringResultNoUpdated'] = "%d个政务新媒体监测期间未更新,占监测总数的%.1f%%" % (
numNoupdated, numNoupdated / dCity['nmCount'] * 100.0)
dCity['stringNoUpdated'] = "%d个政务新媒体监测期间未更新。" % (numNoupdated)
else:
dCity['stringResultNoUpdated'] = ''
dCity['stringNoUpdated'] = ""
# dCity['resultNoUpdated'] = "%d"%(numNoupdated)
# dCity['resultNoUpdatedRatio'] = "%.1f%%"%(numNoupdated/dCity['nmCount']*100.0)
# numNoupdated2W = 0
if '超过两周未更新' in dfResult.columns.values.tolist():
numNoupdated2W = dfResult.loc['All', '超过两周未更新']
dCity['stringResultNoUpdated2W'] = "%d个政务新媒体连续未更新时间超过两周,占监测总数的%.1f%%" % (
numNoupdated2W, numNoupdated2W / dCity['nmCount'] * 100.0)
dCity['stringNoUpdated2W'] = "%d个政务新媒体连续未更新时间超过两周。" % (numNoupdated2W)
else:
dCity['stringResultNoUpdated2W'] = ''
dCity['stringNoUpdated2W'] = ''
# dCity['resultNoUpdated2W'] = "%d"%(numNoupdated2W)
# dCity['resultNoUpdated2WRatio'] = "%.1f%%"%(numNoupdated2W/dCity['nmCount']*100.0)
resultLabels = dfResult.columns.values.tolist()
resultCounts = dfResult.loc['All'].values.tolist()
drawAnnulus(resultCounts, resultLabels,
'政务新媒体监测结果', os.path.join(dirTemp, city + 'annulusResult.png'))
drawBar(countyRates, countieshege,
'政务新媒体管理矩阵发布时效性合格率榜单', os.path.join(dirTemp, city + 'barCountyRatio.png'))
# -----------------------
#
# 准备报告需要的数据
#
print(' 生成报告...')
dfCityUnqulified = dfc[dfc['监测结果'] != '合格']
dfCityUnqulified = dfCityUnqulified.sort_values(by="监测结果", ascending=True) # by指定按哪列排序。ascending表示是否升序=False
#################################################
dfCityQulified = dfc[dfc['监测结果'] == '合格']
dfCityQulified = dfCityQulified.sort_values(by=subordinate, ascending=True) # by指定按哪列排序。ascending表示是否升序=False
# 方法一
tt4_list = []
for index, row in dfCityUnqulified.iterrows():
count = ''
if row['更新次数']:
count = "%d" % row['更新次数']
days = ''
if row['最大静默日数']:
days = "%d" % row['最大静默日数']
sD1 = ''
sD2 = ''
if row['静默开始日期']:
sD1 = toDate(str(row['静默开始日期']))
if row['静默结束日期']:
sD2 = toDate(str(row['静默结束日期']))
tt4_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
'days': days, 'start': sD1, 'end': sD2, }
tt4_list.append(tt4_a)
tt4_results = {'tt4_contents': tt4_list}
context.update(tt4_results)
tt5_list = []
for index, row in dfCityQulified.iterrows():
count = ''
if row['更新次数']:
count = "%d" % row['更新次数']
days = ''
if row['最大静默日数']:
days = "%d" % row['最大静默日数']
sD1 = ''
sD2 = ''
if row['静默开始日期']:
sD1 = toDate(str(row['静默开始日期']))
if row['静默结束日期']:
sD2 = toDate(str(row['静默结束日期']))
tt5_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
'days': days, 'start': sD1, 'end': sD2, }
tt5_list.append(tt5_a)
tt5_results = {'tt5_contents': tt5_list}
context.update(tt5_results)
# 读取添加错别字表格
tCbz_list = []
dfcw.fillna('')
for index, row in dfcw.iterrows():
sTitle = ''
sDate = toDate(str(row['发文时间']))
if '标题' in dfcw.columns:
sTitle = row['标题']
# 去除引号等干扰表格模板输出的字符
r = "[——,$%^,。?、~@#¥%……&*《》<>「」{}【】()/\\\[\]'\"]"
if pd.isna(row['错误出现位置']):
s = ''
else:
s = re.sub(r, '', row['错误出现位置'])
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': s, 'type': row['账号类型'], 'name': row['账号名称'],
'date': sDate, 'title': sTitle, }
tCbz_list.append(a)
if dfcw.shape[0] > 0:
dCity['stringCbzCount'] = '本次检测发现错别字%d处,详细情况见附表政务新媒体发布内容错别字统计表。' % (dfcw.shape[0])
else:
dCity['stringCbzCount'] = '本次检测未发现错别字。'
tCbz_results = {'tCbz_contents': tCbz_list}
context.update(tCbz_results)
# 读取添加敏感词表格
tMgc_list = []
dfcs.fillna('')
for index, row in dfcs.iterrows():
sTitle = ''
sDate = toDate(str(row['发文时间']))
if '标题' in dfcs.columns:
sTitle = row['标题']
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': fetch_chinese(str(row['错误出现位置'])), 'type': row['账号类型'], 'name': row['账号名称'],
'date': sDate, 'title': fetch_chinese(str(sTitle)), }
tMgc_list.append(a)
if dfcs.shape[0] > 0:
dCity['stringMgcCount'] = '本次检测发现敏感信息%d处,详细情况见附表政务新媒体发布内容敏感信息统计表。' % (dfcs.shape[0])
else:
dCity['stringMgcCount'] = '本次检测未发现涉敏内容。'
tMgc_results = {'tMgc_contents': tMgc_list}
context.update(tMgc_results)
# table1
context.update(dCity)
# -----------------------
#
# 按模板生成报告
#
temp_word(fnTemplate,
fnReport,
context, dirTemp, city)
def createDir(dirP, dirS):
dirN = dirP
if os.path.isdir(dirP):
dirN = os.path.join(dirP, dirS)
if not (os.path.exists(dirN)):
os.mkdir(dirN)
if os.path.isdir(dirN):
pass
else:
dirN = dirP
print('Directory ' + dirN + ' cannot be created.')
return dirN
# def createDir(dirP, dirS):
def summary(info, strFnData, strFnW, strFnS, strfnTemplate, strPathOutput):
# 打开监测数据、错别字、敏感词
df = pd.read_excel(strFnData)
dfW = pd.read_excel(strFnW)
dfS = pd.read_excel(strFnS)
# df.loc[df['账号类型'] == '微信服务号', '账号类型'] = '微信'
# df.loc[df['账号类型'] == '微信订阅号', '账号类型'] = '微信'
# 统一监测结果表述
df.loc[df['监测结果'] == '连续两周未更新', '监测结果'] = '超过两周未更新'
# 过长名称替换为简称,便于绘图
df.loc[df['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县'
df.loc[df['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县'
# 省直、 市直、 州直
df['市/省局'] = df['市/省局'].fillna('省直部门')
df['区县/地方部门'] = df['区县/地方部门'].fillna('市直部门')
df.loc[(df['市/省局'] == '临夏回族自治州') & (df['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门'
# 数据整理
df.replace(r'\s+', '', regex=True, inplace=True) # 去除账号、单位名称中的空格、换行、tab等
df.replace(r'^其他\+', '', regex=True, inplace=True) # 去除账号类型中的 "其它" 字样
df['更新次数'] = df['更新次数'].fillna(0)
df = df.fillna(value='')
#########################################################
#
# 统计市州范围
cities = {'甘肃省', '白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
'嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县', '陇南市'}
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '陇南市', '张掖市', '甘肃省', '省直部门'}
#
cities = {'甘肃省'} # 只统计特定市州
# strPathOutput目录下生成报告目录和临时文件目录Reports 和 Intermediate
dirP = os.path.abspath(os.path.dirname(strPathOutput))
dirReports = createDir(dirP, 'Reports')
dirIntermediate = createDir(dirP, 'Intermediate')
for city in cities:
summaryCity(info, city, df, dfW, dfS, strfnTemplate, os.path.join(dirReports, city + '.docx'), dirIntermediate)
# 合并错别字文件
def mergeCMC(keyword, strPathCBZ, strFnCbz):
# cityShorten
cityShorten = {'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
'兰州新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
'BY': '白银市', 'DX': '定西市', 'JQ': '酒泉市', 'JYG': '嘉峪关市', 'LN': '陇南市',
'LX': '临夏回族自治州', 'PL': '平凉市', 'QY': '庆阳市', 'TS': '天水市', 'WW': '武威市', 'XQ': '兰州新区',
'LZXQ': '兰州新区', 'LZ': '兰州市', 'ZY': '张掖市', 'GN': '甘南藏族自治州', 'SZ': '省直部门', 'JC': '金昌市', }
df = pd.DataFrame()
for fn in glob.glob(os.path.join(strPathCBZ, '*'+keyword+'*.xlsx')):
p, f = os.path.split(fn)
city=''
for c in cityShorten.keys():
if c in f:
city = cityShorten[c]
break
if len(city)<1:
print("!!!!! City Name not matched ( ", f, " )")
dfn = pd.read_excel(fn)
dfn['市州'] = city
df = df.append(dfn, ignore_index=True)
print(city, f, dfn.shape[0], '/', df.shape[0])
df.to_excel(strFnCbz)
#def mergeCMC
if __name__ == "__main__":
# 运行之前先转换excel文件的日期列
info = {
"year": "2023",
"quarter": "",
"dateCN": "二〇二三年三月",
"dateStart": "2023年1月1日",
"dateEnd": "2023年3月20日",
"days": "79",
"num": "4",
}
# 数据根目录,
strPath = 'D:/Projects/POM/DATA/2023年S1/'
# 监测数据
strFnMonitoring = strPath + '汇总/第一季度汇总数据_2023.3.xlsx'
# word模板文件
strPathTemplate = strPath + 'POM_ReportTemplate.docx'
# 错别字
strFnCbz = strPath + '汇总/CBZ.xlsx'
if not os.path.exists(strFnCbz):# 汇总错别字
strPathCBZ = strPath + '监测/'
mergeCMC("错别", strPathCBZ, strFnCbz)
# 敏感词
strFnMgc = strPath + '汇总/MGC.xlsx'
if not os.path.exists(strFnMgc):#汇总敏感词
strPathMGC = strPath + '监测/'
mergeCMC("敏感", strPathMGC, strFnMgc)
# 数据目录
strPathOutput = strPath + '统计/'
summary(info, strFnMonitoring, strFnCbz, strFnMgc, strPathTemplate, strPathOutput)