# 1. 打开监测任务表格 import pandas as pd import numpy as np import os, glob, re import matplotlib.pyplot as plt from matplotlib.ticker import FuncFormatter import datetime #word toc import win32com import win32com.client as win32 from win32com.client import constants #pdf from pikepdf import Pdf,Page,Rectangle #word from docxtpl import DocxTemplate from docxtpl import InlineImage from docx.shared import Mm def addStamp(target_pdf_path, watermark_pdf_path, output_pdf_path, sy=140): #选择需要添加水印的pdf文件 target_pdf = Pdf.open(target_pdf_path) #读取水印pdf文件并提取水印 watermark_pdf = Pdf.open(watermark_pdf_path) watermark_page_seal = watermark_pdf.pages[0] watermark_page_wyt = watermark_pdf.pages[1] #加公章 x=240; y=sy; w=115; h=115 target_pdf.pages[0].add_overlay(watermark_page_seal, Rectangle(x,y, x+w, y+h)) #加签字 x=163; y=573; w=85; h=50 target_pdf.pages[2].add_overlay(watermark_page_wyt, Rectangle(x,y, x+w, y+h)) #target_pdf.save(target_pdf_path[:6] + '_已签章.pdf') target_pdf.save(output_pdf_path) def update_toc(docx_file): # word路径 word = win32com.client.DispatchEx("Word.Application") word.Visible = 0 # 设置应用可见 word.DisplayAlerts = 0 doc = word.Documents.Open(docx_file) # 使用微软office打开word toc_count = doc.TablesOfContents.Count # 判断是否有无目录,如果数量是1则代表已经有目录了 if toc_count == 0: print("无目录") ''' for i, p in enumerate(doc.Paragraphs): # 遍历word中的内容 if '目录' in p.Range.Text: # 用于指定目录页面,看下面提示 p.Range.InsertParagraphAfter() # 添加新的段落 p.Range.InsertAfter("---") parag_range = doc.Paragraphs(i+2).Range doc.TablesOfContents.Add(Range=parag_range, UseHeadingStyles=True, LowerHeadingLevel=2) # 生成目录对象 ''' elif toc_count == 1: toc = doc.TablesOfContents(1) #toc.Update() # 更新整个目录 toc.UpdatePageNumbers() # 更新目录页码 doc.SaveAs(docx_file.replace('.docx', '_.pdf'), FileFormat=17) doc.Close(SaveChanges=True) word.Quit() def toDate(strDT): dt = pd.to_datetime(strDT, errors='coerce') dts = '' # print('-+-+:', type(dt), dt) if not pd.isna(dt): dts = dt.strftime('%m-%d') return dts # word模板替换 def temp_word(tmep_path, word_apth, dContext, pathImage, city): tpl = DocxTemplate(tmep_path) dC = {'annulusMediaCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusMediaCount.png'), width=Mm(120)), 'annulusCountyCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyCount.png'), width=Mm(120)), 'annulusCountyArticle': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyArticle.png'), width=Mm(120)), 'annulusResult': InlineImage(tpl, os.path.join(pathImage, city + 'annulusResult.png'), width=Mm(120)), 'barCountyRatio': InlineImage(tpl, os.path.join(pathImage, city + 'barCountyRatio.png'), width=Mm(120)) } dContext.update(dC) tpl.render(dContext) tpl.save(word_apth) # 画柱状图 def drawBar(data, recipe, title='', fn=''): plt.figure(figsize=(6, 4)) plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False counties = recipe countyRates = data plt.bar(counties, countyRates, width=0.5) plt.xticks(counties, counties, rotation=35) plt.ylim((0, 1)) def to_percent(temp, position): return '%2.0f' % (100 * temp) + '%' plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent)) plt.title(title, fontsize=16) plt.tight_layout() plt.savefig(fn) # plt.show() plt.cla() plt.clf() plt.close() # 画环状图 def drawAnnulus(data, recipe, title='', fn=''): plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False xxx = 8 # 画布x,长 yyy = 4 # 画布y,高 nnncol = 1 # 图例列数 fs = 'medium' ## xx--small;x-small;small;medium;large;x-large;xx-large # if title == '政务新媒体账号类型': if len(recipe) > 20: if len(recipe) > 40: xxx = 16 nnncol = 4 fs = 'small' else: xxx = 16 nnncol = 2 fs = 'small' fig, ax = plt.subplots(figsize=(xxx, yyy), subplot_kw=dict(aspect="equal")) """ 设置圆环宽度,绘图方向,起始角度 参数wedgeprops以字典形式传递,设置饼图边界的相关属性,例如圆环宽度0.5 饼状图默认从x轴正向沿逆时针绘图,参数startangle可指定新的角(例如负40度)度起画 """ wedges, texts = ax.pie(data, radius=1.1, wedgeprops=dict(width=0.4), startangle=0) # 画环,返回扇形列表和每个标注文本对象(坐标,文字,属性) if 1: x = 1.2 if title == '政务新媒体监测结果': x = 1.0 plt.legend(labels=recipe, loc="center left", bbox_to_anchor=(x, 0.5), borderaxespad=0., ncol=nnncol, fontsize=fs) # , ncol=3 if len(title) > 0: ax.set_title(title, fontsize=16, fontweight='heavy') # , x=0.6 plt.tight_layout() if len(fn) > 0: plt.savefig(fn) # plt.show() plt.cla() plt.clf() plt.close() # summaryCity(city, dfc, dfcw, dfcs, context, strfnTemplate, os.path.join(strPathVerified,'Reports', city+'.docx'), strPathVerified ) # 汇总市州数据, # 市州名称, 监测数据, cbz数据, mgc数据, context(编号、名称), word模板文件名称, 输出word文件名称, 临时文件目录 # 需要传入模板文件,数据、错别字、敏感词,单位名称等 def summaryCity(info, city, df, dfW, dfS, fnTemplate, fnReport, dirTemp): dCityClient = { '甘肃省': "甘肃省人民政府办公厅", '省直部门': "甘肃省人民政府办公厅", '白银市': "白银市人民政府办公室", '定西市': "定西市人民政府办公室", '临夏回族自治州': "临夏回族自治州人民政府办公室", '平凉市': "中共平凉市委网络安全和信息化委员会办公室", "庆阳市": "庆阳市电子政务与信息资源管理办公室", '庆阳市华池县': "华池县人民政府办公室", '庆阳市宁县': "宁县人民政府办公室", "庆阳市镇原县": "镇原县人民政府办公室", "酒泉市": "酒泉市人民政府办公室", "天水市": "天水市人民政府办公室", "武威市": "武威市人民政府办公室", "金昌市": "金昌市人民政府办公室", "嘉峪关市": "嘉峪关市人民政府办公室", "兰州新区": "兰州新区管委会办公室", "陇南市": "陇南市政务服务中心", "张掖市": "张掖市政务服务中心", "甘南藏族自治州": "甘南藏族自治州政务服务中心", "兰州市": "兰州市政务服务中心", "陇南市": "陇南市政务服务中心", } dHavingSubordinateUnits = {'甘肃省': True, '白银市': True, '定西市': True, '临夏回族自治州': True, '平凉市': True, "庆阳市": True, "酒泉市": True, "天水市": True, "陇南市": True, "张掖市": True, "甘南藏族自治州": True, "兰州市": True, "陇南市": True, "武威市": True, "金昌市": True, '省直部门': False, "兰州新区": False, '庆阳市华池县': False, '庆阳市宁县': False, "庆阳市镇原县": False, "嘉峪关市": False} print("----------------" + city + "----------------") # 报告编号、委托单位 strID = "%02d" % (list(dCityClient).index(city)) # print(strID) context = { "city": city, "client": dCityClient[city], "reportid": strID + info['serialNum'], 'havingSubordinateUnits': dHavingSubordinateUnits[city], 'havingBelowStandard': True, 'havingUpStandard': True, 'havingCbz': True, 'havingMgc': True } context.update(info) subordinate = '区县/地方部门' subordinateName = '县区' # 区县数据筛选 if "庆阳市" in city: if "华池县" in city: dfc = df.loc[(df['市/省局'] == '庆阳市') & (df['区县/地方部门'] == '华池县')].copy() elif "宁县" in city: dfc = df.loc[(df['市/省局'] == '庆阳市') & (df['区县/地方部门'] == '宁县')].copy() elif "镇原县" in city: dfc = df.loc[(df['市/省局'] == '庆阳市') & (df['区县/地方部门'] == '镇原县')].copy() else: dfc = df.loc[(df['市/省局'] == '庆阳市')].copy() # & (df['区县/地方部门']!='华池县') # & (df['区县/地方部门']!='宁县') # & (df['区县/地方部门']!='镇原县') ].copy() dfcw = dfW.loc[dfW['市州'] == '庆阳市'].copy() dfcs = dfS.loc[dfS['市州'] == '庆阳市'].copy() elif "甘肃" in city : #dfc = df.copy() #dfcw = dfW.copy() #dfcs = dfS.copy() cities = {'白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区', '嘉峪关市', '陇南市', '张掖市', '省直部门', '金昌市', '甘南藏族自治州'} dfc = df.loc[ df['市/省局'].isin(cities) ].copy() dfcw = dfW.loc[ dfW['市州'].isin(cities) ].copy() dfcs = dfS.loc[ dfS['市州'].isin(cities) ].copy() subordinate = '市/省局' subordinateName = '市州' elif "省直部门" in city : dfc = df.loc[df['市/省局'] == city].copy() #dfcw = dfW.loc[dfW['市州'] == dictSC[city]].copy() #dfcs = dfS.loc[dfS['市州'] == dictSC[city]].copy() dfcw = dfW.loc[dfW['市州'] == city].copy() dfcs = dfS.loc[dfS['市州'] == city].copy() else: dfc = df.loc[(df['市/省局'] == city)].copy() dfcw = dfW.loc[dfW['市州'] == city].copy() dfcs = dfS.loc[dfS['市州'] == city].copy() # ----------------------- # 统计结果分析 dCity = {'1': '2'} # # 县区-监测结果 统计 # # 透视表, 按县区统计各个监测结果账号数量 dfCountyAccount = pd.pivot_table(dfc, index=[subordinate], columns=['监测结果'], values=['账号名称'], aggfunc='count', fill_value='', margins=True) dfCountyAccount.columns = dfCountyAccount.columns.droplevel(0) # 准备模板中的表格 tt3_list = [] for index, row in dfCountyAccount.iterrows(): county = '' if index == 'All': county = '总 计' else: county = index if not dHavingSubordinateUnits[city] and county=='市直部门': county = city hg = '' u2w = '' un = '' count = '' if '合格' in dfCountyAccount.columns.values.tolist(): if not isinstance(row['合格'], str): hg = int(row['合格']) if '监测期间未更新' in dfCountyAccount.columns.values.tolist(): if not isinstance(row['监测期间未更新'], str): un = int(row['监测期间未更新']) if '超过两周未更新' in dfCountyAccount.columns.values.tolist(): if not isinstance(row['超过两周未更新'], str): u2w = int(row['超过两周未更新']) if 'All' in dfCountyAccount.columns.values.tolist(): if not isinstance(row['All'], str): count = int(row['All']) tt3_a = {'county': county, 'hg': hg, 'u2w': u2w, 'un': un, 'count': count} tt3_list.append(tt3_a) context['tt3_contents'] = tt3_list # dfCountyAccount.to_excel(dirTask+strPathCity+'县区监测结果.xlsx') # ----------------------- # # 按媒体类型统计 # # 透视表, 按账号类型统计账号数量 dfMedia = pd.pivot_table(dfc, index=['账号类型'], values=['账号名称'], aggfunc='count', fill_value='', margins=True) # 提取该市账号数量 dCity['nmCount'] = dfMedia.loc['All', '账号名称'] print(' 监测账号数:', dCity['nmCount']) # 提取 账号类型-数量 , 拼成文本串 dfMedia = dfMedia.sort_values(by='账号名称', ascending=False) lTableCs1 = [] strMedia = '' i = 0 tt1_list = [] for m in dfMedia.index.tolist()[1:]: # 第一个是总数,不用取 strNum = str(dfMedia.iloc[:, 0].tolist()[1:][i]) strMedia = strMedia + m + strNum + '个,' tt1_a = {'type': m, 'count': strNum} tt1_list.append(tt1_a) i = i + 1 dCity['sMediaCount'] = strMedia[:-1].rstrip(',') context.update({'tt1_contents': tt1_list}) # ----------------------- # # 按县区-更新次数 统计 # dfCountyArticle = pd.pivot_table(dfc, index=[subordinate], values=['更新次数'], aggfunc='sum', fill_value='', margins=True) dfCountyArticle = dfCountyArticle.sort_values(by='更新次数', ascending=False).copy() dCity['cityArticleCount'] = "%d" % dfCountyArticle.iloc[0, 0] dCity['countyMostArticle'] = dfCountyArticle.index.tolist()[1] dCity['countyMostArticleCount'] = "%d" % dfCountyArticle.iloc[1, 0] strCountyArticle = '' iiii = 0 if len(dfCountyArticle.index)>2: for cccc in dfCountyArticle.index.tolist()[1:]: iiii = iiii + 1 strCountyArticle = strCountyArticle + cccc + "%d" % dfCountyArticle.iloc[iiii, 0] + "次," strCountyArticle = strCountyArticle.rstrip(',') dCity['sCountyArticles'] = ',按管理矩阵统计,' + strCountyArticle # 市各县区监测结果按总数排序, dfCountyAccount.loc[:, '合格'] = dfCountyAccount['合格'].astype('int') dfCountyAccount = dfCountyAccount.sort_values(by='All', ascending=False).copy() # 计算合格率 dfCountyAccount.eval('rate = 合格 / All ', inplace=True) dfResult = dfCountyAccount.copy() # 提取city合格率 dCity['cityRatio'] = "{:.1%}".format(dfCountyAccount.loc['All', 'rate']) print(' 合格率:', dCity['cityRatio']) # 导出文件 # dfCountyAccount.to_excel(dirIntermediate+sFileBase+'县区合格率.xlsx') # dfMedia = dfMedia.drop(['All']) # 提取县区名称,县区账号数, 县区合格率,转成字符串 dfCountyAccount = dfCountyAccount.drop(['All']) # 删除"All"行 counties = dfCountyAccount.index.tolist() countyCounts = dfCountyAccount['All'].values.tolist() countyHeges = dfCountyAccount['合格'].values.tolist() # 按县区账号数量排序 strCountyCount = '' strCounties = '' i = 0 for c in counties: strCounties = strCounties + c + ',' strCountyCount = strCountyCount + c + str(countyCounts[i]) + '个,' i = i + 1 dCity['countyCount'] = "%d" % i dCity['sCounties'] = strCounties.rstrip(',') dCity['sCountyCount'] = strCountyCount.rstrip(',') # 按合格率排序 dfCountyAccount = dfCountyAccount.sort_values(by='rate', ascending=False) countieshege = dfCountyAccount.index.tolist() countyRates = dfCountyAccount['rate'] strCountyRatio = '' i = 0 tt2_list = [] for c in countieshege: strRatio = "%.1f" % (100.0 * countyRates[i]) strCountyRatio = strCountyRatio + c + strRatio + '%,' tt2_a = {'county': c, 'ratio': strRatio + '%'} tt2_list.append(tt2_a) i = i + 1 dCity['sCountyRatio'] = strCountyRatio.rstrip(',') dCity['tt2_contents'] = tt2_list # ----------------------- # # 绘图 # print(' 生成图片...') drawAnnulus(dfMedia.iloc[:, 0].tolist()[1:], dfMedia.index.tolist()[1:], '政务新媒体账号类型', os.path.join(dirTemp, city + 'annulusMediaCount.png')) drawAnnulus(countyCounts, counties, subordinateName + '政务新媒体账号数量', os.path.join(dirTemp, city + 'annulusCountyCount.png')) drawAnnulus(dfCountyArticle.iloc[:, 0].tolist()[1:], dfCountyArticle.index.tolist()[1:], subordinateName + '政务新媒体累计更新次数', os.path.join(dirTemp, city + 'annulusCountyArticle.png')) # ;{{resultNoUpdated}}个政务新媒体监测期间未更新,占监测总数的{{resultNoUpdatedRatio}} # ;{{resultNoUpdated2W}}个政务新媒体连续未更新时间超过两周,占监测总数的{{resultNoUpdated2WRatio}} # 政务新媒体监测结果 dfResult = dfResult.drop('All', axis=1) dfResult = dfResult.drop('rate', axis=1) # 合格数,合格率,不合格数 dCity['resultQualified'] = "%d" % (dfResult.loc['All', '合格']) dCity['resultQualifiedRatio'] = "%.1f%%" % (dfResult.loc['All', '合格'] / dCity['nmCount'] * 100.0) dCity['resultUnqualified'] = "%d" % (dCity['nmCount'] - dfResult.loc['All', '合格']) # # numNoupdated = 0 if '监测期间未更新' in dfResult.columns.values.tolist(): numNoupdated = dfResult.loc['All', '监测期间未更新'] dCity['stringResultNoUpdated'] = ";%d个政务新媒体监测期间未更新,占监测总数的%.1f%%" % ( numNoupdated, numNoupdated / dCity['nmCount'] * 100.0) dCity['stringNoUpdated'] = "%d个政务新媒体监测期间未更新。" % (numNoupdated) else: dCity['stringResultNoUpdated'] = '' dCity['stringNoUpdated'] = "" # dCity['resultNoUpdated'] = "%d"%(numNoupdated) # dCity['resultNoUpdatedRatio'] = "%.1f%%"%(numNoupdated/dCity['nmCount']*100.0) # numNoupdated2W = 0 if '超过两周未更新' in dfResult.columns.values.tolist(): numNoupdated2W = dfResult.loc['All', '超过两周未更新'] dCity['stringResultNoUpdated2W'] = ";%d个政务新媒体连续未更新时间超过两周,占监测总数的%.1f%%" % ( numNoupdated2W, numNoupdated2W / dCity['nmCount'] * 100.0) dCity['stringNoUpdated2W'] = "%d个政务新媒体连续未更新时间超过两周。" % (numNoupdated2W) else: dCity['stringResultNoUpdated2W'] = '' dCity['stringNoUpdated2W'] = '' # dCity['resultNoUpdated2W'] = "%d"%(numNoupdated2W) # dCity['resultNoUpdated2WRatio'] = "%.1f%%"%(numNoupdated2W/dCity['nmCount']*100.0) resultLabels = dfResult.columns.values.tolist() resultCounts = dfResult.loc['All'].values.tolist() drawAnnulus(resultCounts, resultLabels, '政务新媒体监测结果', os.path.join(dirTemp, city + 'annulusResult.png')) drawBar(countyRates, countieshege, '政务新媒体管理矩阵发布时效性合格率榜单', os.path.join(dirTemp, city + 'barCountyRatio.png')) # ----------------------- # # 准备报告需要的数据 # print(' 生成报告...') dfCityUnqulified = dfc[dfc['监测结果'] != '合格'] dfCityUnqulified = dfCityUnqulified.sort_values(by="监测结果", ascending=True) # by指定按哪列排序。ascending表示是否升序=False ################################################# dfCityQulified = dfc[dfc['监测结果'] == '合格'] dfCityQulified = dfCityQulified.sort_values(by=subordinate, ascending=True) # by指定按哪列排序。ascending表示是否升序=False # # 不合格账号列表 if len(dfCityUnqulified)<1: context.update({'havingBelowStandard':False}) else: tt4_list = [] for index, row in dfCityUnqulified.iterrows(): count = '' if row['更新次数']: count = "%d" % row['更新次数'] days = '' if row['静默日数']: days = "%d" % row['静默日数'] sD1 = '' sD2 = '' if row['静默开始日期']: sD1 = toDate(str(row['静默开始日期'])) if row['静默结束日期']: sD2 = toDate(str(row['静默结束日期'])) tt4_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'], 'county': row[subordinate], 'result': row['监测结果'], 'num': count, 'days': days, 'start': sD1, 'end': sD2, } tt4_list.append(tt4_a) tt4_results = {'tt4_contents': tt4_list} context.update(tt4_results) # # 合格账号列表 if len(dfCityQulified)<1: context.update({'havingUpStandard':False}) else: tt5_list = [] for index, row in dfCityQulified.iterrows(): count = '' if row['更新次数']: count = "%d" % row['更新次数'] days = '' if row['静默日数']: days = "%d" % row['静默日数'] sD1 = '' sD2 = '' if row['静默开始日期']: sD1 = toDate(str(row['静默开始日期'])) if row['静默结束日期']: sD2 = toDate(str(row['静默结束日期'])) tt5_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'], 'county': row[subordinate], 'result': row['监测结果'], 'num': count, 'days': days, 'start': sD1, 'end': sD2, } tt5_list.append(tt5_a) tt5_results = {'tt5_contents': tt5_list} context.update(tt5_results) # # 错别字表格 if dfcw.shape[0]<1: context.update({'havingCbz':False}) else: tCbz_list = [] dfcw.fillna('') for index, row in dfcw.iterrows(): sTitle = '' sDate = toDate(str(row['发文时间'])) if '标题' in dfcw.columns: sTitle = row['标题'] # 去除引号等干扰表格模板输出的字符 r = "[——,$%^,。?、~@#¥%……&*《》<>「」{}【】()/\\\[\]'\"]" if pd.isna(row['错误出现位置']): s = '' else: s = re.sub(r, '', row['错误出现位置']) a = {'error': row['错误'], 'tips': row['建议'], 'sentence': s, 'type': row['账号类型'], 'name': row['账号名称'], 'date': sDate, 'title': sTitle, } tCbz_list.append(a) if dfcw.shape[0] > 0: dCity['stringCbzCount'] = '本次检测发现错别字%d处,详细情况见附表政务新媒体发布内容错别字统计表。' % (dfcw.shape[0]) else: dCity['stringCbzCount'] = '本次检测未发现错别字。' tCbz_results = {'tCbz_contents': tCbz_list} context.update(tCbz_results) # 读取添加敏感词表格 if dfcs.shape[0]<1: context.update({'havingMgc':False}) else: tMgc_list = [] dfcs.fillna('') for index, row in dfcs.iterrows(): sTitle = '' sDate = toDate(str(row['发文时间'])) if '标题' in dfcs.columns: sTitle = row['标题'] a = {'error': row['错误'], 'tips': row['建议'], 'sentence': row['错误出现位置'], 'type': row['账号类型'], 'name': row['账号名称'], 'date': sDate, 'title': sTitle, } tMgc_list.append(a) if dfcs.shape[0] > 0: dCity['stringMgcCount'] = '本次检测发现敏感信息%d处,详细情况见附表政务新媒体发布内容敏感信息统计表。' % (dfcs.shape[0]) else: dCity['stringMgcCount'] = '本次检测未发现涉敏内容。' tMgc_results = {'tMgc_contents': tMgc_list} context.update(tMgc_results) # table1 context.update(dCity) # ----------------------- # # 按模板生成报告 # temp_word(fnTemplate, fnReport, context, dirTemp, city) #更新目录并另存为pdf print(' 更新目录,转换为PDF...') update_toc( fnReport ) #签章 print(' 签章...') fnTmp = fnReport.replace('.docx', '_.pdf') fnPDF = fnReport.replace('.docx', '.pdf') if city in {'庆阳市', '平凉市', '临夏回族自治州'}: addStamp(fnTmp, 'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf' , fnPDF, 115) else: addStamp(fnTmp,'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf',fnPDF) if True: os.remove(fnTmp) def createDir(dirP, dirS): dirN = dirP if os.path.isdir(dirP): dirN = os.path.join(dirP, dirS) if not (os.path.exists(dirN)): os.mkdir(dirN) if os.path.isdir(dirN): pass else: dirN = dirP print('Directory ' + dirN + ' cannot be created.') return dirN # def createDir(dirP, dirS): # 合并错别字文件 def mergeCMC(keyword, strPathCBZ, strFnCbz): # cityShorten cityShorten = {'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市', '临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区', '兰州新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市', 'BY': '白银市', 'DX': '定西市', 'JQ': '酒泉市', 'JYG': '嘉峪关市', 'LN': '陇南市', 'LX': '临夏回族自治州', 'PL': '平凉市', 'QY': '庆阳市', 'TS': '天水市', 'WW': '武威市', 'XQ': '兰州新区', 'LZXQ': '兰州新区', 'LZ': '兰州市', 'ZY': '张掖市', 'GN': '甘南藏族自治州', 'SZ': '省直部门', 'JC': '金昌市', } df = pd.DataFrame() for fn in glob.glob(os.path.join(strPathCBZ, '*'+keyword+'*.xlsx')): p, f = os.path.split(fn) city='' for c in cityShorten.keys(): if c in f: city = cityShorten[c] break if len(city)<1: print("!!!!! City Name not matched ( ", f, " )") dfn = pd.read_excel(fn) dfn['市州'] = city df = df.append(dfn, ignore_index=True) print(city, f, dfn.shape[0], '/', df.shape[0]) df.to_excel(strFnCbz) #def mergeCMC if __name__ == "__main__": # 运行之前先转换excel文件的日期列 info = { "year": "2023", "month": "6", "datePub": "二〇二三年七月", "dateStart": "2023年6月1日", "dateEnd": "2023年6月30日", "days": "30", "serialNum": "8", } # 数据根目录, strPath = 'D:/Projects/POM/DATA/2023年7月/6月报告/' createDir(strPath, '全文') createDir(strPath, '转发') createDir(strPath, '报告') createDir(strPath, '汇总') createDir(strPath, '监测') # 监测数据 strFnMonitoring = strPath + '汇总/6月汇总数据_2023.6.xlsx' # word模板文件 strPathTemplate = strPath + 'POM_ReportTemplate.docx' # 错别字 strFnCbz = strPath + '汇总/CBZ.xlsx' if not os.path.exists(strFnCbz):# 汇总错别字 strPathCBZ = strPath + '监测/' mergeCMC("错别", strPathCBZ, strFnCbz) # 敏感词 strFnMgc = strPath + '汇总/MGC.xlsx' if not os.path.exists(strFnMgc):#汇总敏感词 strPathMGC = strPath + '监测/' mergeCMC("敏感", strPathMGC, strFnMgc) # 数据目录 strPathOutput = strPath # 打开监测数据、错别字、敏感词 df = pd.read_excel(strFnMonitoring) dfW = pd.read_excel(strFnCbz) dfS = pd.read_excel(strFnMgc) # df.loc[df['账号类型'] == '微信服务号', '账号类型'] = '微信' # df.loc[df['账号类型'] == '微信订阅号', '账号类型'] = '微信' # 统一监测结果表述 df.loc[df['监测结果'] == '连续两周未更新', '监测结果'] = '超过两周未更新' # 过长名称替换为简称,便于绘图 df.loc[df['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县' df.loc[df['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县' # 省直、 市直、 州直 df['市/省局'] = df['市/省局'].fillna('省直部门') df['区县/地方部门'] = df['区县/地方部门'].fillna('市直部门') df.loc[(df['市/省局'] == '临夏回族自治州') & (df['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门' # 数据整理 df.replace(r'\s+', '', regex=True, inplace=True) # 去除账号、单位名称中的空格、换行、tab等 df.replace(r'^其他\+', '', regex=True, inplace=True) # 去除账号类型中的 "其它" 字样 df['更新次数'] = df['更新次数'].fillna(0) df = df.fillna(value='') ######################################################### # # 统计市州范围 cities = {'白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区', '嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县', '陇南市', '张掖市', '甘肃省'} #cities = cities | {'甘肃省'}#, '省直部门'} #cities = cities | {'陇南市'}#, '兰州市'}, '省直部门'} #cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '张掖市', '甘肃省', '省直部门'} # cities = {'甘肃省'} # 只统计特定市州 # strPathOutput目录下生成报告目录和临时文件目录:Reports 和 Intermediate dirP = os.path.abspath(os.path.dirname(strPathOutput)) dirReports = createDir(dirP, 'Reports') dirIntermediate = createDir(dirP, 'Intermediate') for city in cities: summaryCity(info, city, df, dfW, dfS, strPathTemplate, os.path.join(dirReports, city + '政务新媒体监测报告_{}年{}月.docx'.format(info['year'], info['month'])), dirIntermediate)