This commit is contained in:
bob 2023-04-04 12:15:34 +08:00
commit 1368bf1f0f
12 changed files with 5294 additions and 0 deletions

759
StatMonthly202303.py Normal file
View File

@ -0,0 +1,759 @@
# 1. 打开监测任务表格
import pandas as pd
import numpy as np
import os, glob, re
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import datetime
#word toc
import win32com
import win32com.client as win32
from win32com.client import constants
#pdf
from pikepdf import Pdf,Page,Rectangle
#word
from docxtpl import DocxTemplate
from docxtpl import InlineImage
from docx.shared import Mm
def addStamp(target_pdf_path, watermark_pdf_path, output_pdf_path):
#选择需要添加水印的pdf文件
target_pdf = Pdf.open(target_pdf_path)
#读取水印pdf文件并提取水印
watermark_pdf = Pdf.open(watermark_pdf_path)
watermark_page_seal = watermark_pdf.pages[0]
watermark_page_wyt = watermark_pdf.pages[1]
#加公章
x=240; y=110; w=115; h=115
target_pdf.pages[0].add_overlay(watermark_page_seal, Rectangle(x,y, x+w, y+h))
#加签字
x=163; y=573; w=85; h=50
target_pdf.pages[2].add_overlay(watermark_page_wyt, Rectangle(x,y, x+w, y+h))
#target_pdf.save(target_pdf_path[:6] + '_已签章.pdf')
target_pdf.save(output_pdf_path)
def update_toc(docx_file): # word路径
word = win32com.client.DispatchEx("Word.Application")
word.Visible = 0 # 设置应用可见
word.DisplayAlerts = 0
doc = word.Documents.Open(docx_file) # 使用微软office打开word
toc_count = doc.TablesOfContents.Count # 判断是否有无目录如果数量是1则代表已经有目录了
if toc_count == 0:
print("无目录")
'''
for i, p in enumerate(doc.Paragraphs): # 遍历word中的内容
if '目录' in p.Range.Text: # 用于指定目录页面,看下面提示
p.Range.InsertParagraphAfter() # 添加新的段落
p.Range.InsertAfter("---")
parag_range = doc.Paragraphs(i+2).Range
doc.TablesOfContents.Add(Range=parag_range,
UseHeadingStyles=True,
LowerHeadingLevel=2) # 生成目录对象
'''
elif toc_count == 1:
toc = doc.TablesOfContents(1)
#toc.Update() # 更新整个目录
toc.UpdatePageNumbers() # 更新目录页码
doc.SaveAs(docx_file.replace('.docx', '.pdf'), FileFormat=17)
doc.Close(SaveChanges=True)
word.Quit()
def toDate(strDT):
dt = pd.to_datetime(strDT, errors='coerce')
dts = ''
# print('-+-+:', type(dt), dt)
if not pd.isna(dt):
dts = dt.strftime('%m-%d')
return dts
# word模板替换
def temp_word(tmep_path, word_apth, dContext, pathImage, city):
tpl = DocxTemplate(tmep_path)
dC = {'annulusMediaCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusMediaCount.png'), width=Mm(120)),
'annulusCountyCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyCount.png'),
width=Mm(120)),
'annulusCountyArticle': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyArticle.png'),
width=Mm(120)),
'annulusResult': InlineImage(tpl, os.path.join(pathImage, city + 'annulusResult.png'), width=Mm(120)),
'barCountyRatio': InlineImage(tpl, os.path.join(pathImage, city + 'barCountyRatio.png'), width=Mm(120))
}
dContext.update(dC)
tpl.render(dContext)
tpl.save(word_apth)
# 画柱状图
def drawBar(data, recipe, title='', fn=''):
plt.figure(figsize=(6, 4))
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
counties = recipe
countyRates = data
plt.bar(counties, countyRates, width=0.5)
plt.xticks(counties, counties, rotation=35)
plt.ylim((0, 1))
def to_percent(temp, position):
return '%2.0f' % (100 * temp) + '%'
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
plt.title(title, fontsize=16)
plt.tight_layout()
plt.savefig(fn)
# plt.show()
plt.cla()
plt.clf()
plt.close()
# 画环状图
def drawAnnulus(data, recipe, title='', fn=''):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
xxx = 8 # 画布x
yyy = 4 # 画布y
nnncol = 1 # 图例列数
fs = 'medium' ## xx--small;x-small;small;medium;large;x-large;xx-large
# if title == '政务新媒体账号类型':
if len(recipe) > 20:
if len(recipe) > 40:
xxx = 16
nnncol = 4
fs = 'small'
else:
xxx = 16
nnncol = 2
fs = 'small'
fig, ax = plt.subplots(figsize=(xxx, yyy), subplot_kw=dict(aspect="equal"))
"""
设置圆环宽度绘图方向起始角度
参数wedgeprops以字典形式传递设置饼图边界的相关属性例如圆环宽度0.5
饼状图默认从x轴正向沿逆时针绘图参数startangle可指定新的角例如负40度度起画
"""
wedges, texts = ax.pie(data, radius=1.1, wedgeprops=dict(width=0.4), startangle=0) # 画环,返回扇形列表和每个标注文本对象(坐标,文字,属性)
if 1:
x = 1.2
if title == '政务新媒体监测结果':
x = 1.0
plt.legend(labels=recipe, loc="center left", bbox_to_anchor=(x, 0.5), borderaxespad=0., ncol=nnncol,
fontsize=fs) # , ncol=3
if len(title) > 0:
ax.set_title(title, fontsize=16, fontweight='heavy') # , x=0.6
plt.tight_layout()
if len(fn) > 0:
plt.savefig(fn)
# plt.show()
plt.cla()
plt.clf()
plt.close()
# summaryCity(city, dfc, dfcw, dfcs, context, strfnTemplate, os.path.join(strPathVerified,'Reports', city+'.docx'), strPathVerified )
# 汇总市州数据,
# 市州名称, 监测数据, cbz数据 mgc数据 context(编号、名称) word模板文件名称 输出word文件名称 临时文件目录
# 需要传入模板文件,数据、错别字、敏感词,单位名称等
def summaryCity(info, city, df, dfW, dfS, fnTemplate, fnReport, dirTemp):
dCityClient = {
'甘肃省': "甘肃省人民政府办公厅",
'省直部门': "甘肃省人民政府办公厅",
'白银市': "白银市人民政府办公室",
'定西市': "定西市人民政府办公室",
'临夏回族自治州': "临夏回族自治州人民政府办公室",
'平凉市': "中共平凉市委网络安全和信息化委员会办公室",
"庆阳市": "庆阳市电子政务与信息资源管理办公室",
'庆阳市华池县': "华池县人民政府办公室",
'庆阳市宁县': "宁县人民政府办公室",
"庆阳市镇原县": "镇原县人民政府办公室",
"酒泉市": "酒泉市人民政府办公室",
"天水市": "天水市人民政府办公室",
"武威市": "武威市人民政府办公室",
"金昌市": "金昌市人民政府办公室",
"嘉峪关市": "嘉峪关市人民政府办公室",
"兰州新区": "兰州新区管委会办公室",
"陇南市": "陇南市政务服务中心",
"张掖市": "张掖市政务服务中心",
"甘南藏族自治州": "甘南藏族自治州政务服务中心",
"兰州市": "兰州市政务服务中心",
"陇南市": "陇南市政务服务中心",
}
dHavingSubordinateUnits = {'甘肃省': True, '白银市': True, '定西市': True,
'临夏回族自治州': True, '平凉市': True, "庆阳市": True, "酒泉市": True, "天水市": True,
"陇南市": True, "张掖市": True, "甘南藏族自治州": True, "兰州市": True, "陇南市": True,
"武威市": True, "金昌市": True,
'省直部门': False, "兰州新区": False, '庆阳市华池县': False,
'庆阳市宁县': False, "庆阳市镇原县": False, "嘉峪关市": False}
print("----------------" + city + "----------------")
# 报告编号、委托单位
strID = "%02d" % (list(dCityClient).index(city))
# print(strID)
context = {
"city": city,
"client": dCityClient[city],
"reportid": strID + info['serialNum'],
'havingSubordinateUnits': dHavingSubordinateUnits[city],
'havingBelowStandard': True,
'havingUpStandard': True,
'havingCbz': True,
'havingMgc': True
}
context.update(info)
subordinate = '区县/地方部门'
subordinateName = '县区'
# 区县数据筛选
if "庆阳市" in city:
if "华池县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '华池县')].copy()
elif "宁县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '宁县')].copy()
elif "镇原县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '镇原县')].copy()
else:
dfc = df.loc[(df['市/省局'] == '庆阳市')].copy()
# & (df['区县/地方部门']!='华池县')
# & (df['区县/地方部门']!='宁县')
# & (df['区县/地方部门']!='镇原县') ].copy()
dfcw = dfW.loc[dfW['市州'] == '庆阳市'].copy()
dfcs = dfS.loc[dfS['市州'] == '庆阳市'].copy()
elif "甘肃" in city :
#dfc = df.copy()
#dfcw = dfW.copy()
#dfcs = dfS.copy()
cities = {'白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
'嘉峪关市', '陇南市', '张掖市'}
dfc = df.loc[ df['市/省局'].isin(cities) ].copy()
dfcw = dfW.loc[ dfW['市州'].isin(cities) ].copy()
dfcs = dfS.loc[ dfS['市州'].isin(cities) ].copy()
subordinate = '市/省局'
subordinateName = '市州'
elif "省直部门" in city :
dfc = df.loc[df['市/省局'] == city].copy()
#dfcw = dfW.loc[dfW['市州'] == dictSC[city]].copy()
#dfcs = dfS.loc[dfS['市州'] == dictSC[city]].copy()
dfcw = dfW.loc[dfW['市州'] == city].copy()
dfcs = dfS.loc[dfS['市州'] == city].copy()
else:
dfc = df.loc[(df['市/省局'] == city)].copy()
dfcw = dfW.loc[dfW['市州'] == city].copy()
dfcs = dfS.loc[dfS['市州'] == city].copy()
# -----------------------
# 统计结果分析
dCity = {'1': '2'}
#
# 县区-监测结果 统计
#
# 透视表, 按县区统计各个监测结果账号数量
dfCountyAccount = pd.pivot_table(dfc, index=[subordinate], columns=['监测结果'], values=['账号名称'], aggfunc='count',
fill_value='', margins=True)
dfCountyAccount.columns = dfCountyAccount.columns.droplevel(0)
# 准备模板中的表格
tt3_list = []
for index, row in dfCountyAccount.iterrows():
county = ''
if index == 'All':
county = '总 计'
else:
county = index
hg = ''
u2w = ''
un = ''
count = ''
if '合格' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['合格'], str):
hg = int(row['合格'])
if '监测期间未更新' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['监测期间未更新'], str):
un = int(row['监测期间未更新'])
if '超过两周未更新' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['超过两周未更新'], str):
u2w = int(row['超过两周未更新'])
if 'All' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['All'], str):
count = int(row['All'])
tt3_a = {'county': county, 'hg': hg, 'u2w': u2w, 'un': un, 'count': count}
tt3_list.append(tt3_a)
context['tt3_contents'] = tt3_list
# dfCountyAccount.to_excel(dirTask+strPathCity+'县区监测结果.xlsx')
# -----------------------
#
# 按媒体类型统计
#
# 透视表, 按账号类型统计账号数量
dfMedia = pd.pivot_table(dfc, index=['账号类型'], values=['账号名称'], aggfunc='count', fill_value='', margins=True)
# 提取该市账号数量
dCity['nmCount'] = dfMedia.loc['All', '账号名称']
print(' 监测账号数:', dCity['nmCount'])
# 提取 账号类型-数量 拼成文本串
dfMedia = dfMedia.sort_values(by='账号名称', ascending=False)
lTableCs1 = []
strMedia = ''
i = 0
tt1_list = []
for m in dfMedia.index.tolist()[1:]: # 第一个是总数,不用取
strNum = str(dfMedia.iloc[:, 0].tolist()[1:][i])
strMedia = strMedia + m + strNum + '个,'
tt1_a = {'type': m, 'count': strNum}
tt1_list.append(tt1_a)
i = i + 1
dCity['sMediaCount'] = strMedia[:-1].rstrip('')
context.update({'tt1_contents': tt1_list})
# -----------------------
#
# 按县区-更新次数 统计
#
dfCountyArticle = pd.pivot_table(dfc, index=[subordinate], values=['更新次数'], aggfunc='sum', fill_value='',
margins=True)
dfCountyArticle = dfCountyArticle.sort_values(by='更新次数', ascending=False).copy()
dCity['cityArticleCount'] = "%d" % dfCountyArticle.iloc[0, 0]
dCity['countyMostArticle'] = dfCountyArticle.index.tolist()[1]
dCity['countyMostArticleCount'] = "%d" % dfCountyArticle.iloc[1, 0]
strCountyArticle = ''
iiii = 0
if len(dfCountyArticle.index)>2:
for cccc in dfCountyArticle.index.tolist()[1:]:
iiii = iiii + 1
strCountyArticle = strCountyArticle + cccc + "%d" % dfCountyArticle.iloc[iiii, 0] + "次,"
strCountyArticle = strCountyArticle.rstrip('')
dCity['sCountyArticles'] = ',按管理矩阵统计,' + strCountyArticle
# 市各县区监测结果按总数排序,
dfCountyAccount.loc[:, '合格'] = dfCountyAccount['合格'].astype('int')
dfCountyAccount = dfCountyAccount.sort_values(by='All', ascending=False).copy()
# 计算合格率
dfCountyAccount.eval('rate = 合格 / All ', inplace=True)
dfResult = dfCountyAccount.copy()
# 提取city合格率
dCity['cityRatio'] = "{:.1%}".format(dfCountyAccount.loc['All', 'rate'])
print(' 合格率:', dCity['cityRatio'])
# 导出文件
# dfCountyAccount.to_excel(dirIntermediate+sFileBase+'县区合格率.xlsx')
# dfMedia = dfMedia.drop(['All'])
# 提取县区名称,县区账号数, 县区合格率,转成字符串
dfCountyAccount = dfCountyAccount.drop(['All']) # 删除"All"行
counties = dfCountyAccount.index.tolist()
countyCounts = dfCountyAccount['All'].values.tolist()
countyHeges = dfCountyAccount['合格'].values.tolist()
# 按县区账号数量排序
strCountyCount = ''
strCounties = ''
i = 0
for c in counties:
strCounties = strCounties + c + ''
strCountyCount = strCountyCount + c + str(countyCounts[i]) + '个,'
i = i + 1
dCity['countyCount'] = "%d" % i
dCity['sCounties'] = strCounties.rstrip('')
dCity['sCountyCount'] = strCountyCount.rstrip('')
# 按合格率排序
dfCountyAccount = dfCountyAccount.sort_values(by='rate', ascending=False)
countieshege = dfCountyAccount.index.tolist()
countyRates = dfCountyAccount['rate']
strCountyRatio = ''
i = 0
tt2_list = []
for c in countieshege:
strRatio = "%.1f" % (100.0 * countyRates[i])
strCountyRatio = strCountyRatio + c + strRatio + '%'
tt2_a = {'county': c, 'ratio': strRatio + '%'}
tt2_list.append(tt2_a)
i = i + 1
dCity['sCountyRatio'] = strCountyRatio.rstrip('')
dCity['tt2_contents'] = tt2_list
# -----------------------
#
# 绘图
#
print(' 生成图片...')
drawAnnulus(dfMedia.iloc[:, 0].tolist()[1:], dfMedia.index.tolist()[1:],
'政务新媒体账号类型', os.path.join(dirTemp, city + 'annulusMediaCount.png'))
drawAnnulus(countyCounts, counties,
subordinateName + '政务新媒体账号数量', os.path.join(dirTemp, city + 'annulusCountyCount.png'))
drawAnnulus(dfCountyArticle.iloc[:, 0].tolist()[1:], dfCountyArticle.index.tolist()[1:],
subordinateName + '政务新媒体累计更新次数', os.path.join(dirTemp, city + 'annulusCountyArticle.png'))
# {{resultNoUpdated}}个政务新媒体监测期间未更新,占监测总数的{{resultNoUpdatedRatio}}
# {{resultNoUpdated2W}}个政务新媒体连续未更新时间超过两周,占监测总数的{{resultNoUpdated2WRatio}}
# 政务新媒体监测结果
dfResult = dfResult.drop('All', axis=1)
dfResult = dfResult.drop('rate', axis=1)
# 合格数,合格率,不合格数
dCity['resultQualified'] = "%d" % (dfResult.loc['All', '合格'])
dCity['resultQualifiedRatio'] = "%.1f%%" % (dfResult.loc['All', '合格'] / dCity['nmCount'] * 100.0)
dCity['resultUnqualified'] = "%d" % (dCity['nmCount'] - dfResult.loc['All', '合格'])
#
# numNoupdated = 0
if '监测期间未更新' in dfResult.columns.values.tolist():
numNoupdated = dfResult.loc['All', '监测期间未更新']
dCity['stringResultNoUpdated'] = "%d个政务新媒体监测期间未更新,占监测总数的%.1f%%" % (
numNoupdated, numNoupdated / dCity['nmCount'] * 100.0)
dCity['stringNoUpdated'] = "%d个政务新媒体监测期间未更新。" % (numNoupdated)
else:
dCity['stringResultNoUpdated'] = ''
dCity['stringNoUpdated'] = ""
# dCity['resultNoUpdated'] = "%d"%(numNoupdated)
# dCity['resultNoUpdatedRatio'] = "%.1f%%"%(numNoupdated/dCity['nmCount']*100.0)
# numNoupdated2W = 0
if '超过两周未更新' in dfResult.columns.values.tolist():
numNoupdated2W = dfResult.loc['All', '超过两周未更新']
dCity['stringResultNoUpdated2W'] = "%d个政务新媒体连续未更新时间超过两周,占监测总数的%.1f%%" % (
numNoupdated2W, numNoupdated2W / dCity['nmCount'] * 100.0)
dCity['stringNoUpdated2W'] = "%d个政务新媒体连续未更新时间超过两周。" % (numNoupdated2W)
else:
dCity['stringResultNoUpdated2W'] = ''
dCity['stringNoUpdated2W'] = ''
# dCity['resultNoUpdated2W'] = "%d"%(numNoupdated2W)
# dCity['resultNoUpdated2WRatio'] = "%.1f%%"%(numNoupdated2W/dCity['nmCount']*100.0)
resultLabels = dfResult.columns.values.tolist()
resultCounts = dfResult.loc['All'].values.tolist()
drawAnnulus(resultCounts, resultLabels,
'政务新媒体监测结果', os.path.join(dirTemp, city + 'annulusResult.png'))
drawBar(countyRates, countieshege,
'政务新媒体管理矩阵发布时效性合格率榜单', os.path.join(dirTemp, city + 'barCountyRatio.png'))
# -----------------------
#
# 准备报告需要的数据
#
print(' 生成报告...')
dfCityUnqulified = dfc[dfc['监测结果'] != '合格']
dfCityUnqulified = dfCityUnqulified.sort_values(by="监测结果", ascending=True) # by指定按哪列排序。ascending表示是否升序=False
#################################################
dfCityQulified = dfc[dfc['监测结果'] == '合格']
dfCityQulified = dfCityQulified.sort_values(by=subordinate, ascending=True) # by指定按哪列排序。ascending表示是否升序=False
#
# 不合格账号列表
if len(dfCityUnqulified)<1:
context.update({'havingBelowStandard':False})
else:
tt4_list = []
for index, row in dfCityUnqulified.iterrows():
count = ''
if row['更新次数']:
count = "%d" % row['更新次数']
days = ''
if row['最大静默日数']:
days = "%d" % row['最大静默日数']
sD1 = ''
sD2 = ''
if row['静默开始日期']:
sD1 = toDate(str(row['静默开始日期']))
if row['静默结束日期']:
sD2 = toDate(str(row['静默结束日期']))
tt4_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
'days': days, 'start': sD1, 'end': sD2, }
tt4_list.append(tt4_a)
tt4_results = {'tt4_contents': tt4_list}
context.update(tt4_results)
#
# 合格账号列表
if len(dfCityQulified)<1:
context.update({'havingUpStandard':False})
else:
tt5_list = []
for index, row in dfCityQulified.iterrows():
count = ''
if row['更新次数']:
count = "%d" % row['更新次数']
days = ''
if row['最大静默日数']:
days = "%d" % row['最大静默日数']
sD1 = ''
sD2 = ''
if row['静默开始日期']:
sD1 = toDate(str(row['静默开始日期']))
if row['静默结束日期']:
sD2 = toDate(str(row['静默结束日期']))
tt5_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
'days': days, 'start': sD1, 'end': sD2, }
tt5_list.append(tt5_a)
tt5_results = {'tt5_contents': tt5_list}
context.update(tt5_results)
#
# 错别字表格
if dfcw.shape[0]<1:
context.update({'havingCbz':False})
else:
tCbz_list = []
dfcw.fillna('')
for index, row in dfcw.iterrows():
sTitle = ''
sDate = toDate(str(row['发文时间']))
if '标题' in dfcw.columns:
sTitle = row['标题']
# 去除引号等干扰表格模板输出的字符
r = "[——,$%^,。?、~@#¥%……&*《》<>「」{}【】()/\\\[\]'\"]"
if pd.isna(row['错误出现位置']):
s = ''
else:
s = re.sub(r, '', row['错误出现位置'])
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': s, 'type': row['账号类型'], 'name': row['账号名称'],
'date': sDate, 'title': sTitle, }
tCbz_list.append(a)
if dfcw.shape[0] > 0:
dCity['stringCbzCount'] = '本次检测发现错别字%d处,详细情况见附表政务新媒体发布内容错别字统计表。' % (dfcw.shape[0])
else:
dCity['stringCbzCount'] = '本次检测未发现错别字。'
tCbz_results = {'tCbz_contents': tCbz_list}
context.update(tCbz_results)
# 读取添加敏感词表格
if dfcs.shape[0]<1:
context.update({'havingMgc':False})
else:
tMgc_list = []
dfcs.fillna('')
for index, row in dfcs.iterrows():
sTitle = ''
sDate = toDate(str(row['发文时间']))
if '标题' in dfcs.columns:
sTitle = row['标题']
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': row['错误出现位置'], 'type': row['账号类型'], 'name': row['账号名称'],
'date': sDate, 'title': sTitle, }
tMgc_list.append(a)
if dfcs.shape[0] > 0:
dCity['stringMgcCount'] = '本次检测发现敏感信息%d处,详细情况见附表政务新媒体发布内容敏感信息统计表。' % (dfcs.shape[0])
else:
dCity['stringMgcCount'] = '本次检测未发现涉敏内容。'
tMgc_results = {'tMgc_contents': tMgc_list}
context.update(tMgc_results)
# table1
context.update(dCity)
# -----------------------
#
# 按模板生成报告
#
temp_word(fnTemplate,
fnReport,
context, dirTemp, city)
#更新目录并另存为pdf
update_toc( fnReport )
#签章
addStamp(fnReport.replace('.docx', '.pdf'),'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf' , fnReport.replace('.docx', '_Stamp.pdf'))
def createDir(dirP, dirS):
dirN = dirP
if os.path.isdir(dirP):
dirN = os.path.join(dirP, dirS)
if not (os.path.exists(dirN)):
os.mkdir(dirN)
if os.path.isdir(dirN):
pass
else:
dirN = dirP
print('Directory ' + dirN + ' cannot be created.')
return dirN
# def createDir(dirP, dirS):
def summary(info, strFnData, strFnW, strFnS, strfnTemplate, strPathOutput):
# 打开监测数据、错别字、敏感词
df = pd.read_excel(strFnData)
dfW = pd.read_excel(strFnW)
dfS = pd.read_excel(strFnS)
# df.loc[df['账号类型'] == '微信服务号', '账号类型'] = '微信'
# df.loc[df['账号类型'] == '微信订阅号', '账号类型'] = '微信'
# 统一监测结果表述
df.loc[df['监测结果'] == '连续两周未更新', '监测结果'] = '超过两周未更新'
# 过长名称替换为简称,便于绘图
df.loc[df['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县'
df.loc[df['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县'
# 省直、 市直、 州直
df['市/省局'] = df['市/省局'].fillna('省直部门')
df['区县/地方部门'] = df['区县/地方部门'].fillna('市直部门')
df.loc[(df['市/省局'] == '临夏回族自治州') & (df['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门'
# 数据整理
df.replace(r'\s+', '', regex=True, inplace=True) # 去除账号、单位名称中的空格、换行、tab等
df.replace(r'^其他\+', '', regex=True, inplace=True) # 去除账号类型中的 "其它" 字样
df['更新次数'] = df['更新次数'].fillna(0)
df = df.fillna(value='')
#########################################################
#
# 统计市州范围
cities = {'白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
'嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县', '陇南市', '张掖市', '甘肃省'}
#cities = cities | {'甘肃省'}#, '省直部门'}
#cities = cities | {'陇南市'}#, '兰州市'}, '省直部门'}
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '张掖市', '甘肃省', '省直部门'}
#
#cities = {'白银市','甘肃省'} # 只统计特定市州
# strPathOutput目录下生成报告目录和临时文件目录Reports 和 Intermediate
dirP = os.path.abspath(os.path.dirname(strPathOutput))
dirReports = createDir(dirP, 'Reports')
dirIntermediate = createDir(dirP, 'Intermediate')
for city in cities:
summaryCity(info, city, df, dfW, dfS, strfnTemplate, os.path.join(dirReports, city + '.docx'), dirIntermediate)
# 合并错别字文件
def mergeCMC(keyword, strPathCBZ, strFnCbz):
# cityShorten
cityShorten = {'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
'兰州新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
'BY': '白银市', 'DX': '定西市', 'JQ': '酒泉市', 'JYG': '嘉峪关市', 'LN': '陇南市',
'LX': '临夏回族自治州', 'PL': '平凉市', 'QY': '庆阳市', 'TS': '天水市', 'WW': '武威市', 'XQ': '兰州新区',
'LZXQ': '兰州新区', 'LZ': '兰州市', 'ZY': '张掖市', 'GN': '甘南藏族自治州', 'SZ': '省直部门', 'JC': '金昌市', }
df = pd.DataFrame()
for fn in glob.glob(os.path.join(strPathCBZ, '*'+keyword+'*.xlsx')):
p, f = os.path.split(fn)
city=''
for c in cityShorten.keys():
if c in f:
city = cityShorten[c]
break
if len(city)<1:
print("!!!!! City Name not matched ( ", f, " )")
dfn = pd.read_excel(fn)
dfn['市州'] = city
df = df.append(dfn, ignore_index=True)
print(city, f, dfn.shape[0], '/', df.shape[0])
df.to_excel(strFnCbz)
#def mergeCMC
if __name__ == "__main__":
# 运行之前先转换excel文件的日期列
info = {
"year": "2023",
"month": "3",
"datePub": "二〇二三年四月",
"dateStart": "2023年3月1日",
"dateEnd": "2023年3月31日",
"days": "31",
"serialNum": "4",
}
# 数据根目录,
strPath = 'D:/Projects/POM/DATA/2023年4月/3月报告/'
createDir(strPath, '全文')
createDir(strPath, '转发')
createDir(strPath, '报告')
createDir(strPath, '汇总')
createDir(strPath, '监测')
# 监测数据
strFnMonitoring = strPath + '汇总/3月汇总数据_2023.3.xlsx'
# word模板文件
strPathTemplate = strPath + 'POM_ReportTemplate.docx'
# 错别字
strFnCbz = strPath + '汇总/CBZ.xlsx'
if not os.path.exists(strFnCbz):# 汇总错别字
strPathCBZ = strPath + '监测/'
mergeCMC("错别", strPathCBZ, strFnCbz)
# 敏感词
strFnMgc = strPath + '汇总/MGC.xlsx'
if not os.path.exists(strFnMgc):#汇总敏感词
strPathMGC = strPath + '监测/'
mergeCMC("敏感", strPathMGC, strFnMgc)
# 数据目录
strPathOutput = strPath
# 打开监测数据、错别字、敏感词
df = pd.read_excel(strFnMonitoring)
dfW = pd.read_excel(strFnCbz)
dfS = pd.read_excel(strFnMgc)
# df.loc[df['账号类型'] == '微信服务号', '账号类型'] = '微信'
# df.loc[df['账号类型'] == '微信订阅号', '账号类型'] = '微信'
# 统一监测结果表述
df.loc[df['监测结果'] == '连续两周未更新', '监测结果'] = '超过两周未更新'
# 过长名称替换为简称,便于绘图
df.loc[df['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县'
df.loc[df['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县'
# 省直、 市直、 州直
df['市/省局'] = df['市/省局'].fillna('省直部门')
df['区县/地方部门'] = df['区县/地方部门'].fillna('市直部门')
df.loc[(df['市/省局'] == '临夏回族自治州') & (df['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门'
# 数据整理
df.replace(r'\s+', '', regex=True, inplace=True) # 去除账号、单位名称中的空格、换行、tab等
df.replace(r'^其他\+', '', regex=True, inplace=True) # 去除账号类型中的 "其它" 字样
df['更新次数'] = df['更新次数'].fillna(0)
df = df.fillna(value='')
#########################################################
#
# 统计市州范围
cities = {'白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
'嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县', '陇南市', '张掖市', '甘肃省'}
#cities = cities | {'甘肃省'}#, '省直部门'}
#cities = cities | {'陇南市'}#, '兰州市'}, '省直部门'}
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '张掖市', '甘肃省', '省直部门'}
#
#cities = {'兰州新区','白银市','庆阳市'} # 只统计特定市州
# strPathOutput目录下生成报告目录和临时文件目录Reports 和 Intermediate
dirP = os.path.abspath(os.path.dirname(strPathOutput))
dirReports = createDir(dirP, 'Reports')
dirIntermediate = createDir(dirP, 'Intermediate')
for city in cities:
summaryCity(info, city, df, dfW, dfS, strPathTemplate, os.path.join(dirReports, city + '.docx'), dirIntermediate)

604
StatSeasonly3.py Normal file
View File

@ -0,0 +1,604 @@
# 1. 打开监测任务表格
import pandas as pd
import numpy as np
import os, glob, re
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import datetime
from docxtpl import DocxTemplate
from docxtpl import InlineImage
from docx.shared import Mm
def toDate(strDT):
dt = pd.to_datetime(strDT, errors='coerce')
dts = ''
# print('-+-+:', type(dt), dt)
if not pd.isna(dt):
dts = dt.strftime('%m-%d')
return dts
# word模板替换
def temp_word(tmep_path, word_apth, dContext, pathImage, city):
tpl = DocxTemplate(tmep_path)
dC = {'annulusMediaCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusMediaCount.png'), width=Mm(120)),
'annulusCountyCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyCount.png'),
width=Mm(120)),
'annulusCountyArticle': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyArticle.png'),
width=Mm(120)),
'annulusResult': InlineImage(tpl, os.path.join(pathImage, city + 'annulusResult.png'), width=Mm(120)),
'barCountyRatio': InlineImage(tpl, os.path.join(pathImage, city + 'barCountyRatio.png'), width=Mm(120))
}
dContext.update(dC)
tpl.render(dContext)
tpl.save(word_apth)
# 画柱状图
def drawBar(data, recipe, title='', fn=''):
plt.figure(figsize=(6, 4))
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
counties = recipe
countyRates = data
plt.bar(counties, countyRates, width=0.5)
plt.xticks(counties, counties, rotation=35)
plt.ylim((0, 1))
def to_percent(temp, position):
return '%2.0f' % (100 * temp) + '%'
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
plt.title(title, fontsize=16)
plt.tight_layout()
plt.savefig(fn)
# plt.show()
plt.cla()
plt.clf()
plt.close()
# 画环状图
def drawAnnulus(data, recipe, title='', fn=''):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
xxx = 8 # 画布x
yyy = 4 # 画布y
nnncol = 1 # 图例列数
fs = 'medium' ## xx--small;x-small;small;medium;large;x-large;xx-large
# if title == '政务新媒体账号类型':
if len(recipe) > 20:
if len(recipe) > 40:
xxx = 16
nnncol = 4
fs = 'x-small'
else:
xxx = 16
nnncol = 2
fs = 'xmall'
fig, ax = plt.subplots(figsize=(xxx, yyy), subplot_kw=dict(aspect="equal"))
"""
设置圆环宽度绘图方向起始角度
参数wedgeprops以字典形式传递设置饼图边界的相关属性例如圆环宽度0.5
饼状图默认从x轴正向沿逆时针绘图参数startangle可指定新的角例如负40度度起画
"""
wedges, texts = ax.pie(data, radius=1.1, wedgeprops=dict(width=0.4), startangle=0) # 画环,返回扇形列表和每个标注文本对象(坐标,文字,属性)
if 1:
x = 1.2
if title == '政务新媒体监测结果':
x = 1.0
plt.legend(labels=recipe, loc="center left", bbox_to_anchor=(x, 0.5), borderaxespad=0., ncol=nnncol,
fontsize=fs) # , ncol=3
if len(title) > 0:
ax.set_title(title, fontsize=16, fontweight='heavy') # , x=0.6
plt.tight_layout()
if len(fn) > 0:
plt.savefig(fn)
# plt.show()
plt.cla()
plt.clf()
plt.close()
# summaryCity(city, dfc, dfcw, dfcs, context, strfnTemplate, os.path.join(strPathVerified,'Reports', city+'.docx'), strPathVerified )
# 汇总市州数据,
# 市州名称, 监测数据, cbz数据 mgc数据 context(编号、名称) word模板文件名称 输出word文件名称 临时文件目录
# 需要传入模板文件,数据、错别字、敏感词,单位名称等
def summaryCity(info, city, df, dfW, dfS, fnTemplate, fnReport, dirTemp):
dCityClient = {
'甘肃省': "甘肃省人民政府办公厅",
'省直部门': "甘肃省人民政府办公厅",
'白银市': "白银市人民政府办公室",
'定西市': "定西市人民政府办公室",
'临夏回族自治州': "临夏回族自治州人民政府办公室",
'平凉市': "中共平凉市委网络安全和信息化委员会办公室",
"庆阳市": "庆阳市电子政务与信息资源管理办公室",
'庆阳市华池县': "华池县人民政府办公室",
'庆阳市宁县': "宁县人民政府办公室",
"庆阳市镇原县": "镇原县人民政府办公室",
"酒泉市": "酒泉市人民政府办公室",
"天水市": "天水市人民政府办公室",
"武威市": "武威市人民政府办公室",
"金昌市": "金昌市人民政府办公室",
"嘉峪关市": "嘉峪关市人民政府办公室",
"兰州新区": "兰州新区管委会办公室",
"陇南市": "陇南市人民政府办公室",
}
print("----------------" + city + "----------------")
# 报告编号、委托单位
strID = "%02d" % (list(dCityClient).index(city))
# print(strID)
context = {
"city": city,
"client": dCityClient[city],
"reportid": strID + info['num'],
}
context.update(info)
subordinate = '区县/地方部门'
subordinateName = '县区'
# 区县数据筛选
if "庆阳市" in city:
if "华池县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '华池县')].copy()
elif "宁县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '宁县')].copy()
elif "镇原县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '镇原县')].copy()
else:
dfc = df.loc[(df['市/省局'] == '庆阳市')].copy()
# & (df['区县/地方部门']!='华池县')
# & (df['区县/地方部门']!='宁县')
# & (df['区县/地方部门']!='镇原县') ].copy()
dfcw = dfW.loc[dfW['市州'] == '庆阳市'].copy()
dfcs = dfS.loc[dfS['市州'] == '庆阳市'].copy()
elif "甘肃" in city :
dfc = df.copy()
dfcw = dfW.copy()
dfcs = dfS.copy()
subordinate = '市/省局'
subordinateName = '市州'
elif "省直部门" in city :
dfc = df.loc[df['市/省局'] == city].copy()
dfcw = dfW.loc[dfW['市州'] == dictSC[city]].copy()
dfcs = dfS.loc[dfS['市州'] == dictSC[city]].copy()
else:
dfc = df.loc[(df['市/省局'] == city)].copy()
dfcw = dfW.loc[dfW['市州'] == city].copy()
dfcs = dfS.loc[dfS['市州'] == city].copy()
# -----------------------
# 统计结果分析
dCity = {'1': '2'}
#
# 县区-监测结果 统计
#
# 透视表, 按县区统计各个监测结果账号数量
dfCountyAccount = pd.pivot_table(dfc, index=[subordinate], columns=['监测结果'], values=['账号名称'], aggfunc='count',
fill_value='', margins=True)
dfCountyAccount.columns = dfCountyAccount.columns.droplevel(0)
# 准备模板中的表格
tt3_list = []
for index, row in dfCountyAccount.iterrows():
county = ''
if index == 'All':
county = '总 计'
else:
county = index
hg = ''
u2w = ''
un = ''
count = ''
if '合格' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['合格'], str):
hg = int(row['合格'])
if '监测期间未更新' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['监测期间未更新'], str):
un = int(row['监测期间未更新'])
if '超过两周未更新' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['超过两周未更新'], str):
u2w = int(row['超过两周未更新'])
if 'All' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['All'], str):
count = int(row['All'])
tt3_a = {'county': county, 'hg': hg, 'u2w': u2w, 'un': un, 'count': count}
tt3_list.append(tt3_a)
context['tt3_contents'] = tt3_list
# dfCountyAccount.to_excel(dirTask+strPathCity+'县区监测结果.xlsx')
# -----------------------
#
# 按媒体类型统计
#
# 透视表, 按账号类型统计账号数量
dfMedia = pd.pivot_table(dfc, index=['账号类型'], values=['账号名称'], aggfunc='count', fill_value='', margins=True)
# 提取该市账号数量
dCity['nmCount'] = dfMedia.loc['All', '账号名称']
print(' 监测账号数:', dCity['nmCount'])
# 提取 账号类型-数量 拼成文本串
dfMedia = dfMedia.sort_values(by='账号名称', ascending=False)
lTableCs1 = []
strMedia = ''
i = 0
tt1_list = []
for m in dfMedia.index.tolist()[1:]: # 第一个是总数,不用取
strNum = str(dfMedia.iloc[:, 0].tolist()[1:][i])
strMedia = strMedia + m + strNum + '个,'
tt1_a = {'type': m, 'count': strNum}
tt1_list.append(tt1_a)
i = i + 1
dCity['sMediaCount'] = strMedia[:-1].rstrip('')
context.update({'tt1_contents': tt1_list})
# -----------------------
#
# 按县区-更新次数 统计
#
dfCountyArticle = pd.pivot_table(dfc, index=[subordinate], values=['更新次数'], aggfunc='sum', fill_value='',
margins=True)
dfCountyArticle = dfCountyArticle.sort_values(by='更新次数', ascending=False).copy()
dCity['cityArticleCount'] = "%d" % dfCountyArticle.iloc[0, 0]
dCity['countyMostArticle'] = dfCountyArticle.index.tolist()[1]
dCity['countyMostArticleCount'] = "%d" % dfCountyArticle.iloc[1, 0]
strCountyArticle = ''
iiii = 1
for cccc in dfCountyArticle.index.tolist()[1:]:
strCountyArticle = strCountyArticle + cccc + "%d" % dfCountyArticle.iloc[iiii, 0] + "次,"
iiii = iiii + 1
dCity['sCountyArticles'] = strCountyArticle.rstrip('')
# 市各县区监测结果按总数排序,
dfCountyAccount.loc[:, '合格'] = dfCountyAccount['合格'].astype('int')
dfCountyAccount = dfCountyAccount.sort_values(by='All', ascending=False).copy()
# 计算合格率
dfCountyAccount.eval('rate = 合格 / All ', inplace=True)
dfResult = dfCountyAccount.copy()
# 提取city合格率
dCity['cityRatio'] = "{:.1%}".format(dfCountyAccount.loc['All', 'rate'])
print(' 合格率:', dCity['cityRatio'])
# 导出文件
# dfCountyAccount.to_excel(dirIntermediate+sFileBase+'县区合格率.xlsx')
# dfMedia = dfMedia.drop(['All'])
# 提取县区名称,县区账号数, 县区合格率,转成字符串
dfCountyAccount = dfCountyAccount.drop(['All']) # 删除"All"行
counties = dfCountyAccount.index.tolist()
countyCounts = dfCountyAccount['All'].values.tolist()
countyHeges = dfCountyAccount['合格'].values.tolist()
print(countyCounts)
print(counties)
# 按县区账号数量排序
strCountyCount = ''
strCounties = ''
i = 0
for c in counties:
strCounties = strCounties + c + ''
strCountyCount = strCountyCount + c + str(countyCounts[i]) + '个,'
i = i + 1
dCity['countyCount'] = "%d" % i
dCity['sCounties'] = strCounties.rstrip('')
dCity['sCountyCount'] = strCountyCount.rstrip('')
# 按合格率排序
dfCountyAccount = dfCountyAccount.sort_values(by='rate', ascending=False)
countieshege = dfCountyAccount.index.tolist()
countyRates = dfCountyAccount['rate']
strCountyRatio = ''
i = 0
tt2_list = []
for c in countieshege:
strRatio = "%.1f" % (100.0 * countyRates[i])
strCountyRatio = strCountyRatio + c + strRatio + '%'
tt2_a = {'county': c, 'ratio': strRatio + '%'}
tt2_list.append(tt2_a)
i = i + 1
dCity['sCountyRatio'] = strCountyRatio.rstrip('')
dCity['tt2_contents'] = tt2_list
# -----------------------
#
# 绘图
#
print(' 生成图片...')
drawAnnulus(dfMedia.iloc[:, 0].tolist()[1:], dfMedia.index.tolist()[1:],
'政务新媒体账号类型', os.path.join(dirTemp, city + 'annulusMediaCount.png'))
print(countyCounts)
print(counties)
drawAnnulus(countyCounts, counties,
subordinateName + '政务新媒体账号数量', os.path.join(dirTemp, city + 'annulusCountyCount.png'))
drawAnnulus(dfCountyArticle.iloc[:, 0].tolist()[1:], dfCountyArticle.index.tolist()[1:],
subordinateName + '政务新媒体累计更新次数', os.path.join(dirTemp, city + 'annulusCountyArticle.png'))
# {{resultNoUpdated}}个政务新媒体监测期间未更新,占监测总数的{{resultNoUpdatedRatio}}
# {{resultNoUpdated2W}}个政务新媒体连续未更新时间超过两周,占监测总数的{{resultNoUpdated2WRatio}}
# 政务新媒体监测结果
dfResult = dfResult.drop('All', axis=1)
dfResult = dfResult.drop('rate', axis=1)
# 合格数,合格率,不合格数
dCity['resultQualified'] = "%d" % (dfResult.loc['All', '合格'])
dCity['resultQualifiedRatio'] = "%.1f%%" % (dfResult.loc['All', '合格'] / dCity['nmCount'] * 100.0)
dCity['resultUnqualified'] = "%d" % (dCity['nmCount'] - dfResult.loc['All', '合格'])
#
# numNoupdated = 0
if '监测期间未更新' in dfResult.columns.values.tolist():
numNoupdated = dfResult.loc['All', '监测期间未更新']
dCity['stringResultNoUpdated'] = "%d个政务新媒体监测期间未更新,占监测总数的%.1f%%" % (
numNoupdated, numNoupdated / dCity['nmCount'] * 100.0)
dCity['stringNoUpdated'] = "%d个政务新媒体监测期间未更新。" % (numNoupdated)
else:
dCity['stringResultNoUpdated'] = ''
dCity['stringNoUpdated'] = ""
# dCity['resultNoUpdated'] = "%d"%(numNoupdated)
# dCity['resultNoUpdatedRatio'] = "%.1f%%"%(numNoupdated/dCity['nmCount']*100.0)
# numNoupdated2W = 0
if '超过两周未更新' in dfResult.columns.values.tolist():
numNoupdated2W = dfResult.loc['All', '超过两周未更新']
dCity['stringResultNoUpdated2W'] = "%d个政务新媒体连续未更新时间超过两周,占监测总数的%.1f%%" % (
numNoupdated2W, numNoupdated2W / dCity['nmCount'] * 100.0)
dCity['stringNoUpdated2W'] = "%d个政务新媒体连续未更新时间超过两周。" % (numNoupdated2W)
else:
dCity['stringResultNoUpdated2W'] = ''
dCity['stringNoUpdated2W'] = ''
# dCity['resultNoUpdated2W'] = "%d"%(numNoupdated2W)
# dCity['resultNoUpdated2WRatio'] = "%.1f%%"%(numNoupdated2W/dCity['nmCount']*100.0)
resultLabels = dfResult.columns.values.tolist()
resultCounts = dfResult.loc['All'].values.tolist()
drawAnnulus(resultCounts, resultLabels,
'政务新媒体监测结果', os.path.join(dirTemp, city + 'annulusResult.png'))
drawBar(countyRates, countieshege,
'政务新媒体管理矩阵发布时效性合格率榜单', os.path.join(dirTemp, city + 'barCountyRatio.png'))
# -----------------------
#
# 准备报告需要的数据
#
print(' 生成报告...')
dfCityUnqulified = dfc[dfc['监测结果'] != '合格']
dfCityUnqulified = dfCityUnqulified.sort_values(by="监测结果", ascending=True) # by指定按哪列排序。ascending表示是否升序=False
#################################################
dfCityQulified = dfc[dfc['监测结果'] == '合格']
dfCityQulified = dfCityQulified.sort_values(by=subordinate, ascending=True) # by指定按哪列排序。ascending表示是否升序=False
# 方法一
tt4_list = []
for index, row in dfCityUnqulified.iterrows():
count = ''
if row['更新次数']:
count = "%d" % row['更新次数']
days = ''
if row['最大静默日数']:
days = "%d" % row['最大静默日数']
sD1 = ''
sD2 = ''
if row['静默开始日期']:
sD1 = toDate(str(row['静默开始日期']))
if row['静默结束日期']:
sD2 = toDate(str(row['静默结束日期']))
tt4_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
'days': days, 'start': sD1, 'end': sD2, }
tt4_list.append(tt4_a)
tt4_results = {'tt4_contents': tt4_list}
context.update(tt4_results)
tt5_list = []
for index, row in dfCityQulified.iterrows():
count = ''
if row['更新次数']:
count = "%d" % row['更新次数']
days = ''
if row['最大静默日数']:
days = "%d" % row['最大静默日数']
sD1 = ''
sD2 = ''
if row['静默开始日期']:
sD1 = toDate(str(row['静默开始日期']))
if row['静默结束日期']:
sD2 = toDate(str(row['静默结束日期']))
tt5_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
'days': days, 'start': sD1, 'end': sD2, }
tt5_list.append(tt5_a)
tt5_results = {'tt5_contents': tt5_list}
context.update(tt5_results)
# 读取添加错别字表格
tCbz_list = []
dfcw.fillna('')
for index, row in dfcw.iterrows():
sTitle = ''
sDate = toDate(str(row['发文时间']))
if '标题' in dfcw.columns:
sTitle = row['标题']
# 去除引号等干扰表格模板输出的字符
r = "[——,$%^,。?、~@#¥%……&*《》<>「」{}【】()/\\\[\]'\"]"
if pd.isna(row['错误出现位置']):
s = ''
else:
s = re.sub(r, '', row['错误出现位置'])
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': s, 'type': row['账号类型'], 'name': row['账号名称'],
'date': sDate, 'title': sTitle, }
tCbz_list.append(a)
if dfcw.shape[0] > 0:
dCity['stringCbzCount'] = '本次检测发现错别字%d处,详细情况见附表政务新媒体发布内容错别字统计表。' % (dfcw.shape[0])
else:
dCity['stringCbzCount'] = '本次检测未发现错别字。'
tCbz_results = {'tCbz_contents': tCbz_list}
context.update(tCbz_results)
# 读取添加敏感词表格
tMgc_list = []
dfcs.fillna('')
for index, row in dfcs.iterrows():
sTitle = ''
sDate = toDate(str(row['发文时间']))
if '标题' in dfcs.columns:
sTitle = row['标题']
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': row['错误出现位置'], 'type': row['账号类型'], 'name': row['账号名称'],
'date': sDate, 'title': sTitle, }
tMgc_list.append(a)
if dfcs.shape[0] > 0:
dCity['stringMgcCount'] = '本次检测发现敏感信息%d处,详细情况见附表政务新媒体发布内容敏感信息统计表。' % (dfcs.shape[0])
else:
dCity['stringMgcCount'] = '本次检测未发现涉敏内容。'
tMgc_results = {'tMgc_contents': tMgc_list}
context.update(tMgc_results)
# table1
context.update(dCity)
# -----------------------
#
# 按模板生成报告
#
temp_word(fnTemplate,
fnReport,
context, dirTemp, city)
def createDir(dirP, dirS):
dirN = dirP
if os.path.isdir(dirP):
dirN = os.path.join(dirP, dirS)
if not (os.path.exists(dirN)):
os.mkdir(dirN)
if os.path.isdir(dirN):
pass
else:
dirN = dirP
print('Directory ' + dirN + ' cannot be created.')
return dirN
# def createDir(dirP, dirS):
def summary(info, strFnData, strFnW, strFnS, strfnTemplate, strPathOutput):
# 打开监测数据、错别字、敏感词
df = pd.read_excel(strFnData)
dfW = pd.read_excel(strFnW)
dfS = pd.read_excel(strFnS)
# df.loc[df['账号类型'] == '微信服务号', '账号类型'] = '微信'
# df.loc[df['账号类型'] == '微信订阅号', '账号类型'] = '微信'
# 统一监测结果表述
df.loc[df['监测结果'] == '连续两周未更新', '监测结果'] = '超过两周未更新'
# 过长名称替换为简称,便于绘图
df.loc[df['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县'
df.loc[df['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县'
# 省直、 市直、 州直
df['市/省局'] = df['市/省局'].fillna('省直部门')
df['区县/地方部门'] = df['区县/地方部门'].fillna('市直部门')
df.loc[(df['市/省局'] == '临夏回族自治州') & (df['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门'
# 数据整理
df.replace(r'\s+', '', regex=True, inplace=True) # 去除账号、单位名称中的空格、换行、tab等
df.replace(r'^其他\+', '', regex=True, inplace=True) # 去除账号类型中的 "其它" 字样
df['更新次数'] = df['更新次数'].fillna(0)
df = df.fillna(value='')
#########################################################
#
# 统计市州范围
cities = {'甘肃省', '白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
'嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县'}
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '陇南市', '张掖市', '甘肃省', '省直部门'}
#
cities = {'甘肃省'} # 只统计特定市州
# strPathOutput目录下生成报告目录和临时文件目录Reports 和 Intermediate
dirP = os.path.abspath(os.path.dirname(strPathOutput))
dirReports = createDir(dirP, 'Reports')
dirIntermediate = createDir(dirP, 'Intermediate')
for city in cities:
summaryCity(info, city, df, dfW, dfS, strfnTemplate, os.path.join(dirReports, city + '.docx'), dirIntermediate)
# 合并错别字文件
def mergeCMC(keyword, strPathCBZ, strFnCbz):
# cityShorten
cityShorten = {'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
'兰州新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
'BY': '白银市', 'DX': '定西市', 'JQ': '酒泉市', 'JYG': '嘉峪关市', 'LN': '陇南市',
'LX': '临夏回族自治州', 'PL': '平凉市', 'QY': '庆阳市', 'TS': '天水市', 'WW': '武威市', 'XQ': '兰州新区',
'LZXQ': '兰州新区', 'LZ': '兰州市', 'ZY': '张掖市', 'GN': '甘南藏族自治州', 'SZ': '省直部门', 'JC': '金昌市', }
df = pd.DataFrame()
for fn in glob.glob(os.path.join(strPathCBZ, '*'+keyword+'*.xlsx')):
p, f = os.path.split(fn)
city=''
for c in cityShorten.keys():
if c in f:
city = cityShorten[c]
break
if len(city)<1:
print("!!!!! City Name not matched ( ", f, " )")
dfn = pd.read_excel(fn)
dfn['市州'] = city
df = df.append(dfn, ignore_index=True)
print(city, f, dfn.shape[0], '/', df.shape[0])
df.to_excel(strFnCbz)
#def mergeCMC
if __name__ == "__main__":
# 运行之前先转换excel文件的日期列
info = {
"year": "2022",
"quarter": "",
"dateCN": "二〇二二年九月",
"dateStart": "2022年7月1日",
"dateEnd": "2022年9月20日",
"days": "81",
"num": "11",
}
# 数据根目录,
strPath = 'D:/Projects/POM/DATA/2023年S1/'
# 监测数据
strFnMonitoring = strPath + '汇总/第一季度汇总数据_2023.3.xlsx'
# word模板文件
strPathTemplate = strPath + 'POM_ReportTemplate0.docx'
# 错别字
strFnCbz = strPath + '汇总/CBZ.xlsx'
if not os.path.exists(strFnCbz):# 汇总错别字
strPathCBZ = strPath + '监测/'
mergeCMC("错别", strPathCBZ, strFnCbz)
# 敏感词
strFnMgc = strPath + '汇总/MGC.xlsx'
if not os.path.exists(strFnMgc):#汇总敏感词
strPathMGC = strPath + '监测/'
mergeCMC("敏感", strPathMGC, strFnMgc)
# 数据目录
strPathOutput = strPath + '统计/'
summary(info, strFnMonitoring, strFnCbz, strFnMgc, strPathTemplate, strPathOutput)

618
StatSeasonly4.py Normal file
View File

@ -0,0 +1,618 @@
# 1. 打开监测任务表格
import pandas as pd
import numpy as np
import os, glob, re
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import datetime
from docxtpl import DocxTemplate
from docxtpl import InlineImage
from docx.shared import Mm
def fetch_chinese(s):
pattern =re.compile(r'[^\u4e00-\u9fa5]')
sc = re.sub(pattern, '', s)
return sc
def toDate(strDT):
dt = pd.to_datetime(strDT, errors='coerce')
dts = ''
# print('-+-+:', type(dt), dt)
if not pd.isna(dt):
dts = dt.strftime('%m-%d')
return dts
# word模板替换
def temp_word(tmep_path, word_apth, dContext, pathImage, city):
tpl = DocxTemplate(tmep_path)
dC = {'annulusMediaCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusMediaCount.png'), width=Mm(120)),
'annulusCountyCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyCount.png'),
width=Mm(120)),
'annulusCountyArticle': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyArticle.png'),
width=Mm(120)),
'annulusResult': InlineImage(tpl, os.path.join(pathImage, city + 'annulusResult.png'), width=Mm(120)),
'barCountyRatio': InlineImage(tpl, os.path.join(pathImage, city + 'barCountyRatio.png'), width=Mm(120))
}
dContext.update(dC)
tpl.render(dContext)
tpl.save(word_apth)
# 画柱状图
def drawBar(data, recipe, title='', fn=''):
plt.figure(figsize=(6, 4))
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
counties = recipe
countyRates = data
plt.bar(counties, countyRates, width=0.5)
plt.xticks(counties, counties, rotation=35)
plt.ylim((0, 1))
def to_percent(temp, position):
return '%2.0f' % (100 * temp) + '%'
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
plt.title(title, fontsize=16)
plt.tight_layout()
plt.savefig(fn)
# plt.show()
plt.cla()
plt.clf()
plt.close()
# 画环状图
def drawAnnulus(data, recipe, title='', fn=''):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
xxx = 8 # 画布x
yyy = 4 # 画布y
nnncol = 1 # 图例列数
fs = 'medium' ## xx--small;x-small;small;medium;large;x-large;xx-large
# if title == '政务新媒体账号类型':
if len(recipe) > 20:
if len(recipe) > 40:
xxx = 16
nnncol = 4
fs = 'x-small'
else:
xxx = 16
nnncol = 2
fs = 'xmall'
fig, ax = plt.subplots(figsize=(xxx, yyy), subplot_kw=dict(aspect="equal"))
"""
设置圆环宽度绘图方向起始角度
参数wedgeprops以字典形式传递设置饼图边界的相关属性例如圆环宽度0.5
饼状图默认从x轴正向沿逆时针绘图参数startangle可指定新的角例如负40度度起画
"""
wedges, texts = ax.pie(data, radius=1.1, wedgeprops=dict(width=0.4), startangle=0) # 画环,返回扇形列表和每个标注文本对象(坐标,文字,属性)
if 1:
x = 1.2
if title == '政务新媒体监测结果':
x = 1.0
plt.legend(labels=recipe, loc="center left", bbox_to_anchor=(x, 0.5), borderaxespad=0., ncol=nnncol,
fontsize=fs) # , ncol=3
if len(title) > 0:
ax.set_title(title, fontsize=16, fontweight='heavy') # , x=0.6
plt.tight_layout()
if len(fn) > 0:
plt.savefig(fn)
# plt.show()
plt.cla()
plt.clf()
plt.close()
# summaryCity(city, dfc, dfcw, dfcs, context, strfnTemplate, os.path.join(strPathVerified,'Reports', city+'.docx'), strPathVerified )
# 汇总市州数据,
# 市州名称, 监测数据, cbz数据 mgc数据 context(编号、名称) word模板文件名称 输出word文件名称 临时文件目录
# 需要传入模板文件,数据、错别字、敏感词,单位名称等
def summaryCity(info, city, df, dfW, dfS, fnTemplate, fnReport, dirTemp):
dCityClient = {
'甘肃省': "甘肃省人民政府办公厅",
'省直部门': "甘肃省人民政府办公厅",
'白银市': "白银市人民政府办公室",
'定西市': "定西市人民政府办公室",
'临夏回族自治州': "临夏回族自治州人民政府办公室",
'平凉市': "中共平凉市委网络安全和信息化委员会办公室",
"庆阳市": "庆阳市电子政务与信息资源管理办公室",
'庆阳市华池县': "华池县人民政府办公室",
'庆阳市宁县': "宁县人民政府办公室",
"庆阳市镇原县": "镇原县人民政府办公室",
"酒泉市": "酒泉市人民政府办公室",
"天水市": "天水市人民政府办公室",
"武威市": "武威市人民政府办公室",
"金昌市": "金昌市人民政府办公室",
"嘉峪关市": "嘉峪关市人民政府办公室",
"兰州新区": "兰州新区管委会办公室",
"陇南市": "陇南市政务服务中心",
"张掖市": "张掖市政务服务中心",
"甘南藏族自治州": "甘南藏族自治州政务服务中心",
"兰州市": "兰州市政务服务中心",
"陇南市": "陇南市政务服务中心",
}
print("----------------" + city + "----------------")
# 报告编号、委托单位
strID = "%02d" % (list(dCityClient).index(city))
# print(strID)
context = {
"city": city,
"client": dCityClient[city],
"reportid": strID + info['num'],
}
context.update(info)
subordinate = '区县/地方部门'
subordinateName = '县区'
# 区县数据筛选
if "庆阳市" in city:
if "华池县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '华池县')].copy()
elif "宁县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '宁县')].copy()
elif "镇原县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '镇原县')].copy()
else:
dfc = df.loc[(df['市/省局'] == '庆阳市')].copy()
# & (df['区县/地方部门']!='华池县')
# & (df['区县/地方部门']!='宁县')
# & (df['区县/地方部门']!='镇原县') ].copy()
dfcw = dfW.loc[dfW['市州'] == '庆阳市'].copy()
dfcs = dfS.loc[dfS['市州'] == '庆阳市'].copy()
elif "甘肃" in city :
dfc = df.copy()
dfcw = dfW.copy()
dfcs = dfS.copy()
subordinate = '市/省局'
subordinateName = '市州'
elif "省直部门" in city :
dfc = df.loc[df['市/省局'] == city].copy()
#dfcw = dfW.loc[dfW['市州'] == dictSC[city]].copy()
#dfcs = dfS.loc[dfS['市州'] == dictSC[city]].copy()
dfcw = dfW.loc[dfW['市州'] == city].copy()
dfcs = dfS.loc[dfS['市州'] == city].copy()
else:
dfc = df.loc[(df['市/省局'] == city)].copy()
dfcw = dfW.loc[dfW['市州'] == city].copy()
dfcs = dfS.loc[dfS['市州'] == city].copy()
# -----------------------
# 统计结果分析
dCity = {'1': '2'}
#
# 县区-监测结果 统计
#
# 透视表, 按县区统计各个监测结果账号数量
dfCountyAccount = pd.pivot_table(dfc, index=[subordinate], columns=['监测结果'], values=['账号名称'], aggfunc='count',
fill_value='', margins=True)
dfCountyAccount.columns = dfCountyAccount.columns.droplevel(0)
# 准备模板中的表格
tt3_list = []
for index, row in dfCountyAccount.iterrows():
county = ''
if index == 'All':
county = '总 计'
else:
county = index
hg = ''
u2w = ''
un = ''
count = ''
if '合格' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['合格'], str):
hg = int(row['合格'])
if '监测期间未更新' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['监测期间未更新'], str):
un = int(row['监测期间未更新'])
if '超过两周未更新' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['超过两周未更新'], str):
u2w = int(row['超过两周未更新'])
if 'All' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['All'], str):
count = int(row['All'])
tt3_a = {'county': county, 'hg': hg, 'u2w': u2w, 'un': un, 'count': count}
tt3_list.append(tt3_a)
context['tt3_contents'] = tt3_list
# dfCountyAccount.to_excel(dirTask+strPathCity+'县区监测结果.xlsx')
# -----------------------
#
# 按媒体类型统计
#
# 透视表, 按账号类型统计账号数量
dfMedia = pd.pivot_table(dfc, index=['账号类型'], values=['账号名称'], aggfunc='count', fill_value='', margins=True)
# 提取该市账号数量
dCity['nmCount'] = dfMedia.loc['All', '账号名称']
print(' 监测账号数:', dCity['nmCount'])
# 提取 账号类型-数量 拼成文本串
dfMedia = dfMedia.sort_values(by='账号名称', ascending=False)
lTableCs1 = []
strMedia = ''
i = 0
tt1_list = []
for m in dfMedia.index.tolist()[1:]: # 第一个是总数,不用取
strNum = str(dfMedia.iloc[:, 0].tolist()[1:][i])
strMedia = strMedia + m + strNum + '个,'
tt1_a = {'type': m, 'count': strNum}
tt1_list.append(tt1_a)
i = i + 1
dCity['sMediaCount'] = strMedia[:-1].rstrip('')
context.update({'tt1_contents': tt1_list})
# -----------------------
#
# 按县区-更新次数 统计
#
dfCountyArticle = pd.pivot_table(dfc, index=[subordinate], values=['更新次数'], aggfunc='sum', fill_value='',
margins=True)
dfCountyArticle = dfCountyArticle.sort_values(by='更新次数', ascending=False).copy()
dCity['cityArticleCount'] = "%d" % dfCountyArticle.iloc[0, 0]
dCity['countyMostArticle'] = dfCountyArticle.index.tolist()[1]
dCity['countyMostArticleCount'] = "%d" % dfCountyArticle.iloc[1, 0]
strCountyArticle = ''
iiii = 1
for cccc in dfCountyArticle.index.tolist()[1:]:
strCountyArticle = strCountyArticle + cccc + "%d" % dfCountyArticle.iloc[iiii, 0] + "次,"
iiii = iiii + 1
dCity['sCountyArticles'] = strCountyArticle.rstrip('')
# 市各县区监测结果按总数排序,
dfCountyAccount.loc[:, '合格'] = dfCountyAccount['合格'].astype('int')
dfCountyAccount = dfCountyAccount.sort_values(by='All', ascending=False).copy()
# 计算合格率
dfCountyAccount.eval('rate = 合格 / All ', inplace=True)
dfResult = dfCountyAccount.copy()
# 提取city合格率
dCity['cityRatio'] = "{:.1%}".format(dfCountyAccount.loc['All', 'rate'])
print(' 合格率:', dCity['cityRatio'])
# 导出文件
# dfCountyAccount.to_excel(dirIntermediate+sFileBase+'县区合格率.xlsx')
# dfMedia = dfMedia.drop(['All'])
# 提取县区名称,县区账号数, 县区合格率,转成字符串
dfCountyAccount = dfCountyAccount.drop(['All']) # 删除"All"行
counties = dfCountyAccount.index.tolist()
countyCounts = dfCountyAccount['All'].values.tolist()
countyHeges = dfCountyAccount['合格'].values.tolist()
print(countyCounts)
print(counties)
# 按县区账号数量排序
strCountyCount = ''
strCounties = ''
i = 0
for c in counties:
strCounties = strCounties + c + ''
strCountyCount = strCountyCount + c + str(countyCounts[i]) + '个,'
i = i + 1
dCity['countyCount'] = "%d" % i
dCity['sCounties'] = strCounties.rstrip('')
dCity['sCountyCount'] = strCountyCount.rstrip('')
# 按合格率排序
dfCountyAccount = dfCountyAccount.sort_values(by='rate', ascending=False)
countieshege = dfCountyAccount.index.tolist()
countyRates = dfCountyAccount['rate']
strCountyRatio = ''
i = 0
tt2_list = []
for c in countieshege:
strRatio = "%.1f" % (100.0 * countyRates[i])
strCountyRatio = strCountyRatio + c + strRatio + '%'
tt2_a = {'county': c, 'ratio': strRatio + '%'}
tt2_list.append(tt2_a)
i = i + 1
dCity['sCountyRatio'] = strCountyRatio.rstrip('')
dCity['tt2_contents'] = tt2_list
# -----------------------
#
# 绘图
#
print(' 生成图片...')
drawAnnulus(dfMedia.iloc[:, 0].tolist()[1:], dfMedia.index.tolist()[1:],
'政务新媒体账号类型', os.path.join(dirTemp, city + 'annulusMediaCount.png'))
print(countyCounts)
print(counties)
drawAnnulus(countyCounts, counties,
subordinateName + '政务新媒体账号数量', os.path.join(dirTemp, city + 'annulusCountyCount.png'))
drawAnnulus(dfCountyArticle.iloc[:, 0].tolist()[1:], dfCountyArticle.index.tolist()[1:],
subordinateName + '政务新媒体累计更新次数', os.path.join(dirTemp, city + 'annulusCountyArticle.png'))
# {{resultNoUpdated}}个政务新媒体监测期间未更新,占监测总数的{{resultNoUpdatedRatio}}
# {{resultNoUpdated2W}}个政务新媒体连续未更新时间超过两周,占监测总数的{{resultNoUpdated2WRatio}}
# 政务新媒体监测结果
dfResult = dfResult.drop('All', axis=1)
dfResult = dfResult.drop('rate', axis=1)
# 合格数,合格率,不合格数
dCity['resultQualified'] = "%d" % (dfResult.loc['All', '合格'])
dCity['resultQualifiedRatio'] = "%.1f%%" % (dfResult.loc['All', '合格'] / dCity['nmCount'] * 100.0)
dCity['resultUnqualified'] = "%d" % (dCity['nmCount'] - dfResult.loc['All', '合格'])
#
# numNoupdated = 0
if '监测期间未更新' in dfResult.columns.values.tolist():
numNoupdated = dfResult.loc['All', '监测期间未更新']
dCity['stringResultNoUpdated'] = "%d个政务新媒体监测期间未更新,占监测总数的%.1f%%" % (
numNoupdated, numNoupdated / dCity['nmCount'] * 100.0)
dCity['stringNoUpdated'] = "%d个政务新媒体监测期间未更新。" % (numNoupdated)
else:
dCity['stringResultNoUpdated'] = ''
dCity['stringNoUpdated'] = ""
# dCity['resultNoUpdated'] = "%d"%(numNoupdated)
# dCity['resultNoUpdatedRatio'] = "%.1f%%"%(numNoupdated/dCity['nmCount']*100.0)
# numNoupdated2W = 0
if '超过两周未更新' in dfResult.columns.values.tolist():
numNoupdated2W = dfResult.loc['All', '超过两周未更新']
dCity['stringResultNoUpdated2W'] = "%d个政务新媒体连续未更新时间超过两周,占监测总数的%.1f%%" % (
numNoupdated2W, numNoupdated2W / dCity['nmCount'] * 100.0)
dCity['stringNoUpdated2W'] = "%d个政务新媒体连续未更新时间超过两周。" % (numNoupdated2W)
else:
dCity['stringResultNoUpdated2W'] = ''
dCity['stringNoUpdated2W'] = ''
# dCity['resultNoUpdated2W'] = "%d"%(numNoupdated2W)
# dCity['resultNoUpdated2WRatio'] = "%.1f%%"%(numNoupdated2W/dCity['nmCount']*100.0)
resultLabels = dfResult.columns.values.tolist()
resultCounts = dfResult.loc['All'].values.tolist()
drawAnnulus(resultCounts, resultLabels,
'政务新媒体监测结果', os.path.join(dirTemp, city + 'annulusResult.png'))
drawBar(countyRates, countieshege,
'政务新媒体管理矩阵发布时效性合格率榜单', os.path.join(dirTemp, city + 'barCountyRatio.png'))
# -----------------------
#
# 准备报告需要的数据
#
print(' 生成报告...')
dfCityUnqulified = dfc[dfc['监测结果'] != '合格']
dfCityUnqulified = dfCityUnqulified.sort_values(by="监测结果", ascending=True) # by指定按哪列排序。ascending表示是否升序=False
#################################################
dfCityQulified = dfc[dfc['监测结果'] == '合格']
dfCityQulified = dfCityQulified.sort_values(by=subordinate, ascending=True) # by指定按哪列排序。ascending表示是否升序=False
# 方法一
tt4_list = []
for index, row in dfCityUnqulified.iterrows():
count = ''
if row['更新次数']:
count = "%d" % row['更新次数']
days = ''
if row['最大静默日数']:
days = "%d" % row['最大静默日数']
sD1 = ''
sD2 = ''
if row['静默开始日期']:
sD1 = toDate(str(row['静默开始日期']))
if row['静默结束日期']:
sD2 = toDate(str(row['静默结束日期']))
tt4_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
'days': days, 'start': sD1, 'end': sD2, }
tt4_list.append(tt4_a)
tt4_results = {'tt4_contents': tt4_list}
context.update(tt4_results)
tt5_list = []
for index, row in dfCityQulified.iterrows():
count = ''
if row['更新次数']:
count = "%d" % row['更新次数']
days = ''
if row['最大静默日数']:
days = "%d" % row['最大静默日数']
sD1 = ''
sD2 = ''
if row['静默开始日期']:
sD1 = toDate(str(row['静默开始日期']))
if row['静默结束日期']:
sD2 = toDate(str(row['静默结束日期']))
tt5_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
'days': days, 'start': sD1, 'end': sD2, }
tt5_list.append(tt5_a)
tt5_results = {'tt5_contents': tt5_list}
context.update(tt5_results)
# 读取添加错别字表格
tCbz_list = []
dfcw.fillna('')
for index, row in dfcw.iterrows():
sTitle = ''
sDate = toDate(str(row['发文时间']))
if '标题' in dfcw.columns:
sTitle = row['标题']
# 去除引号等干扰表格模板输出的字符
r = "[——,$%^,。?、~@#¥%……&*《》<>「」{}【】()/\\\[\]'\"]"
if pd.isna(row['错误出现位置']):
s = ''
else:
s = re.sub(r, '', row['错误出现位置'])
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': s, 'type': row['账号类型'], 'name': row['账号名称'],
'date': sDate, 'title': sTitle, }
tCbz_list.append(a)
if dfcw.shape[0] > 0:
dCity['stringCbzCount'] = '本次检测发现错别字%d处,详细情况见附表政务新媒体发布内容错别字统计表。' % (dfcw.shape[0])
else:
dCity['stringCbzCount'] = '本次检测未发现错别字。'
tCbz_results = {'tCbz_contents': tCbz_list}
context.update(tCbz_results)
# 读取添加敏感词表格
tMgc_list = []
dfcs.fillna('')
for index, row in dfcs.iterrows():
sTitle = ''
sDate = toDate(str(row['发文时间']))
if '标题' in dfcs.columns:
sTitle = row['标题']
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': fetch_chinese(str(row['错误出现位置'])), 'type': row['账号类型'], 'name': row['账号名称'],
'date': sDate, 'title': fetch_chinese(str(sTitle)), }
tMgc_list.append(a)
if dfcs.shape[0] > 0:
dCity['stringMgcCount'] = '本次检测发现敏感信息%d处,详细情况见附表政务新媒体发布内容敏感信息统计表。' % (dfcs.shape[0])
else:
dCity['stringMgcCount'] = '本次检测未发现涉敏内容。'
tMgc_results = {'tMgc_contents': tMgc_list}
context.update(tMgc_results)
# table1
context.update(dCity)
# -----------------------
#
# 按模板生成报告
#
temp_word(fnTemplate,
fnReport,
context, dirTemp, city)
def createDir(dirP, dirS):
dirN = dirP
if os.path.isdir(dirP):
dirN = os.path.join(dirP, dirS)
if not (os.path.exists(dirN)):
os.mkdir(dirN)
if os.path.isdir(dirN):
pass
else:
dirN = dirP
print('Directory ' + dirN + ' cannot be created.')
return dirN
# def createDir(dirP, dirS):
def summary(info, strFnData, strFnW, strFnS, strfnTemplate, strPathOutput):
# 打开监测数据、错别字、敏感词
df = pd.read_excel(strFnData)
dfW = pd.read_excel(strFnW)
dfS = pd.read_excel(strFnS)
# df.loc[df['账号类型'] == '微信服务号', '账号类型'] = '微信'
# df.loc[df['账号类型'] == '微信订阅号', '账号类型'] = '微信'
# 统一监测结果表述
df.loc[df['监测结果'] == '连续两周未更新', '监测结果'] = '超过两周未更新'
# 过长名称替换为简称,便于绘图
df.loc[df['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县'
df.loc[df['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县'
# 省直、 市直、 州直
df['市/省局'] = df['市/省局'].fillna('省直部门')
df['区县/地方部门'] = df['区县/地方部门'].fillna('市直部门')
df.loc[(df['市/省局'] == '临夏回族自治州') & (df['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门'
# 数据整理
df.replace(r'\s+', '', regex=True, inplace=True) # 去除账号、单位名称中的空格、换行、tab等
df.replace(r'^其他\+', '', regex=True, inplace=True) # 去除账号类型中的 "其它" 字样
df['更新次数'] = df['更新次数'].fillna(0)
df = df.fillna(value='')
#########################################################
#
# 统计市州范围
cities = {'甘肃省', '白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
'嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县', '陇南市'}
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '陇南市', '张掖市', '甘肃省', '省直部门'}
#
cities = {'甘肃省'} # 只统计特定市州
# strPathOutput目录下生成报告目录和临时文件目录Reports 和 Intermediate
dirP = os.path.abspath(os.path.dirname(strPathOutput))
dirReports = createDir(dirP, 'Reports')
dirIntermediate = createDir(dirP, 'Intermediate')
for city in cities:
summaryCity(info, city, df, dfW, dfS, strfnTemplate, os.path.join(dirReports, city + '.docx'), dirIntermediate)
# 合并错别字文件
def mergeCMC(keyword, strPathCBZ, strFnCbz):
# cityShorten
cityShorten = {'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
'兰州新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
'BY': '白银市', 'DX': '定西市', 'JQ': '酒泉市', 'JYG': '嘉峪关市', 'LN': '陇南市',
'LX': '临夏回族自治州', 'PL': '平凉市', 'QY': '庆阳市', 'TS': '天水市', 'WW': '武威市', 'XQ': '兰州新区',
'LZXQ': '兰州新区', 'LZ': '兰州市', 'ZY': '张掖市', 'GN': '甘南藏族自治州', 'SZ': '省直部门', 'JC': '金昌市', }
df = pd.DataFrame()
for fn in glob.glob(os.path.join(strPathCBZ, '*'+keyword+'*.xlsx')):
p, f = os.path.split(fn)
city=''
for c in cityShorten.keys():
if c in f:
city = cityShorten[c]
break
if len(city)<1:
print("!!!!! City Name not matched ( ", f, " )")
dfn = pd.read_excel(fn)
dfn['市州'] = city
df = df.append(dfn, ignore_index=True)
print(city, f, dfn.shape[0], '/', df.shape[0])
df.to_excel(strFnCbz)
#def mergeCMC
if __name__ == "__main__":
# 运行之前先转换excel文件的日期列
info = {
"year": "2023",
"quarter": "",
"dateCN": "二〇二三年三月",
"dateStart": "2023年1月1日",
"dateEnd": "2023年3月20日",
"days": "79",
"num": "4",
}
# 数据根目录,
strPath = 'D:/Projects/POM/DATA/2023年S1/'
# 监测数据
strFnMonitoring = strPath + '汇总/第一季度汇总数据_2023.3.xlsx'
# word模板文件
strPathTemplate = strPath + 'POM_ReportTemplate.docx'
# 错别字
strFnCbz = strPath + '汇总/CBZ.xlsx'
if not os.path.exists(strFnCbz):# 汇总错别字
strPathCBZ = strPath + '监测/'
mergeCMC("错别", strPathCBZ, strFnCbz)
# 敏感词
strFnMgc = strPath + '汇总/MGC.xlsx'
if not os.path.exists(strFnMgc):#汇总敏感词
strPathMGC = strPath + '监测/'
mergeCMC("敏感", strPathMGC, strFnMgc)
# 数据目录
strPathOutput = strPath + '统计/'
summary(info, strFnMonitoring, strFnCbz, strFnMgc, strPathTemplate, strPathOutput)

48
cmcMerge.py Normal file
View File

@ -0,0 +1,48 @@
import pandas as pd
import numpy as np
import os, glob, re
# 合并错别字文件
def mergeCMC(keyword, strPathCBZ, strFnCbz):
print(1, keyword, strPathCBZ, strFnCbz)
print('glob: ', glob.glob(os.path.join(strPathCBZ, '*'+keyword+'*.xlsx')))
# cityShorten
cityShorten = {'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
'兰州新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
'BY': '白银市', 'DX': '定西市', 'JQ': '酒泉市', 'JYG': '嘉峪关市', 'LN': '陇南市',
'LX': '临夏回族自治州', 'PL': '平凉市', 'QY': '庆阳市', 'TS': '天水市', 'WW': '武威市', 'XQ': '兰州新区',
'LZXQ': '兰州新区', 'LZ': '兰州市', 'ZY': '张掖市', 'GN': '甘南藏族自治州', 'SZ': '省直部门', 'JC': '金昌市', }
df = pd.DataFrame()
for fn in glob.glob(os.path.join(strPathCBZ, '*'+keyword+'*.xlsx')):
p, f = os.path.split(fn)
print(f)
city=''
for c in cityShorten.keys():
if c in f:
city = cityShorten[c]
break
if len(city)<1:
print("!!!!! City Name not matched ( ", f, " )")
dfn = pd.read_excel(fn)
dfn['市州'] = city
df = df.append(dfn, ignore_index=True)
print(city, f, dfn.shape[0], '/', df.shape[0])
df.to_excel(strFnCbz)
strPath = 'D:/Projects/POM/DATA/2023年3月/3月29日错敏词/敏感词/'
# 错别字
strFnCbz = strPath + '../汇总/CBZ.xlsx'
if(os.path.isfile(strFnCbz)):
os.remove(strFnCbz)
mergeCMC("错别", strPath, strFnCbz)
# 敏感词
strFnMgc = strPath + '../汇总/MGC.xlsx'
if(os.path.isfile(strFnMgc)):
os.remove(strFnMgc)
mergeCMC("敏感", strPath, strFnMgc)

1048
dataCheck.py Normal file

File diff suppressed because it is too large Load Diff

41
excelMerge.py Normal file
View File

@ -0,0 +1,41 @@
import pandas as pd
import numpy as np
import os, glob, re
strPath = 'D:/Projects/POM/DATA/2023年3月/两会/敏感词3.8/'
strFn = strPath + 'merged.xlsx'
if(os.path.isfile(strFn)):
os.remove(strFn)
# 合并错别字文件
def excelMerge(sPath, sFn):
fs = glob.glob(os.path.join(sPath, '*.xlsx'))
df = pd.DataFrame()
for fn in fs:
p, f = os.path.split(fn)
dfn = pd.read_excel(fn)
# 添加市州列
if not '市州' in dfn.columns:
cityShorten = {'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
'兰州新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
'BY': '白银市', 'DX': '定西市', 'JQ': '酒泉市', 'JYG': '嘉峪关市', 'LN': '陇南市',
'LX': '临夏回族自治州', 'PL': '平凉市', 'QY': '庆阳市', 'TS': '天水市', 'WW': '武威市', 'XQ': '兰州新区',
'LZXQ': '兰州新区', 'LZ': '兰州市', 'ZY': '张掖市', 'GN': '甘南藏族自治州', 'SZ': '省直部门', 'JC': '金昌市', }
city=''
for c in cityShorten.keys():
if c in f:
city = cityShorten[c]
break
if len(city)<1:
print("!!!!! City Name not matched ( ", f, " )")
dfn['市州'] = city
df = df.append(dfn, ignore_index=True)
print(f, ' ', dfn.shape[0], '/', df.shape[0] )
df.to_excel(sFn, index=False)
excelMerge(strPath, strFn)

20
excelSheetSplit.py Normal file
View File

@ -0,0 +1,20 @@
import pandas as pd
s = '2023-03-06_11.34.42'
strP = 'D:/Projects/POM/DATA/search/' + s + '/'
strFn = s + '_mm.xlsx'
strP = 'D:/Projects/POM/DATA/2023年3月/两会/敏感词3月13日/上报/'
strFn = 'MGC2023.3.13.xlsx'
df = pd.read_excel(strP+strFn)
print(strP+strFn)
print(df.shape)
cities = df['市州'].unique()
print(cities)
for city in cities:
print(city)
dft = df[df['市州'].isin([city])]
print(dft.shape)
# exec("df%s = dft"%cityNum)
dft.to_excel(strP + city+'.xlsx', index= False)

21
excelSplit.py Normal file
View File

@ -0,0 +1,21 @@
import pandas as pd
strP = 'D:/Projects/POM/DATA/2023年3月/3月13日错敏词/'
strFn1 = '敏感词.xlsx'
strFn2 = '错别字.xlsx'
sheets1 = pd.read_excel(strP+strFn1, sheet_name=None)
sheets2 = pd.read_excel(strP+strFn2, sheet_name=None)
sheetnames = sheets1.keys()
if len(sheets2.keys()) > len(sheets1.keys()) :
sheets = sheets2
d = pd.DataFrame()
for name in sheetnames:
v = pd.DataFrame()
if name in sheets1.keys():
v = pd.concat([v, sheets1[name]], axis=0)
if name in sheets2.keys():
v = pd.concat([v, sheets2[name]], axis=0)
p = pd.concat([d, ])
v.to_excel(strP+name+'.xlsx', index=False)

541
searchALL.py Normal file
View File

@ -0,0 +1,541 @@
import pandas as pd
import numpy as np
import seaborn as sns
import datetime, time
import matplotlib.pyplot as plt
import re,os
regIDCard = r"\d{18}|\d{17}[X|x]"
regCellPhone = r"1[3584]\d{9}"
regSTR = '习近平总同志|习近同志|习近总书记|习平总书记|习近平主义|习总同志' + \
'|习近平治国理政|中国是现代化' + \
'|中华人名|中共民族|名族|中央人名|中华民主' + \
'|中共共产党|中国共产党党章' + \
'|伟大复习|建档伟业|建档百年' + \
'|二十大大|二十精神|二十大开幕式|中国共产党第二十次代表大会|党二十大|第二十次全国人民代表大会' + \
'|建党七十三周年|共产党成立七十三周年' + \
'|大人代表|大人常委会|人大常委主任' + \
'|爱爱服务|抗议英雄|反炸中心'
paths = [
'D:/Projects/POM/DATA/2022年10月/9月报告/全文/',
'D:/Projects/POM/DATA/2022年9月/8月报告/全文/',
'D:/Projects/POM/DATA/2022年8月/7月报告/全文/',
'D:/Projects/POM/DATA/2022年7月/6月报告/全文/',
'D:/Projects/POM/DATA/2022年6月/5月报告/全文/',
'D:/Projects/POM/DATA/2022年5月/4月报告/全文/',
]
"""
'''
#'D:/Projects/POM/DATA/2022年11月/10月报告/全文/',
#'D:/Projects/POM/DATA/2022年12月/11月报告/全文/',
#'D:/Projects/POM/DATA/2023年1月/12月报告/全文/',
#'D:/Projects/POM/DATA/2023年2月/1月报告/全文/',
#'D:/Projects/POM/DATA/2023年3月/2月报告/全文/',
'''
'''
'D:/Projects/POM/DATA/2023年3月/两会/全文数据3月6日',
'D:/Projects/POM/DATA/2023年3月/两会/全文数据3月7日',
'D:/Projects/POM/DATA/2023年3月/两会/全文数据3月8日',
'D:/Projects/POM/DATA/2023年3月/两会/全文数据3月9日',
'D:/Projects/POM/DATA/2023年3月/两会/全文数据3月9日',
'D:/Projects/POM/DATA/2023年3月/两会/全文数据3月10日',
'D:/Projects/POM/DATA/2023年3月/两会/全文数据3月11日',
'D:/Projects/POM/DATA/2023年3月/两会/全文数据3月12日',
'D:/Projects/POM/DATA/2023年3月/两会/全文数据3月13日',
''' """
pathO = 'D:/Projects/POM/DATA/search/'
doWX = True
doWB = True
doTT = True
splitByCity = True
#监测已发现的敏感词
fFound = [
#'D:/Projects/POM/DATA/2023年3月/2月报告/汇总/mgc.xlsx',
#'D:/Projects/POM/DATA/2023年2月/1月报告/汇总/mgc.xlsx',
#'D:/Projects/POM/DATA/2023年1月/12月报告/汇总/mgc.xlsx',
#'D:/Projects/POM/DATA/2022年12月/11月报告/汇总/mgc.xlsx',
#'D:/Projects/POM/DATA/2022年11月/10月报告/汇总/mgc.xlsx',
#'D:/Projects/POM/DATA/2022年10月/9月报告/汇总/mgc.xlsx',
#'D:/Projects/POM/DATA/2022年9月/8月报告/汇总/mgc.xlsx',
#'D:/Projects/POM/DATA/2022年8月/7月报告/汇总/mgc.xlsx',
#'D:/Projects/POM/DATA/2022年7月/6月报告/汇总/mgc.xlsx'
'D:/Projects/POM/DATA/2023年3月/两会/汇总/敏感词2023.3.5.xlsx',
'D:/Projects/POM/DATA/2023年3月/两会/汇总/敏感词2023.3.6.xlsx',
'D:/Projects/POM/DATA/2023年3月/两会/汇总/敏感词2023.3.7.xlsx',
'D:/Projects/POM/DATA/2023年3月/两会/汇总/敏感词2023.3.9.xlsx',
'D:/Projects/POM/DATA/2023年3月/两会/汇总/敏感词扫描结果.xlsx' ]
#通报结果
fInformed = ['D:/Projects/POM/DATA/国办通报/20230303错敏词.xlsx',
'D:/Projects/POM/DATA/国办通报/20230304错敏词.xlsx']
######################################################################################
def getWBData(path, hasBody=False):
dictC = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市', 'TS':'天水市',
'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市', 'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门',
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市', 'ts': '天水市',
'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市', 'dx': '定西市', 'ww': '武威市', 'sz': '省直部门'
}
strC = ''
for k,v in dictC.items():
if k in path:
strC = v
break
print('-----------------------------------')
print('CITY =', strC )
dirCs = os.listdir(path)
cs = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', '发布时间', '发布工具', '点赞数',
'转发数', '评论数', 'weiboID', '账号名称', '市州']
dfWB = pd.DataFrame(columns=cs)
for dirC in dirCs:
#第一层不是目录或者目录名里有weixin跳过
sc = os.path.join(path, dirC)
if not os.path.isdir(sc):
continue
if 'weixin' in dirC.lower():
continue
if 'tt' in dirC.lower():
continue
# 时段 weibo weibo_1
cols = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', '发布时间', '发布工具', '点赞数',
'转发数', '评论数'] #WB下载工具中的格式
dfWBC = pd.DataFrame(columns=cols)
dirCTs = os.listdir(sc)
for dirCT in dirCTs:
# 账号名称 清水司法
sct = os.path.join(path, dirC, dirCT)
if not os.path.isdir(sct):
continue
if 'weixin' in dirC.lower():
continue
if 'tt' in dirC.lower():
continue
#print('--',dirCT)
# 账号名称
wbName = dirCT
dirAs = os.listdir(sct)
for dirA in dirAs:
scta = os.path.join(path, dirC, dirCT, dirA)
# 文件名
fileAs = scta
if len(fileAs) > 0 and os.path.splitext(fileAs)[1] == '.csv':
wbId = dirA[:-4]
# 读取文件
#########print('----',wbName, wbId)
dfdfwb = pd.read_csv(fileAs, sep=',', header=None, names=cols,
index_col=None)#, engine='python', encoding='gbk'#utf-8
dfdfwb = dfdfwb[1:]
dfdfwb["weiboID"] = wbId
dfdfwb["账号名称"] = wbName
dfWBC = dfWBC.append(dfdfwb)
print('.', end='')
#if len(fileAs)>1:
# print(" +=+= ", fileAs)
#print(dfWBC.shape)
#dfWBC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx")cl
#print(dirC)
#print( dictC[dirC])
#print( dfWBC['市州'])
dfWBC['市州'] = strC
dfWB = dfWB.append(dfWBC)
print(' ')
#print('-',dirC, dfWB.shape[0])
print('-',dirC, dfWBC.shape[0])
print('',dfWB.shape[0])
#dfWB.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
return dfWB
def getWBData_Province(path, hasBody=False):
dirCs = os.listdir(path)
cs = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', '发布时间', '发布工具', '点赞数',
'转发数', '评论数', 'weiboID', '账号名称', '市州']
dfWB = pd.DataFrame(columns=cs)
for dirC in dirCs:
sc = os.path.join(path, dirC)
if not os.path.isdir(sc):
continue
dfWB = dfWB.append(getWBData(sc, hasBody))
dfWB['标题']=''
dfWB.rename(columns={"微博正文": "内容", "发布时间": "日期"},inplace=True)
return dfWB
def getWXData_Province(path, hasBody=False):
cs = ['公众号', '链接', '日期', '标题', '内容', '阅读数', '在看数', '点赞数', 'get_time', '头条',]
dfWX = pd.DataFrame(columns=cs)
dictC = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市', 'TS':'天水市',
'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市', 'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门',
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市', 'ts': '天水市',
'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市', 'dx': '定西市', 'ww': '武威市', 'sz': '省直部门'
}
files=[]
dirCs = os.listdir(path)
for dirC in dirCs:
if dirC[:1]=='.':
continue
sc = os.path.join(path, dirC)
# 判断市州名称
strC = ''
for k,v in dictC.items():
if k in sc:
strC = v
break
print('-', strC)
# 市州 dirC PL
if os.path.isdir(sc):
dirCCs = os.listdir(sc)
for dirCC in dirCCs:
scc = os.path.join(sc,dirCC)
# 文件
if dirCC[:1]=='.':
continue
if not os.path.isdir(scc):
#print(dirCC, dirCC[-5:] )
if dirCC[-5:]=='.xlsx' or dirCC[-4:]=='.xls':
files.append(scc)
dfcc = pd.read_excel(scc)
dfcc['市州'] = strC
dfWX = dfWX.append(dfcc)
print(' ', dirCC, dfcc.shape[0])
else:
print('something error 01: ', dirCC)
else:
if dirC[-5:]=='.xlsx' or dirC[-4:]=='.xls':
files.append(sc)
dfc = pd.read_excel(sc)
dfcc['市州'] = strC
dfWX = dfWX.append(dfc)
print(' ', dirC, dfc.shape[0])
else:
print('something error 02')
print(' ', dfWX.shape[0])
print('ALL WX data', dfWX.shape[0])
return dfWX
# 从数据目录中读取xlsx文件拼接到一起
def getTTData(path, cities, hasBody=False):
# cityShorten
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
}
dirCs = os.listdir(path)
#account date title nread ncomment content url origin
cs = ['account', 'date', 'title', 'nread', 'ncomment', 'content', 'url', 'origin', 'city']
dfTT = pd.DataFrame(columns=cs)
cityCount = 0
for dirC in dirCs:
if dirC[:1] == '.' \
or not os.path.isdir(os.path.join(path, dirC)) \
or 'weixin' in dirC.lower() \
or 'weibo' in dirC.lower() \
or not cityShorten[dirC] in cities:
continue
cityCount += 1
# City LN
dfTTC = pd.DataFrame(columns=cs)
dirCTs = os.listdir(os.path.join(path, dirC))
for dirCT in dirCTs:
if dirCT[:1] == '.' \
or not os.path.isdir(os.path.join(path, dirC, dirCT)) \
or 'weixin' in dirCT.lower() \
or 'weibo' in dirCT.lower():
continue
if 'tt' in dirCT.lower() or dirC.lower() in dirCT.lower():
fns = os.listdir(os.path.join(path, dirC, dirCT))
numc = 0
accounts = set()
for fn in fns:
if os.path.isdir(os.path.join(path, dirC, dirCT, fn)):
print(' >>',fn)
sds = os.listdir(os.path.join(path, dirC, dirCT, fn))
for sd in sds:
if sd[:1] == '.' or not sd[-5:] == '.xlsx' or sd.count('_') < 2:
continue
ttName = sd[sd.index('_')+1:]
ttName = ttName[:ttName.index('_')]
fileAs = os.path.join(path, dirC, dirCT, fn, sd)
#print(' ', ttName, fileAs)
if len(fileAs) > 0:
dfdftt = pd.read_excel(fileAs)
dfTTC = dfTTC.append(dfdftt, ignore_index=True)
numc = numc+1
accounts.add(ttName)
print('.', end='')
print(' ')
#
if fn[:1] == '.' or not fn[-5:] == '.xlsx' or fn.count('_') < 2:
continue
#print('---',fn)
# 账号名称
ttName = fn[fn.index('_')+1:]
ttName = ttName[:ttName.index('_')]
fileAs = os.path.join(path, dirC, dirCT, fn)
#print(' ', ttName, fileAs)
if len(fileAs) > 0:
try:
dfdftt = pd.read_excel(fileAs)
except:
print('')
print("!!!!!!! 读取头条文件出错: ", fileAs)
if not dfdftt.empty:
dfTTC = dfTTC.append(dfdftt, ignore_index=True)
numc = numc+1
accounts.add(ttName)
print('.', end='')
print(' ')
print(' +', cityShorten[dirC], 'dir:', dirC, '/', dirCT, '账号数', len(accounts),'文件数', numc, '文章数', dfTTC.shape[0])
dfTTC['city'] = cityShorten[dirC]
dfTT = dfTT.append(dfTTC)
print('Read TT DIR finished. cities', cityCount, '; lines', dfTT.shape)
#dfTT.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
return dfTT
#######################################################
#######################################################
t0 = datetime.datetime.now()
df = pd.DataFrame()
# WX
if doWX:
dfWX = pd.DataFrame()
for path in paths:
ddff = getWXData_Province(path)
print(' read WX data', ddff.shape)
dfWX = dfWX.append(ddff)
print('WX data ', dfWX.shape)
# 查找关键词
dfwxd = dfWX[['市州', '公众号', '日期', '标题', '链接', '内容', '阅读数']][dfWX['内容'].str.contains(regSTR, regex=True, na=False)]
dfwxd['类型'] = '微信'
dfwxd['关键词']=''
dfwxd['上下文']=''
print("Found ", dfwxd.shape)
# 提取上下文
for i,r in dfwxd.iterrows():
string = str(r['内容'])
its = re.finditer(regSTR, string)
sk = ''
sp = ''
for it in its:
s=0
e=len(string)
d = 5
if it.start()>d:
s = it.start()-d
if (it.end()< e-d):
e = it.end()+d
sk += it.group() + ';'
sp += string[s:e] + ';'
dfwxd.loc[i,'关键词'] = sk[:-1]
dfwxd.loc[i,'上下文'] = sp[:-1]
dfwxd.rename(columns={"阅读数": "阅读数/评论数", "公众号": "账号名称"},inplace=True)
dfwxd = dfwxd[['关键词', '上下文', '日期', '市州', '类型', '账号名称', '链接', '标题', '阅读数/评论数', '内容',]]
df = df.append(dfwxd)
# WB
if doWB:
dfWB = pd.DataFrame()
for path in paths:
dfWBff = getWBData_Province(path)
print('read WB data', dfWBff.shape)
dfWB = dfWB.append(dfWBff)
print("WB Data ", dfWB.shape)
# 查找关键词
dfwbd = dfWB[['市州', '账号名称', '标题', '日期', '评论数', '内容']][dfWB['内容'].str.contains(regSTR, regex=True, na=False)]
dfwbd['类型'] = '微博'
dfwbd['关键词'] = ''
dfwbd['上下文'] = ''
print("WB Found ", dfwbd.shape)
# 提取关键词上下文
for i, r in dfwbd.iterrows():
string = str(r['内容'])
its = re.finditer(regSTR, string)
sk = ''
sp = ''
for it in its:
s = 0
e = len(string)
d = 5
if it.start() > d:
s = it.start() - d
if (it.end() < e - d):
e = it.end() + d
sk += it.group() + ';'
sp += string[s:e] + ';'
dfwbd.loc[i, '关键词'] = sk
dfwbd.loc[i, '上下文'] = sp
dfwbd.rename(columns={"评论数": "阅读数/评论数"},inplace=True)
dfwbd = dfwbd[['关键词', '上下文', '日期', '市州', '类型', '账号名称', '标题', '阅读数/评论数', '内容',]]
df = df.append(dfwbd)
#######################################################
# TT
if doTT:
cities = [
'临夏回族自治州',
'白银市',
'定西市',
'酒泉市',
'嘉峪关市',
'平凉市',
'庆阳市',
'天水市',
'武威市',
'兰州新区',
'陇南市',
'兰州市', '张掖市', '甘南藏族自治州', '金昌市',
'省直部门', # 共12市2州1新区
]
dfTT = pd.DataFrame()
for strP in paths:
print("read TT data ", strP)
ddff = getTTData(strP, cities)
dfTT = dfTT.append(ddff)
print("TT data", dfTT.shape)
#account date title nread ncomment content url origin city
# 查找关键词
dfttd = dfTT[['city', 'account', 'date', 'title', 'url', 'content', 'nread']][dfTT['content'].str.contains(regSTR, regex=True, na=False)]
dfttd['类型'] = '头条'
dfttd['关键词']=''
dfttd['上下文']=''
print("Found ", dfttd.shape)
# 提取上下文
for i,r in dfttd.iterrows():
string = str(r['content'])
its = re.finditer(regSTR, string)
sk = ''
sp = ''
for it in its:
s=0
e=len(string)
d = 5
if it.start()>d:
s = it.start()-d
if (it.end()< e-d):
e = it.end()+d
sk += it.group() + ';'
sp += string[s:e] + ';'
dfttd.loc[i,'关键词'] = sk[:-1]
dfttd.loc[i,'上下文'] = sp[:-1]
dfttd.rename(columns={'city': "市州", 'account': "账号名称", 'date': "日期", 'title': "标题", 'url':'链接', 'content': "内容", "nread": "阅读数/评论数"},inplace=True)
dfttd = dfttd[['关键词', '上下文', '日期', '市州', '类型', '账号名称', '链接', '标题', '阅读数/评论数', '内容',]]
df = df.append(dfttd)
#################################
print('扫描完成,发现敏感词', df.shape[0])
df['date'] = pd.to_datetime(df['日期'])
df['identifier'] = df['账号名称'].map(str) + '_' + df['date'].map(lambda x:str(x.year)+str(x.month).rjust(2,'0')+str(x.day).rjust(2,'0'))
print('读取已发现的敏感词文件')
dfFound = pd.DataFrame()
for f in fFound:
dff = pd.read_excel(f)
print(' ', f, dff.shape)
dfFound = dfFound.append(dff, ignore_index=True)
#错误 建议 账号类型 账号名称 错误出现位置 发文时间 标题 市州
dfFound['date'] = pd.to_datetime(dfFound['发文时间'])
dfFound['identifier'] = dfFound['账号名称'].map(str) + '_' + dfFound['date'].map(lambda x:str(x.year)+str(x.month).rjust(2,'0')+str(x.day).rjust(2,'0'))
print('', dfFound.shape[0])
print('读取国办通报数据')
dfInformed = pd.DataFrame()
for f in fInformed:
dfff = pd.read_excel(f)
print(' ', f, dfff.shape[0])
dfInformed = dfInformed.append(dfff, ignore_index=True)
#序号 所属省份/部委 市州 公众号名称 运营主体 文章标题 文章链接 文章发布时间 是否转办 问题摘要 整改情况 监测情况
dfInformed['date'] = pd.to_datetime(dfInformed['文章发布时间'])
dfInformed['identifier'] = dfInformed['公众号名称'].map(str) + '_' + dfInformed['date'].map(lambda x:str(x.year)+str(x.month).rjust(2,'0')+str(x.day).rjust(2,'0'))
print('', dfInformed.shape[0])
print('标记被监测出的和被通报的')
#扫描结果中标记被监测出的和被通报的
df['监测'] = df['identifier'].isin(dfFound['identifier'])
df['通报'] = df['identifier'].isin(dfInformed['identifier'])
'''
print('监测结果中标记被扫描出的和被通报的')
#监测结果中标记被扫描出的和被通报的
dfFound['扫描'] = dfFound['identifier'].isin(dfScan['identifier'])
dfFound['通报'] = dfFound['identifier'].isin(dfInformed['identifier'])
print('通报结果中标记被扫描出的和被监测出的')
#通报结果中标记被扫描出的和被监测出的
dfInformed['扫描'] = dfInformed['identifier'].isin(dfScan['identifier'])
dfInformed['监测'] = dfInformed['identifier'].isin(dfFound['identifier'])
'''
print('去掉已标记内容')
dfO = df.loc[(df['监测']==False) & (df['通报']==False)].copy()
print(dfO.shape)
#################################
#输出
sss = datetime.datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
pathO = pathO + sss + '/'
if not os.path.exists(pathO):
os.makedirs(pathO)
dfO.drop(['date', 'identifier', '监测', '通报'],axis=1, inplace=True)
dfO.to_excel(pathO + sss + ".xlsx", index=False)
if splitByCity:
print('按市州输出')
cities = dfO['市州'].unique()
for city in cities:
dft = dfO[dfO['市州'].isin([city])]
# exec("df%s = dft"%cityNum)
dft.to_excel(pathO + city+'.xlsx', index=False)
#计时
t = datetime.datetime.now() - t0
print('用时{}{}{}'.format(int(t.seconds/3600),int(t.seconds/60), t.seconds%60))

238
sendSMS.py Normal file
View File

@ -0,0 +1,238 @@
import http.client
from urllib import parse
import json
def tpl_send_sms(apikey, tpl_id, tpl_value, mobile):
"""
模板接口发短信
"""
params = parse.urlencode({
'apikey': apikey,
'tpl_id': tpl_id,
'tpl_value': parse.urlencode(tpl_value),
'mobile': mobile
})
headers = {
"Content-type": "application/x-www-form-urlencoded",
"Accept": "text/plain"
}
conn = http.client.HTTPSConnection(sms_host, port=port, timeout=30)
conn.request("POST", sms_tpl_send_uri, params, headers)
response = conn.getresponse()
response_str = response.read()
conn.close()
return response_str
def send_sms(apikey, text, mobile):
"""
通用接口发短信
"""
params = parse.urlencode({'apikey': apikey, 'text': text, 'mobile':mobile})
headers = {
"Content-type": "application/x-www-form-urlencoded",
"Accept": "text/plain"
}
conn = http.client.HTTPSConnection(sms_host, port=port, timeout=30)
conn.request("POST", sms_send_uri, params, headers)
response = conn.getresponse()
response_str = response.read()
conn.close()
return response_str
def sendMessage(apikey = "304eb08353f7ebf00596737acfc31f53"):
# 模板
tpl_id = 4621614
# 【甘肃大未来科技】为提高政务新媒体监测服务效果,提升预警时效性和精准性,
# 从7月1日起我公司将预警周期由10日调整为7日。感谢您对甘肃大未来的信任和支持。
# 电话号码
dictGS = {
'szq': '13359446622',
'zyb': '13609346975'
}
dictCities = {
'天水市': {'王慧': '18706936366', '王肖肖': '17793816150'},
'白银市': {'高雅丽': '15393391905', '范小强': '13639306533', '张静静': '13830021006'},
'定西市': {'党辉': '18893219695', '高刚': '18993265998'},
'酒泉市': {'白苍松白秘书长': '13909371177', '吴建平': '13389370534'},
'临夏州': {'周世泽': '13830103221', '马清明': '13993012391', '马静': '13993096392'},
'平凉市': {'雷勇': '13809330195', '梁文芬': '13993366938'},
'嘉峪关市': {'彭松涛':'18893605128'},
'庆阳市': {'孙德勋': '13909342931', '闫红': '18993490882'},
'华池县': {'李银粉': '13884192323'},
'宁县': {'张虎帅': '13993434900'},
'镇原县': {'刘主任': '13994327967', '苟罗文': '15268989815'},
}
# 批量发送
# (将通知文本向所有号码逐一发送)
dictCities['大未来'] = dictGS
for dGSk in dictCities.keys():
print('----', dGSk)
# dictCities[dGSk].update(dictGS)
for dk in dictCities[dGSk].keys():
sss = tpl_send_sms(apikey, tpl_id, '', dictCities[dGSk][dk])
print(' ', dk, dictCities[dGSk][dk], sss.decode('utf-8'))
def sendReportMonthly(apikey, sYear, sMon):
tpl_id = 4272748
# 【甘肃大未来科技】#city#政务新媒体#year#年#month#月份监测报告电子版已发送到业务联系人,请关注。
# 电话号码
dictGS = {
'szq': '13359446622',
'zyb': '13609346975'
}
dictCities = {
'天水市': {'王慧': '18706936366', '王肖肖': '17793816150'},
'白银市': {'高雅丽': '15393391905', '张静静': '13830021006'},
'定西市': {'张勇':'13993200605', '高刚': '18993265998'},#'党辉': '18893219695',
'酒泉市': {'吴建平': '13389370534'}, #'白苍松白秘书长': '13909371177',
'临夏州': {'周世泽': '13830103221', '马清明': '13993012391', '任琴霞': '13909300361'},
'平凉市': {'雷勇': '13809330195', '万朵': '15193383961'},
'庆阳市': {'孙德勋': '13909342931', '闫红': '18993490882'},
#'华池县': {},#'李银粉': '13884192323'},
#'宁县': {},#'张虎帅': '13993434900'},
#'镇原县': {}, #{'刘主任': '13994327967'}, #, '苟罗文': '15268989815'},
'嘉峪关市': {'彭松涛': '18893605128'},
'武威市': {'马巨龙': '15379291530'},
'兰州新区': {'高天晓副主任':'13993685885', '刘玉明科长':'17726983336', '闫鹏':'15117091122', },
'陇南市': {'王军主任':'18093988558', '杨帅兵':'13830941310'},
'张掖市': {'李伟璟副主任':'13909365376', '张炜':'18993628432'}
}
# 批量发送
# (逐市州发送)
for dGSk in dictCities.keys():
print('----', dGSk)
dictCities[dGSk].update(dictGS) # 向该市号码列表中添加大未来的号码
for dk in dictCities[dGSk].keys():
tpl_value = {'#city#': dGSk, '#year#': sYear, '#month#': sMon}
sss = tpl_send_sms(apikey, tpl_id, tpl_value, dictCities[dGSk][dk])
print(' ', dk, dictCities[dGSk][dk], sss.decode('utf-8'))
def sendForewarning(apikey):
tpl_id = 4058906
# 【甘肃大未来科技】政务新媒体监测预警:#dateStart#至#dateEnd#,监测#city#政务新媒体账号#count#个,
# 有#uq#个账号#problem#,具体名单发至相关工作人员,请予以关注。
tpl_id1 = 4348890
# 【甘肃大未来科技】 # dateStart#至#dateEnd#,监测#city#政务新媒体账号#amount#个,更新频次和发布内容正常。
# 电话号码
dDwlNamePhone = {
'szq': '13359446622',
'zyb': '13609346975'
}
dCityNamePhone = {
'天水市': {'王慧': '18706936366', '王肖肖': '17793816150'},
'白银市': {'高雅丽': '15393391905', '范小强': '13639306533', '张静静': '13830021006'},
'定西市': {'张勇':'13993200605', '高刚': '18993265998'},
'酒泉市': {'新领导': '13909371177', '吴建平': '13389370534'},
'临夏州': {'周世泽': '13830103221', '马清明': '13993012391', '马静': '13993096392'},
'平凉市': {'雷勇': '13809330195', '梁文芬': '13993366938'},
'嘉峪关市': {'彭松涛':'18893605128'},
'庆阳市': {'孙德勋': '13909342931', '闫红': '18993490882'},
'华池县': {},#'李银粉': '13884192323'},
'宁县': {'张虎帅': '13993434900'},
'镇原县': {'刘主任': '13994327967', '苟罗文': '15268989815'},
}
##############################################################################
##############################################################################
# 检测时间和结果
dDate = {
'dateStart': '6月24日',
'dateEnd': '30日'
}
dResults = {
'白银市': {'账号数量': '361', '预警原因': '无更新', '预警数量': '49'},
'定西市': {'账号数量': '406', '预警原因': '无更新', '预警数量': '23'},
'酒泉市': {'账号数量': '376', '预警原因': '无更新', '预警数量': '22'},
'临夏州': {'账号数量': '295', '预警原因': '无更新', '预警数量': '13'},
'平凉市': {'账号数量': '312', '预警原因': '无更新', '预警数量': '16'},
'庆阳市': {'账号数量': '303', '预警原因': '无更新', '预警数量': '10'},
'天水市': {'账号数量': '225', '预警原因': '无更新', '预警数量': '19'},
# '嘉峪关市': {'账号数量': '97', '预警原因': '无更新', '预警数量': '7'},
'华池县': {'账号数量': '38', '预警原因': '无更新', '预警数量': '2'},
'宁县': {'账号数量': '36', '预警原因': '无更新', '预警数量': '1'},
'镇原县': {'账号数量': '33', '预警原因': '', '预警数量': '0'},
}
##############################################################################
##############################################################################
# 批量发送
# (逐市州发送)
for sCity in dResults.keys():
print('----', sCity)
if sCity in dCityNamePhone:
dCityNamePhone[sCity].update(dDwlNamePhone) # 向该市号码列表中添加大未来的号码
for sName in dCityNamePhone[sCity].keys():
uq = dResults[sCity]['预警数量']
if int(uq) > 0:
tpl_value = {'#dateStart#': dDate['dateStart'], '#dateEnd#': dDate['dateEnd'],
'#city#': sCity, '#count#': dResults[sCity]['账号数量'],
'#uq#': dResults[sCity]['预警数量'], '#problem#': dResults[sCity]['预警原因'], }
##sss = tpl_send_sms(apikey, tpl_id, tpl_value, dCityNamePhone[sCity][sName])
sss = '【甘肃大未来科技】政务新媒体监测预警:' + dDate['dateStart'] + '' + dDate['dateEnd']\
+ '' + sCity + '被监测的' + dResults[sCity]['账号数量'] + '个政务新媒体账号中,有'\
+ dResults[sCity]['预警数量'] + '个账号' + dResults[sCity]['预警原因'] + ',具体名单将发至相关工作人员,请予以关注提醒。'
print(' ', sName, dCityNamePhone[sCity][sName], sss)#.decode('utf-8'))
elif int(uq) == 0:
tpl_value = {'#dateStart#': dDate['dateStart'], '#dateEnd#': dDate['dateEnd'],
'#city#': sCity, '#amount#': dResults[sCity]['账号数量'], }
#sss = tpl_send_sms(apikey, tpl_id1, tpl_value, dCityNamePhone[sCity][sName])
# 【甘肃大未来科技】 # dateStart#至#dateEnd#,监测#city#政务新媒体账号#amount#个,更新频次和发布内容正常。
sss = '【甘肃大未来科技】' + dDate['dateStart'] + ''+ dDate['dateEnd'] + ',监测'\
+ sCity + '政务新媒体账号' + dResults[sCity]['账号数量'] + '个,更新频次和发布内容正常。'
print(' ', sName, dCityNamePhone[sCity][sName], sss)#.decode('utf-8'))
else:
print('!!!!! ERROR !!!!!')
if __name__ == "__main__":
##########################
# 短信平台地址和端口
##########################
# 服务地址
sms_host = "sms.yunpian.com"
voice_host = "voice.yunpian.com"
# 端口号
port = 443
# 版本号
version = "v2"
# 查账户信息的URI
user_get_uri = "/" + version + "/user/get.json"
# 智能匹配模板短信接口的URI
sms_send_uri = "/" + version + "/sms/single_send.json"
# 模板短信接口的URI
sms_tpl_send_uri = "/" + version + "/sms/tpl_single_send.json"
# 语音短信接口的URI
sms_voice_send_uri = "/" + version + "/voice/send.json"
# 语音验证码
voiceCode = 1234
##########################
# 短信平台地址和端口
##########################
apikey = "304eb08353f7ebf00596737acfc31f53"
# 向所有成员发送通知
#sendMessage(apikey)
# 逐市州发送月报告
sendReportMonthly(apikey, '2023', '1')
# 逐市州发送预警信息
#sendForewarning(apikey)

1072
statForward202303.py Normal file

File diff suppressed because it is too large Load Diff

284
statWeekly.py Normal file
View File

@ -0,0 +1,284 @@
import pandas as pd
import numpy as np
import glob, os, re, time
from datetime import datetime
from docx import Document
from docx.oxml.ns import qn
from docx.shared import Pt,RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
import http.client
from urllib import parse
################
################
TEST = False # True为测试状态不发短信 False为正式状态发送短信。
################
################
dDate = {
'dateStart': '3月23日',
'dateEnd': '29日'
}
fn = 'D:/Projects/POM/DATA/2023年3月/3月31日预警/周预警_2023.3.29.xlsx'
outPath = 'D:/Projects/POM/DATA/2023年3月/3月31日预警/'
################
cities = {'白银市', '武威市',
'庆阳市',
'酒泉市',
'天水市',
'临夏回族自治州', '平凉市', '定西市', '定西市', '嘉峪关市',
'兰州新区','陇南市', '张掖市', '庆阳市宁县', '庆阳市镇原县', } #
#cities = {'酒泉市'}
# 电话号码
contactsDWL = {
'szq': '13359446622',
'zyb': '13609346975'
}
contacts = {
'天水市': {'王慧': '18706936366', '王肖肖': '17793816150'},
'白银市': {'高雅丽': '15393391905', '张静静': '13830021006'},
'定西市': {'张勇':'13993200605', '高刚': '18993265998'},
'酒泉市': {'吴建平': '13389370534'},
'临夏回族自治州': {'周世泽': '13830103221', '马清明': '13993012391', '马静': '13993096392'},
'平凉市': {'雷勇': '13809330195', '万朵': '15193383961'},
'武威市': {'马巨龙': '15379291530'}, #'孙彪': '17793551918'},
'嘉峪关市': {'彭松涛': '18893605128'},
'庆阳市': {'孙德勋': '13909342931', '闫红': '18993490882'},
#'庆阳市华池县': {'李保宁': '13739343092'},
'庆阳市宁县': {'zyb': '13609346975'},
'庆阳市镇原县': {'zyb': '13609346975'},#'刘主任': '13994327967', '王怡文': '18219942918'
'兰州新区': {'高天晓副主任':'13993685885', '刘玉明科长':'17726983336', '闫鹏':'15117091122', },
'陇南市': {'王军主任':'18093988558', '杨帅兵':'13830941310'},
'张掖市': {'张炜':'18993628432', '李伟璟':'13909365376'}
}
df = pd.read_excel(fn)
df.replace('\s+', '', regex=True, inplace=True)
df.loc[df['账号类型']=='微信服务号', '账号类型'] = '微信'
df.loc[df['账号类型']=='微信订阅号', '账号类型'] = '微信'
df['账号类型'] = df['账号类型'].str.replace(r'^其他\+','')
# SMS
sms_host = "sms.yunpian.com"
port = 443
sms_tpl_send_uri = "/v2/sms/tpl_single_send.json"
apikey = "304eb08353f7ebf00596737acfc31f53"
def tpl_send_sms(sms_host, port, sms_tpl_send_uri, apikey, tpl_id, tpl_value, mobile):
"""
模板接口发短信
"""
params = parse.urlencode({
'apikey': apikey,
'tpl_id': tpl_id,
'tpl_value': parse.urlencode(tpl_value),
'mobile': mobile
})
headers = {
"Content-type": "application/x-www-form-urlencoded",
"Accept": "text/plain"
}
conn = http.client.HTTPSConnection(sms_host, port=port, timeout=30)
conn.request("POST", sms_tpl_send_uri, params, headers)
response = conn.getresponse()
response_str = response.read()
conn.close()
return response_str
aa = 0
bb = 0
for city in cities:
print('~~~~~~~~~~~~~~~~~~~~~~~~')
dfC = df.loc[df['市/省局']==city].copy()
cc = dfC.shape[0]
if city == '庆阳市':
dfC = df.loc[(df['市/省局']==city)
& (df['区县/地方部门']!='华池县')
& (df['区县/地方部门']!='宁县')
& (df['区县/地方部门']!='镇原县')
].copy()
if city == '庆阳市宁县':
dfC = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '宁县')].copy()
cc = dfC.shape[0]
if city == '庆阳市华池县':
dfC = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '华池县')].copy()
cc = dfC.shape[0]
if city == '庆阳市镇原县':
dfC = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '镇原县')].copy()
cc = dfC.shape[0]
dfCU = dfC.loc[dfC['监测结果']!='合格'].copy()
warningText = '【甘肃大未来科技】政务新媒体监测预警:{}{},监测{}政务新媒体账号{}个,更新频次和发布内容正常。'.format(dDate['dateStart'], dDate['dateEnd'], city, cc)
warningLists = []
if dfCU.shape[0] > 0:
warningText = '【甘肃大未来科技】政务新媒体监测预警:{}{}{}被监测的{}个政务新媒体账号中,有{}个账号无更新,具体名单附后,请予以关注提醒。'.format(dDate['dateStart'], dDate['dateEnd'], city, cc, dfCU.shape[0])
print( warningText )
if dfCU.shape[0] > 0:
group = dfCU.groupby('账号类型')
for type, dfa in group:
astr = ''
for index, row in dfa.iterrows():
astr += row['账号名称'] + ', '
s = '{}({}个): {}'.format(type, dfa.shape[0], astr[:-2])
warningLists.append( s )
print(s)
print(' ')
# 生成总览文本
# 生成市州报告文本,存档
if 1:
doc = Document()
doc.styles['Normal'].font.name = u'宋体'
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
doc.styles['Normal'].font.size = Pt(16)
doc.styles['Normal'].font.color.rgb = RGBColor(0, 0, 0)
p1 = doc.add_heading(city + '政务新媒体监测预警', 0)
p1.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
p2 = doc.add_paragraph(warningText)
p1.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
#p2.paragraph_format.left_indent = 406400
p2.paragraph_format.first_line_indent = 406400
for s in warningLists:
doc.add_paragraph(s)
doc.save(outPath + city + '.docx')
dfCU.shape[0]
# 发短信
if 1:
contacts[city].update(contactsDWL) # 向该市号码列表中添加大未来的号码
log = ''
sss = ''
for contact in contacts[city].keys():
log += contact + contacts[city][contact] + ', '
if dfCU.shape[0] > 0:
tpl_id = 4058906 # 【甘肃大未来科技】政务新媒体监测预警:#dateStart#至#dateEnd#,监测#city#政务新媒体账号#count#个,有#uq#个账号#problem#,未发现涉及敏感的错误内容。具体名单发至相关工作人员,请予以关注。
tpl_value = {'#dateStart#': dDate['dateStart'], '#dateEnd#': dDate['dateEnd'], '#city#': city,
'#count#': cc,
'#uq#': dfCU.shape[0], '#problem#': '未更新', }
sss = '【甘肃大未来科技】政务新媒体监测预警:' + dDate['dateStart'] + '' + dDate[
'dateEnd'] + '' + city + '被监测的' + str(cc) + '个政务新媒体账号中,有' \
+ str(dfCU.shape[0]) + '个账号' + '未更新' + ',未发现涉及敏感的错误内容。具体名单发至相关工作人员,请予以关注。'
if not TEST:
sss = tpl_send_sms(sms_host, port, sms_tpl_send_uri, apikey, tpl_id, tpl_value,
contacts[city][contact]).decode('utf-8')
else:
tpl_id = 4348890 # 【甘肃大未来科技】#dateStart#至#dateEnd#,监测#city#政务新媒体账号#amount#个,更新频次和发布内容正常。
tpl_value = {'#dateStart#': dDate['dateStart'], '#dateEnd#': dDate['dateEnd'], '#city#': city,
'#amount#': cc}
sss = '【甘肃大未来科技】政务新媒体监测预警:' + dDate['dateStart'] + '' + dDate[
'dateEnd'] + ',监测' + city + '政务新媒体账号' + str(cc) + '个,更新频次和发布内容正常。'
if not TEST:
sss = tpl_send_sms(sms_host, port, sms_tpl_send_uri, apikey, tpl_id, tpl_value,
contacts[city][contact]).decode('utf-8')
# print(' sendSMS ', contact, contacts[city][contact], sss)
print("sendSMS:", sss)
print(" ", log[:-2])
#写出表格形式
if 1:
if city in ['陇南市']:
dfCU.loc[dfCU['区县/地方部门']=='', '区县/地方部门'] = '市直单位'
dfCU['区县/地方部门'] = dfCU['区县/地方部门'].fillna('市直单位')
# 按县区统计账号数量
dfLN = dfCU.groupby('区县/地方部门').agg({"账号名称":"count"})
# 按类型汇总账号名称
dfLNR = dfCU.groupby(['区县/地方部门','账号类型'])['账号名称'].apply(lambda x:x.str.cat(sep=', ')).reset_index()
# 统计各平台账号个数
dfLNS = dfCU.groupby(['账号类型']).agg({"账号名称":"count"})
print('-=-=-=')
print(dfLNS)
print('-=-=-=')
# 构建DataFrame
#types = list(dfCU['账号类型'].unique())
types = ['微信', '新浪微博', '今日头条', '抖音短视频']
l = ['区县', '未更新数'] + types
dfw = pd.DataFrame([],columns=l)
# 暂存数据
d = dict()
# 往暂存区写入县区名称和数量
for i, r in dfLN.iterrows():
d[i] = [r[0], '', '', '', '']
#d[i] = [r[0],] + types
# 区县/地方部门 账号类型 账号名称
#0 两当县 抖音短视频 陇南两当兴化乡
#1 两当县 新浪微博 陇南两当工信和商务, 陇南市两当地震
# 往暂存区分平台写入账号名称
for i, r in dfLNR.iterrows():
sCounty = r['区县/地方部门']
sType = r['账号类型']
sAccount = r['账号名称']
print(' ', sCounty, sType, sAccount)
d[sCounty][1 + types.index(sType)]=sAccount
print(d)
print('--')
# 从暂存区写入DataFrame
for k in d:
print(k, d[k][0], d[k][1], d[k][2], d[k][3], d[k][4])
dfw.loc[len(dfw)] = {'区县':k,'未更新数':d[k][0],types[0]:d[k][1],types[1]:d[k][2],types[2]:d[k][3],types[3]:d[k][4]}
# 调整输出表格列顺序
#order = ['区县', '未更新数', '微信', '新浪微博', '今日头条', '抖音短视频', ]
#dfw = dfw[order]
print('====')
# 增加 总计 行
if types[0] in dfLNS.index:
d0 = dfLNS.loc[types[0],'账号名称']
else:
d0 = 0
if types[1] in dfLNS.index:
d1 = dfLNS.loc[types[1],'账号名称']
else:
d1 = 0
if types[2] in dfLNS.index:
d2 = dfLNS.loc[types[2],'账号名称']
else:
d2 = 0
if types[3] in dfLNS.index:
d3 = dfLNS.loc[types[3],'账号名称']
else:
d3 = 0
dfw.loc[len(dfw)] = { '区县':'总 计', '未更新数':dfw['未更新数'].sum(),
types[0]:d0, types[1]:d1,
types[2]:d2, types[3]:d3 }
print('==-==', d0, d1, d2, d3)
# 写出, 添加标题
sFn = outPath + city + '周预警' + datetime.now().strftime('_%Y.%m.%d') + '.xlsx'
print('======== write to ', sFn)
writer = pd.ExcelWriter(sFn)
dfw.to_excel(writer, index=None, startrow=1) # , header=None
ws = writer.sheets['Sheet1']
title = '政务新媒体周预警未更新账号统计表({}-{}'.format(dDate['dateStart'], dDate['dateEnd'])
ws.write_string(0, 0, title)
writer.save()
#if city in ['陇南市']
#if 1 # 输出excel
aa += dfCU.shape[0]
bb += cc
print('----{}----({}/{})'.format(city, dfCU.shape[0], cc))
print('----{}----({}/{})'.format('ALL', aa, bb))