This commit is contained in:
bob 2023-07-22 09:19:10 +08:00
parent 1368bf1f0f
commit 95575b137a
12 changed files with 4673 additions and 69 deletions

725
StatMonthly.py Normal file
View File

@ -0,0 +1,725 @@
# 1. 打开监测任务表格
import pandas as pd
import numpy as np
import os, glob, re
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import datetime
#word toc
import win32com
import win32com.client as win32
from win32com.client import constants
#pdf
from pikepdf import Pdf,Page,Rectangle
#word
from docxtpl import DocxTemplate
from docxtpl import InlineImage
from docx.shared import Mm
def addStamp(target_pdf_path, watermark_pdf_path, output_pdf_path, sy=140):
#选择需要添加水印的pdf文件
target_pdf = Pdf.open(target_pdf_path)
#读取水印pdf文件并提取水印
watermark_pdf = Pdf.open(watermark_pdf_path)
watermark_page_seal = watermark_pdf.pages[0]
watermark_page_wyt = watermark_pdf.pages[1]
#加公章
x=240; y=sy; w=115; h=115
target_pdf.pages[0].add_overlay(watermark_page_seal, Rectangle(x,y, x+w, y+h))
#加签字
x=163; y=573; w=85; h=50
target_pdf.pages[2].add_overlay(watermark_page_wyt, Rectangle(x,y, x+w, y+h))
#target_pdf.save(target_pdf_path[:6] + '_已签章.pdf')
target_pdf.save(output_pdf_path)
def update_toc(docx_file): # word路径
word = win32com.client.DispatchEx("Word.Application")
word.Visible = 0 # 设置应用可见
word.DisplayAlerts = 0
doc = word.Documents.Open(docx_file) # 使用微软office打开word
toc_count = doc.TablesOfContents.Count # 判断是否有无目录如果数量是1则代表已经有目录了
if toc_count == 0:
print("无目录")
'''
for i, p in enumerate(doc.Paragraphs): # 遍历word中的内容
if '目录' in p.Range.Text: # 用于指定目录页面,看下面提示
p.Range.InsertParagraphAfter() # 添加新的段落
p.Range.InsertAfter("---")
parag_range = doc.Paragraphs(i+2).Range
doc.TablesOfContents.Add(Range=parag_range,
UseHeadingStyles=True,
LowerHeadingLevel=2) # 生成目录对象
'''
elif toc_count == 1:
toc = doc.TablesOfContents(1)
#toc.Update() # 更新整个目录
toc.UpdatePageNumbers() # 更新目录页码
doc.SaveAs(docx_file.replace('.docx', '_.pdf'), FileFormat=17)
doc.Close(SaveChanges=True)
word.Quit()
def toDate(strDT):
dt = pd.to_datetime(strDT, errors='coerce')
dts = ''
# print('-+-+:', type(dt), dt)
if not pd.isna(dt):
dts = dt.strftime('%m-%d')
return dts
# word模板替换
def temp_word(tmep_path, word_apth, dContext, pathImage, city):
tpl = DocxTemplate(tmep_path)
dC = {'annulusMediaCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusMediaCount.png'), width=Mm(120)),
'annulusCountyCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyCount.png'),
width=Mm(120)),
'annulusCountyArticle': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyArticle.png'),
width=Mm(120)),
'annulusResult': InlineImage(tpl, os.path.join(pathImage, city + 'annulusResult.png'), width=Mm(120)),
'barCountyRatio': InlineImage(tpl, os.path.join(pathImage, city + 'barCountyRatio.png'), width=Mm(120))
}
dContext.update(dC)
tpl.render(dContext)
tpl.save(word_apth)
# 画柱状图
def drawBar(data, recipe, title='', fn=''):
plt.figure(figsize=(6, 4))
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
counties = recipe
countyRates = data
plt.bar(counties, countyRates, width=0.5)
plt.xticks(counties, counties, rotation=35)
plt.ylim((0, 1))
def to_percent(temp, position):
return '%2.0f' % (100 * temp) + '%'
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
plt.title(title, fontsize=16)
plt.tight_layout()
plt.savefig(fn)
# plt.show()
plt.cla()
plt.clf()
plt.close()
# 画环状图
def drawAnnulus(data, recipe, title='', fn=''):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
xxx = 8 # 画布x
yyy = 4 # 画布y
nnncol = 1 # 图例列数
fs = 'medium' ## xx--small;x-small;small;medium;large;x-large;xx-large
# if title == '政务新媒体账号类型':
if len(recipe) > 20:
if len(recipe) > 40:
xxx = 16
nnncol = 4
fs = 'small'
else:
xxx = 16
nnncol = 2
fs = 'small'
fig, ax = plt.subplots(figsize=(xxx, yyy), subplot_kw=dict(aspect="equal"))
"""
设置圆环宽度绘图方向起始角度
参数wedgeprops以字典形式传递设置饼图边界的相关属性例如圆环宽度0.5
饼状图默认从x轴正向沿逆时针绘图参数startangle可指定新的角例如负40度度起画
"""
wedges, texts = ax.pie(data, radius=1.1, wedgeprops=dict(width=0.4), startangle=0) # 画环,返回扇形列表和每个标注文本对象(坐标,文字,属性)
if 1:
x = 1.2
if title == '政务新媒体监测结果':
x = 1.0
plt.legend(labels=recipe, loc="center left", bbox_to_anchor=(x, 0.5), borderaxespad=0., ncol=nnncol,
fontsize=fs) # , ncol=3
if len(title) > 0:
ax.set_title(title, fontsize=16, fontweight='heavy') # , x=0.6
plt.tight_layout()
if len(fn) > 0:
plt.savefig(fn)
# plt.show()
plt.cla()
plt.clf()
plt.close()
# summaryCity(city, dfc, dfcw, dfcs, context, strfnTemplate, os.path.join(strPathVerified,'Reports', city+'.docx'), strPathVerified )
# 汇总市州数据,
# 市州名称, 监测数据, cbz数据 mgc数据 context(编号、名称) word模板文件名称 输出word文件名称 临时文件目录
# 需要传入模板文件,数据、错别字、敏感词,单位名称等
def summaryCity(info, city, df, dfW, dfS, fnTemplate, fnReport, dirTemp):
dCityClient = {
'甘肃省': "甘肃省人民政府办公厅",
'省直部门': "甘肃省人民政府办公厅",
'白银市': "白银市人民政府办公室",
'定西市': "定西市人民政府办公室",
'临夏回族自治州': "临夏回族自治州人民政府办公室",
'平凉市': "中共平凉市委网络安全和信息化委员会办公室",
"庆阳市": "庆阳市电子政务与信息资源管理办公室",
'庆阳市华池县': "华池县人民政府办公室",
'庆阳市宁县': "宁县人民政府办公室",
"庆阳市镇原县": "镇原县人民政府办公室",
"酒泉市": "酒泉市人民政府办公室",
"天水市": "天水市人民政府办公室",
"武威市": "武威市人民政府办公室",
"金昌市": "金昌市人民政府办公室",
"嘉峪关市": "嘉峪关市人民政府办公室",
"兰州新区": "兰州新区管委会办公室",
"陇南市": "陇南市政务服务中心",
"张掖市": "张掖市政务服务中心",
"甘南藏族自治州": "甘南藏族自治州政务服务中心",
"兰州市": "兰州市政务服务中心",
"陇南市": "陇南市政务服务中心",
}
dHavingSubordinateUnits = {'甘肃省': True, '白银市': True, '定西市': True,
'临夏回族自治州': True, '平凉市': True, "庆阳市": True, "酒泉市": True, "天水市": True,
"陇南市": True, "张掖市": True, "甘南藏族自治州": True, "兰州市": True, "陇南市": True,
"武威市": True, "金昌市": True,
'省直部门': False, "兰州新区": False, '庆阳市华池县': False,
'庆阳市宁县': False, "庆阳市镇原县": False, "嘉峪关市": False}
print("----------------" + city + "----------------")
# 报告编号、委托单位
strID = "%02d" % (list(dCityClient).index(city))
# print(strID)
context = {
"city": city,
"client": dCityClient[city],
"reportid": strID + info['serialNum'],
'havingSubordinateUnits': dHavingSubordinateUnits[city],
'havingBelowStandard': True,
'havingUpStandard': True,
'havingCbz': True,
'havingMgc': True
}
context.update(info)
subordinate = '区县/地方部门'
subordinateName = '县区'
# 区县数据筛选
if "庆阳市" in city:
if "华池县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '华池县')].copy()
elif "宁县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '宁县')].copy()
elif "镇原县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '镇原县')].copy()
else:
dfc = df.loc[(df['市/省局'] == '庆阳市')].copy()
# & (df['区县/地方部门']!='华池县')
# & (df['区县/地方部门']!='宁县')
# & (df['区县/地方部门']!='镇原县') ].copy()
dfcw = dfW.loc[dfW['市州'] == '庆阳市'].copy()
dfcs = dfS.loc[dfS['市州'] == '庆阳市'].copy()
elif "甘肃" in city :
#dfc = df.copy()
#dfcw = dfW.copy()
#dfcs = dfS.copy()
cities = {'白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
'嘉峪关市', '陇南市', '张掖市', '省直部门', '金昌市', '甘南藏族自治州'}
dfc = df.loc[ df['市/省局'].isin(cities) ].copy()
dfcw = dfW.loc[ dfW['市州'].isin(cities) ].copy()
dfcs = dfS.loc[ dfS['市州'].isin(cities) ].copy()
subordinate = '市/省局'
subordinateName = '市州'
elif "省直部门" in city :
dfc = df.loc[df['市/省局'] == city].copy()
#dfcw = dfW.loc[dfW['市州'] == dictSC[city]].copy()
#dfcs = dfS.loc[dfS['市州'] == dictSC[city]].copy()
dfcw = dfW.loc[dfW['市州'] == city].copy()
dfcs = dfS.loc[dfS['市州'] == city].copy()
else:
dfc = df.loc[(df['市/省局'] == city)].copy()
dfcw = dfW.loc[dfW['市州'] == city].copy()
dfcs = dfS.loc[dfS['市州'] == city].copy()
# -----------------------
# 统计结果分析
dCity = {'1': '2'}
#
# 县区-监测结果 统计
#
# 透视表, 按县区统计各个监测结果账号数量
dfCountyAccount = pd.pivot_table(dfc, index=[subordinate], columns=['监测结果'], values=['账号名称'], aggfunc='count',
fill_value='', margins=True)
dfCountyAccount.columns = dfCountyAccount.columns.droplevel(0)
# 准备模板中的表格
tt3_list = []
for index, row in dfCountyAccount.iterrows():
county = ''
if index == 'All':
county = '总 计'
else:
county = index
if not dHavingSubordinateUnits[city] and county=='市直部门':
county = city
hg = ''
u2w = ''
un = ''
count = ''
if '合格' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['合格'], str):
hg = int(row['合格'])
if '监测期间未更新' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['监测期间未更新'], str):
un = int(row['监测期间未更新'])
if '超过两周未更新' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['超过两周未更新'], str):
u2w = int(row['超过两周未更新'])
if 'All' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['All'], str):
count = int(row['All'])
tt3_a = {'county': county, 'hg': hg, 'u2w': u2w, 'un': un, 'count': count}
tt3_list.append(tt3_a)
context['tt3_contents'] = tt3_list
# dfCountyAccount.to_excel(dirTask+strPathCity+'县区监测结果.xlsx')
# -----------------------
#
# 按媒体类型统计
#
# 透视表, 按账号类型统计账号数量
dfMedia = pd.pivot_table(dfc, index=['账号类型'], values=['账号名称'], aggfunc='count', fill_value='', margins=True)
# 提取该市账号数量
dCity['nmCount'] = dfMedia.loc['All', '账号名称']
print(' 监测账号数:', dCity['nmCount'])
# 提取 账号类型-数量 拼成文本串
dfMedia = dfMedia.sort_values(by='账号名称', ascending=False)
lTableCs1 = []
strMedia = ''
i = 0
tt1_list = []
for m in dfMedia.index.tolist()[1:]: # 第一个是总数,不用取
strNum = str(dfMedia.iloc[:, 0].tolist()[1:][i])
strMedia = strMedia + m + strNum + '个,'
tt1_a = {'type': m, 'count': strNum}
tt1_list.append(tt1_a)
i = i + 1
dCity['sMediaCount'] = strMedia[:-1].rstrip('')
context.update({'tt1_contents': tt1_list})
# -----------------------
#
# 按县区-更新次数 统计
#
dfCountyArticle = pd.pivot_table(dfc, index=[subordinate], values=['更新次数'], aggfunc='sum', fill_value='',
margins=True)
dfCountyArticle = dfCountyArticle.sort_values(by='更新次数', ascending=False).copy()
dCity['cityArticleCount'] = "%d" % dfCountyArticle.iloc[0, 0]
dCity['countyMostArticle'] = dfCountyArticle.index.tolist()[1]
dCity['countyMostArticleCount'] = "%d" % dfCountyArticle.iloc[1, 0]
strCountyArticle = ''
iiii = 0
if len(dfCountyArticle.index)>2:
for cccc in dfCountyArticle.index.tolist()[1:]:
iiii = iiii + 1
strCountyArticle = strCountyArticle + cccc + "%d" % dfCountyArticle.iloc[iiii, 0] + "次,"
strCountyArticle = strCountyArticle.rstrip('')
dCity['sCountyArticles'] = ',按管理矩阵统计,' + strCountyArticle
# 市各县区监测结果按总数排序,
dfCountyAccount.loc[:, '合格'] = dfCountyAccount['合格'].astype('int')
dfCountyAccount = dfCountyAccount.sort_values(by='All', ascending=False).copy()
# 计算合格率
dfCountyAccount.eval('rate = 合格 / All ', inplace=True)
dfResult = dfCountyAccount.copy()
# 提取city合格率
dCity['cityRatio'] = "{:.1%}".format(dfCountyAccount.loc['All', 'rate'])
print(' 合格率:', dCity['cityRatio'])
# 导出文件
# dfCountyAccount.to_excel(dirIntermediate+sFileBase+'县区合格率.xlsx')
# dfMedia = dfMedia.drop(['All'])
# 提取县区名称,县区账号数, 县区合格率,转成字符串
dfCountyAccount = dfCountyAccount.drop(['All']) # 删除"All"行
counties = dfCountyAccount.index.tolist()
countyCounts = dfCountyAccount['All'].values.tolist()
countyHeges = dfCountyAccount['合格'].values.tolist()
# 按县区账号数量排序
strCountyCount = ''
strCounties = ''
i = 0
for c in counties:
strCounties = strCounties + c + ''
strCountyCount = strCountyCount + c + str(countyCounts[i]) + '个,'
i = i + 1
dCity['countyCount'] = "%d" % i
dCity['sCounties'] = strCounties.rstrip('')
dCity['sCountyCount'] = strCountyCount.rstrip('')
# 按合格率排序
dfCountyAccount = dfCountyAccount.sort_values(by='rate', ascending=False)
countieshege = dfCountyAccount.index.tolist()
countyRates = dfCountyAccount['rate']
strCountyRatio = ''
i = 0
tt2_list = []
for c in countieshege:
strRatio = "%.1f" % (100.0 * countyRates[i])
strCountyRatio = strCountyRatio + c + strRatio + '%'
tt2_a = {'county': c, 'ratio': strRatio + '%'}
tt2_list.append(tt2_a)
i = i + 1
dCity['sCountyRatio'] = strCountyRatio.rstrip('')
dCity['tt2_contents'] = tt2_list
# -----------------------
#
# 绘图
#
print(' 生成图片...')
drawAnnulus(dfMedia.iloc[:, 0].tolist()[1:], dfMedia.index.tolist()[1:],
'政务新媒体账号类型', os.path.join(dirTemp, city + 'annulusMediaCount.png'))
drawAnnulus(countyCounts, counties,
subordinateName + '政务新媒体账号数量', os.path.join(dirTemp, city + 'annulusCountyCount.png'))
drawAnnulus(dfCountyArticle.iloc[:, 0].tolist()[1:], dfCountyArticle.index.tolist()[1:],
subordinateName + '政务新媒体累计更新次数', os.path.join(dirTemp, city + 'annulusCountyArticle.png'))
# {{resultNoUpdated}}个政务新媒体监测期间未更新,占监测总数的{{resultNoUpdatedRatio}}
# {{resultNoUpdated2W}}个政务新媒体连续未更新时间超过两周,占监测总数的{{resultNoUpdated2WRatio}}
# 政务新媒体监测结果
dfResult = dfResult.drop('All', axis=1)
dfResult = dfResult.drop('rate', axis=1)
# 合格数,合格率,不合格数
dCity['resultQualified'] = "%d" % (dfResult.loc['All', '合格'])
dCity['resultQualifiedRatio'] = "%.1f%%" % (dfResult.loc['All', '合格'] / dCity['nmCount'] * 100.0)
dCity['resultUnqualified'] = "%d" % (dCity['nmCount'] - dfResult.loc['All', '合格'])
#
# numNoupdated = 0
if '监测期间未更新' in dfResult.columns.values.tolist():
numNoupdated = dfResult.loc['All', '监测期间未更新']
dCity['stringResultNoUpdated'] = "%d个政务新媒体监测期间未更新,占监测总数的%.1f%%" % (
numNoupdated, numNoupdated / dCity['nmCount'] * 100.0)
dCity['stringNoUpdated'] = "%d个政务新媒体监测期间未更新。" % (numNoupdated)
else:
dCity['stringResultNoUpdated'] = ''
dCity['stringNoUpdated'] = ""
# dCity['resultNoUpdated'] = "%d"%(numNoupdated)
# dCity['resultNoUpdatedRatio'] = "%.1f%%"%(numNoupdated/dCity['nmCount']*100.0)
# numNoupdated2W = 0
if '超过两周未更新' in dfResult.columns.values.tolist():
numNoupdated2W = dfResult.loc['All', '超过两周未更新']
dCity['stringResultNoUpdated2W'] = "%d个政务新媒体连续未更新时间超过两周,占监测总数的%.1f%%" % (
numNoupdated2W, numNoupdated2W / dCity['nmCount'] * 100.0)
dCity['stringNoUpdated2W'] = "%d个政务新媒体连续未更新时间超过两周。" % (numNoupdated2W)
else:
dCity['stringResultNoUpdated2W'] = ''
dCity['stringNoUpdated2W'] = ''
# dCity['resultNoUpdated2W'] = "%d"%(numNoupdated2W)
# dCity['resultNoUpdated2WRatio'] = "%.1f%%"%(numNoupdated2W/dCity['nmCount']*100.0)
resultLabels = dfResult.columns.values.tolist()
resultCounts = dfResult.loc['All'].values.tolist()
drawAnnulus(resultCounts, resultLabels,
'政务新媒体监测结果', os.path.join(dirTemp, city + 'annulusResult.png'))
drawBar(countyRates, countieshege,
'政务新媒体管理矩阵发布时效性合格率榜单', os.path.join(dirTemp, city + 'barCountyRatio.png'))
# -----------------------
#
# 准备报告需要的数据
#
print(' 生成报告...')
dfCityUnqulified = dfc[dfc['监测结果'] != '合格']
dfCityUnqulified = dfCityUnqulified.sort_values(by="监测结果", ascending=True) # by指定按哪列排序。ascending表示是否升序=False
#################################################
dfCityQulified = dfc[dfc['监测结果'] == '合格']
dfCityQulified = dfCityQulified.sort_values(by=subordinate, ascending=True) # by指定按哪列排序。ascending表示是否升序=False
#
# 不合格账号列表
if len(dfCityUnqulified)<1:
context.update({'havingBelowStandard':False})
else:
tt4_list = []
for index, row in dfCityUnqulified.iterrows():
count = ''
if row['更新次数']:
count = "%d" % row['更新次数']
days = ''
if row['静默日数']:
days = "%d" % row['静默日数']
sD1 = ''
sD2 = ''
if row['静默开始日期']:
sD1 = toDate(str(row['静默开始日期']))
if row['静默结束日期']:
sD2 = toDate(str(row['静默结束日期']))
tt4_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
'days': days, 'start': sD1, 'end': sD2, }
tt4_list.append(tt4_a)
tt4_results = {'tt4_contents': tt4_list}
context.update(tt4_results)
#
# 合格账号列表
if len(dfCityQulified)<1:
context.update({'havingUpStandard':False})
else:
tt5_list = []
for index, row in dfCityQulified.iterrows():
count = ''
if row['更新次数']:
count = "%d" % row['更新次数']
days = ''
if row['静默日数']:
days = "%d" % row['静默日数']
sD1 = ''
sD2 = ''
if row['静默开始日期']:
sD1 = toDate(str(row['静默开始日期']))
if row['静默结束日期']:
sD2 = toDate(str(row['静默结束日期']))
tt5_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
'days': days, 'start': sD1, 'end': sD2, }
tt5_list.append(tt5_a)
tt5_results = {'tt5_contents': tt5_list}
context.update(tt5_results)
#
# 错别字表格
if dfcw.shape[0]<1:
context.update({'havingCbz':False})
else:
tCbz_list = []
dfcw.fillna('')
for index, row in dfcw.iterrows():
sTitle = ''
sDate = toDate(str(row['发文时间']))
if '标题' in dfcw.columns:
sTitle = row['标题']
# 去除引号等干扰表格模板输出的字符
r = "[——,$%^,。?、~@#¥%……&*《》<>「」{}【】()/\\\[\]'\"]"
if pd.isna(row['错误出现位置']):
s = ''
else:
s = re.sub(r, '', row['错误出现位置'])
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': s, 'type': row['账号类型'], 'name': row['账号名称'],
'date': sDate, 'title': sTitle, }
tCbz_list.append(a)
if dfcw.shape[0] > 0:
dCity['stringCbzCount'] = '本次检测发现错别字%d处,详细情况见附表政务新媒体发布内容错别字统计表。' % (dfcw.shape[0])
else:
dCity['stringCbzCount'] = '本次检测未发现错别字。'
tCbz_results = {'tCbz_contents': tCbz_list}
context.update(tCbz_results)
# 读取添加敏感词表格
if dfcs.shape[0]<1:
context.update({'havingMgc':False})
else:
tMgc_list = []
dfcs.fillna('')
for index, row in dfcs.iterrows():
sTitle = ''
sDate = toDate(str(row['发文时间']))
if '标题' in dfcs.columns:
sTitle = row['标题']
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': row['错误出现位置'], 'type': row['账号类型'], 'name': row['账号名称'],
'date': sDate, 'title': sTitle, }
tMgc_list.append(a)
if dfcs.shape[0] > 0:
dCity['stringMgcCount'] = '本次检测发现敏感信息%d处,详细情况见附表政务新媒体发布内容敏感信息统计表。' % (dfcs.shape[0])
else:
dCity['stringMgcCount'] = '本次检测未发现涉敏内容。'
tMgc_results = {'tMgc_contents': tMgc_list}
context.update(tMgc_results)
# table1
context.update(dCity)
# -----------------------
#
# 按模板生成报告
#
temp_word(fnTemplate,
fnReport,
context, dirTemp, city)
#更新目录并另存为pdf
print(' 更新目录转换为PDF...')
update_toc( fnReport )
#签章
print(' 签章...')
fnTmp = fnReport.replace('.docx', '_.pdf')
fnPDF = fnReport.replace('.docx', '.pdf')
if city in {'庆阳市', '平凉市', '临夏回族自治州'}:
addStamp(fnTmp,
'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf' ,
fnPDF, 115)
else:
addStamp(fnTmp,'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf',fnPDF)
if True:
os.remove(fnTmp)
def createDir(dirP, dirS):
dirN = dirP
if os.path.isdir(dirP):
dirN = os.path.join(dirP, dirS)
if not (os.path.exists(dirN)):
os.mkdir(dirN)
if os.path.isdir(dirN):
pass
else:
dirN = dirP
print('Directory ' + dirN + ' cannot be created.')
return dirN
# def createDir(dirP, dirS):
# 合并错别字文件
def mergeCMC(keyword, strPathCBZ, strFnCbz):
# cityShorten
cityShorten = {'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
'兰州新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
'BY': '白银市', 'DX': '定西市', 'JQ': '酒泉市', 'JYG': '嘉峪关市', 'LN': '陇南市',
'LX': '临夏回族自治州', 'PL': '平凉市', 'QY': '庆阳市', 'TS': '天水市', 'WW': '武威市', 'XQ': '兰州新区',
'LZXQ': '兰州新区', 'LZ': '兰州市', 'ZY': '张掖市', 'GN': '甘南藏族自治州', 'SZ': '省直部门', 'JC': '金昌市', }
df = pd.DataFrame()
for fn in glob.glob(os.path.join(strPathCBZ, '*'+keyword+'*.xlsx')):
p, f = os.path.split(fn)
city=''
for c in cityShorten.keys():
if c in f:
city = cityShorten[c]
break
if len(city)<1:
print("!!!!! City Name not matched ( ", f, " )")
dfn = pd.read_excel(fn)
dfn['市州'] = city
df = df.append(dfn, ignore_index=True)
print(city, f, dfn.shape[0], '/', df.shape[0])
df.to_excel(strFnCbz)
#def mergeCMC
if __name__ == "__main__":
# 运行之前先转换excel文件的日期列
info = {
"year": "2023",
"month": "6",
"datePub": "二〇二三年七月",
"dateStart": "2023年6月1日",
"dateEnd": "2023年6月30日",
"days": "30",
"serialNum": "8",
}
# 数据根目录,
strPath = 'D:/Projects/POM/DATA/2023年7月/6月报告/'
createDir(strPath, '全文')
createDir(strPath, '转发')
createDir(strPath, '报告')
createDir(strPath, '汇总')
createDir(strPath, '监测')
# 监测数据
strFnMonitoring = strPath + '汇总/6月汇总数据_2023.6.xlsx'
# word模板文件
strPathTemplate = strPath + 'POM_ReportTemplate.docx'
# 错别字
strFnCbz = strPath + '汇总/CBZ.xlsx'
if not os.path.exists(strFnCbz):# 汇总错别字
strPathCBZ = strPath + '监测/'
mergeCMC("错别", strPathCBZ, strFnCbz)
# 敏感词
strFnMgc = strPath + '汇总/MGC.xlsx'
if not os.path.exists(strFnMgc):#汇总敏感词
strPathMGC = strPath + '监测/'
mergeCMC("敏感", strPathMGC, strFnMgc)
# 数据目录
strPathOutput = strPath
# 打开监测数据、错别字、敏感词
df = pd.read_excel(strFnMonitoring)
dfW = pd.read_excel(strFnCbz)
dfS = pd.read_excel(strFnMgc)
# df.loc[df['账号类型'] == '微信服务号', '账号类型'] = '微信'
# df.loc[df['账号类型'] == '微信订阅号', '账号类型'] = '微信'
# 统一监测结果表述
df.loc[df['监测结果'] == '连续两周未更新', '监测结果'] = '超过两周未更新'
# 过长名称替换为简称,便于绘图
df.loc[df['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县'
df.loc[df['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县'
# 省直、 市直、 州直
df['市/省局'] = df['市/省局'].fillna('省直部门')
df['区县/地方部门'] = df['区县/地方部门'].fillna('市直部门')
df.loc[(df['市/省局'] == '临夏回族自治州') & (df['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门'
# 数据整理
df.replace(r'\s+', '', regex=True, inplace=True) # 去除账号、单位名称中的空格、换行、tab等
df.replace(r'^其他\+', '', regex=True, inplace=True) # 去除账号类型中的 "其它" 字样
df['更新次数'] = df['更新次数'].fillna(0)
df = df.fillna(value='')
#########################################################
#
# 统计市州范围
cities = {'白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
'嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县', '陇南市', '张掖市', '甘肃省'}
#cities = cities | {'甘肃省'}#, '省直部门'}
#cities = cities | {'陇南市'}#, '兰州市'}, '省直部门'}
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '张掖市', '甘肃省', '省直部门'}
#
cities = {'甘肃省'} # 只统计特定市州
# strPathOutput目录下生成报告目录和临时文件目录Reports 和 Intermediate
dirP = os.path.abspath(os.path.dirname(strPathOutput))
dirReports = createDir(dirP, 'Reports')
dirIntermediate = createDir(dirP, 'Intermediate')
for city in cities:
summaryCity(info, city, df, dfW, dfS, strPathTemplate, os.path.join(dirReports, city + '政务新媒体监测报告_{}{}月.docx'.format(info['year'], info['month'])), dirIntermediate)

View File

@ -16,7 +16,7 @@ from docxtpl import DocxTemplate
from docxtpl import InlineImage
from docx.shared import Mm
def addStamp(target_pdf_path, watermark_pdf_path, output_pdf_path):
def addStamp(target_pdf_path, watermark_pdf_path, output_pdf_path, sy=140):
#选择需要添加水印的pdf文件
target_pdf = Pdf.open(target_pdf_path)
#读取水印pdf文件并提取水印
@ -25,7 +25,7 @@ def addStamp(target_pdf_path, watermark_pdf_path, output_pdf_path):
watermark_page_wyt = watermark_pdf.pages[1]
#加公章
x=240; y=110; w=115; h=115
x=240; y=sy; w=115; h=115
target_pdf.pages[0].add_overlay(watermark_page_seal, Rectangle(x,y, x+w, y+h))
#加签字
@ -580,10 +580,19 @@ def summaryCity(info, city, df, dfW, dfS, fnTemplate, fnReport, dirTemp):
context, dirTemp, city)
#更新目录并另存为pdf
print(' 更新目录转换为PDF...')
update_toc( fnReport )
#签章
addStamp(fnReport.replace('.docx', '.pdf'),'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf' , fnReport.replace('.docx', '_Stamp.pdf'))
print(' 签章...')
if city in {'庆阳市', '平凉市', '临夏回族自治州'}:
addStamp(fnReport.replace('.docx', '.pdf'),
'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf' ,
fnReport.replace('.docx', '_Stamp.pdf'), 115)
else:
addStamp(fnReport.replace('.docx', '.pdf'),
'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf' ,
fnReport.replace('.docx', '_Stamp.pdf'))
@ -639,7 +648,7 @@ def summary(info, strFnData, strFnW, strFnS, strfnTemplate, strPathOutput):
#cities = cities | {'陇南市'}#, '兰州市'}, '省直部门'}
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '张掖市', '甘肃省', '省直部门'}
#
#cities = {'白银市','甘肃省'} # 只统计特定市州
cities = {'庆阳市', '庆阳市宁县', '甘肃省'} # 只统计特定市州
# strPathOutput目录下生成报告目录和临时文件目录Reports 和 Intermediate
dirP = os.path.abspath(os.path.dirname(strPathOutput))
@ -749,7 +758,7 @@ if __name__ == "__main__":
#cities = cities | {'陇南市'}#, '兰州市'}, '省直部门'}
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '张掖市', '甘肃省', '省直部门'}
#
#cities = {'兰州新区','白银市','庆阳市'} # 只统计特定市州
#cities = {'定西市'} # 只统计特定市州
# strPathOutput目录下生成报告目录和临时文件目录Reports 和 Intermediate
dirP = os.path.abspath(os.path.dirname(strPathOutput))

773
StatSeasonly2023s2.py Normal file
View File

@ -0,0 +1,773 @@
# 1. 打开监测任务表格
import pandas as pd
import numpy as np
import os, glob, re
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import datetime
#word toc
import win32com
import win32com.client as win32
from win32com.client import constants
#pdf
from pikepdf import Pdf,Page,Rectangle
#word
from docxtpl import DocxTemplate
from docxtpl import InlineImage
from docx.shared import Mm
def addStamp(target_pdf_path, watermark_pdf_path, output_pdf_path, sy=140):
#选择需要添加水印的pdf文件
target_pdf = Pdf.open(target_pdf_path)
#读取水印pdf文件并提取水印
watermark_pdf = Pdf.open(watermark_pdf_path)
watermark_page_seal = watermark_pdf.pages[0]
watermark_page_wyt = watermark_pdf.pages[1]
#加公章
x=240; y=sy; w=115; h=115
target_pdf.pages[0].add_overlay(watermark_page_seal, Rectangle(x,y, x+w, y+h))
#加签字
x=163; y=573; w=85; h=50
target_pdf.pages[2].add_overlay(watermark_page_wyt, Rectangle(x,y, x+w, y+h))
#target_pdf.save(target_pdf_path[:6] + '_已签章.pdf')
target_pdf.save(output_pdf_path)
def update_toc(docx_file): # word路径
word = win32com.client.DispatchEx("Word.Application")
word.Visible = 0 # 设置应用可见
word.DisplayAlerts = 0
doc = word.Documents.Open(docx_file) # 使用微软office打开word
toc_count = doc.TablesOfContents.Count # 判断是否有无目录如果数量是1则代表已经有目录了
if toc_count == 0:
print("无目录")
'''
for i, p in enumerate(doc.Paragraphs): # 遍历word中的内容
if '目录' in p.Range.Text: # 用于指定目录页面,看下面提示
p.Range.InsertParagraphAfter() # 添加新的段落
p.Range.InsertAfter("---")
parag_range = doc.Paragraphs(i+2).Range
doc.TablesOfContents.Add(Range=parag_range,
UseHeadingStyles=True,
LowerHeadingLevel=2) # 生成目录对象
'''
elif toc_count == 1:
toc = doc.TablesOfContents(1)
#toc.Update() # 更新整个目录
toc.UpdatePageNumbers() # 更新目录页码
doc.SaveAs(docx_file.replace('.docx', '_.pdf'), FileFormat=17)
doc.Close(SaveChanges=True)
word.Quit()
def toDate(strDT):
dt = pd.to_datetime(strDT, errors='coerce')
dts = ''
# print('-+-+:', type(dt), dt)
if not pd.isna(dt):
dts = dt.strftime('%m-%d')
return dts
# word模板替换
def temp_word(tmep_path, word_apth, dContext, pathImage, city):
tpl = DocxTemplate(tmep_path)
dC = {'annulusMediaCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusMediaCount.png'), width=Mm(120)),
'annulusCountyCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyCount.png'),
width=Mm(120)),
'annulusCountyArticle': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyArticle.png'),
width=Mm(120)),
'annulusResult': InlineImage(tpl, os.path.join(pathImage, city + 'annulusResult.png'), width=Mm(120)),
'barCountyRatio': InlineImage(tpl, os.path.join(pathImage, city + 'barCountyRatio.png'), width=Mm(120))
}
dContext.update(dC)
tpl.render(dContext)
tpl.save(word_apth)
# 画柱状图
def drawBar(data, recipe, title='', fn=''):
plt.figure(figsize=(6, 4))
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
counties = recipe
countyRates = data
plt.bar(counties, countyRates, width=0.5)
plt.xticks(counties, counties, rotation=35)
plt.ylim((0, 1))
def to_percent(temp, position):
return '%2.0f' % (100 * temp) + '%'
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
plt.title(title, fontsize=16)
plt.tight_layout()
plt.savefig(fn)
# plt.show()
plt.cla()
plt.clf()
plt.close()
# 画环状图
def drawAnnulus(data, recipe, title='', fn=''):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
xxx = 8 # 画布x
yyy = 4 # 画布y
nnncol = 1 # 图例列数
fs = 'medium' ## xx--small;x-small;small;medium;large;x-large;xx-large
# if title == '政务新媒体账号类型':
if len(recipe) > 20:
if len(recipe) > 40:
xxx = 16
nnncol = 4
fs = 'x-small'
else:
xxx = 16
nnncol = 2
fs = 'x-small'
fig, ax = plt.subplots(figsize=(xxx, yyy), subplot_kw=dict(aspect="equal"))
"""
设置圆环宽度绘图方向起始角度
参数wedgeprops以字典形式传递设置饼图边界的相关属性例如圆环宽度0.5
饼状图默认从x轴正向沿逆时针绘图参数startangle可指定新的角例如负40度度起画
"""
wedges, texts = ax.pie(data, radius=1.1, wedgeprops=dict(width=0.4), startangle=0) # 画环,返回扇形列表和每个标注文本对象(坐标,文字,属性)
if 1:
x = 1.2
if title == '政务新媒体监测结果':
x = 1.0
plt.legend(labels=recipe, loc="center left", bbox_to_anchor=(x, 0.5), borderaxespad=0., ncol=nnncol,
fontsize=fs) # , ncol=3
if len(title) > 0:
ax.set_title(title, fontsize=16, fontweight='heavy') # , x=0.6
plt.tight_layout()
if len(fn) > 0:
plt.savefig(fn)
# plt.show()
plt.cla()
plt.clf()
plt.close()
# summaryCity(city, dfc, dfcw, dfcs, context, strfnTemplate, os.path.join(strPathVerified,'Reports', city+'.docx'), strPathVerified )
# 汇总市州数据,
# 市州名称, 监测数据, cbz数据 mgc数据 context(编号、名称) word模板文件名称 输出word文件名称 临时文件目录
# 需要传入模板文件,数据、错别字、敏感词,单位名称等
def summaryCity(info, city, df, dfW, dfS, fnTemplate, fnReport, dirTemp):
dCityClient = {
'甘肃省': "甘肃省人民政府办公厅",
'省直部门': "甘肃省人民政府办公厅",
'白银市': "白银市人民政府办公室",
'定西市': "定西市人民政府办公室",
'临夏回族自治州': "临夏回族自治州人民政府办公室",
'平凉市': "中共平凉市委网络安全和信息化委员会办公室",
"庆阳市": "庆阳市电子政务与信息资源管理办公室",
'庆阳市华池县': "华池县人民政府办公室",
'庆阳市宁县': "宁县人民政府办公室",
"庆阳市镇原县": "镇原县人民政府办公室",
"酒泉市": "酒泉市人民政府办公室",
"天水市": "天水市人民政府办公室",
"武威市": "武威市人民政府办公室",
"金昌市": "金昌市人民政府办公室",
"嘉峪关市": "嘉峪关市人民政府办公室",
"兰州新区": "兰州新区管委会办公室",
"陇南市": "陇南市政务服务中心",
"张掖市": "张掖市政务服务中心",
"甘南藏族自治州": "甘南藏族自治州政务服务中心",
"兰州市": "兰州市政务服务中心",
"陇南市": "陇南市政务服务中心",
}
dHavingSubordinateUnits = {'甘肃省': True, '白银市': True, '定西市': True,
'临夏回族自治州': True, '平凉市': True, "庆阳市": True, "酒泉市": True, "天水市": True,
"陇南市": True, "张掖市": True, "甘南藏族自治州": True, "兰州市": True, "陇南市": True,
"武威市": True, "金昌市": True,
'省直部门': False, "兰州新区": False, '庆阳市华池县': False,
'庆阳市宁县': False, "庆阳市镇原县": False, "嘉峪关市": False}
print("----------------" + city + "----------------")
# 报告编号、委托单位
strID = "%02d" % (list(dCityClient).index(city))
# print(strID)
context = {
"city": city,
"client": dCityClient[city],
"reportid": strID + info['serialNum'],
'havingSubordinateUnits': dHavingSubordinateUnits[city],
'havingBelowStandard': True,
'havingUpStandard': True,
'havingCbz': True,
'havingMgc': True
}
context.update(info)
subordinate = '区县/地方部门'
subordinateName = '县区'
# 区县数据筛选
if "庆阳市" in city:
if "华池县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '华池县')].copy()
elif "宁县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '宁县')].copy()
elif "镇原县" in city:
dfc = df.loc[(df['市/省局'] == '庆阳市')
& (df['区县/地方部门'] == '镇原县')].copy()
else:
dfc = df.loc[(df['市/省局'] == '庆阳市')].copy()
# & (df['区县/地方部门']!='华池县')
# & (df['区县/地方部门']!='宁县')
# & (df['区县/地方部门']!='镇原县') ].copy()
dfcw = dfW.loc[dfW['市州'] == '庆阳市'].copy()
dfcs = dfS.loc[dfS['市州'] == '庆阳市'].copy()
elif "甘肃" in city :
dfc = df.copy()
dfcw = dfW.copy()
dfcs = dfS.copy()
'''
cities = {'白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
'嘉峪关市', '陇南市', '张掖市', '省直部门', '甘南藏族自治州', '金昌市'}
dfc = df.loc[ df['市/省局'].isin(cities) ].copy()
dfcw = dfW.loc[ dfW['市州'].isin(cities) ].copy()
dfcs = dfS.loc[ dfS['市州'].isin(cities) ].copy()
'''
subordinate = '市/省局'
subordinateName = '市州'
elif "省直部门" in city :
dfc = df.loc[df['市/省局'] == city].copy()
#dfcw = dfW.loc[dfW['市州'] == dictSC[city]].copy()
#dfcs = dfS.loc[dfS['市州'] == dictSC[city]].copy()
dfcw = dfW.loc[dfW['市州'] == city].copy()
dfcs = dfS.loc[dfS['市州'] == city].copy()
else:
dfc = df.loc[(df['市/省局'] == city)].copy()
dfcw = dfW.loc[dfW['市州'] == city].copy()
dfcs = dfS.loc[dfS['市州'] == city].copy()
# -----------------------
# 统计结果分析
dCity = {'1': '2'}
#
# 县区-监测结果 统计
#
# 透视表, 按县区统计各个监测结果账号数量
dfCountyAccount = pd.pivot_table(dfc, index=[subordinate], columns=['监测结果'], values=['账号名称'], aggfunc='count',
fill_value='', margins=True)
dfCountyAccount.columns = dfCountyAccount.columns.droplevel(0)
# 准备模板中的表格
tt3_list = []
for index, row in dfCountyAccount.iterrows():
county = ''
if index == 'All':
county = '总 计'
else:
county = index
if not dHavingSubordinateUnits[city] and county=='市直部门':
county = city
hg = ''
u2w = ''
un = ''
count = ''
if '合格' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['合格'], str):
hg = int(row['合格'])
if '监测期间未更新' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['监测期间未更新'], str):
un = int(row['监测期间未更新'])
if '超过两周未更新' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['超过两周未更新'], str):
u2w = int(row['超过两周未更新'])
if 'All' in dfCountyAccount.columns.values.tolist():
if not isinstance(row['All'], str):
count = int(row['All'])
tt3_a = {'county': county, 'hg': hg, 'u2w': u2w, 'un': un, 'count': count}
tt3_list.append(tt3_a)
context['tt3_contents'] = tt3_list
# dfCountyAccount.to_excel(dirTask+strPathCity+'县区监测结果.xlsx')
# -----------------------
#
# 按媒体类型统计
#
# 透视表, 按账号类型统计账号数量
dfMedia = pd.pivot_table(dfc, index=['账号类型'], values=['账号名称'], aggfunc='count', fill_value='', margins=True)
# 提取该市账号数量
dCity['nmCount'] = dfMedia.loc['All', '账号名称']
print(' 监测账号数:', dCity['nmCount'])
# 提取 账号类型-数量 拼成文本串
dfMedia = dfMedia.sort_values(by='账号名称', ascending=False)
lTableCs1 = []
strMedia = ''
i = 0
tt1_list = []
for m in dfMedia.index.tolist()[1:]: # 第一个是总数,不用取
strNum = str(dfMedia.iloc[:, 0].tolist()[1:][i])
strMedia = strMedia + m + strNum + '个,'
tt1_a = {'type': m, 'count': strNum}
tt1_list.append(tt1_a)
i = i + 1
dCity['sMediaCount'] = strMedia[:-1].rstrip('')
context.update({'tt1_contents': tt1_list})
# -----------------------
#
# 按县区-更新次数 统计
#
dfCountyArticle = pd.pivot_table(dfc, index=[subordinate], values=['更新次数'], aggfunc='sum', fill_value='',
margins=True)
dfCountyArticle = dfCountyArticle.sort_values(by='更新次数', ascending=False).copy()
dCity['cityArticleCount'] = "%d" % dfCountyArticle.iloc[0, 0]
dCity['countyMostArticle'] = dfCountyArticle.index.tolist()[1]
dCity['countyMostArticleCount'] = "%d" % dfCountyArticle.iloc[1, 0]
strCountyArticle = ''
iiii = 0
if len(dfCountyArticle.index)>2:
for cccc in dfCountyArticle.index.tolist()[1:]:
iiii = iiii + 1
strCountyArticle = strCountyArticle + cccc + "%d" % dfCountyArticle.iloc[iiii, 0] + "次,"
strCountyArticle = strCountyArticle.rstrip('')
dCity['sCountyArticles'] = ',按管理矩阵统计,' + strCountyArticle
# 市各县区监测结果按总数排序,
dfCountyAccount.loc[:, '合格'] = dfCountyAccount['合格'].astype('int')
dfCountyAccount = dfCountyAccount.sort_values(by='All', ascending=False).copy()
# 计算合格率
dfCountyAccount.eval('rate = 合格 / All ', inplace=True)
dfResult = dfCountyAccount.copy()
# 提取city合格率
dCity['cityRatio'] = "{:.1%}".format(dfCountyAccount.loc['All', 'rate'])
print(' 合格率:', dCity['cityRatio'])
# 导出文件
# dfCountyAccount.to_excel(dirIntermediate+sFileBase+'县区合格率.xlsx')
# dfMedia = dfMedia.drop(['All'])
# 提取县区名称,县区账号数, 县区合格率,转成字符串
dfCountyAccount = dfCountyAccount.drop(['All']) # 删除"All"行
counties = dfCountyAccount.index.tolist()
countyCounts = dfCountyAccount['All'].values.tolist()
countyHeges = dfCountyAccount['合格'].values.tolist()
# 按县区账号数量排序
strCountyCount = ''
strCounties = ''
i = 0
for c in counties:
strCounties = strCounties + c + ''
strCountyCount = strCountyCount + c + str(countyCounts[i]) + '个,'
i = i + 1
dCity['countyCount'] = "%d" % i
dCity['sCounties'] = strCounties.rstrip('')
dCity['sCountyCount'] = strCountyCount.rstrip('')
# 按合格率排序
dfCountyAccount = dfCountyAccount.sort_values(by='rate', ascending=False)
countieshege = dfCountyAccount.index.tolist()
countyRates = dfCountyAccount['rate']
strCountyRatio = ''
i = 0
tt2_list = []
for c in countieshege:
strRatio = "%.1f" % (100.0 * countyRates[i])
strCountyRatio = strCountyRatio + c + strRatio + '%'
tt2_a = {'county': c, 'ratio': strRatio + '%'}
tt2_list.append(tt2_a)
i = i + 1
dCity['sCountyRatio'] = strCountyRatio.rstrip('')
dCity['tt2_contents'] = tt2_list
# -----------------------
#
# 绘图
#
print(' 生成图片...')
drawAnnulus(dfMedia.iloc[:, 0].tolist()[1:], dfMedia.index.tolist()[1:],
'政务新媒体账号类型', os.path.join(dirTemp, city + 'annulusMediaCount.png'))
drawAnnulus(countyCounts, counties,
subordinateName + '政务新媒体账号数量', os.path.join(dirTemp, city + 'annulusCountyCount.png'))
drawAnnulus(dfCountyArticle.iloc[:, 0].tolist()[1:], dfCountyArticle.index.tolist()[1:],
subordinateName + '政务新媒体累计更新次数', os.path.join(dirTemp, city + 'annulusCountyArticle.png'))
# {{resultNoUpdated}}个政务新媒体监测期间未更新,占监测总数的{{resultNoUpdatedRatio}}
# {{resultNoUpdated2W}}个政务新媒体连续未更新时间超过两周,占监测总数的{{resultNoUpdated2WRatio}}
# 政务新媒体监测结果
dfResult = dfResult.drop('All', axis=1)
dfResult = dfResult.drop('rate', axis=1)
# 合格数,合格率,不合格数
dCity['resultQualified'] = "%d" % (dfResult.loc['All', '合格'])
dCity['resultQualifiedRatio'] = "%.1f%%" % (dfResult.loc['All', '合格'] / dCity['nmCount'] * 100.0)
dCity['resultUnqualified'] = "%d" % (dCity['nmCount'] - dfResult.loc['All', '合格'])
#
# numNoupdated = 0
if '监测期间未更新' in dfResult.columns.values.tolist():
numNoupdated = dfResult.loc['All', '监测期间未更新']
dCity['stringResultNoUpdated'] = "%d个政务新媒体监测期间未更新,占监测总数的%.1f%%" % (
numNoupdated, numNoupdated / dCity['nmCount'] * 100.0)
dCity['stringNoUpdated'] = "%d个政务新媒体监测期间未更新。" % (numNoupdated)
else:
dCity['stringResultNoUpdated'] = ''
dCity['stringNoUpdated'] = ""
# dCity['resultNoUpdated'] = "%d"%(numNoupdated)
# dCity['resultNoUpdatedRatio'] = "%.1f%%"%(numNoupdated/dCity['nmCount']*100.0)
# numNoupdated2W = 0
if '超过两周未更新' in dfResult.columns.values.tolist():
numNoupdated2W = dfResult.loc['All', '超过两周未更新']
dCity['stringResultNoUpdated2W'] = "%d个政务新媒体连续未更新时间超过两周,占监测总数的%.1f%%" % (
numNoupdated2W, numNoupdated2W / dCity['nmCount'] * 100.0)
dCity['stringNoUpdated2W'] = "%d个政务新媒体连续未更新时间超过两周。" % (numNoupdated2W)
else:
dCity['stringResultNoUpdated2W'] = ''
dCity['stringNoUpdated2W'] = ''
# dCity['resultNoUpdated2W'] = "%d"%(numNoupdated2W)
# dCity['resultNoUpdated2WRatio'] = "%.1f%%"%(numNoupdated2W/dCity['nmCount']*100.0)
resultLabels = dfResult.columns.values.tolist()
resultCounts = dfResult.loc['All'].values.tolist()
drawAnnulus(resultCounts, resultLabels,
'政务新媒体监测结果', os.path.join(dirTemp, city + 'annulusResult.png'))
drawBar(countyRates, countieshege,
'政务新媒体管理矩阵发布时效性合格率榜单', os.path.join(dirTemp, city + 'barCountyRatio.png'))
# -----------------------
#
# 准备报告需要的数据
#
print(' 生成报告...')
dfCityUnqulified = dfc[dfc['监测结果'] != '合格']
dfCityUnqulified = dfCityUnqulified.sort_values(by="监测结果", ascending=True) # by指定按哪列排序。ascending表示是否升序=False
#################################################
dfCityQulified = dfc[dfc['监测结果'] == '合格']
dfCityQulified = dfCityQulified.sort_values(by=subordinate, ascending=True) # by指定按哪列排序。ascending表示是否升序=False
#
# 不合格账号列表
if len(dfCityUnqulified)<1:
context.update({'havingBelowStandard':False})
else:
tt4_list = []
for index, row in dfCityUnqulified.iterrows():
count = ''
if row['更新次数']:
count = "%d" % row['更新次数']
days = ''
if row['静默日数']:
days = "%d" % row['静默日数']
sD1 = ''
sD2 = ''
if row['静默开始日期']:
sD1 = toDate(str(row['静默开始日期']))
if row['静默结束日期']:
sD2 = toDate(str(row['静默结束日期']))
tt4_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
'days': days, 'start': sD1, 'end': sD2, }
tt4_list.append(tt4_a)
tt4_results = {'tt4_contents': tt4_list}
context.update(tt4_results)
#
# 合格账号列表
if len(dfCityQulified)<1:
context.update({'havingUpStandard':False})
else:
tt5_list = []
for index, row in dfCityQulified.iterrows():
count = ''
if row['更新次数']:
count = "%d" % row['更新次数']
days = ''
if row['静默日数']:
days = "%d" % row['静默日数']
sD1 = ''
sD2 = ''
if row['静默开始日期']:
sD1 = toDate(str(row['静默开始日期']))
if row['静默结束日期']:
sD2 = toDate(str(row['静默结束日期']))
tt5_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
'days': days, 'start': sD1, 'end': sD2, }
tt5_list.append(tt5_a)
tt5_results = {'tt5_contents': tt5_list}
context.update(tt5_results)
#
# 错别字表格
if dfcw.shape[0]<1:
context.update({'havingCbz':False})
else:
tCbz_list = []
dfcw.fillna('')
for index, row in dfcw.iterrows():
sTitle = ''
sDate = toDate(str(row['发文时间']))
if '标题' in dfcw.columns:
sTitle = row['标题']
# 去除引号等干扰表格模板输出的字符
r = "[——,$%^,。?、~@#¥%……&*《》<>「」{}【】()/\\\[\]'\"]"
if pd.isna(row['错误出现位置']):
s = ''
else:
s = re.sub(r, '', row['错误出现位置'])
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': s, 'type': row['账号类型'], 'name': row['账号名称'],
'date': sDate, 'title': sTitle, }
tCbz_list.append(a)
if dfcw.shape[0] > 0:
dCity['stringCbzCount'] = '本次检测发现错别字%d处,详细情况见附表政务新媒体发布内容错别字统计表。' % (dfcw.shape[0])
else:
dCity['stringCbzCount'] = '本次检测未发现错别字。'
tCbz_results = {'tCbz_contents': tCbz_list}
context.update(tCbz_results)
# 读取添加敏感词表格
if dfcs.shape[0]<1:
context.update({'havingMgc':False})
else:
tMgc_list = []
dfcs.fillna('')
for index, row in dfcs.iterrows():
sTitle = ''
sDate = toDate(str(row['发文时间']))
if '标题' in dfcs.columns:
sTitle = row['标题']
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': row['错误出现位置'], 'type': row['账号类型'], 'name': row['账号名称'],
'date': sDate, 'title': sTitle, }
tMgc_list.append(a)
if dfcs.shape[0] > 0:
dCity['stringMgcCount'] = '本次检测发现敏感信息%d处,详细情况见附表政务新媒体发布内容敏感信息统计表。' % (dfcs.shape[0])
else:
dCity['stringMgcCount'] = '本次检测未发现涉敏内容。'
tMgc_results = {'tMgc_contents': tMgc_list}
context.update(tMgc_results)
# table1
context.update(dCity)
# -----------------------
#
# 按模板生成报告
#
temp_word(fnTemplate,
fnReport,
context, dirTemp, city)
#更新目录并另存为pdf
print(' 更新目录转换为PDF...')
update_toc( fnReport )
#签章
print(' 签章...')
fnTmp = fnReport.replace('.docx', '_.pdf')
fnPDF = fnReport.replace('.docx', '.pdf')
if city in {'庆阳市', '平凉市', '临夏回族自治州'}:
addStamp(fnTmp,
'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf' ,
fnPDF, 115)
else:
addStamp(fnTmp,'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf',fnPDF)
if True:
os.remove(fnTmp)
def createDir(dirP, dirS):
dirN = dirP
if os.path.isdir(dirP):
dirN = os.path.join(dirP, dirS)
if not (os.path.exists(dirN)):
os.mkdir(dirN)
if os.path.isdir(dirN):
pass
else:
dirN = dirP
print('Directory ' + dirN + ' cannot be created.')
return dirN
# def createDir(dirP, dirS):
def summary(info, strFnData, strFnW, strFnS, strfnTemplate, strPathOutput):
# 打开监测数据、错别字、敏感词
df = pd.read_excel(strFnData)
dfW = pd.read_excel(strFnW)
dfS = pd.read_excel(strFnS)
# df.loc[df['账号类型'] == '微信服务号', '账号类型'] = '微信'
# df.loc[df['账号类型'] == '微信订阅号', '账号类型'] = '微信'
# 统一监测结果表述
df.loc[df['监测结果'] == '连续两周未更新', '监测结果'] = '超过两周未更新'
# 过长名称替换为简称,便于绘图
df.loc[df['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县'
df.loc[df['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县'
# 省直、 市直、 州直
df['市/省局'] = df['市/省局'].fillna('省直部门')
df['区县/地方部门'] = df['区县/地方部门'].fillna('市直部门')
df.loc[(df['市/省局'] == '临夏回族自治州') & (df['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门'
# 数据整理
df.replace(r'\s+', '', regex=True, inplace=True) # 去除账号、单位名称中的空格、换行、tab等
df.replace(r'^其他\+', '', regex=True, inplace=True) # 去除账号类型中的 "其它" 字样
df['更新次数'] = df['更新次数'].fillna(0)
df = df.fillna(value='')
#########################################################
#
# 统计市州范围
cities = {'甘肃省', '白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
'嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县', '陇南市'}
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '陇南市', '张掖市', '甘肃省', '省直部门'}
#
cities = {'甘肃省','庆阳市','武威市','临夏回族自治州'} # 只统计特定市州
# strPathOutput目录下生成报告目录和临时文件目录Reports 和 Intermediate
dirP = os.path.abspath(os.path.dirname(strPathOutput))
dirReports = createDir(dirP, 'Reports')
dirIntermediate = createDir(dirP, 'Intermediate')
for city in cities:
summaryCity(info, city, df, dfW, dfS, strfnTemplate, os.path.join(dirReports, city + '.docx'), dirIntermediate)
# 合并错别字文件
def mergeCMC(keyword, strPathCBZ, strFnCbz):
# cityShorten
cityShorten = {'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
'兰州新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
'BY': '白银市', 'DX': '定西市', 'JQ': '酒泉市', 'JYG': '嘉峪关市', 'LN': '陇南市',
'LX': '临夏回族自治州', 'PL': '平凉市', 'QY': '庆阳市', 'TS': '天水市', 'WW': '武威市', 'XQ': '兰州新区',
'LZXQ': '兰州新区', 'LZ': '兰州市', 'ZY': '张掖市', 'GN': '甘南藏族自治州', 'SZ': '省直部门', 'JC': '金昌市', }
df = pd.DataFrame()
for fn in glob.glob(os.path.join(strPathCBZ, '*'+keyword+'*.xlsx')):
p, f = os.path.split(fn)
city=''
for c in cityShorten.keys():
if c in f:
city = cityShorten[c]
break
if len(city)<1:
print("!!!!! City Name not matched ( ", f, " )")
dfn = pd.read_excel(fn)
dfn['市州'] = city
df = df.append(dfn, ignore_index=True)
print(city, f, dfn.shape[0], '/', df.shape[0])
df.to_excel(strFnCbz)
#def mergeCMC
if __name__ == "__main__":
# 运行之前先转换excel文件的日期列
info = {
"year": "2023",
"month": "6",
"quarter": "",
"datePub": "二〇二三年六月",
"dateStart": "2023年3月20日",
"dateEnd": "2023年6月20日",
"days": "92",
"serialNum": "8",
}
# 数据根目录,
strPath = 'D:/Projects/POM/DATA/2023年6月/季度报告/'
createDir(strPath, '全文')
createDir(strPath, '转发')
createDir(strPath, '报告')
createDir(strPath, '汇总')
createDir(strPath, '监测')
# 监测数据
strFnMonitoring = strPath + '汇总/汇总数据_2023.6.xlsx'
# word模板文件
strPathTemplate = strPath + 'POM_ReportTemplateQuarterly.docx'
# 错别字
strFnCbz = strPath + '汇总/CBZ.xlsx'
if not os.path.exists(strFnCbz):# 汇总错别字
strPathCBZ = strPath + '监测/'
mergeCMC("错别", strPathCBZ, strFnCbz)
# 敏感词
strFnMgc = strPath + '汇总/MGC.xlsx'
if not os.path.exists(strFnMgc):#汇总敏感词
strPathMGC = strPath + '监测/'
mergeCMC("敏感", strPathMGC, strFnMgc)
# 数据目录
strPathOutput = strPath
# 打开监测数据、错别字、敏感词
df = pd.read_excel(strFnMonitoring)
dfW = pd.read_excel(strFnCbz)
dfS = pd.read_excel(strFnMgc)
# df.loc[df['账号类型'] == '微信服务号', '账号类型'] = '微信'
# df.loc[df['账号类型'] == '微信订阅号', '账号类型'] = '微信'
# 统一监测结果表述
df.loc[df['监测结果'] == '连续两周未更新', '监测结果'] = '超过两周未更新'
# 过长名称替换为简称,便于绘图
df.loc[df['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县'
df.loc[df['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县'
# 省直、 市直、 州直
df['市/省局'] = df['市/省局'].fillna('省直部门')
df['区县/地方部门'] = df['区县/地方部门'].fillna('市直部门')
df.loc[(df['市/省局'] == '临夏回族自治州') & (df['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门'
# 数据整理
df.replace(r'\s+', '', regex=True, inplace=True) # 去除账号、单位名称中的空格、换行、tab等
df.replace(r'^其他\+', '', regex=True, inplace=True) # 去除账号类型中的 "其它" 字样
df['更新次数'] = df['更新次数'].fillna(0)
df = df.fillna(value='')
#########################################################
#
# 统计市州范围
cities = {'白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
'嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县', '陇南市', '张掖市', '甘肃省'}
#cities = cities | {'甘肃省'}#, '省直部门'}
#cities = cities | {'陇南市'}#, '兰州市'}, '省直部门'}
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '张掖市', '甘肃省', '省直部门'}
#
#cities = {'甘肃省'} # 只统计特定市州
# strPathOutput目录下生成报告目录和临时文件目录Reports 和 Intermediate
dirP = os.path.abspath(os.path.dirname(strPathOutput))
dirReports = createDir(dirP, 'Reports')
dirIntermediate = createDir(dirP, 'Intermediate')
for city in cities:
summaryCity(info, city, df, dfW, dfS, strPathTemplate, os.path.join(dirReports, city + '政务新媒体监测报告_{}年第{}季度.docx'.format(info['year'], info['quarter'])), dirIntermediate)

View File

@ -78,7 +78,7 @@ def drawAnnulus(data, recipe, title='', fn=''):
else:
xxx = 16
nnncol = 2
fs = 'xmall'
fs = 'x-small'
fig, ax = plt.subplots(figsize=(xxx, yyy), subplot_kw=dict(aspect="equal"))
@ -132,6 +132,10 @@ def summaryCity(info, city, df, dfW, dfS, fnTemplate, fnReport, dirTemp):
"嘉峪关市": "嘉峪关市人民政府办公室",
"兰州新区": "兰州新区管委会办公室",
"陇南市": "陇南市人民政府办公室",
"张掖市": "张掖市政务服务中心",
"甘南藏族自治州": "甘南藏族自治州政务服务中心",
"兰州市": "兰州市政务服务中心",
"陇南市": "陇南市政务服务中心",
}
print("----------------" + city + "----------------")
# 报告编号、委托单位
@ -534,7 +538,9 @@ def summary(info, strFnData, strFnW, strFnS, strfnTemplate, strPathOutput):
'嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县'}
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '陇南市', '张掖市', '甘肃省', '省直部门'}
#
cities = {'甘肃省'} # 只统计特定市州
cities = {'甘肃省','庆阳市','武威市','临夏回族自治州', '酒泉市'} # 只统计特定市州
cities = {'张掖市'}
# strPathOutput目录下生成报告目录和临时文件目录Reports 和 Intermediate
dirP = os.path.abspath(os.path.dirname(strPathOutput))
@ -574,13 +580,13 @@ if __name__ == "__main__":
# 运行之前先转换excel文件的日期列
info = {
"year": "2022",
"quarter": "",
"dateCN": "二〇二二年九",
"dateStart": "2022年7月1日",
"dateEnd": "2022年9月20日",
"days": "81",
"num": "11",
"year": "2023",
"quarter": "",
"dateCN": "二〇二三年四",
"dateStart": "2023年1月1日",
"dateEnd": "2023年3月20日",
"days": "79",
"num": "4",
}
# 数据根目录,
strPath = 'D:/Projects/POM/DATA/2023年S1/'

View File

@ -548,7 +548,7 @@ def summary(info, strFnData, strFnW, strFnS, strfnTemplate, strPathOutput):
'嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县', '陇南市'}
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '陇南市', '张掖市', '甘肃省', '省直部门'}
#
cities = {'甘肃省'} # 只统计特定市州
cities = {'甘肃省','庆阳市','武威市','临夏回族自治州'} # 只统计特定市州
# strPathOutput目录下生成报告目录和临时文件目录Reports 和 Intermediate
dirP = os.path.abspath(os.path.dirname(strPathOutput))

View File

@ -1,4 +1,5 @@
import pandas as pd
from openpyxl import Workbook
import numpy as np
import seaborn as sns
import datetime, time
@ -17,16 +18,37 @@ regSTR = '习近平总同志|习近同志|习近总书记|习平总书记|习近
'|建党七十三周年|共产党成立七十三周年' + \
'|大人代表|大人常委会|人大常委主任' + \
'|爱爱服务|抗议英雄|反炸中心'
paths = [
'D:/Projects/POM/DATA/2022年10月/9月报告/全文/',
'D:/Projects/POM/DATA/2022年9月/8月报告/全文/',
'D:/Projects/POM/DATA/2022年8月/7月报告/全文/',
'D:/Projects/POM/DATA/2022年7月/6月报告/全文/',
'D:/Projects/POM/DATA/2022年6月/5月报告/全文/',
'D:/Projects/POM/DATA/2022年5月/4月报告/全文/',
regSTR = '网络意识形态工作责任制实施细则|意识形态工作责任制实施办法'
regSTR = r'(?=.*西藏)(?=.*劳务)'
regSTRA = r'藏族|西藏'
regSTRB = r'劳务|用工|转移|输出|输转|就业|职业培训|技能培训|高校毕业生'
regSTRAB = r'藏族|西藏|劳务|用工|转移|输出|输转|就业|职业培训|技能培训|高校毕业生'
paths = [
'D:/Projects/POM/DATA/2023年6月/季度报告/全文/',
'D:/Projects/POM/DATA/2023年6月/5月报告/全文/',
'D:/Projects/POM/DATA/2023年5月/4月报告/全文/',
'D:/Projects/POM/DATA/2023年4月/3月报告/全文/',
'D:/Projects/POM/DATA/2023年3月/2月报告/全文/',
'D:/Projects/POM/DATA/2023年2月/1月报告/全文/',
'D:/Projects/POM/DATA/2023年1月/12月报告/全文/',
]
paths = ['D:/Projects/POM/DATA/2022年/2022年12月/11月报告/全文/',
'D:/Projects/POM/DATA/2022年/2022年11月/10月报告/全文/',
'D:/Projects/POM/DATA/2022年/2022年10月/9月报告/全文/',
'D:/Projects/POM/DATA/2022年/2022年9月/8月报告/全文/',
'D:/Projects/POM/DATA/2022年/2022年8月/7月报告/全文/',
'D:/Projects/POM/DATA/2022年/2022年7月/6月报告/全文/',
'D:/Projects/POM/DATA/2022年/2022年6月/5月报告/全文/',
'D:/Projects/POM/DATA/2022年/2022年5月/4月报告/全文/',
'D:/Projects/POM/DATA/2022年/2022年4月/3月报告/全文/',
'D:/Projects/POM/DATA/2022年/2022年3月/2月报告/全文/',
'D:/Projects/POM/DATA/2022年/2022年2月/1月报告/全文/',]
"""
'''
#'D:/Projects/POM/DATA/2022年11月/10月报告/全文/',
#'D:/Projects/POM/DATA/2022年12月/11月报告/全文/',
@ -201,17 +223,19 @@ def getWXData_Province(path, hasBody=False):
continue
if not os.path.isdir(scc):
#print(dirCC, dirCC[-5:] )
if dirCC[-5:]=='.xlsx' or dirCC[-4:]=='.xls':
if (dirCC[-5:]=='.xlsx' or dirCC[-4:]=='.xls') and not dirCC.startswith('~'):
files.append(scc)
dfcc = pd.read_excel(scc)
#print(scc)
dfcc = pd.read_excel(scc) # , engine='openpyxl'
dfcc['市州'] = strC
dfWX = dfWX.append(dfcc)
print(' ', dirCC, dfcc.shape[0])
else:
print('something error 01: ', dirCC)
else:
if dirC[-5:]=='.xlsx' or dirC[-4:]=='.xls':
if (dirC[-5:]=='.xlsx' or dirC[-4:]=='.xls') and not dirCC.startswith('~'):
files.append(sc)
#print(sc)
dfc = pd.read_excel(sc)
dfcc['市州'] = strC
dfWX = dfWX.append(dfc)
@ -333,16 +357,17 @@ if doWX:
print('WX data ', dfWX.shape)
# 查找关键词
dfwxd = dfWX[['市州', '公众号', '日期', '标题', '链接', '内容', '阅读数']][dfWX['内容'].str.contains(regSTR, regex=True, na=False)]
dfwxd = dfWX[['市州', '公众号', '日期', '标题', '链接', '内容', '阅读数']][dfWX['内容'].str.contains(regSTRA, regex=True, na=False) & dfWX['内容'].str.contains(regSTRB, regex=True, na=False)]
dfwxd['类型'] = '微信'
dfwxd['关键词']=''
dfwxd['上下文']=''
print("Found ", dfwxd.shape)
dfwxd = dfwxd.reset_index()
# 提取上下文
iiii=0
for i,r in dfwxd.iterrows():
string = str(r['内容'])
its = re.finditer(regSTR, string)
string = str(dfwxd.loc[iiii,'内容'])
its = re.finditer(regSTRAB, string)
sk = ''
sp = ''
for it in its:
@ -356,8 +381,9 @@ if doWX:
sk += it.group() + ';'
sp += string[s:e] + ';'
dfwxd.loc[i,'关键词'] = sk[:-1]
dfwxd.loc[i,'上下文'] = sp[:-1]
dfwxd.loc[iiii,'关键词'] = sk[:-1]
dfwxd.loc[iiii,'上下文'] = sp[:-1]
iiii = iiii+1
dfwxd.rename(columns={"阅读数": "阅读数/评论数", "公众号": "账号名称"},inplace=True)
dfwxd = dfwxd[['关键词', '上下文', '日期', '市州', '类型', '账号名称', '链接', '标题', '阅读数/评论数', '内容',]]
@ -375,16 +401,18 @@ if doWB:
# 查找关键词
dfwbd = dfWB[['市州', '账号名称', '标题', '日期', '评论数', '内容']][dfWB['内容'].str.contains(regSTR, regex=True, na=False)]
dfwbd = dfWB[['市州', '账号名称', '标题', '日期', '评论数', '内容']][dfWB['内容'].str.contains(regSTRA, regex=True, na=False) & dfWB['内容'].str.contains(regSTRB, regex=True, na=False)]
dfwbd['类型'] = '微博'
dfwbd['关键词'] = ''
dfwbd['上下文'] = ''
print("WB Found ", dfwbd.shape)
# 提取关键词上下文
dfwbd = dfwbd.reset_index()
iiii = 0
for i, r in dfwbd.iterrows():
string = str(r['内容'])
its = re.finditer(regSTR, string)
string = str(dfwbd.loc[iiii, '内容'])
its = re.finditer(regSTRAB, string)
sk = ''
sp = ''
for it in its:
@ -397,8 +425,9 @@ if doWB:
e = it.end() + d
sk += it.group() + ';'
sp += string[s:e] + ';'
dfwbd.loc[i, '关键词'] = sk
dfwbd.loc[i, '上下文'] = sp
dfwbd.loc[iiii, '关键词'] = sk
dfwbd.loc[iiii, '上下文'] = sp
iiii = iiii + 1
dfwbd.rename(columns={"评论数": "阅读数/评论数"},inplace=True)
dfwbd = dfwbd[['关键词', '上下文', '日期', '市州', '类型', '账号名称', '标题', '阅读数/评论数', '内容',]]
@ -433,16 +462,18 @@ if doTT:
#account date title nread ncomment content url origin city
# 查找关键词
dfttd = dfTT[['city', 'account', 'date', 'title', 'url', 'content', 'nread']][dfTT['content'].str.contains(regSTR, regex=True, na=False)]
dfttd = dfTT[['city', 'account', 'date', 'title', 'url', 'content', 'nread']][dfTT['content'].str.contains(regSTRA, regex=True, na=False) & dfTT['content'].str.contains(regSTRB, regex=True, na=False)]
dfttd['类型'] = '头条'
dfttd['关键词']=''
dfttd['上下文']=''
print("Found ", dfttd.shape)
# 提取上下文
dfttd = dfttd.reset_index()
iiii = 0
for i,r in dfttd.iterrows():
string = str(r['content'])
its = re.finditer(regSTR, string)
string = str(dfttd.loc[iiii, 'content'])
its = re.finditer(regSTRAB, string)
sk = ''
sp = ''
for it in its:
@ -456,8 +487,9 @@ if doTT:
sk += it.group() + ';'
sp += string[s:e] + ';'
dfttd.loc[i,'关键词'] = sk[:-1]
dfttd.loc[i,'上下文'] = sp[:-1]
dfttd.loc[iiii,'关键词'] = sk[:-1]
dfttd.loc[iiii,'上下文'] = sp[:-1]
iiii = iiii + 1
dfttd.rename(columns={'city': "市州", 'account': "账号名称", 'date': "日期", 'title': "标题", 'url':'链接', 'content': "内容", "nread": "阅读数/评论数"},inplace=True)
dfttd = dfttd[['关键词', '上下文', '日期', '市州', '类型', '账号名称', '链接', '标题', '阅读数/评论数', '内容',]]

View File

@ -228,7 +228,7 @@ if __name__ == "__main__":
#sendMessage(apikey)
# 逐市州发送月报告
sendReportMonthly(apikey, '2023', '1')
sendReportMonthly(apikey, '2023', '5')
# 逐市州发送预警信息
#sendForewarning(apikey)

1035
statForward.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -18,6 +18,22 @@ from docx.shared import Mm
import jieba
import jieba.posseg as pseg
def fetch_chinese(s):
pattern =re.compile(r'[^\u4e00-\u9fa5]')
sc = re.sub(pattern, '', s)
return sc
def toDate(strDT):
dt = pd.to_datetime(strDT, errors='coerce')
dts = ''
# print('-+-+:', type(dt), dt)
if not pd.isna(dt):
dts = dt.strftime('%m-%d')
return dts
# 画柱状图
def drawBar(data, recipe, title='', fn=''):
plt.figure(figsize=(6, 4))
@ -257,9 +273,11 @@ def getTTData(path, cities, hasBody=False):
fileAs = os.path.join(path, dirC, dirCT, fn)
#print(' ', ttName, fileAs)
if len(fileAs) > 0:
dfdftt = pd.read_excel(fileAs)
dfTTC = dfTTC.append(dfdftt)
try:
dfdftt = pd.read_excel(fileAs)
dfTTC = dfTTC.append(dfdftt)
except:
print("read file failed. ", fileAs)
#dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
# index_col=None)#, engine='python', encoding='gbk'#utf-8
@ -290,7 +308,7 @@ def fetch_chinese(s):
if __name__ == "__main__":
starttime = datetime.datetime.now()
_RATIO = 0.7
_RATIO = 0.5
isDoWX = True
isDoWB = True
isDoTT = True
@ -309,7 +327,7 @@ if __name__ == "__main__":
'兰州市', '张掖市', '甘南藏族自治州', '金昌市',
'省直部门', # 共12市2州1新区
]
'''
cities = [
'临夏回族自治州',
'白银市',
@ -320,28 +338,29 @@ if __name__ == "__main__":
#'省直部门', # 共12市2州1新区
]
'''
#cities = ['陇南市', '临夏回族自治州', '白银市', '定西市', '酒泉市', '平凉市','武威市','天水市']
#cities = ['白银市']
#cities = ['陇南市']
# 转发任务
#dfTask = pd.read_excel('D:/Projects/POM/DATA/2022年S2/S2/全省政务新媒体二季度转发信息条目.xls')
dfTask = pd.read_excel('D:/Projects/POM/DATA/2023年3月/2月报告/2023年2月份全省政务新媒体转发内容条目.xlsx')
dfTask = pd.read_excel('D:/Projects/POM/DATA/2023年4月/3月报告/2023年3月份全省政务新媒体转发内容条目.xlsx')
# 账号信息
strFnAccount = 'D:/Projects/POM/DATA/2023年3月/2月报告/全国报送系统表单_2023.2.28.xlsx'
strFnAccount = 'D:/Projects/POM/DATA/2023年4月/3月报告/全国报送系统表单_2023.3.31.xlsx'
dfAllAccount = pd.read_excel(strFnAccount)
# 省直部门账号部门简称
dfProvincial = pd.read_excel('D:/Projects/POM/DATA/2023年3月/2月报告/省直部门账号名称简称.xlsx')
fnTemplate = 'D:/Projects/POM/DATA/2023年3月/2月报告/POM_ForewardTemplate.docx'
dfProvincial = pd.read_excel('D:/Projects/POM/DATA/2023年4月/3月报告/省直部门账号名称简称.xlsx')
fnTemplate = 'D:/Projects/POM/DATA/2023年4月/3月报告/POM_ForewardTemplate.docx'
# 数据根目录,
strPath = ['D:/Projects/POM/DATA/2023年3月/2月报告/']
strOutputPath = 'D:/Projects/POM/DATA/2023年3月/2月报告/转发/'
strPath = ['D:/Projects/POM/DATA/2023年4月/3月报告/']
strOutputPath = 'D:/Projects/POM/DATA/2023年4月/3月报告/转发/'
context = {
"year": "2023",
"month": "2",
"pubMonth": "3",
"dateStart": "2023年2月1日",
"dateEnd": "2023年2月28"
"month": "3",
"pubMonth": "4",
"dateStart": "2023年3月1日",
"dateEnd": "2023年3月31"
}
dfAllAccount.loc[:, '转发数'] = 0
@ -409,10 +428,11 @@ if __name__ == "__main__":
for j in range(dataA.shape[0]):
str1 = str(dataA.iloc[j, dataA.columns.get_loc('title')]) # 文章标题
#
if len(rt) > len(str1):
if len(rt) > len(str1): # 任务标题过长,截取前半部分进行对比
strRT = rt[:len(str1)]
else:
else: #文章标题过长,只比较任务标题长度部分
strRT = rt
str1 = str1[:len(rt)]
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
if ratio > _RATIO:
forwarded += 1
@ -484,12 +504,12 @@ if __name__ == "__main__":
# 查看该账号的所有文章
for j in range(dataA.shape[0]):
str1 = str(dataA.iloc[j, dataA.columns.get_loc('标题')])
#
if len(rt) > len(str1):
if len(rt) > len(str1):# 任务标题过长,截取前半部分进行对比
strRT = rt[:len(str1)]
else:
else:#文章标题过长,只比较任务标题长度部分
strRT = rt
str1 = str1[:len(rt)]
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()

1035
statForward2023s2.py Normal file

File diff suppressed because it is too large Load Diff

965
statForward_LN.py Normal file
View File

@ -0,0 +1,965 @@
import datetime
import csv
import pandas as pd
import numpy as np
import glob, os, re, time
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from difflib import SequenceMatcher
from collections import Counter
import difflib
from docxtpl import DocxTemplate
from docxtpl import InlineImage
from docx.shared import Mm
import jieba
import jieba.posseg as pseg
#---
#那我们的目标就是将字段列名的日期数据替换成标准的日期格式,具体的思路是:
#1、先用excel实验2018-11-02对应的日期时间戳是43406。
#2、我再用2018-11-02减43406看看是从那一年开始计算的所以得出结论是1899-12-30。
#3、那最后要达成目标就只需要时间戳+1899-12-30就等于对应的当前日
def ts2date(dates, sf='%Y-%m-%d'):#定义转化日期戳的函数,dates为日期戳
delta=datetime.timedelta(days=dates)
today=datetime.datetime.strptime('1899-12-30','%Y-%m-%d')+delta#将1899-12-30转化为可以计算的时间格式并加上要转化的日期戳
return datetime.datetime.strftime(today,sf)#制定输出日期的格式
#---
def fetch_chinese(s):
pattern =re.compile(r'[^\u4e00-\u9fa5]')
sc = re.sub(pattern, '', s)
return sc
# 画柱状图
def drawBar(data, recipe, title='', fn=''):
plt.figure(figsize=(6, 4))
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
counties = recipe
countyRates = data
plt.bar(counties, countyRates, width=0.5)
plt.xticks(counties, counties, rotation=35)
plt.ylim((0, 1))
def to_percent(temp, position):
return '%2.0f' % (100 * temp) + '%'
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
plt.title(title, fontsize=16)
plt.tight_layout()
plt.savefig(fn)
# plt.show()
plt.cla()
plt.clf()
plt.close()
def getWBData(path, cities, hasBody=False):
# cityShorten
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
}
dirCs = os.listdir(path)
cs = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数',
'转发数', '评论数', 'weiboID', 'weiboName', '市州']
dfWB = pd.DataFrame(columns=cs)
cityCount = 0
for dirC in dirCs:
if dirC[:1] == '.':
continue
if not os.path.isdir(os.path.join(path, dirC)):
continue
if 'weixin' in dirC.lower():
continue
if 'tt' in dirC.lower():
continue
if not cityShorten[dirC] in cities:
continue
print(' city: ', cityShorten[dirC], dirC)
cityCount += 1
# City LN
cols = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数',
'转发数', '评论数'] #WB下载工具中的格式
dfWBC = pd.DataFrame(columns=cols)
dirCTs = os.listdir(os.path.join(path, dirC))
for dirCT in dirCTs:
if dirCT[:1] == '.':
continue
# 时段 weibo weibo_1
if not os.path.isdir(os.path.join(path, dirC, dirCT)):
continue
if 'weixin' in dirCT.lower():
continue
if 'tt' in dirCT.lower():
continue
print(' read WB... dir:',dirCT)
dirAs = os.listdir(os.path.join(path, dirC, dirCT))
for dirA in dirAs:
if dirA[:1] == '.':
continue
# 都是账号名称目录下再存账号ID.txt
if not os.path.isdir(os.path.join(path, dirC, dirCT, dirA)):
continue
##print('---',dirA)
# 账号名称
wbName = dirA
fileAs = os.listdir(os.path.join(path, dirC, dirCT, dirA))
if len(fileAs) > 0 and os.path.splitext(fileAs[0])[-1] == '.csv':
wbId = fileAs[0][:-4]
if len(fileAs) > 1 and wbId.startswith('.'):
wbId = fileAs[1][:-4]
# 读取文件
##print('----',wbName, wbId)
filename = os.path.join(path, dirC, dirCT, dirA, fileAs[0])
dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
index_col=None)#, engine='python', encoding='gbk'#utf-8
dfdfwb = dfdfwb[1:]
dfdfwb["weiboID"] = wbId
dfdfwb["weiboName"] = wbName
dfWBC = dfWBC.append(dfdfwb)
#print(wbName, wbId, fileAs[0], dfdfwb.shape, dfWBC.shape)
if len(fileAs)>1:
print(" +=+= ", fileAs)
print(' ', dfWBC.shape)
#dfWBC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx")
dfWBC['市州'] = cityShorten[dirC]
dfWB = dfWB.append(dfWBC)
print('Read WB finished. cities', cityCount, '; lines', dfWB.shape)
#dfWB.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
return dfWB
# 从数据目录中读取xlsx文件拼接到一起
def getWXData(path, cities, hasBody=False):
# cityShorten
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
}
dirBatches = os.listdir(path)
cols = ['公众号', '链接', '日期', '标题', '内容', '头条', '市州', '阅读数']
dfWX = pd.DataFrame(columns=cols)
countC = 0
countFnC = 0
# 监测批次目录
for dirBatch in dirBatches:
if not os.path.isdir(os.path.join(path, dirBatch)):
continue # 仅目录
# City LN
# 列出市州文件名称
fileCs = os.listdir(os.path.join(path, dirBatch))
count = 0
for fileC in fileCs:
if fileC[:1] == '.':
continue
# 处理目录
if os.path.isdir(os.path.join(path, dirBatch, fileC)) and 'weixin' in fileC.lower():
print(' ', os.path.join(path, dirBatch, fileC))
fs = os.listdir(os.path.join(path, dirBatch, fileC))
for f in fs:
fe = os.path.splitext(f)[-1]
if fe == '.xlsx' or fe == '.xls':
fName = os.path.splitext(fileC)[0]
cityname = cityShorten[dirBatch]
if cityname in cities:
dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC, f))
dfdfwxc['市州'] = cityname
print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape)
dfWX = dfWX.append(dfdfwxc)
count = count + 1
# 处理文件
fExt = os.path.splitext(fileC)[-1]
if fExt != '.xlsx' and fExt != '.xls':
continue # 限制文件扩展名
fName = os.path.splitext(fileC)[0]
cityname = cityShorten[dirBatch]
if cityname in cities:
dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC))
dfdfwxc['市州'] = cityShorten[dirBatch]
print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape)
dfWX = dfWX.append(dfdfwxc)
count = count + 1
countFnC += count
if count > 0:
countC += 1
print(" Read WX Finished. cities ", countC, '; Files', countFnC, '; lines ', dfWX.shape[0])
#dfWX.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WX_ALL.xlsx")
return dfWX
# 从数据目录中读取xlsx文件拼接到一起
def getTTData(path, cities, hasBody=False):
# cityShorten
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
}
dirCs = os.listdir(path)
#account date title nread ncomment content url origin
cs = ['account', 'date', 'title', 'nread', 'ncomment', 'content', 'url', 'origin', 'city']
dfTT = pd.DataFrame(columns=cs)
cityCount = 0
for dirC in dirCs:
if dirC[:1] == '.':
continue
if not os.path.isdir(os.path.join(path, dirC)):
continue
if 'weixin' in dirC.lower():
continue
if 'weibo' in dirC.lower():
continue
if not cityShorten[dirC] in cities:
continue
print(' city: ', cityShorten[dirC], dirC)
cityCount += 1
# City LN
dfTTC = pd.DataFrame(columns=cs)
dirCTs = os.listdir(os.path.join(path, dirC))
for dirCT in dirCTs:
if dirCT[:1] == '.':
continue
# 时段 weibo weibo_1
if not os.path.isdir(os.path.join(path, dirC, dirCT)):
continue
if 'weixin' in dirCT.lower():
continue
if 'weibo' in dirCT.lower():
continue
if 'tt' in dirCT.lower():
print(' read TT... dir:',dirCT)
fns = os.listdir(os.path.join(path, dirC, dirCT))
for fn in fns:
if fn[:1] == '.':
continue
if not fn[-5:] == '.xlsx':
continue
#print('---',fn)
# 账号名称
ttName = fn[fn.index('_')+1:]
ttName = ttName[:ttName.index('_')]
#D:\Projects\POM\DATA\2022年11月\10月报告\全文\LN\TT
fileAs = os.path.join(path, dirC, dirCT, fn)
#print(' ', ttName, fileAs)
if len(fileAs) > 0:
try:
dfdftt = pd.read_excel(fileAs)
dfTTC = dfTTC.append(dfdftt)
except:
print("read file failed. ", fileAs)
#dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
# index_col=None)#, engine='python', encoding='gbk'#utf-8
#dfdfwb = dfdfwb[1:]
#dfdfwb["weiboID"] = wbId
#dfdfwb["weiboName"] = wbName
#dfTTC = dfTTC.append(dfdfwb)
#print(ttName, '读入:', dfdftt.shape[0], ' 总计:', dfTTC.shape[0])
#if len(fileAs)>1:
# print(" +=+= ", fileAs)
print(' 读入头条数据行数', dfTTC.shape)
#dfTTC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx")
dfTTC['city'] = cityShorten[dirC]
dfTT = dfTT.append(dfTTC)
print('Read TT finished. cities', cityCount, '; lines', dfTT.shape)
#dfTT.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
return dfTT
def fetch_chinese(s):
pattern =re.compile(r'[^\u4e00-\u9fa5]')
sc = re.sub(pattern, '', s)
return sc
def doWBData():
dfAccount = pd.read_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全国报送系统表单_2023.6.30.xlsx')
dfAccount = dfAccount[dfAccount['账号类型']=='新浪微博']
dfAccount['微信biz/oid/账号ID'] = dfAccount['微信biz/oid/账号ID'].astype('int64')
dfwb1 = pd.read_csv('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/weibo_2/weibo1.csv', sep=',',index_col=None)#, engine='python', encoding='gbk'#utf-8
dfwb1 = dfwb1.fillna(0)
dfwb1['user_id'] = dfwb1['user_id'].astype('int64')
dfwb1.rename(columns={'id':'微博id', 'content':'微博正文', 'article_url':'头条文章url', 'original_pictures':'原始图片url',
'retweet_pictures':'被转发微博原始图片url', 'original':'是否为原创微博', 'video_url':'微博视频url',
'publish_place':'发布位置', 'publish_time':'发布时间', 'publish_tool':'发布工具',
'up_num':'点赞数', 'retweet_num':'转发数', 'comment_num':'评论数'}, inplace = True)
print(dfAccount.shape)
print(dfwb1.shape, dfwb1.dtypes)
if 1:
sDir = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全文/LN/weibo_3/'
i=0
j=0
for uid in dfwb1['user_id'].unique():
dfa1 = dfAccount[dfAccount['微信biz/oid/账号ID']==uid]
dfa1.reset_index(inplace=True)
if dfa1.shape[0]>0:
sA = str(dfa1.loc[0,'账号名称'])
#print(dfa1['账号名称'])
i = i + 1
dfwba = dfwb1.loc[dfwb1['user_id']==uid]
os.mkdir(sDir+sA)
#微博id,微博正文,头条文章url,原始图片url,被转发微博原始图片url,是否为原创微博,微博视频url,
# 发布位置,发布时间,发布工具,点赞数,转发数,评论数
dfwba = dfwba[['微博id', '微博正文', '头条文章url', '原始图片url',
'被转发微博原始图片url', '是否为原创微博',
'微博视频url', '微博视频url', '发布位置', '发布时间', '发布工具',
'点赞数', '转发数', '评论数']]
dfwba = dfwba.reset_index()
dfwba.to_csv(sDir+sA+'/'+str(uid)+'.csv', encoding='utf_8_sig', index=0, quoting=1)
else:
j = j+1
print('found ', i, '; nofound', j)
i=0
j=0
if 1:
sDir = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全文/LN/weibo_4/'
dfwb2 = pd.read_csv('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/weibo_2/weibo2.csv', sep=',',index_col=None)#, engine='python', encoding='gbk'#utf-8
dfwb2 = dfwb2.fillna(0)
dfwb2['user_id'] = dfwb2['user_id'].astype('int64')
dfwb2.rename(columns={'id':'微博id', 'text':'微博正文', 'article_url':'头条文章url', 'pics':'原始图片url',
'topics':'被转发微博原始图片url','source':'是否为原创微博','video_url':'微博视频url',
'location':'发布位置', 'created_at':'发布时间', 'bid':'发布工具',
'attitudes_count':'点赞数', 'reposts_count':'转发数', 'comments_count':'评论数'}, inplace = True)
print(dfwb2.shape)
for uid in dfwb2['user_id'].unique():
dfa2 = dfAccount[dfAccount['微信biz/oid/账号ID']==uid]
dfa2.reset_index(inplace=True)
if dfa2.shape[0]>0:
sA = str(dfa2.loc[0, '账号名称'])
i = i+1
dfwba = dfwb2.loc[dfwb2['user_id']==uid]
os.mkdir(sDir+sA)
#微博id,微博正文,头条文章url,原始图片url,被转发微博原始图片url,是否为原创微博,微博视频url,发布位置,发布时间,发布工具,点赞数,转发数,评论数
dfwba = dfwba[['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博',
'微博视频url', '发布位置', '发布时间', '发布工具',
'点赞数', '转发数', '评论数']]
dfwba = dfwba.reset_index()
dfwba.to_csv(sDir+sA+'/'+str(uid)+'.csv', encoding='utf_8_sig', index=0, quoting=1)
else:
#print(uid)
j = j+1
print('found ', i, '; nofound', j)
if __name__ == "__main__":
#doWBData()
#exit(0)
starttime = datetime.datetime.now()
_RATIO = 0.5
isDoWX = True
isDoWB = True
isDoTT = True
cities = [
'临夏回族自治州',
'白银市',
'定西市',
'酒泉市',
'嘉峪关市',
'平凉市',
'庆阳市',
'天水市',
'武威市',
'兰州新区',
'陇南市',
'兰州市', '张掖市', '甘南藏族自治州', '金昌市',
'省直部门', # 共12市2州1新区
]
'''
cities = [
'临夏回族自治州',
'白银市',
'定西市',
'酒泉市',
'天水市',
'陇南市',
#'省直部门', # 共12市2州1新区
]
'''
cities = ['陇南市',]
#cities = ['陇南市', '临夏回族自治州', '白银市', '定西市', '酒泉市', '平凉市','武威市','天水市']
#cities = ['陇南市']
# 转发任务
#dfTask = pd.read_excel('D:/Projects/POM/DATA/2022年S2/S2/全省政务新媒体二季度转发信息条目.xls')
dfTask = pd.read_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/陇南7月上旬转发台账.xlsx')
sTaskTitle = '标题'
sTaskDate = '推送时间'
# 删除标题列为空的行
dfTask.dropna(axis=0,subset = ["标题"])
yT0 = dfTask.columns.get_loc('序号')
yT1 = dfTask.columns.get_loc('标题')
# 账号信息
strFnAccount = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全国报送系统表单_2023.6.30.xlsx'
dfAllAccount = pd.read_excel(strFnAccount)
# 添加列
dfAllAccount.loc[:, '转发数'] = 0
#dfAllAccount.loc[:, '阅读数'] = 0
dfAllAccount = pd.concat([dfAllAccount, pd.DataFrame(np.zeros((dfAllAccount.shape[0], dfTask.shape[0])), columns=dfTask['序号'].astype(str).tolist())], axis=1)
# 整理数据
dfAllAccount['市/省局'] = dfAllAccount['市/省局'].fillna('省直部门')
dfAllAccount['区县/地方部门'] = dfAllAccount['区县/地方部门'].fillna('市直部门')
dfAllAccount.loc[(dfAllAccount['市/省局'].isin(['临夏回族自治州', '甘南藏族自治州'])) & (dfAllAccount['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门'
dfAllAccount.loc[(dfAllAccount['市/省局'].isin(['省直部门'])) & (dfAllAccount['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '省直部门'
# 过长名称替换为简称,便于绘图
dfAllAccount.loc[dfAllAccount['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县'
dfAllAccount.loc[dfAllAccount['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县'
yAccountName = dfAllAccount.columns.get_loc('账号名称')
yAccountCity = dfAllAccount.columns.get_loc('市/省局')
yAccountCounty = dfAllAccount.columns.get_loc('区县/地方部门')
yAccountUnit = dfAllAccount.columns.get_loc('单位全称')
# 省直部门账号部门简称
fnTemplate = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/POM_ForewardTemplate.docx'
# 数据根目录,
strPath = ['D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全文/',
]
strOutputPath = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/转发/'
context = {
"year": "2023",
"month": "7",
"pubMonth": "7",
"dateStart": "2023年7月1日",
"dateEnd": "2023年7月10日"
}
################################################
# 创建存储矩阵
# 按照转发任务创建统计矩阵
colRR = ['市州', '类型', '账号名称', '单位名称', '省直部门', '区县', '转发数', '阅读数']
for ididid in dfTask['序号'][0:dfTask['标题'].count()].tolist():
#for ididid in range(1, dfTask['标题'].count()):
colRR.append(str(ididid))
# 用于保存每一条转发任务的账号和文章
dfO = pd.DataFrame(columns=['任务序号', '任务名称', '类型', '公众号', '日期', '内容', '链接', '市州'] )
# WX
if isDoWX:
print('=============================================================')
print('---- WX ----')
dfWX = pd.DataFrame()
for strP in strPath:
ddff = getWXData(strP, cities)
dfWX = dfWX.append(ddff)
dfWX = dfWX.fillna(value=0)
yWXtitle = dfWX.columns.get_loc('标题')
yWXnread = dfWX.columns.get_loc('阅读数')
yWXdate = dfWX.columns.get_loc('日期')
yWXurl = dfWX.columns.get_loc('链接')
# 公众号 链接 日期 标题 内容 头条 city
## 逐个市州统计每个账号的转发情况
#cities = dfWX['市州'].unique()
for city in cities:
print('---- WX title match', city, ' ----' )
# 本市微信数据
dataC = dfWX.loc[dfWX['市州'] == city].copy()
# 获取微信账号数
accounts = dataC['公众号'].unique()
# 所有微信账号数
maskCWX = ( (dfAllAccount['账号类型'] == '微信服务号')|(dfAllAccount['账号类型'] == '微信订阅号') ) & (dfAllAccount['市/省局'] == city)
accountNumCWX = maskCWX.tolist().count(True)
# 按获取得微信账号遍历
for account in accounts:
#print(account)
# 该账号的所有文章
dataA = dataC.loc[dataC['公众号'] == account].copy() # 一个公众号的所有文章
sR = pd.Series(dtype='object')
sR['类型'] = '微信'
sR['市州'] = city
sR['账号名称'] = account
count = 0
arn = 0
# 从账号信息中匹配该账号详细信息
mask = ( (dfAllAccount['账号类型'] == '小程序+微信')
| (dfAllAccount['账号类型'] == '微信服务号')
| (dfAllAccount['账号类型'] == '微信订阅号') ) & (dfAllAccount['市/省局'] == city) & (dfAllAccount['账号名称']==account)
if mask.any():
sxq = str(dfAllAccount.loc[mask, '区县/地方部门'].values[0])
if sxq.lower() !='nan':
sR['区县'] = sxq
sdwmc = str(dfAllAccount.loc[mask, '单位全称'].values[0])
if sdwmc.lower() != 'nan':
sR['单位名称'] = sdwmc
else:
print(' !!!! 微信', account, '', city, '无详细信息' )
continue
# 按任务标题逐个匹配所有发文,得到每篇任务的转发情况
for i in range(dfTask['标题'].count()):
# 对于每一篇任务文章
rn = dfTask.iloc[i, yT0] # 序号
ssrt = str(dfTask.iloc[i, yT1]) # 标题/内容
rt = fetch_chinese(ssrt) # 只取汉字
forwarded = 0 # 转发数
readNum = 0 # 阅读数
# 查看该账号的所有文章
for j in range(dataA.shape[0]):
str1 = fetch_chinese(str(dataA.iloc[j, yWXtitle])) # 只取汉字
# 任务标题过长,截取前半部分进行对比
if len(rt) > len(str1):
strRT = rt[:len(str1)]
else:#文章标题过长,只比较任务标题长度部分
strRT = rt
str1 = str1[:len(rt)]
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
# 遇到相似的,认为已转发,即跳出不再查找
if ratio > _RATIO:
forwarded += 1
readNum += int(dataA.iloc[j, yWXnread])
if forwarded > 0:
break
sR[str(rn)] = forwarded # 记录该篇文章的转发数
count += forwarded # 累加该篇文章的转发数
arn += readNum # 累加该篇文章的阅读数
# 记录该篇任务转发情况加入
if forwarded > 0:
dfO = dfO.append([{'任务序号': rn, '任务名称': ssrt,
'类型': '微信',
'公众号': account,
'日期': dataA.iloc[j, yWXdate],
'内容': str1,
'链接': dataA.iloc[j, yWXurl],
'市州': city,
'阅读数': readNum,
}], ignore_index=True)
#记录该任务的转发情况
dfAllAccount.loc[mask, str(rn)] = forwarded
#记录该账号的总转发数
dfAllAccount.loc[mask, '转发数'] = count
sR['转发数'] = count
sR['阅读数'] = arn
# 全市总转发文章篇数
ccwx = dfAllAccount.loc[maskCWX, '转发数'].sum()
# 全市总转发率
rcc = ccwx/accountNumCWX/dfTask.shape[0]
print(' ', city, '共有', accountNumCWX, '个微信号,获取数据', len(accounts), '个。共转发', ccwx, '次,转发率{:.1f}%'.format(rcc*100) )
#countWxForewards = dfRR.shape[0]
#print(' 获取 WX 账号数', len(dfWX['公众号'].unique()),'参与转发账号数', countWxForewards)
# WB
if isDoWB:
print('=============================================================')
print('---- WB data read ----')
#获取微博数据
dfWB = pd.read_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/微博全文.xlsx')
#for strP in strPath:
## ddff = getWBData(strP, cities)
# dfWB = dfWB.append(ddff)
print('----', dfWB.shape)
#===========================================================================================
#===========================================================================================
yWBcontent = dfWB.columns.get_loc('微博正文')
yWBdate = dfWB.columns.get_loc('date')
yWBurl = dfWB.columns.get_loc('头条文章url')
#dfWB.to_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/转发/微博全文.xlsx')
################################################
# WB
# 微博id 微博正文 头条文章url 原始图片url 被转发微博原始图片url 是否为原创微博 微博视频url 发布位置 date
# 发布工具 点赞数 转发数 评论数 weiboID weiboName city
#cities = dfWB['市州'].unique()
for city in cities:
print('---- WB match', city, ' ----' )
# 本市微博数据
dataC = dfWB.loc[dfWB['市州'] == city].copy()
# 获取数据的微博账号
accounts = dataC['weiboName'].unique()
# 本市所有微博账号
maskCWB = (dfAllAccount['账号类型'] == '新浪微博') & (dfAllAccount['市/省局'] == city)
accountNumCWB = maskCWB.tolist().count(True)
# 按获取的微博账号遍历
for account in accounts:
# print(account)
# 该公众号的所有文章
dataA = dataC.loc[dataC['weiboName'] == account].copy()
sR = pd.Series(dtype='object')
sR['类型'] = '新浪微博'
sR['市州'] = city
sR['账号名称'] = account
count = 0
# 为转发账号匹配单位全称和所属县区
mask = ( dfAllAccount['账号类型'] == '新浪微博' ) & (dfAllAccount['市/省局'] == city) & (dfAllAccount['账号名称']==account)
if mask.any():
sxq = str(dfAllAccount.loc[mask, '区县/地方部门'].values[0])
if sxq.lower() !='nan':
sR['区县'] = sxq
sdwmc = str(dfAllAccount.loc[mask, '单位全称'].values[0])
if sdwmc.lower() != 'nan':
sR['单位名称'] = sdwmc
else:
print(' !!!! 微博', account, '', city, '无详细信息' )
continue
# 按任务标题逐个匹配所有发文,得到每篇任务的转发情况
for i in range(dfTask['标题'].count()):
rn = dfTask.iloc[i, yT0] # 任务序号
ssrt = str(dfTask.iloc[i, yT1]) # 任务标题
rt = fetch_chinese(ssrt) # 只取中文
forwarded = 0
# 对该账号的所有文章
for j in range(dataA.shape[0]):
str0 = str(dataA.iloc[j, yWBcontent])
str1 = fetch_chinese(str0)
str2 = str1[:len(rt)] # 取任务标题相同汉字数进行比较
ratio = difflib.SequenceMatcher(None, rt, str2).quick_ratio()
if ratio > _RATIO:
forwarded += 1
if forwarded > 0:
break
#记记录该任务的转发情况
dfAllAccount.loc[mask, str(rn)] = forwarded
sR[str(rn)] = forwarded
# 转发数累加到本账号里
count += forwarded
# 记录该篇任务转发情况加入
if forwarded > 0:
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
'类型': '新浪微博',
'公众号': account,
'日期': dataA.iloc[j, yWBdate],
'内容': str1,
'链接': dataA.iloc[j, yWBurl],
'市州': city,
}], ignore_index=True)
# 记录该账号的总转发数
dfAllAccount.loc[mask, '转发数'] = count
sR['转发数'] = count
# 全市总转发文章篇数
ccwb = dfAllAccount.loc[maskCWB, '转发数'].sum()
# 全市总转发率
rcc = ccwb/accountNumCWB/dfTask.shape[0]
print(' ', city, '共有', accountNumCWB, '个微博号,获取数据', len(accounts), '个。共转发', ccwb, '次,转发率{:.1f}%'.format(rcc*100) )
#countWbForewards = dfRR.shape[0] - countWxForewards
#print(' 获取 WB 账号数', len(dfWB['weiboName'].unique()), '参与转发账号数', countWbForewards)
# TT
if isDoTT:
print('=============================================================')
print('---- TT data read ----')
# id userId source city tid cellType title
# time-stamp date url commentCount readNum likeNum showNum
# 获取头条数据
dfTT = pd.DataFrame()
for strP in strPath:
ddff = getTTData(strP, cities)
dfTT = dfTT.append(ddff)
yTTtitle = dfTT.columns.get_loc('title')
yTTdate = dfTT.columns.get_loc('date')
yTTurl = dfTT.columns.get_loc('url')
# 逐个市州统计账号转发情况
for city in cities:
print("++++++++++++++++++++++++++++++++++++++++++++++++++")
print('---- TT title match', city, ' ----' )
# 本市头条数据
dataC = dfTT.loc[dfTT['city'] == city].copy()
# 获取数据的头条账号
accounts = dataC['account'].unique()
# 本市所有头条账号信息
maskCTT = (dfAllAccount['账号类型'] == '今日头条') & (dfAllAccount['市/省局'] == city)
accountNumCTT = maskCTT.tolist().count(True)
# 按头条数据的账号遍历
for account in accounts:
#print(account)
# 该账号的所有文章
dataA = dataC[dataC['account']==account]
sR = pd.Series([], dtype=pd.StringDtype())
sR['类型'] = '今日头条'
sR['市州'] = city
sR['账号名称'] = account
count = 0
# 为转发账号匹配单位全称和所属县区
mask = ( dfAllAccount['账号类型'] == '今日头条' ) & (dfAllAccount['市/省局'] == city) & (dfAllAccount['账号名称']==account)
if mask.any():
sxq = str(dfAllAccount.loc[mask, '区县/地方部门'].values[0])
if sxq.lower() !='nan':
sR['区县'] = sxq
sdwmc = str(dfAllAccount.loc[mask, '单位全称'].values[0])
if sdwmc.lower() != 'nan':
sR['单位名称'] = sdwmc
else:
print(' !!!! 头条', account, '', city, '无详细信息' )
continue
# 按任务标题逐个匹配所有发文,得到每篇任务的转发情况
for i in range(dfTask['标题'].count()):
# 对于每一篇任务文章
rn = dfTask.iloc[i, yT0] # 任务序号
ssrt = str(dfTask.iloc[i, yT1]) # 任务标题
rt = fetch_chinese(ssrt) # 只取中文
forwarded = 0
# 查看该账号的所有文章
for j in range(dataA.shape[0]):
str0 = str(dataA.iloc[j, yTTtitle])
str1 = fetch_chinese(str0)
#
if len(rt) > len(str1): # 若任务标题过长,截取前半部分进行对比
strRT = rt[:len(str1)]
else: #若文章标题过长,只比较任务标题长度部分
strRT = rt
str1 = str1[:len(rt)]
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
if ratio > _RATIO:
forwarded += 1
if forwarded > 0:
break
#记录该任务转发情况
dfAllAccount.loc[mask, str(rn)] = forwarded
sR[str(rn)] = forwarded
count += forwarded
if forwarded > 0:
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
'类型': '今日头条',
'公众号': account,
'日期': dataA.iloc[j, yTTdate],
'内容': str1,
'链接': dataA.iloc[j, yTTurl],
'市州': city,
}], ignore_index=True)
# 记录该账号转发情况
dfAllAccount.loc[mask, '转发数'] = count
sR['转发数'] = count
# 全市总转发文章篇数
cctt = dfAllAccount.loc[maskCTT, '转发数'].sum()
# 全市总转发率
rcc = cctt/accountNumCTT/dfTask.shape[0]
print(' ', city, '共有', accountNumCTT, '个头条号,获取数据', len(accounts), '个。共转发', cctt, '次,转发率{:.1f}%'.format(rcc*100) )
#countTtForewards = dfRR.shape[0] - countWxForewards - countWbForewards
#print(' 获取 TT 账号数', len(dfTT['account'].unique()),'参与转发账号数', countTtForewards)
if isDoWX or isDoWB or isDoTT:
print('=============================================================')
print('---- STATISTICS ----')
print('=============================================================')
dfAllAccount.to_excel(strOutputPath + '甘肃省_转发账号.xlsx')
dfO.to_excel(strOutputPath + '甘肃省_转发文章.xlsx')
print('---- 统计市州转发率 ----')
for city in cities:
#if city in ['兰州新区', '省直部门']:
# continue
print(" add up city", city)
maskC = ( (dfAllAccount['账号类型'] == '新浪微博')
| (dfAllAccount['账号类型'] == '微信服务号')
| (dfAllAccount['账号类型'] == '微信订阅号')
| (dfAllAccount['账号类型'] == '今日头条') ) & (dfAllAccount['市/省局'] == city)
# dfdfC = dfAllAccount.loc[((dfAllAccount['账号类型'] == '新浪微博')
# | (dfAllAccount['账号类型'] == '微信服务号')
# | (dfAllAccount['账号类型'] == '微信订阅号')
# | (dfAllAccount['账号类型'] == '今日头条'))
# & (dfAllAccount['市/省局'] == city)].copy()
dfdfC = dfAllAccount.loc[maskC,:]
dfdfC.to_excel(strOutputPath + city + '_转发账号.xlsx')
dfOCity = dfO[dfO['市州'] == city]
dfO.to_excel(strOutputPath + city + '_转发文章.xlsx')
#dfRRCity = dfRR.loc[dfRR['市州'] == city].copy()
#########################################################################################################
# 统计市/州直部门转发数
dfdfCD = dfdfC.loc[dfdfC['区县/地方部门'].isin(['州直部门', '市直部门', '省直部门'])].copy()
dfdfCDA = pd.pivot_table(dfdfCD, index=['单位全称'], values=['账号名称'],
aggfunc=['count'], fill_value='', margins=True, margins_name='总计')
dfdfCDC = pd.pivot_table(dfdfCD, index=['单位全称'], values=['转发数'],
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
#dfdfCDR = pd.pivot_table(dfdfCD, index=['单位全称'], values=['阅读数'],
# aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
dfdfCD_A = pd.concat([dfdfCDA, dfdfCDC], axis=1)
#print('-', dfdfCD_A.columns.values)
# 合并多层索引MultiIndex
dfdfCD_A.columns = ['_'.join(col) for col in dfdfCD_A.columns.values]
#print('=', dfdfCD_A.columns.values)
# 计算转发率
dfdfCD_A['rate'] = dfdfCD_A.apply(
lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['标题'].count() * 1000) / 1000.0, axis=1)
# 排序
dfdfCD_AD = dfdfCD_A[0:dfdfCD_A.shape[0] - 1].sort_values(by='rate', ascending=False)
dfdfCD_AD = pd.concat([dfdfCD_AD, dfdfCD_A[dfdfCD_A.shape[0] - 1:dfdfCD_A.shape[0]]], axis=0)
dfdfCD_AD.to_excel(strOutputPath + city + '部门转发统计表.xlsx')
#dfDD
##########################################################################################
# 全市/州账号按'区县'统计
# 发现目前版本pivot_table函数aggfunc用列表时前几列计算值不准确
# 所以,暂时单列计算,再合并
#dfdfCD = dfdfC.loc[dfdfC['区县/地方部门'].isin(['州直部门', '市直部门', '省直部门'])].copy()
dfdfCA = pd.pivot_table(dfdfC, index=['区县/地方部门'], values=['账号名称'],
aggfunc=['count'], fill_value='', margins=True, margins_name='总计')
dfdfCC = pd.pivot_table(dfdfC, index=['区县/地方部门'], values=['转发数'],
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
#dfdfCDR = pd.pivot_table(dfdfCD, index=['单位全称'], values=['阅读数'],
# aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
dfdfC_A = pd.concat([dfdfCA, dfdfCC], axis=1)
#print('-', dfdfCD_A.columns.values)
# 合并多层索引MultiIndex
dfdfC_A.columns = ['_'.join(col) for col in dfdfC_A.columns.values]
#print('=', dfdfCD_A.columns.values)
# 计算转发率
dfdfC_A['rate'] = dfdfC_A.apply(
lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['标题'].count() * 1000) / 1000.0, axis=1)
# 排序
dfdfC_AD = dfdfC_A[0:dfdfC_A.shape[0] - 1].sort_values(by='rate', ascending=False)
dfdfC_AD = pd.concat([dfdfC_AD, dfdfC_A[dfdfC_A.shape[0] - 1:dfdfC_A.shape[0]]], axis=0)
dfdfC_AD.to_excel(strOutputPath + city + '转发统计表.xlsx')
#dfCC
#########################################################
#
# 生成报告
tpl = DocxTemplate(fnTemplate)
if city in ['临夏回族自治州', '甘南藏族自治州']:
sL0 = ''
else:
sL0 = ''
info = {
"strL0":sL0,
"strL1":"区县",
"taskCount": dfTask['标题'].count(),
"aNum": int(dfdfC_AD.iloc[-1]['count_账号名称']),
"fNum": int(dfdfC_AD.iloc[-1]['sum_转发数']),
"r": '%.1f'%(dfdfC_AD.iloc[-1]['rate']*100.0),
#
"dNum": int(dfdfCD_AD.iloc[-1]['count_账号名称']), # 部门总账号数
"dFNum": int(dfdfCD_AD.iloc[-1]['sum_转发数']), # 部门总转发数
"dr": '%.1f'%(dfdfCD_AD.iloc[-1]['rate']*100.0), # 部门平均转发率
}
context.update(info)
# 县区转发率表格
t1_list = []
for index, row in dfdfC_AD.iterrows():
if index == "总计":
continue
t1_a = {'county': str(index), 'rate': '%.1f'%(row['rate']*100.0),
'account': int(row['count_账号名称']), 'fNum': int(row['sum_转发数']) }
t1_list.append(t1_a)
context['t1_contents'] = t1_list
# 部门转发率表格
t2_list = []
for index, row in dfdfCD_AD.iterrows():
if index == "总计":
continue
t2_a = {'name': str(index),
'rate': '%.1f'%(row['rate']*100.0),
'account': int(row['count_账号名称']),
'fNum': int(row['sum_转发数']) }
t2_list.append(t2_a)
context['t2_contents'] = t2_list
# 转发任务列表
t3_list = []
for index, row in dfTask.iterrows():
t3_a = {'id': row['序号'],
'title': row['标题'],
'date': ts2date(row[sTaskDate], '%m月%d') }
t3_list.append(t3_a)
context['t3_contents'] = t3_list
# 绘制区县转发率图
drawBar(dfdfC_AD['rate'][:-1], dfdfC_AD.index[:-1],
'区县转发率', os.path.join(strOutputPath, '_' + city + '_graphCounty.png'))
dc = {
'graphCounty': InlineImage(tpl, os.path.join(strOutputPath, '_' + city+'_graphCounty.png'), width=Mm(120)),
}
context.update(dc)
tpl.render(context)
tpl.save(strOutputPath+city+'转发统计报告_2023年{}月份.docx'.format(context['month']))
endtime = datetime.datetime.now()
usedtime = endtime - starttime
print("time: ", usedtime)

View File

@ -15,11 +15,11 @@ TEST = False # True为测试状态不发短信 False为正式状态
################
dDate = {
'dateStart': '3月23',
'dateEnd': '29'
'dateStart': '6月8',
'dateEnd': '14'
}
fn = 'D:/Projects/POM/DATA/2023年3月/3月31日预警/周预警_2023.3.29.xlsx'
outPath = 'D:/Projects/POM/DATA/2023年3月/3月31日预警/'
fn = 'D:/Projects/POM/DATA/2023年6月/6.16周预警/周预警_2023.6.15.xlsx'
outPath = 'D:/Projects/POM/DATA/2023年6月/6.16周预警/'
################
cities = {'白银市', '武威市',
@ -29,12 +29,16 @@ cities = {'白银市', '武威市',
'临夏回族自治州', '平凉市', '定西市', '定西市', '嘉峪关市',
'兰州新区','陇南市', '张掖市', '庆阳市宁县', '庆阳市镇原县', } #
cities = {'天水市', '平凉市', '定西市', '定西市', '嘉峪关市',
'兰州新区','陇南市', '张掖市', '庆阳市镇原县', } #
#cities = {'酒泉市'}
# 电话号码
contactsDWL = {
'szq': '13359446622',
'zyb': '13609346975'
'zyb': '13609346975',
'shx': '18089386522'
}
contacts = {
'天水市': {'王慧': '18706936366', '王肖肖': '17793816150'},