This commit is contained in:
parent
1368bf1f0f
commit
95575b137a
|
@ -0,0 +1,725 @@
|
|||
# 1. 打开监测任务表格
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os, glob, re
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.ticker import FuncFormatter
|
||||
import datetime
|
||||
#word toc
|
||||
import win32com
|
||||
import win32com.client as win32
|
||||
from win32com.client import constants
|
||||
#pdf
|
||||
from pikepdf import Pdf,Page,Rectangle
|
||||
#word
|
||||
from docxtpl import DocxTemplate
|
||||
from docxtpl import InlineImage
|
||||
from docx.shared import Mm
|
||||
|
||||
def addStamp(target_pdf_path, watermark_pdf_path, output_pdf_path, sy=140):
|
||||
#选择需要添加水印的pdf文件
|
||||
target_pdf = Pdf.open(target_pdf_path)
|
||||
#读取水印pdf文件并提取水印
|
||||
watermark_pdf = Pdf.open(watermark_pdf_path)
|
||||
watermark_page_seal = watermark_pdf.pages[0]
|
||||
watermark_page_wyt = watermark_pdf.pages[1]
|
||||
|
||||
#加公章
|
||||
x=240; y=sy; w=115; h=115
|
||||
target_pdf.pages[0].add_overlay(watermark_page_seal, Rectangle(x,y, x+w, y+h))
|
||||
|
||||
#加签字
|
||||
x=163; y=573; w=85; h=50
|
||||
target_pdf.pages[2].add_overlay(watermark_page_wyt, Rectangle(x,y, x+w, y+h))
|
||||
|
||||
#target_pdf.save(target_pdf_path[:6] + '_已签章.pdf')
|
||||
target_pdf.save(output_pdf_path)
|
||||
|
||||
|
||||
def update_toc(docx_file): # word路径
|
||||
word = win32com.client.DispatchEx("Word.Application")
|
||||
word.Visible = 0 # 设置应用可见
|
||||
word.DisplayAlerts = 0
|
||||
doc = word.Documents.Open(docx_file) # 使用微软office打开word
|
||||
toc_count = doc.TablesOfContents.Count # 判断是否有无目录,如果数量是1则代表已经有目录了
|
||||
if toc_count == 0:
|
||||
print("无目录")
|
||||
'''
|
||||
for i, p in enumerate(doc.Paragraphs): # 遍历word中的内容
|
||||
if '目录' in p.Range.Text: # 用于指定目录页面,看下面提示
|
||||
p.Range.InsertParagraphAfter() # 添加新的段落
|
||||
p.Range.InsertAfter("---")
|
||||
parag_range = doc.Paragraphs(i+2).Range
|
||||
doc.TablesOfContents.Add(Range=parag_range,
|
||||
UseHeadingStyles=True,
|
||||
LowerHeadingLevel=2) # 生成目录对象
|
||||
'''
|
||||
elif toc_count == 1:
|
||||
toc = doc.TablesOfContents(1)
|
||||
#toc.Update() # 更新整个目录
|
||||
toc.UpdatePageNumbers() # 更新目录页码
|
||||
|
||||
doc.SaveAs(docx_file.replace('.docx', '_.pdf'), FileFormat=17)
|
||||
doc.Close(SaveChanges=True)
|
||||
word.Quit()
|
||||
|
||||
def toDate(strDT):
|
||||
dt = pd.to_datetime(strDT, errors='coerce')
|
||||
dts = ''
|
||||
# print('-+-+:', type(dt), dt)
|
||||
if not pd.isna(dt):
|
||||
dts = dt.strftime('%m-%d')
|
||||
return dts
|
||||
|
||||
# word模板替换
|
||||
def temp_word(tmep_path, word_apth, dContext, pathImage, city):
|
||||
tpl = DocxTemplate(tmep_path)
|
||||
dC = {'annulusMediaCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusMediaCount.png'), width=Mm(120)),
|
||||
'annulusCountyCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyCount.png'),
|
||||
width=Mm(120)),
|
||||
'annulusCountyArticle': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyArticle.png'),
|
||||
width=Mm(120)),
|
||||
'annulusResult': InlineImage(tpl, os.path.join(pathImage, city + 'annulusResult.png'), width=Mm(120)),
|
||||
'barCountyRatio': InlineImage(tpl, os.path.join(pathImage, city + 'barCountyRatio.png'), width=Mm(120))
|
||||
}
|
||||
|
||||
dContext.update(dC)
|
||||
tpl.render(dContext)
|
||||
tpl.save(word_apth)
|
||||
|
||||
|
||||
# 画柱状图
|
||||
def drawBar(data, recipe, title='', fn=''):
|
||||
plt.figure(figsize=(6, 4))
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||||
plt.rcParams['axes.unicode_minus'] = False
|
||||
counties = recipe
|
||||
countyRates = data
|
||||
|
||||
plt.bar(counties, countyRates, width=0.5)
|
||||
plt.xticks(counties, counties, rotation=35)
|
||||
plt.ylim((0, 1))
|
||||
|
||||
def to_percent(temp, position):
|
||||
return '%2.0f' % (100 * temp) + '%'
|
||||
|
||||
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
|
||||
plt.title(title, fontsize=16)
|
||||
plt.tight_layout()
|
||||
plt.savefig(fn)
|
||||
# plt.show()
|
||||
plt.cla()
|
||||
plt.clf()
|
||||
plt.close()
|
||||
|
||||
|
||||
# 画环状图
|
||||
def drawAnnulus(data, recipe, title='', fn=''):
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||||
plt.rcParams['axes.unicode_minus'] = False
|
||||
xxx = 8 # 画布x,长
|
||||
yyy = 4 # 画布y,高
|
||||
nnncol = 1 # 图例列数
|
||||
fs = 'medium' ## xx--small;x-small;small;medium;large;x-large;xx-large
|
||||
|
||||
# if title == '政务新媒体账号类型':
|
||||
if len(recipe) > 20:
|
||||
if len(recipe) > 40:
|
||||
xxx = 16
|
||||
nnncol = 4
|
||||
fs = 'small'
|
||||
else:
|
||||
xxx = 16
|
||||
nnncol = 2
|
||||
fs = 'small'
|
||||
|
||||
fig, ax = plt.subplots(figsize=(xxx, yyy), subplot_kw=dict(aspect="equal"))
|
||||
|
||||
"""
|
||||
设置圆环宽度,绘图方向,起始角度
|
||||
|
||||
参数wedgeprops以字典形式传递,设置饼图边界的相关属性,例如圆环宽度0.5
|
||||
饼状图默认从x轴正向沿逆时针绘图,参数startangle可指定新的角(例如负40度)度起画
|
||||
"""
|
||||
wedges, texts = ax.pie(data, radius=1.1, wedgeprops=dict(width=0.4), startangle=0) # 画环,返回扇形列表和每个标注文本对象(坐标,文字,属性)
|
||||
|
||||
if 1:
|
||||
x = 1.2
|
||||
if title == '政务新媒体监测结果':
|
||||
x = 1.0
|
||||
plt.legend(labels=recipe, loc="center left", bbox_to_anchor=(x, 0.5), borderaxespad=0., ncol=nnncol,
|
||||
fontsize=fs) # , ncol=3
|
||||
if len(title) > 0:
|
||||
ax.set_title(title, fontsize=16, fontweight='heavy') # , x=0.6
|
||||
|
||||
plt.tight_layout()
|
||||
if len(fn) > 0:
|
||||
plt.savefig(fn)
|
||||
# plt.show()
|
||||
plt.cla()
|
||||
plt.clf()
|
||||
plt.close()
|
||||
|
||||
# summaryCity(city, dfc, dfcw, dfcs, context, strfnTemplate, os.path.join(strPathVerified,'Reports', city+'.docx'), strPathVerified )
|
||||
|
||||
|
||||
# 汇总市州数据,
|
||||
# 市州名称, 监测数据, cbz数据, mgc数据, context(编号、名称), word模板文件名称, 输出word文件名称, 临时文件目录
|
||||
# 需要传入模板文件,数据、错别字、敏感词,单位名称等
|
||||
def summaryCity(info, city, df, dfW, dfS, fnTemplate, fnReport, dirTemp):
|
||||
dCityClient = {
|
||||
'甘肃省': "甘肃省人民政府办公厅",
|
||||
'省直部门': "甘肃省人民政府办公厅",
|
||||
'白银市': "白银市人民政府办公室",
|
||||
'定西市': "定西市人民政府办公室",
|
||||
'临夏回族自治州': "临夏回族自治州人民政府办公室",
|
||||
'平凉市': "中共平凉市委网络安全和信息化委员会办公室",
|
||||
"庆阳市": "庆阳市电子政务与信息资源管理办公室",
|
||||
'庆阳市华池县': "华池县人民政府办公室",
|
||||
'庆阳市宁县': "宁县人民政府办公室",
|
||||
"庆阳市镇原县": "镇原县人民政府办公室",
|
||||
"酒泉市": "酒泉市人民政府办公室",
|
||||
"天水市": "天水市人民政府办公室",
|
||||
"武威市": "武威市人民政府办公室",
|
||||
"金昌市": "金昌市人民政府办公室",
|
||||
"嘉峪关市": "嘉峪关市人民政府办公室",
|
||||
"兰州新区": "兰州新区管委会办公室",
|
||||
"陇南市": "陇南市政务服务中心",
|
||||
"张掖市": "张掖市政务服务中心",
|
||||
"甘南藏族自治州": "甘南藏族自治州政务服务中心",
|
||||
"兰州市": "兰州市政务服务中心",
|
||||
"陇南市": "陇南市政务服务中心",
|
||||
}
|
||||
dHavingSubordinateUnits = {'甘肃省': True, '白银市': True, '定西市': True,
|
||||
'临夏回族自治州': True, '平凉市': True, "庆阳市": True, "酒泉市": True, "天水市": True,
|
||||
"陇南市": True, "张掖市": True, "甘南藏族自治州": True, "兰州市": True, "陇南市": True,
|
||||
"武威市": True, "金昌市": True,
|
||||
'省直部门': False, "兰州新区": False, '庆阳市华池县': False,
|
||||
'庆阳市宁县': False, "庆阳市镇原县": False, "嘉峪关市": False}
|
||||
print("----------------" + city + "----------------")
|
||||
# 报告编号、委托单位
|
||||
strID = "%02d" % (list(dCityClient).index(city))
|
||||
# print(strID)
|
||||
context = {
|
||||
"city": city,
|
||||
"client": dCityClient[city],
|
||||
"reportid": strID + info['serialNum'],
|
||||
'havingSubordinateUnits': dHavingSubordinateUnits[city],
|
||||
'havingBelowStandard': True,
|
||||
'havingUpStandard': True,
|
||||
'havingCbz': True,
|
||||
'havingMgc': True
|
||||
}
|
||||
context.update(info)
|
||||
|
||||
subordinate = '区县/地方部门'
|
||||
subordinateName = '县区'
|
||||
# 区县数据筛选
|
||||
if "庆阳市" in city:
|
||||
if "华池县" in city:
|
||||
dfc = df.loc[(df['市/省局'] == '庆阳市')
|
||||
& (df['区县/地方部门'] == '华池县')].copy()
|
||||
|
||||
elif "宁县" in city:
|
||||
dfc = df.loc[(df['市/省局'] == '庆阳市')
|
||||
& (df['区县/地方部门'] == '宁县')].copy()
|
||||
elif "镇原县" in city:
|
||||
dfc = df.loc[(df['市/省局'] == '庆阳市')
|
||||
& (df['区县/地方部门'] == '镇原县')].copy()
|
||||
else:
|
||||
dfc = df.loc[(df['市/省局'] == '庆阳市')].copy()
|
||||
# & (df['区县/地方部门']!='华池县')
|
||||
# & (df['区县/地方部门']!='宁县')
|
||||
# & (df['区县/地方部门']!='镇原县') ].copy()
|
||||
dfcw = dfW.loc[dfW['市州'] == '庆阳市'].copy()
|
||||
dfcs = dfS.loc[dfS['市州'] == '庆阳市'].copy()
|
||||
elif "甘肃" in city :
|
||||
#dfc = df.copy()
|
||||
#dfcw = dfW.copy()
|
||||
#dfcs = dfS.copy()
|
||||
|
||||
cities = {'白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
|
||||
'嘉峪关市', '陇南市', '张掖市', '省直部门', '金昌市', '甘南藏族自治州'}
|
||||
|
||||
dfc = df.loc[ df['市/省局'].isin(cities) ].copy()
|
||||
dfcw = dfW.loc[ dfW['市州'].isin(cities) ].copy()
|
||||
dfcs = dfS.loc[ dfS['市州'].isin(cities) ].copy()
|
||||
subordinate = '市/省局'
|
||||
subordinateName = '市州'
|
||||
|
||||
elif "省直部门" in city :
|
||||
dfc = df.loc[df['市/省局'] == city].copy()
|
||||
#dfcw = dfW.loc[dfW['市州'] == dictSC[city]].copy()
|
||||
#dfcs = dfS.loc[dfS['市州'] == dictSC[city]].copy()
|
||||
dfcw = dfW.loc[dfW['市州'] == city].copy()
|
||||
dfcs = dfS.loc[dfS['市州'] == city].copy()
|
||||
|
||||
else:
|
||||
dfc = df.loc[(df['市/省局'] == city)].copy()
|
||||
dfcw = dfW.loc[dfW['市州'] == city].copy()
|
||||
dfcs = dfS.loc[dfS['市州'] == city].copy()
|
||||
|
||||
# -----------------------
|
||||
# 统计结果分析
|
||||
|
||||
dCity = {'1': '2'}
|
||||
#
|
||||
# 县区-监测结果 统计
|
||||
#
|
||||
|
||||
# 透视表, 按县区统计各个监测结果账号数量
|
||||
dfCountyAccount = pd.pivot_table(dfc, index=[subordinate], columns=['监测结果'], values=['账号名称'], aggfunc='count',
|
||||
fill_value='', margins=True)
|
||||
dfCountyAccount.columns = dfCountyAccount.columns.droplevel(0)
|
||||
# 准备模板中的表格
|
||||
tt3_list = []
|
||||
for index, row in dfCountyAccount.iterrows():
|
||||
county = ''
|
||||
if index == 'All':
|
||||
county = '总 计'
|
||||
else:
|
||||
county = index
|
||||
if not dHavingSubordinateUnits[city] and county=='市直部门':
|
||||
county = city
|
||||
hg = ''
|
||||
u2w = ''
|
||||
un = ''
|
||||
count = ''
|
||||
if '合格' in dfCountyAccount.columns.values.tolist():
|
||||
if not isinstance(row['合格'], str):
|
||||
hg = int(row['合格'])
|
||||
if '监测期间未更新' in dfCountyAccount.columns.values.tolist():
|
||||
if not isinstance(row['监测期间未更新'], str):
|
||||
un = int(row['监测期间未更新'])
|
||||
if '超过两周未更新' in dfCountyAccount.columns.values.tolist():
|
||||
if not isinstance(row['超过两周未更新'], str):
|
||||
u2w = int(row['超过两周未更新'])
|
||||
if 'All' in dfCountyAccount.columns.values.tolist():
|
||||
if not isinstance(row['All'], str):
|
||||
count = int(row['All'])
|
||||
|
||||
tt3_a = {'county': county, 'hg': hg, 'u2w': u2w, 'un': un, 'count': count}
|
||||
tt3_list.append(tt3_a)
|
||||
context['tt3_contents'] = tt3_list
|
||||
# dfCountyAccount.to_excel(dirTask+strPathCity+'县区监测结果.xlsx')
|
||||
|
||||
# -----------------------
|
||||
#
|
||||
# 按媒体类型统计
|
||||
#
|
||||
# 透视表, 按账号类型统计账号数量
|
||||
dfMedia = pd.pivot_table(dfc, index=['账号类型'], values=['账号名称'], aggfunc='count', fill_value='', margins=True)
|
||||
# 提取该市账号数量
|
||||
dCity['nmCount'] = dfMedia.loc['All', '账号名称']
|
||||
print(' 监测账号数:', dCity['nmCount'])
|
||||
# 提取 账号类型-数量 , 拼成文本串
|
||||
dfMedia = dfMedia.sort_values(by='账号名称', ascending=False)
|
||||
lTableCs1 = []
|
||||
strMedia = ''
|
||||
i = 0
|
||||
tt1_list = []
|
||||
for m in dfMedia.index.tolist()[1:]: # 第一个是总数,不用取
|
||||
strNum = str(dfMedia.iloc[:, 0].tolist()[1:][i])
|
||||
strMedia = strMedia + m + strNum + '个,'
|
||||
tt1_a = {'type': m, 'count': strNum}
|
||||
tt1_list.append(tt1_a)
|
||||
i = i + 1
|
||||
dCity['sMediaCount'] = strMedia[:-1].rstrip(',')
|
||||
context.update({'tt1_contents': tt1_list})
|
||||
|
||||
# -----------------------
|
||||
#
|
||||
# 按县区-更新次数 统计
|
||||
#
|
||||
dfCountyArticle = pd.pivot_table(dfc, index=[subordinate], values=['更新次数'], aggfunc='sum', fill_value='',
|
||||
margins=True)
|
||||
dfCountyArticle = dfCountyArticle.sort_values(by='更新次数', ascending=False).copy()
|
||||
dCity['cityArticleCount'] = "%d" % dfCountyArticle.iloc[0, 0]
|
||||
dCity['countyMostArticle'] = dfCountyArticle.index.tolist()[1]
|
||||
dCity['countyMostArticleCount'] = "%d" % dfCountyArticle.iloc[1, 0]
|
||||
strCountyArticle = ''
|
||||
iiii = 0
|
||||
|
||||
if len(dfCountyArticle.index)>2:
|
||||
for cccc in dfCountyArticle.index.tolist()[1:]:
|
||||
iiii = iiii + 1
|
||||
strCountyArticle = strCountyArticle + cccc + "%d" % dfCountyArticle.iloc[iiii, 0] + "次,"
|
||||
strCountyArticle = strCountyArticle.rstrip(',')
|
||||
|
||||
dCity['sCountyArticles'] = ',按管理矩阵统计,' + strCountyArticle
|
||||
|
||||
|
||||
# 市各县区监测结果按总数排序,
|
||||
dfCountyAccount.loc[:, '合格'] = dfCountyAccount['合格'].astype('int')
|
||||
dfCountyAccount = dfCountyAccount.sort_values(by='All', ascending=False).copy()
|
||||
# 计算合格率
|
||||
dfCountyAccount.eval('rate = 合格 / All ', inplace=True)
|
||||
dfResult = dfCountyAccount.copy()
|
||||
# 提取city合格率
|
||||
dCity['cityRatio'] = "{:.1%}".format(dfCountyAccount.loc['All', 'rate'])
|
||||
print(' 合格率:', dCity['cityRatio'])
|
||||
|
||||
# 导出文件
|
||||
# dfCountyAccount.to_excel(dirIntermediate+sFileBase+'县区合格率.xlsx')
|
||||
|
||||
# dfMedia = dfMedia.drop(['All'])
|
||||
# 提取县区名称,县区账号数, 县区合格率,转成字符串
|
||||
dfCountyAccount = dfCountyAccount.drop(['All']) # 删除"All"行
|
||||
counties = dfCountyAccount.index.tolist()
|
||||
countyCounts = dfCountyAccount['All'].values.tolist()
|
||||
countyHeges = dfCountyAccount['合格'].values.tolist()
|
||||
|
||||
# 按县区账号数量排序
|
||||
strCountyCount = ''
|
||||
strCounties = ''
|
||||
i = 0
|
||||
for c in counties:
|
||||
strCounties = strCounties + c + ','
|
||||
strCountyCount = strCountyCount + c + str(countyCounts[i]) + '个,'
|
||||
i = i + 1
|
||||
dCity['countyCount'] = "%d" % i
|
||||
dCity['sCounties'] = strCounties.rstrip(',')
|
||||
dCity['sCountyCount'] = strCountyCount.rstrip(',')
|
||||
|
||||
# 按合格率排序
|
||||
dfCountyAccount = dfCountyAccount.sort_values(by='rate', ascending=False)
|
||||
countieshege = dfCountyAccount.index.tolist()
|
||||
countyRates = dfCountyAccount['rate']
|
||||
strCountyRatio = ''
|
||||
i = 0
|
||||
tt2_list = []
|
||||
for c in countieshege:
|
||||
strRatio = "%.1f" % (100.0 * countyRates[i])
|
||||
strCountyRatio = strCountyRatio + c + strRatio + '%,'
|
||||
tt2_a = {'county': c, 'ratio': strRatio + '%'}
|
||||
tt2_list.append(tt2_a)
|
||||
i = i + 1
|
||||
dCity['sCountyRatio'] = strCountyRatio.rstrip(',')
|
||||
dCity['tt2_contents'] = tt2_list
|
||||
|
||||
# -----------------------
|
||||
#
|
||||
# 绘图
|
||||
#
|
||||
print(' 生成图片...')
|
||||
drawAnnulus(dfMedia.iloc[:, 0].tolist()[1:], dfMedia.index.tolist()[1:],
|
||||
'政务新媒体账号类型', os.path.join(dirTemp, city + 'annulusMediaCount.png'))
|
||||
|
||||
drawAnnulus(countyCounts, counties,
|
||||
subordinateName + '政务新媒体账号数量', os.path.join(dirTemp, city + 'annulusCountyCount.png'))
|
||||
|
||||
drawAnnulus(dfCountyArticle.iloc[:, 0].tolist()[1:], dfCountyArticle.index.tolist()[1:],
|
||||
subordinateName + '政务新媒体累计更新次数', os.path.join(dirTemp, city + 'annulusCountyArticle.png'))
|
||||
|
||||
# ;{{resultNoUpdated}}个政务新媒体监测期间未更新,占监测总数的{{resultNoUpdatedRatio}}
|
||||
# ;{{resultNoUpdated2W}}个政务新媒体连续未更新时间超过两周,占监测总数的{{resultNoUpdated2WRatio}}
|
||||
# 政务新媒体监测结果
|
||||
dfResult = dfResult.drop('All', axis=1)
|
||||
dfResult = dfResult.drop('rate', axis=1)
|
||||
# 合格数,合格率,不合格数
|
||||
dCity['resultQualified'] = "%d" % (dfResult.loc['All', '合格'])
|
||||
dCity['resultQualifiedRatio'] = "%.1f%%" % (dfResult.loc['All', '合格'] / dCity['nmCount'] * 100.0)
|
||||
dCity['resultUnqualified'] = "%d" % (dCity['nmCount'] - dfResult.loc['All', '合格'])
|
||||
#
|
||||
# numNoupdated = 0
|
||||
if '监测期间未更新' in dfResult.columns.values.tolist():
|
||||
numNoupdated = dfResult.loc['All', '监测期间未更新']
|
||||
dCity['stringResultNoUpdated'] = ";%d个政务新媒体监测期间未更新,占监测总数的%.1f%%" % (
|
||||
numNoupdated, numNoupdated / dCity['nmCount'] * 100.0)
|
||||
dCity['stringNoUpdated'] = "%d个政务新媒体监测期间未更新。" % (numNoupdated)
|
||||
else:
|
||||
dCity['stringResultNoUpdated'] = ''
|
||||
dCity['stringNoUpdated'] = ""
|
||||
# dCity['resultNoUpdated'] = "%d"%(numNoupdated)
|
||||
# dCity['resultNoUpdatedRatio'] = "%.1f%%"%(numNoupdated/dCity['nmCount']*100.0)
|
||||
# numNoupdated2W = 0
|
||||
if '超过两周未更新' in dfResult.columns.values.tolist():
|
||||
numNoupdated2W = dfResult.loc['All', '超过两周未更新']
|
||||
dCity['stringResultNoUpdated2W'] = ";%d个政务新媒体连续未更新时间超过两周,占监测总数的%.1f%%" % (
|
||||
numNoupdated2W, numNoupdated2W / dCity['nmCount'] * 100.0)
|
||||
dCity['stringNoUpdated2W'] = "%d个政务新媒体连续未更新时间超过两周。" % (numNoupdated2W)
|
||||
else:
|
||||
dCity['stringResultNoUpdated2W'] = ''
|
||||
dCity['stringNoUpdated2W'] = ''
|
||||
# dCity['resultNoUpdated2W'] = "%d"%(numNoupdated2W)
|
||||
# dCity['resultNoUpdated2WRatio'] = "%.1f%%"%(numNoupdated2W/dCity['nmCount']*100.0)
|
||||
resultLabels = dfResult.columns.values.tolist()
|
||||
resultCounts = dfResult.loc['All'].values.tolist()
|
||||
drawAnnulus(resultCounts, resultLabels,
|
||||
'政务新媒体监测结果', os.path.join(dirTemp, city + 'annulusResult.png'))
|
||||
|
||||
drawBar(countyRates, countieshege,
|
||||
'政务新媒体管理矩阵发布时效性合格率榜单', os.path.join(dirTemp, city + 'barCountyRatio.png'))
|
||||
|
||||
# -----------------------
|
||||
#
|
||||
# 准备报告需要的数据
|
||||
#
|
||||
print(' 生成报告...')
|
||||
|
||||
dfCityUnqulified = dfc[dfc['监测结果'] != '合格']
|
||||
dfCityUnqulified = dfCityUnqulified.sort_values(by="监测结果", ascending=True) # by指定按哪列排序。ascending表示是否升序=False
|
||||
|
||||
#################################################
|
||||
|
||||
dfCityQulified = dfc[dfc['监测结果'] == '合格']
|
||||
dfCityQulified = dfCityQulified.sort_values(by=subordinate, ascending=True) # by指定按哪列排序。ascending表示是否升序=False
|
||||
|
||||
#
|
||||
# 不合格账号列表
|
||||
if len(dfCityUnqulified)<1:
|
||||
context.update({'havingBelowStandard':False})
|
||||
else:
|
||||
tt4_list = []
|
||||
for index, row in dfCityUnqulified.iterrows():
|
||||
count = ''
|
||||
if row['更新次数']:
|
||||
count = "%d" % row['更新次数']
|
||||
days = ''
|
||||
if row['静默日数']:
|
||||
days = "%d" % row['静默日数']
|
||||
sD1 = ''
|
||||
sD2 = ''
|
||||
if row['静默开始日期']:
|
||||
sD1 = toDate(str(row['静默开始日期']))
|
||||
if row['静默结束日期']:
|
||||
sD2 = toDate(str(row['静默结束日期']))
|
||||
|
||||
tt4_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
|
||||
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
|
||||
'days': days, 'start': sD1, 'end': sD2, }
|
||||
tt4_list.append(tt4_a)
|
||||
tt4_results = {'tt4_contents': tt4_list}
|
||||
context.update(tt4_results)
|
||||
|
||||
#
|
||||
# 合格账号列表
|
||||
if len(dfCityQulified)<1:
|
||||
context.update({'havingUpStandard':False})
|
||||
else:
|
||||
tt5_list = []
|
||||
for index, row in dfCityQulified.iterrows():
|
||||
count = ''
|
||||
if row['更新次数']:
|
||||
count = "%d" % row['更新次数']
|
||||
days = ''
|
||||
if row['静默日数']:
|
||||
days = "%d" % row['静默日数']
|
||||
sD1 = ''
|
||||
sD2 = ''
|
||||
if row['静默开始日期']:
|
||||
sD1 = toDate(str(row['静默开始日期']))
|
||||
if row['静默结束日期']:
|
||||
sD2 = toDate(str(row['静默结束日期']))
|
||||
|
||||
tt5_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
|
||||
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
|
||||
'days': days, 'start': sD1, 'end': sD2, }
|
||||
tt5_list.append(tt5_a)
|
||||
tt5_results = {'tt5_contents': tt5_list}
|
||||
context.update(tt5_results)
|
||||
|
||||
#
|
||||
# 错别字表格
|
||||
|
||||
if dfcw.shape[0]<1:
|
||||
context.update({'havingCbz':False})
|
||||
else:
|
||||
tCbz_list = []
|
||||
dfcw.fillna('')
|
||||
for index, row in dfcw.iterrows():
|
||||
sTitle = ''
|
||||
sDate = toDate(str(row['发文时间']))
|
||||
if '标题' in dfcw.columns:
|
||||
sTitle = row['标题']
|
||||
|
||||
# 去除引号等干扰表格模板输出的字符
|
||||
r = "[——,$%^,。?、~@#¥%……&*《》<>「」{}【】()/\\\[\]'\"]"
|
||||
if pd.isna(row['错误出现位置']):
|
||||
s = ''
|
||||
else:
|
||||
s = re.sub(r, '', row['错误出现位置'])
|
||||
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': s, 'type': row['账号类型'], 'name': row['账号名称'],
|
||||
'date': sDate, 'title': sTitle, }
|
||||
tCbz_list.append(a)
|
||||
if dfcw.shape[0] > 0:
|
||||
dCity['stringCbzCount'] = '本次检测发现错别字%d处,详细情况见附表政务新媒体发布内容错别字统计表。' % (dfcw.shape[0])
|
||||
else:
|
||||
dCity['stringCbzCount'] = '本次检测未发现错别字。'
|
||||
tCbz_results = {'tCbz_contents': tCbz_list}
|
||||
context.update(tCbz_results)
|
||||
|
||||
# 读取添加敏感词表格
|
||||
if dfcs.shape[0]<1:
|
||||
context.update({'havingMgc':False})
|
||||
else:
|
||||
tMgc_list = []
|
||||
dfcs.fillna('')
|
||||
for index, row in dfcs.iterrows():
|
||||
sTitle = ''
|
||||
sDate = toDate(str(row['发文时间']))
|
||||
if '标题' in dfcs.columns:
|
||||
sTitle = row['标题']
|
||||
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': row['错误出现位置'], 'type': row['账号类型'], 'name': row['账号名称'],
|
||||
'date': sDate, 'title': sTitle, }
|
||||
tMgc_list.append(a)
|
||||
if dfcs.shape[0] > 0:
|
||||
dCity['stringMgcCount'] = '本次检测发现敏感信息%d处,详细情况见附表政务新媒体发布内容敏感信息统计表。' % (dfcs.shape[0])
|
||||
else:
|
||||
dCity['stringMgcCount'] = '本次检测未发现涉敏内容。'
|
||||
tMgc_results = {'tMgc_contents': tMgc_list}
|
||||
context.update(tMgc_results)
|
||||
|
||||
# table1
|
||||
context.update(dCity)
|
||||
|
||||
# -----------------------
|
||||
#
|
||||
# 按模板生成报告
|
||||
#
|
||||
temp_word(fnTemplate,
|
||||
fnReport,
|
||||
context, dirTemp, city)
|
||||
|
||||
#更新目录并另存为pdf
|
||||
print(' 更新目录,转换为PDF...')
|
||||
update_toc( fnReport )
|
||||
|
||||
#签章
|
||||
print(' 签章...')
|
||||
fnTmp = fnReport.replace('.docx', '_.pdf')
|
||||
fnPDF = fnReport.replace('.docx', '.pdf')
|
||||
if city in {'庆阳市', '平凉市', '临夏回族自治州'}:
|
||||
addStamp(fnTmp,
|
||||
'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf' ,
|
||||
fnPDF, 115)
|
||||
else:
|
||||
addStamp(fnTmp,'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf',fnPDF)
|
||||
if True:
|
||||
os.remove(fnTmp)
|
||||
|
||||
|
||||
|
||||
def createDir(dirP, dirS):
|
||||
dirN = dirP
|
||||
if os.path.isdir(dirP):
|
||||
dirN = os.path.join(dirP, dirS)
|
||||
if not (os.path.exists(dirN)):
|
||||
os.mkdir(dirN)
|
||||
if os.path.isdir(dirN):
|
||||
pass
|
||||
else:
|
||||
dirN = dirP
|
||||
print('Directory ' + dirN + ' cannot be created.')
|
||||
return dirN
|
||||
# def createDir(dirP, dirS):
|
||||
|
||||
# 合并错别字文件
|
||||
def mergeCMC(keyword, strPathCBZ, strFnCbz):
|
||||
# cityShorten
|
||||
cityShorten = {'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
||||
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
||||
'兰州新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
||||
'BY': '白银市', 'DX': '定西市', 'JQ': '酒泉市', 'JYG': '嘉峪关市', 'LN': '陇南市',
|
||||
'LX': '临夏回族自治州', 'PL': '平凉市', 'QY': '庆阳市', 'TS': '天水市', 'WW': '武威市', 'XQ': '兰州新区',
|
||||
'LZXQ': '兰州新区', 'LZ': '兰州市', 'ZY': '张掖市', 'GN': '甘南藏族自治州', 'SZ': '省直部门', 'JC': '金昌市', }
|
||||
df = pd.DataFrame()
|
||||
for fn in glob.glob(os.path.join(strPathCBZ, '*'+keyword+'*.xlsx')):
|
||||
p, f = os.path.split(fn)
|
||||
city=''
|
||||
for c in cityShorten.keys():
|
||||
if c in f:
|
||||
city = cityShorten[c]
|
||||
break
|
||||
if len(city)<1:
|
||||
print("!!!!! City Name not matched ( ", f, " )")
|
||||
dfn = pd.read_excel(fn)
|
||||
dfn['市州'] = city
|
||||
df = df.append(dfn, ignore_index=True)
|
||||
print(city, f, dfn.shape[0], '/', df.shape[0])
|
||||
df.to_excel(strFnCbz)
|
||||
#def mergeCMC
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# 运行之前先转换excel文件的日期列
|
||||
|
||||
info = {
|
||||
"year": "2023",
|
||||
"month": "6",
|
||||
"datePub": "二〇二三年七月",
|
||||
"dateStart": "2023年6月1日",
|
||||
"dateEnd": "2023年6月30日",
|
||||
"days": "30",
|
||||
"serialNum": "8",
|
||||
}
|
||||
# 数据根目录,
|
||||
strPath = 'D:/Projects/POM/DATA/2023年7月/6月报告/'
|
||||
createDir(strPath, '全文')
|
||||
createDir(strPath, '转发')
|
||||
createDir(strPath, '报告')
|
||||
createDir(strPath, '汇总')
|
||||
createDir(strPath, '监测')
|
||||
# 监测数据
|
||||
strFnMonitoring = strPath + '汇总/6月汇总数据_2023.6.xlsx'
|
||||
# word模板文件
|
||||
strPathTemplate = strPath + 'POM_ReportTemplate.docx'
|
||||
# 错别字
|
||||
strFnCbz = strPath + '汇总/CBZ.xlsx'
|
||||
if not os.path.exists(strFnCbz):# 汇总错别字
|
||||
strPathCBZ = strPath + '监测/'
|
||||
mergeCMC("错别", strPathCBZ, strFnCbz)
|
||||
# 敏感词
|
||||
strFnMgc = strPath + '汇总/MGC.xlsx'
|
||||
if not os.path.exists(strFnMgc):#汇总敏感词
|
||||
strPathMGC = strPath + '监测/'
|
||||
mergeCMC("敏感", strPathMGC, strFnMgc)
|
||||
# 数据目录
|
||||
strPathOutput = strPath
|
||||
|
||||
|
||||
# 打开监测数据、错别字、敏感词
|
||||
df = pd.read_excel(strFnMonitoring)
|
||||
dfW = pd.read_excel(strFnCbz)
|
||||
dfS = pd.read_excel(strFnMgc)
|
||||
|
||||
# df.loc[df['账号类型'] == '微信服务号', '账号类型'] = '微信'
|
||||
# df.loc[df['账号类型'] == '微信订阅号', '账号类型'] = '微信'
|
||||
|
||||
# 统一监测结果表述
|
||||
df.loc[df['监测结果'] == '连续两周未更新', '监测结果'] = '超过两周未更新'
|
||||
|
||||
# 过长名称替换为简称,便于绘图
|
||||
df.loc[df['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县'
|
||||
df.loc[df['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县'
|
||||
|
||||
# 省直、 市直、 州直
|
||||
df['市/省局'] = df['市/省局'].fillna('省直部门')
|
||||
df['区县/地方部门'] = df['区县/地方部门'].fillna('市直部门')
|
||||
df.loc[(df['市/省局'] == '临夏回族自治州') & (df['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门'
|
||||
|
||||
|
||||
# 数据整理
|
||||
df.replace(r'\s+', '', regex=True, inplace=True) # 去除账号、单位名称中的空格、换行、tab等
|
||||
df.replace(r'^其他\+', '', regex=True, inplace=True) # 去除账号类型中的 "其它" 字样
|
||||
df['更新次数'] = df['更新次数'].fillna(0)
|
||||
df = df.fillna(value='')
|
||||
|
||||
|
||||
#########################################################
|
||||
#
|
||||
# 统计市州范围
|
||||
cities = {'白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
|
||||
'嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县', '陇南市', '张掖市', '甘肃省'}
|
||||
#cities = cities | {'甘肃省'}#, '省直部门'}
|
||||
#cities = cities | {'陇南市'}#, '兰州市'}, '省直部门'}
|
||||
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '张掖市', '甘肃省', '省直部门'}
|
||||
#
|
||||
cities = {'甘肃省'} # 只统计特定市州
|
||||
|
||||
# strPathOutput目录下生成报告目录和临时文件目录:Reports 和 Intermediate
|
||||
dirP = os.path.abspath(os.path.dirname(strPathOutput))
|
||||
dirReports = createDir(dirP, 'Reports')
|
||||
dirIntermediate = createDir(dirP, 'Intermediate')
|
||||
for city in cities:
|
||||
summaryCity(info, city, df, dfW, dfS, strPathTemplate, os.path.join(dirReports, city + '政务新媒体监测报告_{}年{}月.docx'.format(info['year'], info['month'])), dirIntermediate)
|
|
@ -16,7 +16,7 @@ from docxtpl import DocxTemplate
|
|||
from docxtpl import InlineImage
|
||||
from docx.shared import Mm
|
||||
|
||||
def addStamp(target_pdf_path, watermark_pdf_path, output_pdf_path):
|
||||
def addStamp(target_pdf_path, watermark_pdf_path, output_pdf_path, sy=140):
|
||||
#选择需要添加水印的pdf文件
|
||||
target_pdf = Pdf.open(target_pdf_path)
|
||||
#读取水印pdf文件并提取水印
|
||||
|
@ -25,7 +25,7 @@ def addStamp(target_pdf_path, watermark_pdf_path, output_pdf_path):
|
|||
watermark_page_wyt = watermark_pdf.pages[1]
|
||||
|
||||
#加公章
|
||||
x=240; y=110; w=115; h=115
|
||||
x=240; y=sy; w=115; h=115
|
||||
target_pdf.pages[0].add_overlay(watermark_page_seal, Rectangle(x,y, x+w, y+h))
|
||||
|
||||
#加签字
|
||||
|
@ -580,10 +580,19 @@ def summaryCity(info, city, df, dfW, dfS, fnTemplate, fnReport, dirTemp):
|
|||
context, dirTemp, city)
|
||||
|
||||
#更新目录并另存为pdf
|
||||
print(' 更新目录,转换为PDF...')
|
||||
update_toc( fnReport )
|
||||
|
||||
#签章
|
||||
addStamp(fnReport.replace('.docx', '.pdf'),'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf' , fnReport.replace('.docx', '_Stamp.pdf'))
|
||||
print(' 签章...')
|
||||
if city in {'庆阳市', '平凉市', '临夏回族自治州'}:
|
||||
addStamp(fnReport.replace('.docx', '.pdf'),
|
||||
'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf' ,
|
||||
fnReport.replace('.docx', '_Stamp.pdf'), 115)
|
||||
else:
|
||||
addStamp(fnReport.replace('.docx', '.pdf'),
|
||||
'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf' ,
|
||||
fnReport.replace('.docx', '_Stamp.pdf'))
|
||||
|
||||
|
||||
|
||||
|
@ -639,7 +648,7 @@ def summary(info, strFnData, strFnW, strFnS, strfnTemplate, strPathOutput):
|
|||
#cities = cities | {'陇南市'}#, '兰州市'}, '省直部门'}
|
||||
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '张掖市', '甘肃省', '省直部门'}
|
||||
#
|
||||
#cities = {'白银市','甘肃省'} # 只统计特定市州
|
||||
cities = {'庆阳市', '庆阳市宁县', '甘肃省'} # 只统计特定市州
|
||||
|
||||
# strPathOutput目录下生成报告目录和临时文件目录:Reports 和 Intermediate
|
||||
dirP = os.path.abspath(os.path.dirname(strPathOutput))
|
||||
|
@ -749,7 +758,7 @@ if __name__ == "__main__":
|
|||
#cities = cities | {'陇南市'}#, '兰州市'}, '省直部门'}
|
||||
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '张掖市', '甘肃省', '省直部门'}
|
||||
#
|
||||
#cities = {'兰州新区','白银市','庆阳市'} # 只统计特定市州
|
||||
#cities = {'定西市'} # 只统计特定市州
|
||||
|
||||
# strPathOutput目录下生成报告目录和临时文件目录:Reports 和 Intermediate
|
||||
dirP = os.path.abspath(os.path.dirname(strPathOutput))
|
||||
|
|
|
@ -0,0 +1,773 @@
|
|||
# 1. 打开监测任务表格
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os, glob, re
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.ticker import FuncFormatter
|
||||
import datetime
|
||||
#word toc
|
||||
import win32com
|
||||
import win32com.client as win32
|
||||
from win32com.client import constants
|
||||
#pdf
|
||||
from pikepdf import Pdf,Page,Rectangle
|
||||
#word
|
||||
from docxtpl import DocxTemplate
|
||||
from docxtpl import InlineImage
|
||||
from docx.shared import Mm
|
||||
|
||||
def addStamp(target_pdf_path, watermark_pdf_path, output_pdf_path, sy=140):
|
||||
#选择需要添加水印的pdf文件
|
||||
target_pdf = Pdf.open(target_pdf_path)
|
||||
#读取水印pdf文件并提取水印
|
||||
watermark_pdf = Pdf.open(watermark_pdf_path)
|
||||
watermark_page_seal = watermark_pdf.pages[0]
|
||||
watermark_page_wyt = watermark_pdf.pages[1]
|
||||
|
||||
#加公章
|
||||
x=240; y=sy; w=115; h=115
|
||||
target_pdf.pages[0].add_overlay(watermark_page_seal, Rectangle(x,y, x+w, y+h))
|
||||
|
||||
#加签字
|
||||
x=163; y=573; w=85; h=50
|
||||
target_pdf.pages[2].add_overlay(watermark_page_wyt, Rectangle(x,y, x+w, y+h))
|
||||
|
||||
#target_pdf.save(target_pdf_path[:6] + '_已签章.pdf')
|
||||
target_pdf.save(output_pdf_path)
|
||||
|
||||
|
||||
def update_toc(docx_file): # word路径
|
||||
word = win32com.client.DispatchEx("Word.Application")
|
||||
word.Visible = 0 # 设置应用可见
|
||||
word.DisplayAlerts = 0
|
||||
doc = word.Documents.Open(docx_file) # 使用微软office打开word
|
||||
toc_count = doc.TablesOfContents.Count # 判断是否有无目录,如果数量是1则代表已经有目录了
|
||||
if toc_count == 0:
|
||||
print("无目录")
|
||||
'''
|
||||
for i, p in enumerate(doc.Paragraphs): # 遍历word中的内容
|
||||
if '目录' in p.Range.Text: # 用于指定目录页面,看下面提示
|
||||
p.Range.InsertParagraphAfter() # 添加新的段落
|
||||
p.Range.InsertAfter("---")
|
||||
parag_range = doc.Paragraphs(i+2).Range
|
||||
doc.TablesOfContents.Add(Range=parag_range,
|
||||
UseHeadingStyles=True,
|
||||
LowerHeadingLevel=2) # 生成目录对象
|
||||
'''
|
||||
elif toc_count == 1:
|
||||
toc = doc.TablesOfContents(1)
|
||||
#toc.Update() # 更新整个目录
|
||||
toc.UpdatePageNumbers() # 更新目录页码
|
||||
|
||||
doc.SaveAs(docx_file.replace('.docx', '_.pdf'), FileFormat=17)
|
||||
doc.Close(SaveChanges=True)
|
||||
word.Quit()
|
||||
|
||||
def toDate(strDT):
|
||||
dt = pd.to_datetime(strDT, errors='coerce')
|
||||
dts = ''
|
||||
# print('-+-+:', type(dt), dt)
|
||||
if not pd.isna(dt):
|
||||
dts = dt.strftime('%m-%d')
|
||||
return dts
|
||||
|
||||
# word模板替换
|
||||
def temp_word(tmep_path, word_apth, dContext, pathImage, city):
|
||||
tpl = DocxTemplate(tmep_path)
|
||||
dC = {'annulusMediaCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusMediaCount.png'), width=Mm(120)),
|
||||
'annulusCountyCount': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyCount.png'),
|
||||
width=Mm(120)),
|
||||
'annulusCountyArticle': InlineImage(tpl, os.path.join(pathImage, city + 'annulusCountyArticle.png'),
|
||||
width=Mm(120)),
|
||||
'annulusResult': InlineImage(tpl, os.path.join(pathImage, city + 'annulusResult.png'), width=Mm(120)),
|
||||
'barCountyRatio': InlineImage(tpl, os.path.join(pathImage, city + 'barCountyRatio.png'), width=Mm(120))
|
||||
}
|
||||
|
||||
dContext.update(dC)
|
||||
tpl.render(dContext)
|
||||
tpl.save(word_apth)
|
||||
|
||||
|
||||
# 画柱状图
|
||||
def drawBar(data, recipe, title='', fn=''):
|
||||
plt.figure(figsize=(6, 4))
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||||
plt.rcParams['axes.unicode_minus'] = False
|
||||
counties = recipe
|
||||
countyRates = data
|
||||
|
||||
plt.bar(counties, countyRates, width=0.5)
|
||||
plt.xticks(counties, counties, rotation=35)
|
||||
plt.ylim((0, 1))
|
||||
|
||||
def to_percent(temp, position):
|
||||
return '%2.0f' % (100 * temp) + '%'
|
||||
|
||||
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
|
||||
plt.title(title, fontsize=16)
|
||||
plt.tight_layout()
|
||||
plt.savefig(fn)
|
||||
# plt.show()
|
||||
plt.cla()
|
||||
plt.clf()
|
||||
plt.close()
|
||||
|
||||
|
||||
# 画环状图
|
||||
def drawAnnulus(data, recipe, title='', fn=''):
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||||
plt.rcParams['axes.unicode_minus'] = False
|
||||
xxx = 8 # 画布x,长
|
||||
yyy = 4 # 画布y,高
|
||||
nnncol = 1 # 图例列数
|
||||
fs = 'medium' ## xx--small;x-small;small;medium;large;x-large;xx-large
|
||||
|
||||
# if title == '政务新媒体账号类型':
|
||||
if len(recipe) > 20:
|
||||
if len(recipe) > 40:
|
||||
xxx = 16
|
||||
nnncol = 4
|
||||
fs = 'x-small'
|
||||
else:
|
||||
xxx = 16
|
||||
nnncol = 2
|
||||
fs = 'x-small'
|
||||
|
||||
fig, ax = plt.subplots(figsize=(xxx, yyy), subplot_kw=dict(aspect="equal"))
|
||||
|
||||
"""
|
||||
设置圆环宽度,绘图方向,起始角度
|
||||
|
||||
参数wedgeprops以字典形式传递,设置饼图边界的相关属性,例如圆环宽度0.5
|
||||
饼状图默认从x轴正向沿逆时针绘图,参数startangle可指定新的角(例如负40度)度起画
|
||||
"""
|
||||
wedges, texts = ax.pie(data, radius=1.1, wedgeprops=dict(width=0.4), startangle=0) # 画环,返回扇形列表和每个标注文本对象(坐标,文字,属性)
|
||||
|
||||
if 1:
|
||||
x = 1.2
|
||||
if title == '政务新媒体监测结果':
|
||||
x = 1.0
|
||||
plt.legend(labels=recipe, loc="center left", bbox_to_anchor=(x, 0.5), borderaxespad=0., ncol=nnncol,
|
||||
fontsize=fs) # , ncol=3
|
||||
if len(title) > 0:
|
||||
ax.set_title(title, fontsize=16, fontweight='heavy') # , x=0.6
|
||||
|
||||
plt.tight_layout()
|
||||
if len(fn) > 0:
|
||||
plt.savefig(fn)
|
||||
# plt.show()
|
||||
plt.cla()
|
||||
plt.clf()
|
||||
plt.close()
|
||||
|
||||
# summaryCity(city, dfc, dfcw, dfcs, context, strfnTemplate, os.path.join(strPathVerified,'Reports', city+'.docx'), strPathVerified )
|
||||
|
||||
|
||||
# 汇总市州数据,
|
||||
# 市州名称, 监测数据, cbz数据, mgc数据, context(编号、名称), word模板文件名称, 输出word文件名称, 临时文件目录
|
||||
# 需要传入模板文件,数据、错别字、敏感词,单位名称等
|
||||
def summaryCity(info, city, df, dfW, dfS, fnTemplate, fnReport, dirTemp):
|
||||
dCityClient = {
|
||||
'甘肃省': "甘肃省人民政府办公厅",
|
||||
'省直部门': "甘肃省人民政府办公厅",
|
||||
'白银市': "白银市人民政府办公室",
|
||||
'定西市': "定西市人民政府办公室",
|
||||
'临夏回族自治州': "临夏回族自治州人民政府办公室",
|
||||
'平凉市': "中共平凉市委网络安全和信息化委员会办公室",
|
||||
"庆阳市": "庆阳市电子政务与信息资源管理办公室",
|
||||
'庆阳市华池县': "华池县人民政府办公室",
|
||||
'庆阳市宁县': "宁县人民政府办公室",
|
||||
"庆阳市镇原县": "镇原县人民政府办公室",
|
||||
"酒泉市": "酒泉市人民政府办公室",
|
||||
"天水市": "天水市人民政府办公室",
|
||||
"武威市": "武威市人民政府办公室",
|
||||
"金昌市": "金昌市人民政府办公室",
|
||||
"嘉峪关市": "嘉峪关市人民政府办公室",
|
||||
"兰州新区": "兰州新区管委会办公室",
|
||||
"陇南市": "陇南市政务服务中心",
|
||||
"张掖市": "张掖市政务服务中心",
|
||||
"甘南藏族自治州": "甘南藏族自治州政务服务中心",
|
||||
"兰州市": "兰州市政务服务中心",
|
||||
"陇南市": "陇南市政务服务中心",
|
||||
}
|
||||
dHavingSubordinateUnits = {'甘肃省': True, '白银市': True, '定西市': True,
|
||||
'临夏回族自治州': True, '平凉市': True, "庆阳市": True, "酒泉市": True, "天水市": True,
|
||||
"陇南市": True, "张掖市": True, "甘南藏族自治州": True, "兰州市": True, "陇南市": True,
|
||||
"武威市": True, "金昌市": True,
|
||||
'省直部门': False, "兰州新区": False, '庆阳市华池县': False,
|
||||
'庆阳市宁县': False, "庆阳市镇原县": False, "嘉峪关市": False}
|
||||
print("----------------" + city + "----------------")
|
||||
# 报告编号、委托单位
|
||||
strID = "%02d" % (list(dCityClient).index(city))
|
||||
# print(strID)
|
||||
context = {
|
||||
"city": city,
|
||||
"client": dCityClient[city],
|
||||
"reportid": strID + info['serialNum'],
|
||||
'havingSubordinateUnits': dHavingSubordinateUnits[city],
|
||||
'havingBelowStandard': True,
|
||||
'havingUpStandard': True,
|
||||
'havingCbz': True,
|
||||
'havingMgc': True
|
||||
}
|
||||
context.update(info)
|
||||
|
||||
subordinate = '区县/地方部门'
|
||||
subordinateName = '县区'
|
||||
# 区县数据筛选
|
||||
if "庆阳市" in city:
|
||||
if "华池县" in city:
|
||||
dfc = df.loc[(df['市/省局'] == '庆阳市')
|
||||
& (df['区县/地方部门'] == '华池县')].copy()
|
||||
|
||||
elif "宁县" in city:
|
||||
dfc = df.loc[(df['市/省局'] == '庆阳市')
|
||||
& (df['区县/地方部门'] == '宁县')].copy()
|
||||
elif "镇原县" in city:
|
||||
dfc = df.loc[(df['市/省局'] == '庆阳市')
|
||||
& (df['区县/地方部门'] == '镇原县')].copy()
|
||||
else:
|
||||
dfc = df.loc[(df['市/省局'] == '庆阳市')].copy()
|
||||
# & (df['区县/地方部门']!='华池县')
|
||||
# & (df['区县/地方部门']!='宁县')
|
||||
# & (df['区县/地方部门']!='镇原县') ].copy()
|
||||
dfcw = dfW.loc[dfW['市州'] == '庆阳市'].copy()
|
||||
dfcs = dfS.loc[dfS['市州'] == '庆阳市'].copy()
|
||||
elif "甘肃" in city :
|
||||
dfc = df.copy()
|
||||
dfcw = dfW.copy()
|
||||
dfcs = dfS.copy()
|
||||
'''
|
||||
cities = {'白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
|
||||
'嘉峪关市', '陇南市', '张掖市', '省直部门', '甘南藏族自治州', '金昌市'}
|
||||
|
||||
dfc = df.loc[ df['市/省局'].isin(cities) ].copy()
|
||||
dfcw = dfW.loc[ dfW['市州'].isin(cities) ].copy()
|
||||
dfcs = dfS.loc[ dfS['市州'].isin(cities) ].copy()
|
||||
'''
|
||||
subordinate = '市/省局'
|
||||
subordinateName = '市州'
|
||||
|
||||
elif "省直部门" in city :
|
||||
dfc = df.loc[df['市/省局'] == city].copy()
|
||||
#dfcw = dfW.loc[dfW['市州'] == dictSC[city]].copy()
|
||||
#dfcs = dfS.loc[dfS['市州'] == dictSC[city]].copy()
|
||||
dfcw = dfW.loc[dfW['市州'] == city].copy()
|
||||
dfcs = dfS.loc[dfS['市州'] == city].copy()
|
||||
|
||||
else:
|
||||
dfc = df.loc[(df['市/省局'] == city)].copy()
|
||||
dfcw = dfW.loc[dfW['市州'] == city].copy()
|
||||
dfcs = dfS.loc[dfS['市州'] == city].copy()
|
||||
|
||||
# -----------------------
|
||||
# 统计结果分析
|
||||
|
||||
dCity = {'1': '2'}
|
||||
#
|
||||
# 县区-监测结果 统计
|
||||
#
|
||||
|
||||
# 透视表, 按县区统计各个监测结果账号数量
|
||||
dfCountyAccount = pd.pivot_table(dfc, index=[subordinate], columns=['监测结果'], values=['账号名称'], aggfunc='count',
|
||||
fill_value='', margins=True)
|
||||
dfCountyAccount.columns = dfCountyAccount.columns.droplevel(0)
|
||||
# 准备模板中的表格
|
||||
tt3_list = []
|
||||
for index, row in dfCountyAccount.iterrows():
|
||||
county = ''
|
||||
if index == 'All':
|
||||
county = '总 计'
|
||||
else:
|
||||
county = index
|
||||
if not dHavingSubordinateUnits[city] and county=='市直部门':
|
||||
county = city
|
||||
hg = ''
|
||||
u2w = ''
|
||||
un = ''
|
||||
count = ''
|
||||
if '合格' in dfCountyAccount.columns.values.tolist():
|
||||
if not isinstance(row['合格'], str):
|
||||
hg = int(row['合格'])
|
||||
if '监测期间未更新' in dfCountyAccount.columns.values.tolist():
|
||||
if not isinstance(row['监测期间未更新'], str):
|
||||
un = int(row['监测期间未更新'])
|
||||
if '超过两周未更新' in dfCountyAccount.columns.values.tolist():
|
||||
if not isinstance(row['超过两周未更新'], str):
|
||||
u2w = int(row['超过两周未更新'])
|
||||
if 'All' in dfCountyAccount.columns.values.tolist():
|
||||
if not isinstance(row['All'], str):
|
||||
count = int(row['All'])
|
||||
|
||||
tt3_a = {'county': county, 'hg': hg, 'u2w': u2w, 'un': un, 'count': count}
|
||||
tt3_list.append(tt3_a)
|
||||
context['tt3_contents'] = tt3_list
|
||||
# dfCountyAccount.to_excel(dirTask+strPathCity+'县区监测结果.xlsx')
|
||||
|
||||
# -----------------------
|
||||
#
|
||||
# 按媒体类型统计
|
||||
#
|
||||
# 透视表, 按账号类型统计账号数量
|
||||
dfMedia = pd.pivot_table(dfc, index=['账号类型'], values=['账号名称'], aggfunc='count', fill_value='', margins=True)
|
||||
# 提取该市账号数量
|
||||
dCity['nmCount'] = dfMedia.loc['All', '账号名称']
|
||||
print(' 监测账号数:', dCity['nmCount'])
|
||||
# 提取 账号类型-数量 , 拼成文本串
|
||||
dfMedia = dfMedia.sort_values(by='账号名称', ascending=False)
|
||||
lTableCs1 = []
|
||||
strMedia = ''
|
||||
i = 0
|
||||
tt1_list = []
|
||||
for m in dfMedia.index.tolist()[1:]: # 第一个是总数,不用取
|
||||
strNum = str(dfMedia.iloc[:, 0].tolist()[1:][i])
|
||||
strMedia = strMedia + m + strNum + '个,'
|
||||
tt1_a = {'type': m, 'count': strNum}
|
||||
tt1_list.append(tt1_a)
|
||||
i = i + 1
|
||||
dCity['sMediaCount'] = strMedia[:-1].rstrip(',')
|
||||
context.update({'tt1_contents': tt1_list})
|
||||
|
||||
# -----------------------
|
||||
#
|
||||
# 按县区-更新次数 统计
|
||||
#
|
||||
dfCountyArticle = pd.pivot_table(dfc, index=[subordinate], values=['更新次数'], aggfunc='sum', fill_value='',
|
||||
margins=True)
|
||||
dfCountyArticle = dfCountyArticle.sort_values(by='更新次数', ascending=False).copy()
|
||||
dCity['cityArticleCount'] = "%d" % dfCountyArticle.iloc[0, 0]
|
||||
dCity['countyMostArticle'] = dfCountyArticle.index.tolist()[1]
|
||||
dCity['countyMostArticleCount'] = "%d" % dfCountyArticle.iloc[1, 0]
|
||||
strCountyArticle = ''
|
||||
iiii = 0
|
||||
|
||||
if len(dfCountyArticle.index)>2:
|
||||
for cccc in dfCountyArticle.index.tolist()[1:]:
|
||||
iiii = iiii + 1
|
||||
strCountyArticle = strCountyArticle + cccc + "%d" % dfCountyArticle.iloc[iiii, 0] + "次,"
|
||||
strCountyArticle = strCountyArticle.rstrip(',')
|
||||
|
||||
dCity['sCountyArticles'] = ',按管理矩阵统计,' + strCountyArticle
|
||||
|
||||
|
||||
# 市各县区监测结果按总数排序,
|
||||
dfCountyAccount.loc[:, '合格'] = dfCountyAccount['合格'].astype('int')
|
||||
dfCountyAccount = dfCountyAccount.sort_values(by='All', ascending=False).copy()
|
||||
# 计算合格率
|
||||
dfCountyAccount.eval('rate = 合格 / All ', inplace=True)
|
||||
dfResult = dfCountyAccount.copy()
|
||||
# 提取city合格率
|
||||
dCity['cityRatio'] = "{:.1%}".format(dfCountyAccount.loc['All', 'rate'])
|
||||
print(' 合格率:', dCity['cityRatio'])
|
||||
|
||||
# 导出文件
|
||||
# dfCountyAccount.to_excel(dirIntermediate+sFileBase+'县区合格率.xlsx')
|
||||
|
||||
# dfMedia = dfMedia.drop(['All'])
|
||||
# 提取县区名称,县区账号数, 县区合格率,转成字符串
|
||||
dfCountyAccount = dfCountyAccount.drop(['All']) # 删除"All"行
|
||||
counties = dfCountyAccount.index.tolist()
|
||||
countyCounts = dfCountyAccount['All'].values.tolist()
|
||||
countyHeges = dfCountyAccount['合格'].values.tolist()
|
||||
|
||||
# 按县区账号数量排序
|
||||
strCountyCount = ''
|
||||
strCounties = ''
|
||||
i = 0
|
||||
for c in counties:
|
||||
strCounties = strCounties + c + ','
|
||||
strCountyCount = strCountyCount + c + str(countyCounts[i]) + '个,'
|
||||
i = i + 1
|
||||
dCity['countyCount'] = "%d" % i
|
||||
dCity['sCounties'] = strCounties.rstrip(',')
|
||||
dCity['sCountyCount'] = strCountyCount.rstrip(',')
|
||||
|
||||
# 按合格率排序
|
||||
dfCountyAccount = dfCountyAccount.sort_values(by='rate', ascending=False)
|
||||
countieshege = dfCountyAccount.index.tolist()
|
||||
countyRates = dfCountyAccount['rate']
|
||||
strCountyRatio = ''
|
||||
i = 0
|
||||
tt2_list = []
|
||||
for c in countieshege:
|
||||
strRatio = "%.1f" % (100.0 * countyRates[i])
|
||||
strCountyRatio = strCountyRatio + c + strRatio + '%,'
|
||||
tt2_a = {'county': c, 'ratio': strRatio + '%'}
|
||||
tt2_list.append(tt2_a)
|
||||
i = i + 1
|
||||
dCity['sCountyRatio'] = strCountyRatio.rstrip(',')
|
||||
dCity['tt2_contents'] = tt2_list
|
||||
|
||||
# -----------------------
|
||||
#
|
||||
# 绘图
|
||||
#
|
||||
print(' 生成图片...')
|
||||
drawAnnulus(dfMedia.iloc[:, 0].tolist()[1:], dfMedia.index.tolist()[1:],
|
||||
'政务新媒体账号类型', os.path.join(dirTemp, city + 'annulusMediaCount.png'))
|
||||
|
||||
drawAnnulus(countyCounts, counties,
|
||||
subordinateName + '政务新媒体账号数量', os.path.join(dirTemp, city + 'annulusCountyCount.png'))
|
||||
|
||||
drawAnnulus(dfCountyArticle.iloc[:, 0].tolist()[1:], dfCountyArticle.index.tolist()[1:],
|
||||
subordinateName + '政务新媒体累计更新次数', os.path.join(dirTemp, city + 'annulusCountyArticle.png'))
|
||||
|
||||
# ;{{resultNoUpdated}}个政务新媒体监测期间未更新,占监测总数的{{resultNoUpdatedRatio}}
|
||||
# ;{{resultNoUpdated2W}}个政务新媒体连续未更新时间超过两周,占监测总数的{{resultNoUpdated2WRatio}}
|
||||
# 政务新媒体监测结果
|
||||
dfResult = dfResult.drop('All', axis=1)
|
||||
dfResult = dfResult.drop('rate', axis=1)
|
||||
# 合格数,合格率,不合格数
|
||||
dCity['resultQualified'] = "%d" % (dfResult.loc['All', '合格'])
|
||||
dCity['resultQualifiedRatio'] = "%.1f%%" % (dfResult.loc['All', '合格'] / dCity['nmCount'] * 100.0)
|
||||
dCity['resultUnqualified'] = "%d" % (dCity['nmCount'] - dfResult.loc['All', '合格'])
|
||||
#
|
||||
# numNoupdated = 0
|
||||
if '监测期间未更新' in dfResult.columns.values.tolist():
|
||||
numNoupdated = dfResult.loc['All', '监测期间未更新']
|
||||
dCity['stringResultNoUpdated'] = ";%d个政务新媒体监测期间未更新,占监测总数的%.1f%%" % (
|
||||
numNoupdated, numNoupdated / dCity['nmCount'] * 100.0)
|
||||
dCity['stringNoUpdated'] = "%d个政务新媒体监测期间未更新。" % (numNoupdated)
|
||||
else:
|
||||
dCity['stringResultNoUpdated'] = ''
|
||||
dCity['stringNoUpdated'] = ""
|
||||
# dCity['resultNoUpdated'] = "%d"%(numNoupdated)
|
||||
# dCity['resultNoUpdatedRatio'] = "%.1f%%"%(numNoupdated/dCity['nmCount']*100.0)
|
||||
# numNoupdated2W = 0
|
||||
if '超过两周未更新' in dfResult.columns.values.tolist():
|
||||
numNoupdated2W = dfResult.loc['All', '超过两周未更新']
|
||||
dCity['stringResultNoUpdated2W'] = ";%d个政务新媒体连续未更新时间超过两周,占监测总数的%.1f%%" % (
|
||||
numNoupdated2W, numNoupdated2W / dCity['nmCount'] * 100.0)
|
||||
dCity['stringNoUpdated2W'] = "%d个政务新媒体连续未更新时间超过两周。" % (numNoupdated2W)
|
||||
else:
|
||||
dCity['stringResultNoUpdated2W'] = ''
|
||||
dCity['stringNoUpdated2W'] = ''
|
||||
# dCity['resultNoUpdated2W'] = "%d"%(numNoupdated2W)
|
||||
# dCity['resultNoUpdated2WRatio'] = "%.1f%%"%(numNoupdated2W/dCity['nmCount']*100.0)
|
||||
resultLabels = dfResult.columns.values.tolist()
|
||||
resultCounts = dfResult.loc['All'].values.tolist()
|
||||
drawAnnulus(resultCounts, resultLabels,
|
||||
'政务新媒体监测结果', os.path.join(dirTemp, city + 'annulusResult.png'))
|
||||
|
||||
drawBar(countyRates, countieshege,
|
||||
'政务新媒体管理矩阵发布时效性合格率榜单', os.path.join(dirTemp, city + 'barCountyRatio.png'))
|
||||
|
||||
# -----------------------
|
||||
#
|
||||
# 准备报告需要的数据
|
||||
#
|
||||
print(' 生成报告...')
|
||||
|
||||
dfCityUnqulified = dfc[dfc['监测结果'] != '合格']
|
||||
dfCityUnqulified = dfCityUnqulified.sort_values(by="监测结果", ascending=True) # by指定按哪列排序。ascending表示是否升序=False
|
||||
|
||||
#################################################
|
||||
|
||||
dfCityQulified = dfc[dfc['监测结果'] == '合格']
|
||||
dfCityQulified = dfCityQulified.sort_values(by=subordinate, ascending=True) # by指定按哪列排序。ascending表示是否升序=False
|
||||
|
||||
#
|
||||
# 不合格账号列表
|
||||
if len(dfCityUnqulified)<1:
|
||||
context.update({'havingBelowStandard':False})
|
||||
else:
|
||||
tt4_list = []
|
||||
for index, row in dfCityUnqulified.iterrows():
|
||||
count = ''
|
||||
if row['更新次数']:
|
||||
count = "%d" % row['更新次数']
|
||||
days = ''
|
||||
if row['静默日数']:
|
||||
days = "%d" % row['静默日数']
|
||||
sD1 = ''
|
||||
sD2 = ''
|
||||
if row['静默开始日期']:
|
||||
sD1 = toDate(str(row['静默开始日期']))
|
||||
if row['静默结束日期']:
|
||||
sD2 = toDate(str(row['静默结束日期']))
|
||||
|
||||
tt4_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
|
||||
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
|
||||
'days': days, 'start': sD1, 'end': sD2, }
|
||||
tt4_list.append(tt4_a)
|
||||
tt4_results = {'tt4_contents': tt4_list}
|
||||
context.update(tt4_results)
|
||||
|
||||
#
|
||||
# 合格账号列表
|
||||
if len(dfCityQulified)<1:
|
||||
context.update({'havingUpStandard':False})
|
||||
else:
|
||||
tt5_list = []
|
||||
for index, row in dfCityQulified.iterrows():
|
||||
count = ''
|
||||
if row['更新次数']:
|
||||
count = "%d" % row['更新次数']
|
||||
days = ''
|
||||
if row['静默日数']:
|
||||
days = "%d" % row['静默日数']
|
||||
sD1 = ''
|
||||
sD2 = ''
|
||||
if row['静默开始日期']:
|
||||
sD1 = toDate(str(row['静默开始日期']))
|
||||
if row['静默结束日期']:
|
||||
sD2 = toDate(str(row['静默结束日期']))
|
||||
|
||||
tt5_a = {'name': row['账号名称'], 'type': row['账号类型'], 'unit': row['开设主体'],
|
||||
'county': row[subordinate], 'result': row['监测结果'], 'num': count,
|
||||
'days': days, 'start': sD1, 'end': sD2, }
|
||||
tt5_list.append(tt5_a)
|
||||
tt5_results = {'tt5_contents': tt5_list}
|
||||
context.update(tt5_results)
|
||||
|
||||
#
|
||||
# 错别字表格
|
||||
|
||||
if dfcw.shape[0]<1:
|
||||
context.update({'havingCbz':False})
|
||||
else:
|
||||
tCbz_list = []
|
||||
dfcw.fillna('')
|
||||
for index, row in dfcw.iterrows():
|
||||
sTitle = ''
|
||||
sDate = toDate(str(row['发文时间']))
|
||||
if '标题' in dfcw.columns:
|
||||
sTitle = row['标题']
|
||||
|
||||
# 去除引号等干扰表格模板输出的字符
|
||||
r = "[——,$%^,。?、~@#¥%……&*《》<>「」{}【】()/\\\[\]'\"]"
|
||||
if pd.isna(row['错误出现位置']):
|
||||
s = ''
|
||||
else:
|
||||
s = re.sub(r, '', row['错误出现位置'])
|
||||
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': s, 'type': row['账号类型'], 'name': row['账号名称'],
|
||||
'date': sDate, 'title': sTitle, }
|
||||
tCbz_list.append(a)
|
||||
if dfcw.shape[0] > 0:
|
||||
dCity['stringCbzCount'] = '本次检测发现错别字%d处,详细情况见附表政务新媒体发布内容错别字统计表。' % (dfcw.shape[0])
|
||||
else:
|
||||
dCity['stringCbzCount'] = '本次检测未发现错别字。'
|
||||
tCbz_results = {'tCbz_contents': tCbz_list}
|
||||
context.update(tCbz_results)
|
||||
|
||||
# 读取添加敏感词表格
|
||||
if dfcs.shape[0]<1:
|
||||
context.update({'havingMgc':False})
|
||||
else:
|
||||
tMgc_list = []
|
||||
dfcs.fillna('')
|
||||
for index, row in dfcs.iterrows():
|
||||
sTitle = ''
|
||||
sDate = toDate(str(row['发文时间']))
|
||||
if '标题' in dfcs.columns:
|
||||
sTitle = row['标题']
|
||||
a = {'error': row['错误'], 'tips': row['建议'], 'sentence': row['错误出现位置'], 'type': row['账号类型'], 'name': row['账号名称'],
|
||||
'date': sDate, 'title': sTitle, }
|
||||
tMgc_list.append(a)
|
||||
if dfcs.shape[0] > 0:
|
||||
dCity['stringMgcCount'] = '本次检测发现敏感信息%d处,详细情况见附表政务新媒体发布内容敏感信息统计表。' % (dfcs.shape[0])
|
||||
else:
|
||||
dCity['stringMgcCount'] = '本次检测未发现涉敏内容。'
|
||||
tMgc_results = {'tMgc_contents': tMgc_list}
|
||||
context.update(tMgc_results)
|
||||
|
||||
# table1
|
||||
context.update(dCity)
|
||||
|
||||
# -----------------------
|
||||
#
|
||||
# 按模板生成报告
|
||||
#
|
||||
temp_word(fnTemplate,
|
||||
fnReport,
|
||||
context, dirTemp, city)
|
||||
|
||||
|
||||
#更新目录并另存为pdf
|
||||
print(' 更新目录,转换为PDF...')
|
||||
update_toc( fnReport )
|
||||
|
||||
#签章
|
||||
print(' 签章...')
|
||||
fnTmp = fnReport.replace('.docx', '_.pdf')
|
||||
fnPDF = fnReport.replace('.docx', '.pdf')
|
||||
if city in {'庆阳市', '平凉市', '临夏回族自治州'}:
|
||||
addStamp(fnTmp,
|
||||
'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf' ,
|
||||
fnPDF, 115)
|
||||
else:
|
||||
addStamp(fnTmp,'D:/Projects/POM/DEV/SCRIPTS/stamps_dwl.pdf',fnPDF)
|
||||
if True:
|
||||
os.remove(fnTmp)
|
||||
|
||||
|
||||
|
||||
def createDir(dirP, dirS):
|
||||
dirN = dirP
|
||||
if os.path.isdir(dirP):
|
||||
dirN = os.path.join(dirP, dirS)
|
||||
if not (os.path.exists(dirN)):
|
||||
os.mkdir(dirN)
|
||||
if os.path.isdir(dirN):
|
||||
pass
|
||||
else:
|
||||
dirN = dirP
|
||||
print('Directory ' + dirN + ' cannot be created.')
|
||||
return dirN
|
||||
# def createDir(dirP, dirS):
|
||||
|
||||
def summary(info, strFnData, strFnW, strFnS, strfnTemplate, strPathOutput):
|
||||
# 打开监测数据、错别字、敏感词
|
||||
df = pd.read_excel(strFnData)
|
||||
dfW = pd.read_excel(strFnW)
|
||||
dfS = pd.read_excel(strFnS)
|
||||
|
||||
# df.loc[df['账号类型'] == '微信服务号', '账号类型'] = '微信'
|
||||
# df.loc[df['账号类型'] == '微信订阅号', '账号类型'] = '微信'
|
||||
|
||||
# 统一监测结果表述
|
||||
df.loc[df['监测结果'] == '连续两周未更新', '监测结果'] = '超过两周未更新'
|
||||
|
||||
# 过长名称替换为简称,便于绘图
|
||||
df.loc[df['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县'
|
||||
df.loc[df['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县'
|
||||
|
||||
# 省直、 市直、 州直
|
||||
df['市/省局'] = df['市/省局'].fillna('省直部门')
|
||||
df['区县/地方部门'] = df['区县/地方部门'].fillna('市直部门')
|
||||
df.loc[(df['市/省局'] == '临夏回族自治州') & (df['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门'
|
||||
|
||||
|
||||
# 数据整理
|
||||
df.replace(r'\s+', '', regex=True, inplace=True) # 去除账号、单位名称中的空格、换行、tab等
|
||||
df.replace(r'^其他\+', '', regex=True, inplace=True) # 去除账号类型中的 "其它" 字样
|
||||
df['更新次数'] = df['更新次数'].fillna(0)
|
||||
df = df.fillna(value='')
|
||||
|
||||
|
||||
#########################################################
|
||||
#
|
||||
# 统计市州范围
|
||||
cities = {'甘肃省', '白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
|
||||
'嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县', '陇南市'}
|
||||
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '陇南市', '张掖市', '甘肃省', '省直部门'}
|
||||
#
|
||||
cities = {'甘肃省','庆阳市','武威市','临夏回族自治州'} # 只统计特定市州
|
||||
|
||||
# strPathOutput目录下生成报告目录和临时文件目录:Reports 和 Intermediate
|
||||
dirP = os.path.abspath(os.path.dirname(strPathOutput))
|
||||
dirReports = createDir(dirP, 'Reports')
|
||||
dirIntermediate = createDir(dirP, 'Intermediate')
|
||||
for city in cities:
|
||||
summaryCity(info, city, df, dfW, dfS, strfnTemplate, os.path.join(dirReports, city + '.docx'), dirIntermediate)
|
||||
|
||||
# 合并错别字文件
|
||||
def mergeCMC(keyword, strPathCBZ, strFnCbz):
|
||||
# cityShorten
|
||||
cityShorten = {'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
||||
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
||||
'兰州新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
||||
'BY': '白银市', 'DX': '定西市', 'JQ': '酒泉市', 'JYG': '嘉峪关市', 'LN': '陇南市',
|
||||
'LX': '临夏回族自治州', 'PL': '平凉市', 'QY': '庆阳市', 'TS': '天水市', 'WW': '武威市', 'XQ': '兰州新区',
|
||||
'LZXQ': '兰州新区', 'LZ': '兰州市', 'ZY': '张掖市', 'GN': '甘南藏族自治州', 'SZ': '省直部门', 'JC': '金昌市', }
|
||||
df = pd.DataFrame()
|
||||
for fn in glob.glob(os.path.join(strPathCBZ, '*'+keyword+'*.xlsx')):
|
||||
p, f = os.path.split(fn)
|
||||
city=''
|
||||
for c in cityShorten.keys():
|
||||
if c in f:
|
||||
city = cityShorten[c]
|
||||
break
|
||||
if len(city)<1:
|
||||
print("!!!!! City Name not matched ( ", f, " )")
|
||||
dfn = pd.read_excel(fn)
|
||||
dfn['市州'] = city
|
||||
df = df.append(dfn, ignore_index=True)
|
||||
print(city, f, dfn.shape[0], '/', df.shape[0])
|
||||
df.to_excel(strFnCbz)
|
||||
#def mergeCMC
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# 运行之前先转换excel文件的日期列
|
||||
|
||||
info = {
|
||||
"year": "2023",
|
||||
"month": "6",
|
||||
"quarter": "二",
|
||||
"datePub": "二〇二三年六月",
|
||||
"dateStart": "2023年3月20日",
|
||||
"dateEnd": "2023年6月20日",
|
||||
"days": "92",
|
||||
"serialNum": "8",
|
||||
}
|
||||
# 数据根目录,
|
||||
strPath = 'D:/Projects/POM/DATA/2023年6月/季度报告/'
|
||||
createDir(strPath, '全文')
|
||||
createDir(strPath, '转发')
|
||||
createDir(strPath, '报告')
|
||||
createDir(strPath, '汇总')
|
||||
createDir(strPath, '监测')
|
||||
# 监测数据
|
||||
strFnMonitoring = strPath + '汇总/汇总数据_2023.6.xlsx'
|
||||
# word模板文件
|
||||
strPathTemplate = strPath + 'POM_ReportTemplateQuarterly.docx'
|
||||
# 错别字
|
||||
strFnCbz = strPath + '汇总/CBZ.xlsx'
|
||||
if not os.path.exists(strFnCbz):# 汇总错别字
|
||||
strPathCBZ = strPath + '监测/'
|
||||
mergeCMC("错别", strPathCBZ, strFnCbz)
|
||||
# 敏感词
|
||||
strFnMgc = strPath + '汇总/MGC.xlsx'
|
||||
if not os.path.exists(strFnMgc):#汇总敏感词
|
||||
strPathMGC = strPath + '监测/'
|
||||
mergeCMC("敏感", strPathMGC, strFnMgc)
|
||||
# 数据目录
|
||||
strPathOutput = strPath
|
||||
|
||||
|
||||
# 打开监测数据、错别字、敏感词
|
||||
df = pd.read_excel(strFnMonitoring)
|
||||
dfW = pd.read_excel(strFnCbz)
|
||||
dfS = pd.read_excel(strFnMgc)
|
||||
|
||||
# df.loc[df['账号类型'] == '微信服务号', '账号类型'] = '微信'
|
||||
# df.loc[df['账号类型'] == '微信订阅号', '账号类型'] = '微信'
|
||||
|
||||
# 统一监测结果表述
|
||||
df.loc[df['监测结果'] == '连续两周未更新', '监测结果'] = '超过两周未更新'
|
||||
|
||||
# 过长名称替换为简称,便于绘图
|
||||
df.loc[df['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县'
|
||||
df.loc[df['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县'
|
||||
|
||||
# 省直、 市直、 州直
|
||||
df['市/省局'] = df['市/省局'].fillna('省直部门')
|
||||
df['区县/地方部门'] = df['区县/地方部门'].fillna('市直部门')
|
||||
df.loc[(df['市/省局'] == '临夏回族自治州') & (df['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门'
|
||||
|
||||
|
||||
# 数据整理
|
||||
df.replace(r'\s+', '', regex=True, inplace=True) # 去除账号、单位名称中的空格、换行、tab等
|
||||
df.replace(r'^其他\+', '', regex=True, inplace=True) # 去除账号类型中的 "其它" 字样
|
||||
df['更新次数'] = df['更新次数'].fillna(0)
|
||||
df = df.fillna(value='')
|
||||
|
||||
|
||||
#########################################################
|
||||
#
|
||||
# 统计市州范围
|
||||
cities = {'白银市', '武威市', '庆阳市', '酒泉市', '天水市', '临夏回族自治州', '平凉市', '定西市', '兰州新区',
|
||||
'嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县', '陇南市', '张掖市', '甘肃省'}
|
||||
#cities = cities | {'甘肃省'}#, '省直部门'}
|
||||
#cities = cities | {'陇南市'}#, '兰州市'}, '省直部门'}
|
||||
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '张掖市', '甘肃省', '省直部门'}
|
||||
#
|
||||
#cities = {'甘肃省'} # 只统计特定市州
|
||||
|
||||
# strPathOutput目录下生成报告目录和临时文件目录:Reports 和 Intermediate
|
||||
dirP = os.path.abspath(os.path.dirname(strPathOutput))
|
||||
dirReports = createDir(dirP, 'Reports')
|
||||
dirIntermediate = createDir(dirP, 'Intermediate')
|
||||
for city in cities:
|
||||
summaryCity(info, city, df, dfW, dfS, strPathTemplate, os.path.join(dirReports, city + '政务新媒体监测报告_{}年第{}季度.docx'.format(info['year'], info['quarter'])), dirIntermediate)
|
|
@ -78,7 +78,7 @@ def drawAnnulus(data, recipe, title='', fn=''):
|
|||
else:
|
||||
xxx = 16
|
||||
nnncol = 2
|
||||
fs = 'xmall'
|
||||
fs = 'x-small'
|
||||
|
||||
fig, ax = plt.subplots(figsize=(xxx, yyy), subplot_kw=dict(aspect="equal"))
|
||||
|
||||
|
@ -132,6 +132,10 @@ def summaryCity(info, city, df, dfW, dfS, fnTemplate, fnReport, dirTemp):
|
|||
"嘉峪关市": "嘉峪关市人民政府办公室",
|
||||
"兰州新区": "兰州新区管委会办公室",
|
||||
"陇南市": "陇南市人民政府办公室",
|
||||
"张掖市": "张掖市政务服务中心",
|
||||
"甘南藏族自治州": "甘南藏族自治州政务服务中心",
|
||||
"兰州市": "兰州市政务服务中心",
|
||||
"陇南市": "陇南市政务服务中心",
|
||||
}
|
||||
print("----------------" + city + "----------------")
|
||||
# 报告编号、委托单位
|
||||
|
@ -534,7 +538,9 @@ def summary(info, strFnData, strFnW, strFnS, strfnTemplate, strPathOutput):
|
|||
'嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县'}
|
||||
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '陇南市', '张掖市', '甘肃省', '省直部门'}
|
||||
#
|
||||
cities = {'甘肃省'} # 只统计特定市州
|
||||
cities = {'甘肃省','庆阳市','武威市','临夏回族自治州', '酒泉市'} # 只统计特定市州
|
||||
|
||||
cities = {'张掖市'}
|
||||
|
||||
# strPathOutput目录下生成报告目录和临时文件目录:Reports 和 Intermediate
|
||||
dirP = os.path.abspath(os.path.dirname(strPathOutput))
|
||||
|
@ -574,13 +580,13 @@ if __name__ == "__main__":
|
|||
# 运行之前先转换excel文件的日期列
|
||||
|
||||
info = {
|
||||
"year": "2022",
|
||||
"quarter": "三",
|
||||
"dateCN": "二〇二二年九月",
|
||||
"dateStart": "2022年7月1日",
|
||||
"dateEnd": "2022年9月20日",
|
||||
"days": "81",
|
||||
"num": "11",
|
||||
"year": "2023",
|
||||
"quarter": "一",
|
||||
"dateCN": "二〇二三年四月",
|
||||
"dateStart": "2023年1月1日",
|
||||
"dateEnd": "2023年3月20日",
|
||||
"days": "79",
|
||||
"num": "4",
|
||||
}
|
||||
# 数据根目录,
|
||||
strPath = 'D:/Projects/POM/DATA/2023年S1/'
|
||||
|
|
|
@ -548,7 +548,7 @@ def summary(info, strFnData, strFnW, strFnS, strfnTemplate, strPathOutput):
|
|||
'嘉峪关市', '庆阳市华池县', '庆阳市镇原县', '庆阳市宁县', '陇南市'}
|
||||
#cities = cities | {'甘南藏族自治州', '金昌市', '兰州市', '陇南市', '张掖市', '甘肃省', '省直部门'}
|
||||
#
|
||||
cities = {'甘肃省'} # 只统计特定市州
|
||||
cities = {'甘肃省','庆阳市','武威市','临夏回族自治州'} # 只统计特定市州
|
||||
|
||||
# strPathOutput目录下生成报告目录和临时文件目录:Reports 和 Intermediate
|
||||
dirP = os.path.abspath(os.path.dirname(strPathOutput))
|
||||
|
|
86
searchALL.py
86
searchALL.py
|
@ -1,4 +1,5 @@
|
|||
import pandas as pd
|
||||
from openpyxl import Workbook
|
||||
import numpy as np
|
||||
import seaborn as sns
|
||||
import datetime, time
|
||||
|
@ -17,16 +18,37 @@ regSTR = '习近平总同志|习近同志|习近总书记|习平总书记|习近
|
|||
'|建党七十三周年|共产党成立七十三周年' + \
|
||||
'|大人代表|大人常委会|人大常委主任' + \
|
||||
'|爱爱服务|抗议英雄|反炸中心'
|
||||
|
||||
paths = [
|
||||
'D:/Projects/POM/DATA/2022年10月/9月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2022年9月/8月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2022年8月/7月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2022年7月/6月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2022年6月/5月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2022年5月/4月报告/全文/',
|
||||
regSTR = '网络意识形态工作责任制实施细则|意识形态工作责任制实施办法'
|
||||
regSTR = r'(?=.*西藏)(?=.*劳务)'
|
||||
regSTRA = r'藏族|西藏'
|
||||
regSTRB = r'劳务|用工|转移|输出|输转|就业|职业培训|技能培训|高校毕业生'
|
||||
regSTRAB = r'藏族|西藏|劳务|用工|转移|输出|输转|就业|职业培训|技能培训|高校毕业生'
|
||||
paths = [
|
||||
'D:/Projects/POM/DATA/2023年6月/季度报告/全文/',
|
||||
'D:/Projects/POM/DATA/2023年6月/5月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2023年5月/4月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2023年4月/3月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2023年3月/2月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2023年2月/1月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2023年1月/12月报告/全文/',
|
||||
]
|
||||
paths = ['D:/Projects/POM/DATA/2022年/2022年12月/11月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2022年/2022年11月/10月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2022年/2022年10月/9月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2022年/2022年9月/8月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2022年/2022年8月/7月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2022年/2022年7月/6月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2022年/2022年6月/5月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2022年/2022年5月/4月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2022年/2022年4月/3月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2022年/2022年3月/2月报告/全文/',
|
||||
'D:/Projects/POM/DATA/2022年/2022年2月/1月报告/全文/',]
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
'''
|
||||
#'D:/Projects/POM/DATA/2022年11月/10月报告/全文/',
|
||||
#'D:/Projects/POM/DATA/2022年12月/11月报告/全文/',
|
||||
|
@ -201,17 +223,19 @@ def getWXData_Province(path, hasBody=False):
|
|||
continue
|
||||
if not os.path.isdir(scc):
|
||||
#print(dirCC, dirCC[-5:] )
|
||||
if dirCC[-5:]=='.xlsx' or dirCC[-4:]=='.xls':
|
||||
if (dirCC[-5:]=='.xlsx' or dirCC[-4:]=='.xls') and not dirCC.startswith('~'):
|
||||
files.append(scc)
|
||||
dfcc = pd.read_excel(scc)
|
||||
#print(scc)
|
||||
dfcc = pd.read_excel(scc) # , engine='openpyxl'
|
||||
dfcc['市州'] = strC
|
||||
dfWX = dfWX.append(dfcc)
|
||||
print(' ', dirCC, dfcc.shape[0])
|
||||
else:
|
||||
print('something error 01: ', dirCC)
|
||||
else:
|
||||
if dirC[-5:]=='.xlsx' or dirC[-4:]=='.xls':
|
||||
if (dirC[-5:]=='.xlsx' or dirC[-4:]=='.xls') and not dirCC.startswith('~'):
|
||||
files.append(sc)
|
||||
#print(sc)
|
||||
dfc = pd.read_excel(sc)
|
||||
dfcc['市州'] = strC
|
||||
dfWX = dfWX.append(dfc)
|
||||
|
@ -333,16 +357,17 @@ if doWX:
|
|||
print('WX data ', dfWX.shape)
|
||||
|
||||
# 查找关键词
|
||||
dfwxd = dfWX[['市州', '公众号', '日期', '标题', '链接', '内容', '阅读数']][dfWX['内容'].str.contains(regSTR, regex=True, na=False)]
|
||||
dfwxd = dfWX[['市州', '公众号', '日期', '标题', '链接', '内容', '阅读数']][dfWX['内容'].str.contains(regSTRA, regex=True, na=False) & dfWX['内容'].str.contains(regSTRB, regex=True, na=False)]
|
||||
dfwxd['类型'] = '微信'
|
||||
dfwxd['关键词']=''
|
||||
dfwxd['上下文']=''
|
||||
print("Found ", dfwxd.shape)
|
||||
|
||||
dfwxd = dfwxd.reset_index()
|
||||
# 提取上下文
|
||||
iiii=0
|
||||
for i,r in dfwxd.iterrows():
|
||||
string = str(r['内容'])
|
||||
its = re.finditer(regSTR, string)
|
||||
string = str(dfwxd.loc[iiii,'内容'])
|
||||
its = re.finditer(regSTRAB, string)
|
||||
sk = ''
|
||||
sp = ''
|
||||
for it in its:
|
||||
|
@ -356,8 +381,9 @@ if doWX:
|
|||
sk += it.group() + ';'
|
||||
sp += string[s:e] + ';'
|
||||
|
||||
dfwxd.loc[i,'关键词'] = sk[:-1]
|
||||
dfwxd.loc[i,'上下文'] = sp[:-1]
|
||||
dfwxd.loc[iiii,'关键词'] = sk[:-1]
|
||||
dfwxd.loc[iiii,'上下文'] = sp[:-1]
|
||||
iiii = iiii+1
|
||||
|
||||
dfwxd.rename(columns={"阅读数": "阅读数/评论数", "公众号": "账号名称"},inplace=True)
|
||||
dfwxd = dfwxd[['关键词', '上下文', '日期', '市州', '类型', '账号名称', '链接', '标题', '阅读数/评论数', '内容',]]
|
||||
|
@ -375,16 +401,18 @@ if doWB:
|
|||
|
||||
|
||||
# 查找关键词
|
||||
dfwbd = dfWB[['市州', '账号名称', '标题', '日期', '评论数', '内容']][dfWB['内容'].str.contains(regSTR, regex=True, na=False)]
|
||||
dfwbd = dfWB[['市州', '账号名称', '标题', '日期', '评论数', '内容']][dfWB['内容'].str.contains(regSTRA, regex=True, na=False) & dfWB['内容'].str.contains(regSTRB, regex=True, na=False)]
|
||||
dfwbd['类型'] = '微博'
|
||||
dfwbd['关键词'] = ''
|
||||
dfwbd['上下文'] = ''
|
||||
print("WB Found ", dfwbd.shape)
|
||||
|
||||
# 提取关键词上下文
|
||||
dfwbd = dfwbd.reset_index()
|
||||
iiii = 0
|
||||
for i, r in dfwbd.iterrows():
|
||||
string = str(r['内容'])
|
||||
its = re.finditer(regSTR, string)
|
||||
string = str(dfwbd.loc[iiii, '内容'])
|
||||
its = re.finditer(regSTRAB, string)
|
||||
sk = ''
|
||||
sp = ''
|
||||
for it in its:
|
||||
|
@ -397,8 +425,9 @@ if doWB:
|
|||
e = it.end() + d
|
||||
sk += it.group() + ';'
|
||||
sp += string[s:e] + ';'
|
||||
dfwbd.loc[i, '关键词'] = sk
|
||||
dfwbd.loc[i, '上下文'] = sp
|
||||
dfwbd.loc[iiii, '关键词'] = sk
|
||||
dfwbd.loc[iiii, '上下文'] = sp
|
||||
iiii = iiii + 1
|
||||
|
||||
dfwbd.rename(columns={"评论数": "阅读数/评论数"},inplace=True)
|
||||
dfwbd = dfwbd[['关键词', '上下文', '日期', '市州', '类型', '账号名称', '标题', '阅读数/评论数', '内容',]]
|
||||
|
@ -433,16 +462,18 @@ if doTT:
|
|||
#account date title nread ncomment content url origin city
|
||||
|
||||
# 查找关键词
|
||||
dfttd = dfTT[['city', 'account', 'date', 'title', 'url', 'content', 'nread']][dfTT['content'].str.contains(regSTR, regex=True, na=False)]
|
||||
dfttd = dfTT[['city', 'account', 'date', 'title', 'url', 'content', 'nread']][dfTT['content'].str.contains(regSTRA, regex=True, na=False) & dfTT['content'].str.contains(regSTRB, regex=True, na=False)]
|
||||
dfttd['类型'] = '头条'
|
||||
dfttd['关键词']=''
|
||||
dfttd['上下文']=''
|
||||
print("Found ", dfttd.shape)
|
||||
|
||||
# 提取上下文
|
||||
dfttd = dfttd.reset_index()
|
||||
iiii = 0
|
||||
for i,r in dfttd.iterrows():
|
||||
string = str(r['content'])
|
||||
its = re.finditer(regSTR, string)
|
||||
string = str(dfttd.loc[iiii, 'content'])
|
||||
its = re.finditer(regSTRAB, string)
|
||||
sk = ''
|
||||
sp = ''
|
||||
for it in its:
|
||||
|
@ -456,8 +487,9 @@ if doTT:
|
|||
sk += it.group() + ';'
|
||||
sp += string[s:e] + ';'
|
||||
|
||||
dfttd.loc[i,'关键词'] = sk[:-1]
|
||||
dfttd.loc[i,'上下文'] = sp[:-1]
|
||||
dfttd.loc[iiii,'关键词'] = sk[:-1]
|
||||
dfttd.loc[iiii,'上下文'] = sp[:-1]
|
||||
iiii = iiii + 1
|
||||
|
||||
dfttd.rename(columns={'city': "市州", 'account': "账号名称", 'date': "日期", 'title': "标题", 'url':'链接', 'content': "内容", "nread": "阅读数/评论数"},inplace=True)
|
||||
dfttd = dfttd[['关键词', '上下文', '日期', '市州', '类型', '账号名称', '链接', '标题', '阅读数/评论数', '内容',]]
|
||||
|
|
|
@ -228,7 +228,7 @@ if __name__ == "__main__":
|
|||
#sendMessage(apikey)
|
||||
|
||||
# 逐市州发送月报告
|
||||
sendReportMonthly(apikey, '2023', '1')
|
||||
sendReportMonthly(apikey, '2023', '5')
|
||||
|
||||
# 逐市州发送预警信息
|
||||
#sendForewarning(apikey)
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -18,6 +18,22 @@ from docx.shared import Mm
|
|||
import jieba
|
||||
import jieba.posseg as pseg
|
||||
|
||||
|
||||
|
||||
def fetch_chinese(s):
|
||||
pattern =re.compile(r'[^\u4e00-\u9fa5]')
|
||||
sc = re.sub(pattern, '', s)
|
||||
return sc
|
||||
|
||||
def toDate(strDT):
|
||||
dt = pd.to_datetime(strDT, errors='coerce')
|
||||
dts = ''
|
||||
# print('-+-+:', type(dt), dt)
|
||||
if not pd.isna(dt):
|
||||
dts = dt.strftime('%m-%d')
|
||||
return dts
|
||||
|
||||
|
||||
# 画柱状图
|
||||
def drawBar(data, recipe, title='', fn=''):
|
||||
plt.figure(figsize=(6, 4))
|
||||
|
@ -257,9 +273,11 @@ def getTTData(path, cities, hasBody=False):
|
|||
fileAs = os.path.join(path, dirC, dirCT, fn)
|
||||
#print(' ', ttName, fileAs)
|
||||
if len(fileAs) > 0:
|
||||
dfdftt = pd.read_excel(fileAs)
|
||||
dfTTC = dfTTC.append(dfdftt)
|
||||
|
||||
try:
|
||||
dfdftt = pd.read_excel(fileAs)
|
||||
dfTTC = dfTTC.append(dfdftt)
|
||||
except:
|
||||
print("read file failed. ", fileAs)
|
||||
|
||||
#dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
|
||||
# index_col=None)#, engine='python', encoding='gbk'#utf-8
|
||||
|
@ -290,7 +308,7 @@ def fetch_chinese(s):
|
|||
|
||||
if __name__ == "__main__":
|
||||
starttime = datetime.datetime.now()
|
||||
_RATIO = 0.7
|
||||
_RATIO = 0.5
|
||||
isDoWX = True
|
||||
isDoWB = True
|
||||
isDoTT = True
|
||||
|
@ -309,7 +327,7 @@ if __name__ == "__main__":
|
|||
'兰州市', '张掖市', '甘南藏族自治州', '金昌市',
|
||||
'省直部门', # 共12市2州1新区
|
||||
]
|
||||
|
||||
'''
|
||||
cities = [
|
||||
'临夏回族自治州',
|
||||
'白银市',
|
||||
|
@ -320,28 +338,29 @@ if __name__ == "__main__":
|
|||
|
||||
#'省直部门', # 共12市2州1新区
|
||||
]
|
||||
'''
|
||||
#cities = ['陇南市', '临夏回族自治州', '白银市', '定西市', '酒泉市', '平凉市','武威市','天水市']
|
||||
#cities = ['白银市']
|
||||
#cities = ['陇南市']
|
||||
# 转发任务
|
||||
#dfTask = pd.read_excel('D:/Projects/POM/DATA/2022年S2/S2/全省政务新媒体二季度转发信息条目.xls')
|
||||
dfTask = pd.read_excel('D:/Projects/POM/DATA/2023年3月/2月报告/2023年2月份全省政务新媒体转发内容条目.xlsx')
|
||||
dfTask = pd.read_excel('D:/Projects/POM/DATA/2023年4月/3月报告/2023年3月份全省政务新媒体转发内容条目.xlsx')
|
||||
# 账号信息
|
||||
strFnAccount = 'D:/Projects/POM/DATA/2023年3月/2月报告/全国报送系统表单_2023.2.28.xlsx'
|
||||
strFnAccount = 'D:/Projects/POM/DATA/2023年4月/3月报告/全国报送系统表单_2023.3.31.xlsx'
|
||||
dfAllAccount = pd.read_excel(strFnAccount)
|
||||
# 省直部门账号部门简称
|
||||
dfProvincial = pd.read_excel('D:/Projects/POM/DATA/2023年3月/2月报告/省直部门账号名称简称.xlsx')
|
||||
fnTemplate = 'D:/Projects/POM/DATA/2023年3月/2月报告/POM_ForewardTemplate.docx'
|
||||
dfProvincial = pd.read_excel('D:/Projects/POM/DATA/2023年4月/3月报告/省直部门账号名称简称.xlsx')
|
||||
fnTemplate = 'D:/Projects/POM/DATA/2023年4月/3月报告/POM_ForewardTemplate.docx'
|
||||
|
||||
# 数据根目录,
|
||||
strPath = ['D:/Projects/POM/DATA/2023年3月/2月报告/']
|
||||
strOutputPath = 'D:/Projects/POM/DATA/2023年3月/2月报告/转发/'
|
||||
strPath = ['D:/Projects/POM/DATA/2023年4月/3月报告/']
|
||||
strOutputPath = 'D:/Projects/POM/DATA/2023年4月/3月报告/转发/'
|
||||
|
||||
context = {
|
||||
"year": "2023",
|
||||
"month": "2",
|
||||
"pubMonth": "3",
|
||||
"dateStart": "2023年2月1日",
|
||||
"dateEnd": "2023年2月28日"
|
||||
"month": "3",
|
||||
"pubMonth": "4",
|
||||
"dateStart": "2023年3月1日",
|
||||
"dateEnd": "2023年3月31日"
|
||||
}
|
||||
|
||||
dfAllAccount.loc[:, '转发数'] = 0
|
||||
|
@ -409,10 +428,11 @@ if __name__ == "__main__":
|
|||
for j in range(dataA.shape[0]):
|
||||
str1 = str(dataA.iloc[j, dataA.columns.get_loc('title')]) # 文章标题
|
||||
#
|
||||
if len(rt) > len(str1):
|
||||
if len(rt) > len(str1): # 任务标题过长,截取前半部分进行对比
|
||||
strRT = rt[:len(str1)]
|
||||
else:
|
||||
else: #文章标题过长,只比较任务标题长度部分
|
||||
strRT = rt
|
||||
str1 = str1[:len(rt)]
|
||||
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
|
||||
if ratio > _RATIO:
|
||||
forwarded += 1
|
||||
|
@ -484,12 +504,12 @@ if __name__ == "__main__":
|
|||
# 查看该账号的所有文章
|
||||
for j in range(dataA.shape[0]):
|
||||
str1 = str(dataA.iloc[j, dataA.columns.get_loc('标题')])
|
||||
|
||||
#
|
||||
if len(rt) > len(str1):
|
||||
if len(rt) > len(str1):# 任务标题过长,截取前半部分进行对比
|
||||
strRT = rt[:len(str1)]
|
||||
else:
|
||||
else:#文章标题过长,只比较任务标题长度部分
|
||||
strRT = rt
|
||||
str1 = str1[:len(rt)]
|
||||
|
||||
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,965 @@
|
|||
import datetime
|
||||
import csv
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import glob, os, re, time
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.ticker import FuncFormatter
|
||||
from difflib import SequenceMatcher
|
||||
from collections import Counter
|
||||
import difflib
|
||||
|
||||
|
||||
from docxtpl import DocxTemplate
|
||||
from docxtpl import InlineImage
|
||||
from docx.shared import Mm
|
||||
|
||||
import jieba
|
||||
import jieba.posseg as pseg
|
||||
|
||||
#---
|
||||
#那我们的目标就是将字段列名的日期数据替换成标准的日期格式,具体的思路是:
|
||||
#1、先用excel实验2018-11-02对应的日期时间戳是43406。
|
||||
#2、我再用2018-11-02减43406看看是从那一年开始计算的,所以得出结论是1899-12-30。
|
||||
#3、那最后要达成目标就只需要时间戳+1899-12-30就等于对应的当前日
|
||||
def ts2date(dates, sf='%Y-%m-%d'):#定义转化日期戳的函数,dates为日期戳
|
||||
delta=datetime.timedelta(days=dates)
|
||||
today=datetime.datetime.strptime('1899-12-30','%Y-%m-%d')+delta#将1899-12-30转化为可以计算的时间格式并加上要转化的日期戳
|
||||
return datetime.datetime.strftime(today,sf)#制定输出日期的格式
|
||||
#---
|
||||
|
||||
def fetch_chinese(s):
|
||||
pattern =re.compile(r'[^\u4e00-\u9fa5]')
|
||||
sc = re.sub(pattern, '', s)
|
||||
return sc
|
||||
|
||||
# 画柱状图
|
||||
def drawBar(data, recipe, title='', fn=''):
|
||||
plt.figure(figsize=(6, 4))
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||||
plt.rcParams['axes.unicode_minus'] = False
|
||||
counties = recipe
|
||||
countyRates = data
|
||||
|
||||
plt.bar(counties, countyRates, width=0.5)
|
||||
plt.xticks(counties, counties, rotation=35)
|
||||
plt.ylim((0, 1))
|
||||
|
||||
def to_percent(temp, position):
|
||||
return '%2.0f' % (100 * temp) + '%'
|
||||
|
||||
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
|
||||
plt.title(title, fontsize=16)
|
||||
plt.tight_layout()
|
||||
plt.savefig(fn)
|
||||
# plt.show()
|
||||
plt.cla()
|
||||
plt.clf()
|
||||
plt.close()
|
||||
|
||||
def getWBData(path, cities, hasBody=False):
|
||||
# cityShorten
|
||||
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
|
||||
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
|
||||
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
|
||||
|
||||
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
|
||||
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
|
||||
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
|
||||
|
||||
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
||||
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
||||
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
||||
}
|
||||
dirCs = os.listdir(path)
|
||||
cs = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数',
|
||||
'转发数', '评论数', 'weiboID', 'weiboName', '市州']
|
||||
dfWB = pd.DataFrame(columns=cs)
|
||||
cityCount = 0
|
||||
for dirC in dirCs:
|
||||
if dirC[:1] == '.':
|
||||
continue
|
||||
if not os.path.isdir(os.path.join(path, dirC)):
|
||||
continue
|
||||
if 'weixin' in dirC.lower():
|
||||
continue
|
||||
if 'tt' in dirC.lower():
|
||||
continue
|
||||
if not cityShorten[dirC] in cities:
|
||||
continue
|
||||
print(' city: ', cityShorten[dirC], dirC)
|
||||
cityCount += 1
|
||||
# City LN
|
||||
cols = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数',
|
||||
'转发数', '评论数'] #WB下载工具中的格式
|
||||
dfWBC = pd.DataFrame(columns=cols)
|
||||
dirCTs = os.listdir(os.path.join(path, dirC))
|
||||
for dirCT in dirCTs:
|
||||
if dirCT[:1] == '.':
|
||||
continue
|
||||
# 时段 weibo weibo_1
|
||||
if not os.path.isdir(os.path.join(path, dirC, dirCT)):
|
||||
continue
|
||||
if 'weixin' in dirCT.lower():
|
||||
continue
|
||||
if 'tt' in dirCT.lower():
|
||||
continue
|
||||
print(' read WB... dir:',dirCT)
|
||||
dirAs = os.listdir(os.path.join(path, dirC, dirCT))
|
||||
for dirA in dirAs:
|
||||
if dirA[:1] == '.':
|
||||
continue
|
||||
# 都是账号名称目录下再存账号ID.txt,
|
||||
if not os.path.isdir(os.path.join(path, dirC, dirCT, dirA)):
|
||||
continue
|
||||
##print('---',dirA)
|
||||
# 账号名称
|
||||
wbName = dirA
|
||||
fileAs = os.listdir(os.path.join(path, dirC, dirCT, dirA))
|
||||
if len(fileAs) > 0 and os.path.splitext(fileAs[0])[-1] == '.csv':
|
||||
wbId = fileAs[0][:-4]
|
||||
if len(fileAs) > 1 and wbId.startswith('.'):
|
||||
wbId = fileAs[1][:-4]
|
||||
# 读取文件
|
||||
##print('----',wbName, wbId)
|
||||
filename = os.path.join(path, dirC, dirCT, dirA, fileAs[0])
|
||||
dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
|
||||
index_col=None)#, engine='python', encoding='gbk'#utf-8
|
||||
dfdfwb = dfdfwb[1:]
|
||||
dfdfwb["weiboID"] = wbId
|
||||
dfdfwb["weiboName"] = wbName
|
||||
|
||||
dfWBC = dfWBC.append(dfdfwb)
|
||||
#print(wbName, wbId, fileAs[0], dfdfwb.shape, dfWBC.shape)
|
||||
|
||||
if len(fileAs)>1:
|
||||
print(" +=+= ", fileAs)
|
||||
|
||||
print(' ', dfWBC.shape)
|
||||
#dfWBC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx")
|
||||
dfWBC['市州'] = cityShorten[dirC]
|
||||
dfWB = dfWB.append(dfWBC)
|
||||
|
||||
print('Read WB finished. cities', cityCount, '; lines', dfWB.shape)
|
||||
#dfWB.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
|
||||
return dfWB
|
||||
|
||||
# 从数据目录中读取xlsx文件,拼接到一起
|
||||
def getWXData(path, cities, hasBody=False):
|
||||
# cityShorten
|
||||
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
|
||||
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
|
||||
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
|
||||
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
|
||||
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
|
||||
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
|
||||
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
||||
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
||||
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
||||
}
|
||||
dirBatches = os.listdir(path)
|
||||
cols = ['公众号', '链接', '日期', '标题', '内容', '头条', '市州', '阅读数']
|
||||
dfWX = pd.DataFrame(columns=cols)
|
||||
countC = 0
|
||||
countFnC = 0
|
||||
# 监测批次目录
|
||||
for dirBatch in dirBatches:
|
||||
if not os.path.isdir(os.path.join(path, dirBatch)):
|
||||
continue # 仅目录
|
||||
|
||||
# City LN
|
||||
# 列出市州文件名称
|
||||
fileCs = os.listdir(os.path.join(path, dirBatch))
|
||||
count = 0
|
||||
for fileC in fileCs:
|
||||
if fileC[:1] == '.':
|
||||
continue
|
||||
# 处理目录
|
||||
if os.path.isdir(os.path.join(path, dirBatch, fileC)) and 'weixin' in fileC.lower():
|
||||
print(' ', os.path.join(path, dirBatch, fileC))
|
||||
fs = os.listdir(os.path.join(path, dirBatch, fileC))
|
||||
for f in fs:
|
||||
fe = os.path.splitext(f)[-1]
|
||||
if fe == '.xlsx' or fe == '.xls':
|
||||
fName = os.path.splitext(fileC)[0]
|
||||
cityname = cityShorten[dirBatch]
|
||||
if cityname in cities:
|
||||
dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC, f))
|
||||
dfdfwxc['市州'] = cityname
|
||||
print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape)
|
||||
dfWX = dfWX.append(dfdfwxc)
|
||||
count = count + 1
|
||||
# 处理文件
|
||||
fExt = os.path.splitext(fileC)[-1]
|
||||
if fExt != '.xlsx' and fExt != '.xls':
|
||||
continue # 限制文件扩展名
|
||||
fName = os.path.splitext(fileC)[0]
|
||||
cityname = cityShorten[dirBatch]
|
||||
if cityname in cities:
|
||||
dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC))
|
||||
dfdfwxc['市州'] = cityShorten[dirBatch]
|
||||
print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape)
|
||||
dfWX = dfWX.append(dfdfwxc)
|
||||
count = count + 1
|
||||
countFnC += count
|
||||
if count > 0:
|
||||
countC += 1
|
||||
print(" Read WX Finished. cities ", countC, '; Files', countFnC, '; lines ', dfWX.shape[0])
|
||||
#dfWX.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WX_ALL.xlsx")
|
||||
return dfWX
|
||||
|
||||
# 从数据目录中读取xlsx文件,拼接到一起
|
||||
def getTTData(path, cities, hasBody=False):
|
||||
# cityShorten
|
||||
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
|
||||
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
|
||||
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
|
||||
|
||||
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
|
||||
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
|
||||
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
|
||||
|
||||
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
||||
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
||||
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
||||
}
|
||||
dirCs = os.listdir(path)
|
||||
#account date title nread ncomment content url origin
|
||||
cs = ['account', 'date', 'title', 'nread', 'ncomment', 'content', 'url', 'origin', 'city']
|
||||
|
||||
dfTT = pd.DataFrame(columns=cs)
|
||||
cityCount = 0
|
||||
for dirC in dirCs:
|
||||
if dirC[:1] == '.':
|
||||
continue
|
||||
if not os.path.isdir(os.path.join(path, dirC)):
|
||||
continue
|
||||
if 'weixin' in dirC.lower():
|
||||
continue
|
||||
if 'weibo' in dirC.lower():
|
||||
continue
|
||||
if not cityShorten[dirC] in cities:
|
||||
continue
|
||||
print(' city: ', cityShorten[dirC], dirC)
|
||||
cityCount += 1
|
||||
# City LN
|
||||
dfTTC = pd.DataFrame(columns=cs)
|
||||
dirCTs = os.listdir(os.path.join(path, dirC))
|
||||
for dirCT in dirCTs:
|
||||
if dirCT[:1] == '.':
|
||||
continue
|
||||
# 时段 weibo weibo_1
|
||||
if not os.path.isdir(os.path.join(path, dirC, dirCT)):
|
||||
continue
|
||||
if 'weixin' in dirCT.lower():
|
||||
continue
|
||||
if 'weibo' in dirCT.lower():
|
||||
continue
|
||||
if 'tt' in dirCT.lower():
|
||||
print(' read TT... dir:',dirCT)
|
||||
fns = os.listdir(os.path.join(path, dirC, dirCT))
|
||||
for fn in fns:
|
||||
if fn[:1] == '.':
|
||||
continue
|
||||
if not fn[-5:] == '.xlsx':
|
||||
continue
|
||||
#print('---',fn)
|
||||
# 账号名称
|
||||
|
||||
ttName = fn[fn.index('_')+1:]
|
||||
ttName = ttName[:ttName.index('_')]
|
||||
#D:\Projects\POM\DATA\2022年11月\10月报告\全文\LN\TT
|
||||
fileAs = os.path.join(path, dirC, dirCT, fn)
|
||||
#print(' ', ttName, fileAs)
|
||||
if len(fileAs) > 0:
|
||||
try:
|
||||
dfdftt = pd.read_excel(fileAs)
|
||||
dfTTC = dfTTC.append(dfdftt)
|
||||
except:
|
||||
print("read file failed. ", fileAs)
|
||||
|
||||
#dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
|
||||
# index_col=None)#, engine='python', encoding='gbk'#utf-8
|
||||
#dfdfwb = dfdfwb[1:]
|
||||
#dfdfwb["weiboID"] = wbId
|
||||
#dfdfwb["weiboName"] = wbName
|
||||
|
||||
#dfTTC = dfTTC.append(dfdfwb)
|
||||
#print(ttName, '读入:', dfdftt.shape[0], ' 总计:', dfTTC.shape[0])
|
||||
|
||||
#if len(fileAs)>1:
|
||||
# print(" +=+= ", fileAs)
|
||||
|
||||
print(' 读入头条数据行数', dfTTC.shape)
|
||||
#dfTTC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx")
|
||||
dfTTC['city'] = cityShorten[dirC]
|
||||
dfTT = dfTT.append(dfTTC)
|
||||
|
||||
print('Read TT finished. cities', cityCount, '; lines', dfTT.shape)
|
||||
#dfTT.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
|
||||
return dfTT
|
||||
|
||||
|
||||
def fetch_chinese(s):
|
||||
pattern =re.compile(r'[^\u4e00-\u9fa5]')
|
||||
sc = re.sub(pattern, '', s)
|
||||
return sc
|
||||
|
||||
def doWBData():
|
||||
|
||||
dfAccount = pd.read_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全国报送系统表单_2023.6.30.xlsx')
|
||||
dfAccount = dfAccount[dfAccount['账号类型']=='新浪微博']
|
||||
dfAccount['微信biz/oid/账号ID'] = dfAccount['微信biz/oid/账号ID'].astype('int64')
|
||||
|
||||
dfwb1 = pd.read_csv('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/weibo_2/weibo1.csv', sep=',',index_col=None)#, engine='python', encoding='gbk'#utf-8
|
||||
dfwb1 = dfwb1.fillna(0)
|
||||
dfwb1['user_id'] = dfwb1['user_id'].astype('int64')
|
||||
|
||||
dfwb1.rename(columns={'id':'微博id', 'content':'微博正文', 'article_url':'头条文章url', 'original_pictures':'原始图片url',
|
||||
'retweet_pictures':'被转发微博原始图片url', 'original':'是否为原创微博', 'video_url':'微博视频url',
|
||||
'publish_place':'发布位置', 'publish_time':'发布时间', 'publish_tool':'发布工具',
|
||||
'up_num':'点赞数', 'retweet_num':'转发数', 'comment_num':'评论数'}, inplace = True)
|
||||
|
||||
print(dfAccount.shape)
|
||||
|
||||
print(dfwb1.shape, dfwb1.dtypes)
|
||||
if 1:
|
||||
sDir = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全文/LN/weibo_3/'
|
||||
i=0
|
||||
j=0
|
||||
for uid in dfwb1['user_id'].unique():
|
||||
dfa1 = dfAccount[dfAccount['微信biz/oid/账号ID']==uid]
|
||||
dfa1.reset_index(inplace=True)
|
||||
if dfa1.shape[0]>0:
|
||||
sA = str(dfa1.loc[0,'账号名称'])
|
||||
#print(dfa1['账号名称'])
|
||||
i = i + 1
|
||||
dfwba = dfwb1.loc[dfwb1['user_id']==uid]
|
||||
os.mkdir(sDir+sA)
|
||||
#微博id,微博正文,头条文章url,原始图片url,被转发微博原始图片url,是否为原创微博,微博视频url,
|
||||
# 发布位置,发布时间,发布工具,点赞数,转发数,评论数
|
||||
dfwba = dfwba[['微博id', '微博正文', '头条文章url', '原始图片url',
|
||||
'被转发微博原始图片url', '是否为原创微博',
|
||||
'微博视频url', '微博视频url', '发布位置', '发布时间', '发布工具',
|
||||
'点赞数', '转发数', '评论数']]
|
||||
|
||||
dfwba = dfwba.reset_index()
|
||||
dfwba.to_csv(sDir+sA+'/'+str(uid)+'.csv', encoding='utf_8_sig', index=0, quoting=1)
|
||||
|
||||
else:
|
||||
j = j+1
|
||||
print('found ', i, '; nofound', j)
|
||||
i=0
|
||||
j=0
|
||||
if 1:
|
||||
sDir = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全文/LN/weibo_4/'
|
||||
dfwb2 = pd.read_csv('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/weibo_2/weibo2.csv', sep=',',index_col=None)#, engine='python', encoding='gbk'#utf-8
|
||||
|
||||
dfwb2 = dfwb2.fillna(0)
|
||||
dfwb2['user_id'] = dfwb2['user_id'].astype('int64')
|
||||
|
||||
dfwb2.rename(columns={'id':'微博id', 'text':'微博正文', 'article_url':'头条文章url', 'pics':'原始图片url',
|
||||
'topics':'被转发微博原始图片url','source':'是否为原创微博','video_url':'微博视频url',
|
||||
'location':'发布位置', 'created_at':'发布时间', 'bid':'发布工具',
|
||||
'attitudes_count':'点赞数', 'reposts_count':'转发数', 'comments_count':'评论数'}, inplace = True)
|
||||
print(dfwb2.shape)
|
||||
for uid in dfwb2['user_id'].unique():
|
||||
dfa2 = dfAccount[dfAccount['微信biz/oid/账号ID']==uid]
|
||||
dfa2.reset_index(inplace=True)
|
||||
if dfa2.shape[0]>0:
|
||||
sA = str(dfa2.loc[0, '账号名称'])
|
||||
i = i+1
|
||||
dfwba = dfwb2.loc[dfwb2['user_id']==uid]
|
||||
os.mkdir(sDir+sA)
|
||||
#微博id,微博正文,头条文章url,原始图片url,被转发微博原始图片url,是否为原创微博,微博视频url,发布位置,发布时间,发布工具,点赞数,转发数,评论数
|
||||
dfwba = dfwba[['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博',
|
||||
'微博视频url', '发布位置', '发布时间', '发布工具',
|
||||
'点赞数', '转发数', '评论数']]
|
||||
dfwba = dfwba.reset_index()
|
||||
dfwba.to_csv(sDir+sA+'/'+str(uid)+'.csv', encoding='utf_8_sig', index=0, quoting=1)
|
||||
else:
|
||||
#print(uid)
|
||||
j = j+1
|
||||
|
||||
print('found ', i, '; nofound', j)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
#doWBData()
|
||||
#exit(0)
|
||||
starttime = datetime.datetime.now()
|
||||
_RATIO = 0.5
|
||||
isDoWX = True
|
||||
isDoWB = True
|
||||
isDoTT = True
|
||||
cities = [
|
||||
'临夏回族自治州',
|
||||
'白银市',
|
||||
'定西市',
|
||||
'酒泉市',
|
||||
'嘉峪关市',
|
||||
'平凉市',
|
||||
'庆阳市',
|
||||
'天水市',
|
||||
'武威市',
|
||||
'兰州新区',
|
||||
'陇南市',
|
||||
'兰州市', '张掖市', '甘南藏族自治州', '金昌市',
|
||||
'省直部门', # 共12市2州1新区
|
||||
]
|
||||
'''
|
||||
cities = [
|
||||
'临夏回族自治州',
|
||||
'白银市',
|
||||
'定西市',
|
||||
'酒泉市',
|
||||
'天水市',
|
||||
'陇南市',
|
||||
|
||||
#'省直部门', # 共12市2州1新区
|
||||
]
|
||||
'''
|
||||
cities = ['陇南市',]
|
||||
#cities = ['陇南市', '临夏回族自治州', '白银市', '定西市', '酒泉市', '平凉市','武威市','天水市']
|
||||
#cities = ['陇南市']
|
||||
# 转发任务
|
||||
#dfTask = pd.read_excel('D:/Projects/POM/DATA/2022年S2/S2/全省政务新媒体二季度转发信息条目.xls')
|
||||
dfTask = pd.read_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/陇南7月上旬转发台账.xlsx')
|
||||
sTaskTitle = '标题'
|
||||
sTaskDate = '推送时间'
|
||||
# 删除标题列为空的行
|
||||
dfTask.dropna(axis=0,subset = ["标题"])
|
||||
yT0 = dfTask.columns.get_loc('序号')
|
||||
yT1 = dfTask.columns.get_loc('标题')
|
||||
|
||||
# 账号信息
|
||||
strFnAccount = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全国报送系统表单_2023.6.30.xlsx'
|
||||
dfAllAccount = pd.read_excel(strFnAccount)
|
||||
# 添加列
|
||||
dfAllAccount.loc[:, '转发数'] = 0
|
||||
#dfAllAccount.loc[:, '阅读数'] = 0
|
||||
dfAllAccount = pd.concat([dfAllAccount, pd.DataFrame(np.zeros((dfAllAccount.shape[0], dfTask.shape[0])), columns=dfTask['序号'].astype(str).tolist())], axis=1)
|
||||
# 整理数据
|
||||
dfAllAccount['市/省局'] = dfAllAccount['市/省局'].fillna('省直部门')
|
||||
dfAllAccount['区县/地方部门'] = dfAllAccount['区县/地方部门'].fillna('市直部门')
|
||||
dfAllAccount.loc[(dfAllAccount['市/省局'].isin(['临夏回族自治州', '甘南藏族自治州'])) & (dfAllAccount['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门'
|
||||
dfAllAccount.loc[(dfAllAccount['市/省局'].isin(['省直部门'])) & (dfAllAccount['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '省直部门'
|
||||
# 过长名称替换为简称,便于绘图
|
||||
dfAllAccount.loc[dfAllAccount['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县'
|
||||
dfAllAccount.loc[dfAllAccount['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县'
|
||||
yAccountName = dfAllAccount.columns.get_loc('账号名称')
|
||||
yAccountCity = dfAllAccount.columns.get_loc('市/省局')
|
||||
yAccountCounty = dfAllAccount.columns.get_loc('区县/地方部门')
|
||||
yAccountUnit = dfAllAccount.columns.get_loc('单位全称')
|
||||
|
||||
# 省直部门账号部门简称
|
||||
fnTemplate = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/POM_ForewardTemplate.docx'
|
||||
|
||||
# 数据根目录,
|
||||
strPath = ['D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全文/',
|
||||
]
|
||||
strOutputPath = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/转发/'
|
||||
|
||||
context = {
|
||||
"year": "2023",
|
||||
"month": "7",
|
||||
"pubMonth": "7",
|
||||
"dateStart": "2023年7月1日",
|
||||
"dateEnd": "2023年7月10日"
|
||||
}
|
||||
|
||||
################################################
|
||||
# 创建存储矩阵
|
||||
# 按照转发任务创建统计矩阵
|
||||
colRR = ['市州', '类型', '账号名称', '单位名称', '省直部门', '区县', '转发数', '阅读数']
|
||||
for ididid in dfTask['序号'][0:dfTask['标题'].count()].tolist():
|
||||
#for ididid in range(1, dfTask['标题'].count()):
|
||||
colRR.append(str(ididid))
|
||||
# 用于保存每一条转发任务的账号和文章
|
||||
dfO = pd.DataFrame(columns=['任务序号', '任务名称', '类型', '公众号', '日期', '内容', '链接', '市州'] )
|
||||
|
||||
|
||||
|
||||
# WX
|
||||
if isDoWX:
|
||||
print('=============================================================')
|
||||
print('---- WX ----')
|
||||
dfWX = pd.DataFrame()
|
||||
for strP in strPath:
|
||||
ddff = getWXData(strP, cities)
|
||||
dfWX = dfWX.append(ddff)
|
||||
|
||||
dfWX = dfWX.fillna(value=0)
|
||||
yWXtitle = dfWX.columns.get_loc('标题')
|
||||
yWXnread = dfWX.columns.get_loc('阅读数')
|
||||
yWXdate = dfWX.columns.get_loc('日期')
|
||||
yWXurl = dfWX.columns.get_loc('链接')
|
||||
|
||||
# 公众号 链接 日期 标题 内容 头条 city
|
||||
## 逐个市州统计每个账号的转发情况
|
||||
#cities = dfWX['市州'].unique()
|
||||
for city in cities:
|
||||
print('---- WX title match', city, ' ----' )
|
||||
# 本市微信数据
|
||||
dataC = dfWX.loc[dfWX['市州'] == city].copy()
|
||||
# 获取微信账号数
|
||||
accounts = dataC['公众号'].unique()
|
||||
|
||||
# 所有微信账号数
|
||||
maskCWX = ( (dfAllAccount['账号类型'] == '微信服务号')|(dfAllAccount['账号类型'] == '微信订阅号') ) & (dfAllAccount['市/省局'] == city)
|
||||
accountNumCWX = maskCWX.tolist().count(True)
|
||||
|
||||
# 按获取得微信账号遍历
|
||||
for account in accounts:
|
||||
#print(account)
|
||||
# 该账号的所有文章
|
||||
dataA = dataC.loc[dataC['公众号'] == account].copy() # 一个公众号的所有文章
|
||||
sR = pd.Series(dtype='object')
|
||||
sR['类型'] = '微信'
|
||||
sR['市州'] = city
|
||||
sR['账号名称'] = account
|
||||
count = 0
|
||||
arn = 0
|
||||
|
||||
# 从账号信息中匹配该账号详细信息
|
||||
mask = ( (dfAllAccount['账号类型'] == '小程序+微信')
|
||||
| (dfAllAccount['账号类型'] == '微信服务号')
|
||||
| (dfAllAccount['账号类型'] == '微信订阅号') ) & (dfAllAccount['市/省局'] == city) & (dfAllAccount['账号名称']==account)
|
||||
if mask.any():
|
||||
sxq = str(dfAllAccount.loc[mask, '区县/地方部门'].values[0])
|
||||
if sxq.lower() !='nan':
|
||||
sR['区县'] = sxq
|
||||
sdwmc = str(dfAllAccount.loc[mask, '单位全称'].values[0])
|
||||
if sdwmc.lower() != 'nan':
|
||||
sR['单位名称'] = sdwmc
|
||||
else:
|
||||
print(' !!!! 微信', account, '在', city, '无详细信息' )
|
||||
continue
|
||||
|
||||
# 按任务标题逐个匹配所有发文,得到每篇任务的转发情况
|
||||
for i in range(dfTask['标题'].count()):
|
||||
# 对于每一篇任务文章
|
||||
rn = dfTask.iloc[i, yT0] # 序号
|
||||
ssrt = str(dfTask.iloc[i, yT1]) # 标题/内容
|
||||
rt = fetch_chinese(ssrt) # 只取汉字
|
||||
forwarded = 0 # 转发数
|
||||
readNum = 0 # 阅读数
|
||||
# 查看该账号的所有文章
|
||||
for j in range(dataA.shape[0]):
|
||||
str1 = fetch_chinese(str(dataA.iloc[j, yWXtitle])) # 只取汉字
|
||||
|
||||
# 任务标题过长,截取前半部分进行对比
|
||||
if len(rt) > len(str1):
|
||||
strRT = rt[:len(str1)]
|
||||
else:#文章标题过长,只比较任务标题长度部分
|
||||
strRT = rt
|
||||
str1 = str1[:len(rt)]
|
||||
|
||||
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
|
||||
|
||||
# 遇到相似的,认为已转发,即跳出不再查找
|
||||
if ratio > _RATIO:
|
||||
forwarded += 1
|
||||
readNum += int(dataA.iloc[j, yWXnread])
|
||||
if forwarded > 0:
|
||||
break
|
||||
sR[str(rn)] = forwarded # 记录该篇文章的转发数
|
||||
|
||||
count += forwarded # 累加该篇文章的转发数
|
||||
arn += readNum # 累加该篇文章的阅读数
|
||||
|
||||
# 记录该篇任务转发情况加入
|
||||
if forwarded > 0:
|
||||
dfO = dfO.append([{'任务序号': rn, '任务名称': ssrt,
|
||||
'类型': '微信',
|
||||
'公众号': account,
|
||||
'日期': dataA.iloc[j, yWXdate],
|
||||
'内容': str1,
|
||||
'链接': dataA.iloc[j, yWXurl],
|
||||
'市州': city,
|
||||
'阅读数': readNum,
|
||||
}], ignore_index=True)
|
||||
#记录该任务的转发情况
|
||||
dfAllAccount.loc[mask, str(rn)] = forwarded
|
||||
#记录该账号的总转发数
|
||||
dfAllAccount.loc[mask, '转发数'] = count
|
||||
sR['转发数'] = count
|
||||
sR['阅读数'] = arn
|
||||
# 全市总转发文章篇数
|
||||
ccwx = dfAllAccount.loc[maskCWX, '转发数'].sum()
|
||||
# 全市总转发率
|
||||
rcc = ccwx/accountNumCWX/dfTask.shape[0]
|
||||
print(' ', city, '共有', accountNumCWX, '个微信号,获取数据', len(accounts), '个。共转发', ccwx, '次,转发率{:.1f}%'.format(rcc*100) )
|
||||
#countWxForewards = dfRR.shape[0]
|
||||
#print(' 获取 WX 账号数', len(dfWX['公众号'].unique()),'参与转发账号数', countWxForewards)
|
||||
|
||||
# WB
|
||||
if isDoWB:
|
||||
print('=============================================================')
|
||||
print('---- WB data read ----')
|
||||
#获取微博数据
|
||||
dfWB = pd.read_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/微博全文.xlsx')
|
||||
#for strP in strPath:
|
||||
## ddff = getWBData(strP, cities)
|
||||
# dfWB = dfWB.append(ddff)
|
||||
print('----', dfWB.shape)
|
||||
#===========================================================================================
|
||||
|
||||
#===========================================================================================
|
||||
yWBcontent = dfWB.columns.get_loc('微博正文')
|
||||
yWBdate = dfWB.columns.get_loc('date')
|
||||
yWBurl = dfWB.columns.get_loc('头条文章url')
|
||||
#dfWB.to_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/转发/微博全文.xlsx')
|
||||
################################################
|
||||
# WB
|
||||
# 微博id 微博正文 头条文章url 原始图片url 被转发微博原始图片url 是否为原创微博 微博视频url 发布位置 date
|
||||
# 发布工具 点赞数 转发数 评论数 weiboID weiboName city
|
||||
#cities = dfWB['市州'].unique()
|
||||
for city in cities:
|
||||
print('---- WB match', city, ' ----' )
|
||||
# 本市微博数据
|
||||
dataC = dfWB.loc[dfWB['市州'] == city].copy()
|
||||
# 获取数据的微博账号
|
||||
accounts = dataC['weiboName'].unique()
|
||||
|
||||
# 本市所有微博账号
|
||||
maskCWB = (dfAllAccount['账号类型'] == '新浪微博') & (dfAllAccount['市/省局'] == city)
|
||||
accountNumCWB = maskCWB.tolist().count(True)
|
||||
|
||||
# 按获取的微博账号遍历
|
||||
for account in accounts:
|
||||
# print(account)
|
||||
# 该公众号的所有文章
|
||||
dataA = dataC.loc[dataC['weiboName'] == account].copy()
|
||||
sR = pd.Series(dtype='object')
|
||||
sR['类型'] = '新浪微博'
|
||||
sR['市州'] = city
|
||||
sR['账号名称'] = account
|
||||
count = 0
|
||||
|
||||
# 为转发账号匹配单位全称和所属县区
|
||||
mask = ( dfAllAccount['账号类型'] == '新浪微博' ) & (dfAllAccount['市/省局'] == city) & (dfAllAccount['账号名称']==account)
|
||||
if mask.any():
|
||||
sxq = str(dfAllAccount.loc[mask, '区县/地方部门'].values[0])
|
||||
if sxq.lower() !='nan':
|
||||
sR['区县'] = sxq
|
||||
sdwmc = str(dfAllAccount.loc[mask, '单位全称'].values[0])
|
||||
if sdwmc.lower() != 'nan':
|
||||
sR['单位名称'] = sdwmc
|
||||
else:
|
||||
print(' !!!! 微博', account, '在', city, '无详细信息' )
|
||||
continue
|
||||
|
||||
# 按任务标题逐个匹配所有发文,得到每篇任务的转发情况
|
||||
for i in range(dfTask['标题'].count()):
|
||||
rn = dfTask.iloc[i, yT0] # 任务序号
|
||||
ssrt = str(dfTask.iloc[i, yT1]) # 任务标题
|
||||
rt = fetch_chinese(ssrt) # 只取中文
|
||||
forwarded = 0
|
||||
# 对该账号的所有文章
|
||||
for j in range(dataA.shape[0]):
|
||||
str0 = str(dataA.iloc[j, yWBcontent])
|
||||
str1 = fetch_chinese(str0)
|
||||
str2 = str1[:len(rt)] # 取任务标题相同汉字数进行比较
|
||||
|
||||
ratio = difflib.SequenceMatcher(None, rt, str2).quick_ratio()
|
||||
|
||||
if ratio > _RATIO:
|
||||
forwarded += 1
|
||||
if forwarded > 0:
|
||||
break
|
||||
#记记录该任务的转发情况
|
||||
dfAllAccount.loc[mask, str(rn)] = forwarded
|
||||
sR[str(rn)] = forwarded
|
||||
# 转发数累加到本账号里
|
||||
count += forwarded
|
||||
|
||||
# 记录该篇任务转发情况加入
|
||||
if forwarded > 0:
|
||||
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
|
||||
'类型': '新浪微博',
|
||||
'公众号': account,
|
||||
'日期': dataA.iloc[j, yWBdate],
|
||||
'内容': str1,
|
||||
'链接': dataA.iloc[j, yWBurl],
|
||||
'市州': city,
|
||||
}], ignore_index=True)
|
||||
# 记录该账号的总转发数
|
||||
dfAllAccount.loc[mask, '转发数'] = count
|
||||
sR['转发数'] = count
|
||||
|
||||
# 全市总转发文章篇数
|
||||
ccwb = dfAllAccount.loc[maskCWB, '转发数'].sum()
|
||||
# 全市总转发率
|
||||
rcc = ccwb/accountNumCWB/dfTask.shape[0]
|
||||
print(' ', city, '共有', accountNumCWB, '个微博号,获取数据', len(accounts), '个。共转发', ccwb, '次,转发率{:.1f}%'.format(rcc*100) )
|
||||
|
||||
#countWbForewards = dfRR.shape[0] - countWxForewards
|
||||
#print(' 获取 WB 账号数', len(dfWB['weiboName'].unique()), '参与转发账号数', countWbForewards)
|
||||
|
||||
|
||||
# TT
|
||||
if isDoTT:
|
||||
print('=============================================================')
|
||||
print('---- TT data read ----')
|
||||
# id userId source city tid cellType title
|
||||
# time-stamp date url commentCount readNum likeNum showNum
|
||||
# 获取头条数据
|
||||
dfTT = pd.DataFrame()
|
||||
for strP in strPath:
|
||||
ddff = getTTData(strP, cities)
|
||||
dfTT = dfTT.append(ddff)
|
||||
|
||||
yTTtitle = dfTT.columns.get_loc('title')
|
||||
yTTdate = dfTT.columns.get_loc('date')
|
||||
yTTurl = dfTT.columns.get_loc('url')
|
||||
|
||||
# 逐个市州统计账号转发情况
|
||||
for city in cities:
|
||||
print("++++++++++++++++++++++++++++++++++++++++++++++++++")
|
||||
print('---- TT title match', city, ' ----' )
|
||||
# 本市头条数据
|
||||
dataC = dfTT.loc[dfTT['city'] == city].copy()
|
||||
# 获取数据的头条账号
|
||||
accounts = dataC['account'].unique()
|
||||
|
||||
# 本市所有头条账号信息
|
||||
maskCTT = (dfAllAccount['账号类型'] == '今日头条') & (dfAllAccount['市/省局'] == city)
|
||||
accountNumCTT = maskCTT.tolist().count(True)
|
||||
|
||||
# 按头条数据的账号遍历
|
||||
for account in accounts:
|
||||
#print(account)
|
||||
# 该账号的所有文章
|
||||
dataA = dataC[dataC['account']==account]
|
||||
sR = pd.Series([], dtype=pd.StringDtype())
|
||||
sR['类型'] = '今日头条'
|
||||
sR['市州'] = city
|
||||
sR['账号名称'] = account
|
||||
count = 0
|
||||
|
||||
# 为转发账号匹配单位全称和所属县区
|
||||
mask = ( dfAllAccount['账号类型'] == '今日头条' ) & (dfAllAccount['市/省局'] == city) & (dfAllAccount['账号名称']==account)
|
||||
if mask.any():
|
||||
sxq = str(dfAllAccount.loc[mask, '区县/地方部门'].values[0])
|
||||
if sxq.lower() !='nan':
|
||||
sR['区县'] = sxq
|
||||
sdwmc = str(dfAllAccount.loc[mask, '单位全称'].values[0])
|
||||
if sdwmc.lower() != 'nan':
|
||||
sR['单位名称'] = sdwmc
|
||||
else:
|
||||
print(' !!!! 头条', account, '在', city, '无详细信息' )
|
||||
continue
|
||||
|
||||
# 按任务标题逐个匹配所有发文,得到每篇任务的转发情况
|
||||
for i in range(dfTask['标题'].count()):
|
||||
# 对于每一篇任务文章
|
||||
rn = dfTask.iloc[i, yT0] # 任务序号
|
||||
ssrt = str(dfTask.iloc[i, yT1]) # 任务标题
|
||||
rt = fetch_chinese(ssrt) # 只取中文
|
||||
forwarded = 0
|
||||
|
||||
# 查看该账号的所有文章
|
||||
for j in range(dataA.shape[0]):
|
||||
str0 = str(dataA.iloc[j, yTTtitle])
|
||||
str1 = fetch_chinese(str0)
|
||||
#
|
||||
if len(rt) > len(str1): # 若任务标题过长,截取前半部分进行对比
|
||||
strRT = rt[:len(str1)]
|
||||
else: #若文章标题过长,只比较任务标题长度部分
|
||||
strRT = rt
|
||||
str1 = str1[:len(rt)]
|
||||
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
|
||||
if ratio > _RATIO:
|
||||
forwarded += 1
|
||||
if forwarded > 0:
|
||||
break
|
||||
#记录该任务转发情况
|
||||
dfAllAccount.loc[mask, str(rn)] = forwarded
|
||||
sR[str(rn)] = forwarded
|
||||
count += forwarded
|
||||
if forwarded > 0:
|
||||
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
|
||||
'类型': '今日头条',
|
||||
'公众号': account,
|
||||
'日期': dataA.iloc[j, yTTdate],
|
||||
'内容': str1,
|
||||
'链接': dataA.iloc[j, yTTurl],
|
||||
'市州': city,
|
||||
}], ignore_index=True)
|
||||
|
||||
# 记录该账号转发情况
|
||||
dfAllAccount.loc[mask, '转发数'] = count
|
||||
sR['转发数'] = count
|
||||
|
||||
|
||||
# 全市总转发文章篇数
|
||||
cctt = dfAllAccount.loc[maskCTT, '转发数'].sum()
|
||||
# 全市总转发率
|
||||
rcc = cctt/accountNumCTT/dfTask.shape[0]
|
||||
print(' ', city, '共有', accountNumCTT, '个头条号,获取数据', len(accounts), '个。共转发', cctt, '次,转发率{:.1f}%'.format(rcc*100) )
|
||||
|
||||
#countTtForewards = dfRR.shape[0] - countWxForewards - countWbForewards
|
||||
#print(' 获取 TT 账号数', len(dfTT['account'].unique()),'参与转发账号数', countTtForewards)
|
||||
|
||||
if isDoWX or isDoWB or isDoTT:
|
||||
print('=============================================================')
|
||||
print('---- STATISTICS ----')
|
||||
print('=============================================================')
|
||||
|
||||
dfAllAccount.to_excel(strOutputPath + '甘肃省_转发账号.xlsx')
|
||||
dfO.to_excel(strOutputPath + '甘肃省_转发文章.xlsx')
|
||||
|
||||
|
||||
print('---- 统计市州转发率 ----')
|
||||
for city in cities:
|
||||
#if city in ['兰州新区', '省直部门']:
|
||||
# continue
|
||||
print(" add up city", city)
|
||||
|
||||
|
||||
maskC = ( (dfAllAccount['账号类型'] == '新浪微博')
|
||||
| (dfAllAccount['账号类型'] == '微信服务号')
|
||||
| (dfAllAccount['账号类型'] == '微信订阅号')
|
||||
| (dfAllAccount['账号类型'] == '今日头条') ) & (dfAllAccount['市/省局'] == city)
|
||||
|
||||
# dfdfC = dfAllAccount.loc[((dfAllAccount['账号类型'] == '新浪微博')
|
||||
# | (dfAllAccount['账号类型'] == '微信服务号')
|
||||
# | (dfAllAccount['账号类型'] == '微信订阅号')
|
||||
# | (dfAllAccount['账号类型'] == '今日头条'))
|
||||
# & (dfAllAccount['市/省局'] == city)].copy()
|
||||
|
||||
dfdfC = dfAllAccount.loc[maskC,:]
|
||||
dfdfC.to_excel(strOutputPath + city + '_转发账号.xlsx')
|
||||
|
||||
dfOCity = dfO[dfO['市州'] == city]
|
||||
dfO.to_excel(strOutputPath + city + '_转发文章.xlsx')
|
||||
|
||||
#dfRRCity = dfRR.loc[dfRR['市州'] == city].copy()
|
||||
|
||||
#########################################################################################################
|
||||
# 统计市/州直部门转发数
|
||||
dfdfCD = dfdfC.loc[dfdfC['区县/地方部门'].isin(['州直部门', '市直部门', '省直部门'])].copy()
|
||||
dfdfCDA = pd.pivot_table(dfdfCD, index=['单位全称'], values=['账号名称'],
|
||||
aggfunc=['count'], fill_value='', margins=True, margins_name='总计')
|
||||
dfdfCDC = pd.pivot_table(dfdfCD, index=['单位全称'], values=['转发数'],
|
||||
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
||||
#dfdfCDR = pd.pivot_table(dfdfCD, index=['单位全称'], values=['阅读数'],
|
||||
# aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
||||
dfdfCD_A = pd.concat([dfdfCDA, dfdfCDC], axis=1)
|
||||
|
||||
#print('-', dfdfCD_A.columns.values)
|
||||
# 合并多层索引MultiIndex
|
||||
dfdfCD_A.columns = ['_'.join(col) for col in dfdfCD_A.columns.values]
|
||||
#print('=', dfdfCD_A.columns.values)
|
||||
# 计算转发率
|
||||
dfdfCD_A['rate'] = dfdfCD_A.apply(
|
||||
lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['标题'].count() * 1000) / 1000.0, axis=1)
|
||||
# 排序
|
||||
dfdfCD_AD = dfdfCD_A[0:dfdfCD_A.shape[0] - 1].sort_values(by='rate', ascending=False)
|
||||
|
||||
dfdfCD_AD = pd.concat([dfdfCD_AD, dfdfCD_A[dfdfCD_A.shape[0] - 1:dfdfCD_A.shape[0]]], axis=0)
|
||||
|
||||
dfdfCD_AD.to_excel(strOutputPath + city + '部门转发统计表.xlsx')
|
||||
#dfDD
|
||||
|
||||
|
||||
##########################################################################################
|
||||
# 全市/州账号按'区县'统计
|
||||
# 发现目前版本pivot_table函数aggfunc用列表时,前几列计算值不准确
|
||||
# 所以,暂时单列计算,再合并
|
||||
|
||||
|
||||
#dfdfCD = dfdfC.loc[dfdfC['区县/地方部门'].isin(['州直部门', '市直部门', '省直部门'])].copy()
|
||||
dfdfCA = pd.pivot_table(dfdfC, index=['区县/地方部门'], values=['账号名称'],
|
||||
aggfunc=['count'], fill_value='', margins=True, margins_name='总计')
|
||||
dfdfCC = pd.pivot_table(dfdfC, index=['区县/地方部门'], values=['转发数'],
|
||||
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
||||
#dfdfCDR = pd.pivot_table(dfdfCD, index=['单位全称'], values=['阅读数'],
|
||||
# aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
||||
dfdfC_A = pd.concat([dfdfCA, dfdfCC], axis=1)
|
||||
|
||||
#print('-', dfdfCD_A.columns.values)
|
||||
# 合并多层索引MultiIndex
|
||||
dfdfC_A.columns = ['_'.join(col) for col in dfdfC_A.columns.values]
|
||||
#print('=', dfdfCD_A.columns.values)
|
||||
# 计算转发率
|
||||
dfdfC_A['rate'] = dfdfC_A.apply(
|
||||
lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['标题'].count() * 1000) / 1000.0, axis=1)
|
||||
# 排序
|
||||
dfdfC_AD = dfdfC_A[0:dfdfC_A.shape[0] - 1].sort_values(by='rate', ascending=False)
|
||||
|
||||
dfdfC_AD = pd.concat([dfdfC_AD, dfdfC_A[dfdfC_A.shape[0] - 1:dfdfC_A.shape[0]]], axis=0)
|
||||
|
||||
dfdfC_AD.to_excel(strOutputPath + city + '转发统计表.xlsx')
|
||||
#dfCC
|
||||
|
||||
#########################################################
|
||||
#
|
||||
# 生成报告
|
||||
tpl = DocxTemplate(fnTemplate)
|
||||
if city in ['临夏回族自治州', '甘南藏族自治州']:
|
||||
sL0 = '州'
|
||||
else:
|
||||
sL0 = '市'
|
||||
info = {
|
||||
"strL0":sL0,
|
||||
"strL1":"区县",
|
||||
"taskCount": dfTask['标题'].count(),
|
||||
"aNum": int(dfdfC_AD.iloc[-1]['count_账号名称']),
|
||||
"fNum": int(dfdfC_AD.iloc[-1]['sum_转发数']),
|
||||
"r": '%.1f'%(dfdfC_AD.iloc[-1]['rate']*100.0),
|
||||
#
|
||||
"dNum": int(dfdfCD_AD.iloc[-1]['count_账号名称']), # 部门总账号数
|
||||
"dFNum": int(dfdfCD_AD.iloc[-1]['sum_转发数']), # 部门总转发数
|
||||
"dr": '%.1f'%(dfdfCD_AD.iloc[-1]['rate']*100.0), # 部门平均转发率
|
||||
}
|
||||
context.update(info)
|
||||
|
||||
# 县区转发率表格
|
||||
t1_list = []
|
||||
for index, row in dfdfC_AD.iterrows():
|
||||
if index == "总计":
|
||||
continue
|
||||
t1_a = {'county': str(index), 'rate': '%.1f'%(row['rate']*100.0),
|
||||
'account': int(row['count_账号名称']), 'fNum': int(row['sum_转发数']) }
|
||||
t1_list.append(t1_a)
|
||||
context['t1_contents'] = t1_list
|
||||
|
||||
# 部门转发率表格
|
||||
t2_list = []
|
||||
for index, row in dfdfCD_AD.iterrows():
|
||||
if index == "总计":
|
||||
continue
|
||||
t2_a = {'name': str(index),
|
||||
'rate': '%.1f'%(row['rate']*100.0),
|
||||
'account': int(row['count_账号名称']),
|
||||
'fNum': int(row['sum_转发数']) }
|
||||
t2_list.append(t2_a)
|
||||
context['t2_contents'] = t2_list
|
||||
|
||||
# 转发任务列表
|
||||
t3_list = []
|
||||
for index, row in dfTask.iterrows():
|
||||
t3_a = {'id': row['序号'],
|
||||
'title': row['标题'],
|
||||
'date': ts2date(row[sTaskDate], '%m月%d日') }
|
||||
t3_list.append(t3_a)
|
||||
context['t3_contents'] = t3_list
|
||||
|
||||
# 绘制区县转发率图
|
||||
drawBar(dfdfC_AD['rate'][:-1], dfdfC_AD.index[:-1],
|
||||
'区县转发率', os.path.join(strOutputPath, '_' + city + '_graphCounty.png'))
|
||||
|
||||
dc = {
|
||||
'graphCounty': InlineImage(tpl, os.path.join(strOutputPath, '_' + city+'_graphCounty.png'), width=Mm(120)),
|
||||
}
|
||||
context.update(dc)
|
||||
|
||||
tpl.render(context)
|
||||
tpl.save(strOutputPath+city+'转发统计报告_2023年{}月份.docx'.format(context['month']))
|
||||
|
||||
endtime = datetime.datetime.now()
|
||||
usedtime = endtime - starttime
|
||||
print("time: ", usedtime)
|
|
@ -15,11 +15,11 @@ TEST = False # True为测试状态,不发短信; False为正式状态,发
|
|||
################
|
||||
|
||||
dDate = {
|
||||
'dateStart': '3月23日',
|
||||
'dateEnd': '29日'
|
||||
'dateStart': '6月8日',
|
||||
'dateEnd': '14日'
|
||||
}
|
||||
fn = 'D:/Projects/POM/DATA/2023年3月/3月31日预警/周预警_2023.3.29.xlsx'
|
||||
outPath = 'D:/Projects/POM/DATA/2023年3月/3月31日预警/'
|
||||
fn = 'D:/Projects/POM/DATA/2023年6月/6.16周预警/周预警_2023.6.15.xlsx'
|
||||
outPath = 'D:/Projects/POM/DATA/2023年6月/6.16周预警/'
|
||||
################
|
||||
|
||||
cities = {'白银市', '武威市',
|
||||
|
@ -29,12 +29,16 @@ cities = {'白银市', '武威市',
|
|||
'临夏回族自治州', '平凉市', '定西市', '定西市', '嘉峪关市',
|
||||
'兰州新区','陇南市', '张掖市', '庆阳市宁县', '庆阳市镇原县', } #
|
||||
|
||||
cities = {'天水市', '平凉市', '定西市', '定西市', '嘉峪关市',
|
||||
'兰州新区','陇南市', '张掖市', '庆阳市镇原县', } #
|
||||
|
||||
#cities = {'酒泉市'}
|
||||
|
||||
# 电话号码
|
||||
contactsDWL = {
|
||||
'szq': '13359446622',
|
||||
'zyb': '13609346975'
|
||||
'zyb': '13609346975',
|
||||
'shx': '18089386522'
|
||||
}
|
||||
contacts = {
|
||||
'天水市': {'王慧': '18706936366', '王肖肖': '17793816150'},
|
||||
|
|
Loading…
Reference in New Issue