1092 lines
52 KiB
Python
1092 lines
52 KiB
Python
import datetime
|
||
import csv
|
||
import pandas as pd
|
||
import numpy as np
|
||
import glob, os, re, time
|
||
|
||
import matplotlib.pyplot as plt
|
||
from matplotlib.ticker import FuncFormatter
|
||
from difflib import SequenceMatcher
|
||
from collections import Counter
|
||
import difflib
|
||
|
||
|
||
from docxtpl import DocxTemplate
|
||
from docxtpl import InlineImage
|
||
from docx.shared import Mm
|
||
|
||
import jieba
|
||
import jieba.posseg as pseg
|
||
|
||
|
||
|
||
def fetch_chinese(s):
|
||
pattern =re.compile(r'[^\u4e00-\u9fa5]')
|
||
sc = re.sub(pattern, '', s)
|
||
return sc
|
||
|
||
def toDate(strDT):
|
||
dt = pd.to_datetime(strDT, errors='coerce')
|
||
dts = ''
|
||
# print('-+-+:', type(dt), dt)
|
||
if not pd.isna(dt):
|
||
dts = dt.strftime('%m-%d')
|
||
return dts
|
||
|
||
|
||
# 画柱状图
|
||
def drawBar(data, recipe, title='', fn=''):
|
||
plt.figure(figsize=(6, 4))
|
||
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||
plt.rcParams['axes.unicode_minus'] = False
|
||
counties = recipe
|
||
countyRates = data
|
||
|
||
plt.bar(counties, countyRates, width=0.5)
|
||
plt.xticks(counties, counties, rotation=35)
|
||
plt.ylim((0, 1))
|
||
|
||
def to_percent(temp, position):
|
||
return '%2.0f' % (100 * temp) + '%'
|
||
|
||
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
|
||
plt.title(title, fontsize=16)
|
||
plt.tight_layout()
|
||
plt.savefig(fn)
|
||
# plt.show()
|
||
plt.cla()
|
||
plt.clf()
|
||
plt.close()
|
||
|
||
def getWBData(path, cities, hasBody=False):
|
||
# cityShorten
|
||
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
|
||
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
|
||
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
|
||
|
||
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
|
||
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
|
||
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
|
||
|
||
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
||
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
||
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
||
}
|
||
dirCs = os.listdir(path)
|
||
cs = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数',
|
||
'转发数', '评论数', 'weiboID', 'weiboName', '市州']
|
||
dfWB = pd.DataFrame(columns=cs)
|
||
cityCount = 0
|
||
for dirC in dirCs:
|
||
if dirC[:1] == '.':
|
||
continue
|
||
if not os.path.isdir(os.path.join(path, dirC)):
|
||
continue
|
||
if 'weixin' in dirC.lower():
|
||
continue
|
||
if 'tt' in dirC.lower():
|
||
continue
|
||
if not cityShorten[dirC] in cities:
|
||
continue
|
||
print(' city: ', cityShorten[dirC], dirC)
|
||
cityCount += 1
|
||
# City LN
|
||
cols = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数',
|
||
'转发数', '评论数'] #WB下载工具中的格式
|
||
dfWBC = pd.DataFrame(columns=cols)
|
||
dirCTs = os.listdir(os.path.join(path, dirC))
|
||
for dirCT in dirCTs:
|
||
if dirCT[:1] == '.':
|
||
continue
|
||
# 时段 weibo weibo_1
|
||
if not os.path.isdir(os.path.join(path, dirC, dirCT)):
|
||
continue
|
||
if 'weixin' in dirCT.lower():
|
||
continue
|
||
if 'tt' in dirCT.lower():
|
||
continue
|
||
print(' read WB... dir:',dirCT)
|
||
dirAs = os.listdir(os.path.join(path, dirC, dirCT))
|
||
for dirA in dirAs:
|
||
if dirA[:1] == '.':
|
||
continue
|
||
# 都是账号名称目录下再存账号ID.txt,
|
||
if not os.path.isdir(os.path.join(path, dirC, dirCT, dirA)):
|
||
continue
|
||
##print('---',dirA)
|
||
# 账号名称
|
||
wbName = dirA
|
||
fileAs = os.listdir(os.path.join(path, dirC, dirCT, dirA))
|
||
if len(fileAs) > 0 and os.path.splitext(fileAs[0])[-1] == '.csv':
|
||
wbId = fileAs[0][:-4]
|
||
if len(fileAs) > 1 and wbId.startswith('.'):
|
||
wbId = fileAs[1][:-4]
|
||
# 读取文件
|
||
##print('----',wbName, wbId)
|
||
filename = os.path.join(path, dirC, dirCT, dirA, fileAs[0])
|
||
dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
|
||
index_col=None)#, engine='python', encoding='gbk'#utf-8
|
||
dfdfwb = dfdfwb[1:]
|
||
dfdfwb["weiboID"] = wbId
|
||
dfdfwb["weiboName"] = wbName
|
||
|
||
dfWBC = dfWBC.append(dfdfwb)
|
||
#print(wbName, wbId, fileAs[0], dfdfwb.shape, dfWBC.shape)
|
||
|
||
if len(fileAs)>1:
|
||
print(" +=+= ", fileAs)
|
||
|
||
print(' ', dfWBC.shape)
|
||
#dfWBC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx")
|
||
dfWBC['市州'] = cityShorten[dirC]
|
||
dfWB = dfWB.append(dfWBC)
|
||
|
||
print('Read WB finished. cities', cityCount, '; lines', dfWB.shape)
|
||
#dfWB.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
|
||
return dfWB
|
||
|
||
# 从数据目录中读取xlsx文件,拼接到一起
|
||
def getWXData(path, cities, hasBody=False):
|
||
# cityShorten
|
||
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
|
||
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
|
||
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
|
||
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
|
||
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
|
||
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
|
||
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
||
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
||
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
||
}
|
||
dirBatches = os.listdir(path)
|
||
cols = ['公众号', '链接', '日期', '标题', '内容', '头条', '市州', '阅读数']
|
||
dfWX = pd.DataFrame(columns=cols)
|
||
countC = 0
|
||
countFnC = 0
|
||
# 监测批次目录
|
||
for dirBatch in dirBatches:
|
||
if not os.path.isdir(os.path.join(path, dirBatch)):
|
||
continue # 仅目录
|
||
|
||
# City LN
|
||
# 列出市州文件名称
|
||
fileCs = os.listdir(os.path.join(path, dirBatch))
|
||
count = 0
|
||
for fileC in fileCs:
|
||
if fileC[:1] == '.':
|
||
continue
|
||
# 处理目录
|
||
if os.path.isdir(os.path.join(path, dirBatch, fileC)) and 'weixin' in fileC:
|
||
print(' ', os.path.join(path, dirBatch, fileC))
|
||
fs = os.listdir(os.path.join(path, dirBatch, fileC))
|
||
for f in fs:
|
||
fe = os.path.splitext(f)[-1]
|
||
if fe == '.xlsx' or fe == '.xls':
|
||
fName = os.path.splitext(fileC)[0]
|
||
cityname = cityShorten[dirBatch]
|
||
if cityname in cities:
|
||
dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC, f))
|
||
dfdfwxc['市州'] = cityname
|
||
print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape)
|
||
dfWX = dfWX.append(dfdfwxc)
|
||
count = count + 1
|
||
# 处理文件
|
||
fExt = os.path.splitext(fileC)[-1]
|
||
if fExt != '.xlsx' and fExt != '.xls':
|
||
continue # 限制文件扩展名
|
||
fName = os.path.splitext(fileC)[0]
|
||
cityname = cityShorten[dirBatch]
|
||
if cityname in cities:
|
||
dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC))
|
||
dfdfwxc['市州'] = cityShorten[dirBatch]
|
||
print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape)
|
||
dfWX = dfWX.append(dfdfwxc)
|
||
count = count + 1
|
||
countFnC += count
|
||
if count > 0:
|
||
countC += 1
|
||
print("Read WX Finished. cities ", countC, '; Files', countFnC, '; lines ', dfWX.shape[0])
|
||
#dfWX.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WX_ALL.xlsx")
|
||
return dfWX
|
||
|
||
# 从数据目录中读取xlsx文件,拼接到一起
|
||
def getTTData(path, cities, hasBody=False):
|
||
# cityShorten
|
||
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
|
||
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
|
||
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
|
||
|
||
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
|
||
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
|
||
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
|
||
|
||
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
||
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
||
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
||
}
|
||
dirCs = os.listdir(path)
|
||
#account date title nread ncomment content url origin
|
||
cs = ['account', 'date', 'title', 'nread', 'ncomment', 'content', 'url', 'origin', 'city']
|
||
|
||
dfTT = pd.DataFrame(columns=cs)
|
||
cityCount = 0
|
||
for dirC in dirCs:
|
||
if dirC[:1] == '.':
|
||
continue
|
||
if not os.path.isdir(os.path.join(path, dirC)):
|
||
continue
|
||
if 'weixin' in dirC.lower():
|
||
continue
|
||
if 'weibo' in dirC.lower():
|
||
continue
|
||
if not cityShorten[dirC] in cities:
|
||
continue
|
||
print(' city: ', cityShorten[dirC], dirC)
|
||
cityCount += 1
|
||
# City LN
|
||
dfTTC = pd.DataFrame(columns=cs)
|
||
dirCTs = os.listdir(os.path.join(path, dirC))
|
||
for dirCT in dirCTs:
|
||
if dirCT[:1] == '.':
|
||
continue
|
||
# 时段 weibo weibo_1
|
||
if not os.path.isdir(os.path.join(path, dirC, dirCT)):
|
||
continue
|
||
if 'weixin' in dirCT.lower():
|
||
continue
|
||
if 'weibo' in dirCT.lower():
|
||
continue
|
||
if 'tt' in dirCT.lower():
|
||
print(' read TT... dir:',dirCT)
|
||
fns = os.listdir(os.path.join(path, dirC, dirCT))
|
||
for fn in fns:
|
||
if fn[:1] == '.':
|
||
continue
|
||
if not fn[-5:] == '.xlsx':
|
||
continue
|
||
#print('---',fn)
|
||
# 账号名称
|
||
|
||
ttName = fn[fn.index('_')+1:]
|
||
ttName = ttName[:ttName.index('_')]
|
||
#D:\Projects\POM\DATA\2022年11月\10月报告\全文\LN\TT
|
||
fileAs = os.path.join(path, dirC, dirCT, fn)
|
||
#print(' ', ttName, fileAs)
|
||
if len(fileAs) > 0:
|
||
try:
|
||
dfdftt = pd.read_excel(fileAs)
|
||
dfTTC = dfTTC.append(dfdftt)
|
||
except:
|
||
print("read file failed. ", fileAs)
|
||
|
||
#dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
|
||
# index_col=None)#, engine='python', encoding='gbk'#utf-8
|
||
#dfdfwb = dfdfwb[1:]
|
||
#dfdfwb["weiboID"] = wbId
|
||
#dfdfwb["weiboName"] = wbName
|
||
|
||
#dfTTC = dfTTC.append(dfdfwb)
|
||
#print(ttName, '读入:', dfdftt.shape[0], ' 总计:', dfTTC.shape[0])
|
||
|
||
#if len(fileAs)>1:
|
||
# print(" +=+= ", fileAs)
|
||
|
||
print(' 读入头条数据行数', dfTTC.shape)
|
||
#dfTTC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx")
|
||
dfTTC['city'] = cityShorten[dirC]
|
||
dfTT = dfTT.append(dfTTC)
|
||
|
||
print('Read TT finished. cities', cityCount, '; lines', dfTT.shape)
|
||
#dfTT.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
|
||
return dfTT
|
||
|
||
|
||
def fetch_chinese(s):
|
||
pattern =re.compile(r'[^\u4e00-\u9fa5]')
|
||
sc = re.sub(pattern, '', s)
|
||
return sc
|
||
|
||
if __name__ == "__main__":
|
||
starttime = datetime.datetime.now()
|
||
_RATIO = 0.5
|
||
isDoWX = True
|
||
isDoWB = True
|
||
isDoTT = True
|
||
cities = [
|
||
'临夏回族自治州',
|
||
'白银市',
|
||
'定西市',
|
||
'酒泉市',
|
||
'嘉峪关市',
|
||
'平凉市',
|
||
'庆阳市',
|
||
'天水市',
|
||
'武威市',
|
||
'兰州新区',
|
||
'陇南市',
|
||
'兰州市', '张掖市', '甘南藏族自治州', '金昌市',
|
||
'省直部门', # 共12市2州1新区
|
||
]
|
||
'''
|
||
cities = [
|
||
'临夏回族自治州',
|
||
'白银市',
|
||
'定西市',
|
||
'酒泉市',
|
||
'天水市',
|
||
'陇南市',
|
||
|
||
#'省直部门', # 共12市2州1新区
|
||
]
|
||
'''
|
||
#cities = ['陇南市', '临夏回族自治州', '白银市', '定西市', '酒泉市', '平凉市','武威市','天水市']
|
||
#cities = ['陇南市']
|
||
# 转发任务
|
||
#dfTask = pd.read_excel('D:/Projects/POM/DATA/2022年S2/S2/全省政务新媒体二季度转发信息条目.xls')
|
||
dfTask = pd.read_excel('D:/Projects/POM/DATA/2023年4月/3月报告/2023年3月份全省政务新媒体转发内容条目.xlsx')
|
||
# 账号信息
|
||
strFnAccount = 'D:/Projects/POM/DATA/2023年4月/3月报告/全国报送系统表单_2023.3.31.xlsx'
|
||
dfAllAccount = pd.read_excel(strFnAccount)
|
||
# 省直部门账号部门简称
|
||
dfProvincial = pd.read_excel('D:/Projects/POM/DATA/2023年4月/3月报告/省直部门账号名称简称.xlsx')
|
||
fnTemplate = 'D:/Projects/POM/DATA/2023年4月/3月报告/POM_ForewardTemplate.docx'
|
||
|
||
# 数据根目录,
|
||
strPath = ['D:/Projects/POM/DATA/2023年4月/3月报告/']
|
||
strOutputPath = 'D:/Projects/POM/DATA/2023年4月/3月报告/转发/'
|
||
|
||
context = {
|
||
"year": "2023",
|
||
"month": "3",
|
||
"pubMonth": "4",
|
||
"dateStart": "2023年3月1日",
|
||
"dateEnd": "2023年3月31日"
|
||
}
|
||
|
||
dfAllAccount.loc[:, '转发数'] = 0
|
||
dfAllAccount.loc[:, '阅读数'] = 0
|
||
|
||
|
||
################################################
|
||
# 创建存储矩阵
|
||
# 按照转发任务创建统计矩阵
|
||
colRR = ['市州', '类型', '账号名称', '单位名称', '省直部门', '区县', '转发数', '阅读数']
|
||
for ididid in dfTask['序号'][0:dfTask['内容'].count()].tolist():
|
||
#for ididid in range(1, dfTask['内容'].count()):
|
||
colRR.append(str(ididid))
|
||
dfRR = pd.DataFrame(columns=colRR) # 每列一个转发任务,每行一个账号
|
||
# 用于保存每一条转发任务的账号和文章
|
||
dfO = pd.DataFrame(columns=['任务序号', '任务名称', '类型', '公众号', '日期', '内容', '链接', '市州'] )
|
||
|
||
|
||
################################################
|
||
#
|
||
countWxForewards = 0
|
||
countWbForewards = 0
|
||
countTtForewards = 0
|
||
|
||
|
||
# TT
|
||
if isDoTT:
|
||
print('=============================================================')
|
||
print('---- TT ----')
|
||
# id userId source city tid cellType title
|
||
# time-stamp date url commentCount readNum likeNum showNum
|
||
#dfTT = getTTData(strFnTT, strFnAccount, cities) #附加市州信息, cities未使用
|
||
dfTT = pd.DataFrame()
|
||
for strP in strPath:
|
||
ddff = getTTData(strP + '全文/', cities)
|
||
print(" read TT data", ddff.shape)
|
||
dfTT = dfTT.append(ddff)
|
||
|
||
print("tt data ready", dfTT.shape)
|
||
|
||
# cs = ['account', 'date', 'title', 'nread', 'ncomment', 'content', 'url', 'origin', 'city']
|
||
#cities = dfTT['city'].unique()
|
||
for city in cities:
|
||
dataC = dfTT.loc[dfTT['city'] == city].copy()
|
||
accounts = dataC['account'].unique()
|
||
|
||
dfdfCityTT = dfAllAccount.loc[(dfAllAccount['账号类型'] == '今日头条')
|
||
& (dfAllAccount['市/省局'] == city)].copy()
|
||
|
||
print(' count TT, city:', city, '读入账号数:', len(accounts), '任务账号数:', dfdfCityTT.shape[0])
|
||
print(' ', dataC.shape)
|
||
for account in accounts:
|
||
#一个公众号的所有文章
|
||
#print(account)
|
||
dataA = dataC[dataC['account']==account]
|
||
sR = pd.Series([], dtype=pd.StringDtype())
|
||
sR['类型'] = '今日头条'
|
||
sR['市州'] = city
|
||
sR['账号名称'] = account
|
||
count = 0
|
||
for i in range(dfTask['内容'].count()):
|
||
rn = dfTask.iloc[i, dfTask.columns.get_loc('序号')]
|
||
rt = str(dfTask.iloc[i, dfTask.columns.get_loc('内容')]) #任务标题
|
||
forwarded = 0
|
||
for j in range(dataA.shape[0]):
|
||
str1 = str(dataA.iloc[j, dataA.columns.get_loc('title')]) # 文章标题
|
||
#
|
||
if len(rt) > len(str1): # 任务标题过长,截取前半部分进行对比
|
||
strRT = rt[:len(str1)]
|
||
else: #文章标题过长,只比较任务标题长度部分
|
||
strRT = rt
|
||
str1 = str1[:len(rt)]
|
||
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
|
||
if ratio > _RATIO:
|
||
forwarded += 1
|
||
if forwarded > 0:
|
||
break
|
||
sR[str(rn)] = forwarded
|
||
count += forwarded
|
||
if forwarded > 0:
|
||
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
|
||
'类型': '今日头条',
|
||
'公众号': account, #dfTT.iloc[j, dfTT.columns.get_loc('account')],
|
||
'日期': dataA.iloc[j, dataA.columns.get_loc('date')],
|
||
'内容': str1,
|
||
'链接': dataA.iloc[j, dataA.columns.get_loc('url')],
|
||
'市州': city,
|
||
}], ignore_index=True)
|
||
|
||
#checknames = ['白银高新区管委会', '健康白银', '白银市卫生健康委', '白银市卫生健康委员会', '白银发改委', '白银市场监管', '白银农业农村', '白银应急', '白银退役军人', '白银政务服务', '白银政务服务', '白银文旅', '白银科技', '白银林草', '白银市平川区商务局', '平川区长征街道', '平川区红会路街道', '和谐复兴', '兴平路街道办', '平川金融办']
|
||
#if account in checknames:
|
||
# print(' -- ', account, count)
|
||
sR['转发数'] = count
|
||
dfRR = dfRR.append(sR, ignore_index=True, sort=False)
|
||
#dfRR.to_excel('D:/Projects/POM/2021年6月/二季度/转发统计__'+city+'.xlsx')
|
||
countTtForewards = dfRR.shape[0] - countWxForewards - countWbForewards
|
||
print('TT forewards', countTtForewards)
|
||
|
||
# WX
|
||
if isDoWX:
|
||
print('=============================================================')
|
||
print('---- WX ----')
|
||
dfWX = pd.DataFrame()
|
||
for strP in strPath:
|
||
ddff = getWXData(strP + '全文/', cities)
|
||
print(' read WX data', ddff.shape)
|
||
dfWX = dfWX.append(ddff)
|
||
print('WX data ready', dfWX.shape)
|
||
|
||
dfWX = dfWX.fillna(value=0)
|
||
|
||
# 公众号 链接 日期 标题 内容 头条 city
|
||
## 逐个市州统计每个账号的转发情况
|
||
#cities = dfWX['市州'].unique()
|
||
for city in cities:
|
||
print(" count WX, city:", city)
|
||
dataC = dfWX.loc[dfWX['市州'] == city].copy()
|
||
accounts = dataC['公众号'].unique()
|
||
|
||
dfdfCityWX = dfAllAccount.loc[((dfAllAccount['账号类型'] == '小程序+微信')
|
||
| (dfAllAccount['账号类型'] == '微信服务号')
|
||
| (dfAllAccount['账号类型'] == '微信订阅号'))
|
||
& (dfAllAccount['市/省局'] == city)].copy()
|
||
|
||
print(" count WX, city:", city, '账号数:', len(accounts), '任务账号数:', dfdfCityWX.shape[0])
|
||
for account in accounts:
|
||
# print(account)
|
||
dataA = dataC.loc[dataC['公众号'] == account].copy() # 一个公众号的所有文章
|
||
sR = pd.Series(dtype='object')
|
||
sR['类型'] = '微信'
|
||
sR['市州'] = city
|
||
sR['账号名称'] = account
|
||
count = 0
|
||
arn = 0
|
||
for i in range(dfTask['内容'].count()):
|
||
# 对于每一篇任务文章
|
||
rn = dfTask.iloc[i, dfTask.columns.get_loc('序号')]
|
||
rt = str(dfTask.iloc[i, dfTask.columns.get_loc('内容')])
|
||
forwarded = 0
|
||
readNum = 0
|
||
# 查看该账号的所有文章
|
||
for j in range(dataA.shape[0]):
|
||
str1 = str(dataA.iloc[j, dataA.columns.get_loc('标题')])
|
||
#
|
||
if len(rt) > len(str1):# 任务标题过长,截取前半部分进行对比
|
||
strRT = rt[:len(str1)]
|
||
else:#文章标题过长,只比较任务标题长度部分
|
||
strRT = rt
|
||
str1 = str1[:len(rt)]
|
||
|
||
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
|
||
|
||
tRN = int(dataA.iloc[j, dataA.columns.get_loc('阅读数')])
|
||
# 遇到相似的即跳出
|
||
if ratio > _RATIO:
|
||
forwarded += 1
|
||
readNum += tRN
|
||
if forwarded > 0:
|
||
break
|
||
sR[str(rn)] = forwarded
|
||
count += forwarded
|
||
arn += readNum
|
||
if forwarded > 0:
|
||
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
|
||
'类型': '微信',
|
||
'公众号': account,
|
||
'日期': dataA.iloc[j, dataA.columns.get_loc('日期')],
|
||
'内容': str1,
|
||
'链接': dataA.iloc[j, dataA.columns.get_loc('链接')],
|
||
'市州': city,
|
||
'阅读数': readNum,
|
||
}], ignore_index=True)
|
||
sR['转发数'] = count
|
||
sR['阅读数'] = arn
|
||
dfRR = dfRR.append(sR, ignore_index=True, sort=False)
|
||
countWxForewards = dfRR.shape[0]
|
||
print('WX forwards', countWxForewards)
|
||
|
||
# WB
|
||
if isDoWB:
|
||
print('=============================================================')
|
||
print('---- WB ----')
|
||
|
||
dfWB = pd.DataFrame()
|
||
for strP in strPath:
|
||
ddff = getWBData(strP + '全文/', cities)
|
||
print(' read WB data', ddff.shape)
|
||
dfWB = dfWB.append(ddff)
|
||
print('WB data ready', dfWB.shape)
|
||
|
||
################################################
|
||
# WB
|
||
# 微博id 微博正文 头条文章url 原始图片url 被转发微博原始图片url 是否为原创微博 微博视频url 发布位置 date
|
||
# 发布工具 点赞数 转发数 评论数 weiboID weiboName city
|
||
#cities = dfWB['市州'].unique()
|
||
for city in cities:
|
||
print(' count WB, city:', city)
|
||
dataC = dfWB.loc[dfWB['市州'] == city].copy()
|
||
accounts = dataC['weiboName'].unique()
|
||
|
||
|
||
dfdfCityWB = dfAllAccount.loc[(dfAllAccount['账号类型'] == '新浪微博')
|
||
& (dfAllAccount['市/省局'] == city)].copy()
|
||
|
||
print(' count WB, city:', city, "读入账号数:", len(accounts), '任务账号数:', dfdfCityWB.shape[0])
|
||
|
||
for account in accounts:
|
||
# 一个公众号的所有文章
|
||
# print(account)
|
||
dataA = dataC.loc[dataC['weiboName'] == account].copy()
|
||
sR = pd.Series(dtype='object')
|
||
sR['类型'] = '新浪微博'
|
||
sR['市州'] = city
|
||
sR['账号名称'] = account
|
||
count = 0
|
||
# 对一个账号,用任务标题从它全部发文里比对
|
||
# 若找到, 该任务标记为已转发
|
||
for i in range(dfTask['内容'].count()):
|
||
rn = dfTask.iloc[i, dfTask.columns.get_loc('序号')]
|
||
rt = str(dfTask.iloc[i, dfTask.columns.get_loc('内容')])
|
||
forwarded = 0
|
||
for j in range(dataA.shape[0]):
|
||
str1 = str(dataA.iloc[j, dataA.columns.get_loc('微博正文')])
|
||
str2 = str1[:len(rt)]
|
||
#if rt in str1:
|
||
ratio = difflib.SequenceMatcher(None, fetch_chinese(rt), fetch_chinese(str2)).quick_ratio()
|
||
# if account=='陇南公积金' and ratio > 0.5:
|
||
# print('----', ratio)
|
||
# print(rt)
|
||
# print(fetch_chinese(rt))
|
||
# print(str2)
|
||
# print(fetch_chinese(str2))
|
||
if ratio > _RATIO:
|
||
forwarded += 1
|
||
if forwarded > 0:
|
||
break
|
||
sR[str(rn)] = forwarded
|
||
count += forwarded
|
||
if forwarded > 0:
|
||
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
|
||
'类型': '新浪微博',
|
||
'公众号': account,
|
||
'日期': dataA.iloc[j, dataA.columns.get_loc('date')],
|
||
'内容': str1,
|
||
'链接': dataA.iloc[j, dataA.columns.get_loc('头条文章url')],
|
||
'市州': city,
|
||
}], ignore_index=True)
|
||
sR['转发数'] = count
|
||
##print(account, count)
|
||
dfRR = dfRR.append(sR, ignore_index=True, sort=False)
|
||
####if account=='陇南公积金' :
|
||
####print(account, dataA.shape[0], count)
|
||
# dfRR.to_excel('D:/Projects/POM/2021年6月/二季度/转发统计__'+city+'.xlsx')
|
||
|
||
countWbForewards = dfRR.shape[0] - countWxForewards
|
||
print('WB forwards', countWbForewards)
|
||
|
||
|
||
#dfRR.to_excel(strOutputPath + '转发统计_GS_ALL.xlsx')
|
||
#dfO.to_excel(strOutputPath + '转发统计_GS_ALLDATA.xlsx')
|
||
|
||
if isDoWX or isDoWB or isDoTT:
|
||
print('=============================================================')
|
||
print('---- STATISTICS ----')
|
||
print('=============================================================')
|
||
print('ALL forewards account num:', dfRR.shape[0], 'task num:', dfRR.shape[1])
|
||
|
||
################################################
|
||
# 统计每一个账号的累计转发数和阅读数
|
||
# ~~ 匹配到账号所有信息的表格中
|
||
#
|
||
dfAAWX = dfAllAccount.loc[(dfAllAccount['账号类型'] == '小程序+微信')
|
||
| (dfAllAccount['账号类型'] == '微信服务号')
|
||
| (dfAllAccount['账号类型'] == '微信订阅号') ].copy()
|
||
dfRRWX = dfRR.loc[dfRR['类型'] == '微信'].copy()
|
||
dfRRWX.rename(columns={'账号名称':'account'}, inplace=True)
|
||
## 遍历账号详情表
|
||
for i in range(dfAAWX.shape[0]):
|
||
strName = str(dfAAWX.iloc[i, dfAAWX.columns.get_loc('账号名称')])
|
||
dfRRRRWX = dfRRWX[dfRRWX.account == strName]
|
||
|
||
aName = re.sub('\s+','', str(dfAAWX.iloc[i, dfAAWX.columns.get_loc('账号名称')]))
|
||
matchedRow = -1 # 转发统计表中的行数
|
||
|
||
# 对每一个微信账号,从转发统计表中匹配账号名称
|
||
for j in range(dfRRWX.shape[0]):
|
||
name = re.sub('\s+', '', str(dfRRWX.iloc[j, dfRRWX.columns.get_loc('account')]))
|
||
if name == aName:
|
||
matchedRow = j
|
||
break
|
||
if matchedRow > -1 :
|
||
r = int(dfRRWX.iloc[matchedRow, dfRRWX.columns.get_loc('转发数')])
|
||
readNum = int(dfRRWX.iloc[matchedRow, dfRRWX.columns.get_loc('阅读数')])
|
||
dfAAWX.iloc[i, dfAAWX.columns.get_loc('转发数')] = r
|
||
dfAAWX.iloc[i, dfAAWX.columns.get_loc('阅读数')] = readNum
|
||
|
||
if dfRRRRWX.shape[0]>0 and matchedRow<0:
|
||
print(' -', aName, strName, dfRRRRWX.shape[0], matchedRow)
|
||
######
|
||
|
||
if dfRRRRWX.shape[0]<1 and matchedRow>-1:
|
||
print(' =', aName, strName, dfRRRRWX.shape[0], matchedRow)
|
||
######
|
||
#dfAAWX.to_excel('D:/Projects/POM/2021年7月/月报告/6月份全文数据/转发统计___List.xlsx')
|
||
print('总微信账号数:', dfAAWX.shape)
|
||
|
||
dfAAWB = dfAllAccount.loc[dfAllAccount['账号类型'] == '新浪微博'].copy()
|
||
dfRRWB = dfRR.loc[dfRR['类型'] == '新浪微博'].copy()
|
||
for i in range(dfAAWB.shape[0]):
|
||
aName = re.sub('\s+','', str(dfAAWB.iloc[i, dfAAWB.columns.get_loc('账号名称')]))
|
||
matchedRow = -1
|
||
# 对每一个账号,从转发统计表中匹配账号名称
|
||
for j in range(dfRRWB.shape[0]):
|
||
name = re.sub('\s+', '', str(dfRRWB.iloc[j, dfRRWB.columns.get_loc('账号名称')]))
|
||
if name == aName:
|
||
matchedRow = j
|
||
break
|
||
if matchedRow > -1 :
|
||
r = int(dfRRWB.iloc[matchedRow, dfRRWB.columns.get_loc('转发数')])
|
||
dfAAWB.iloc[i, dfAAWB.columns.get_loc('转发数')] = r
|
||
print('总微博账号数:', dfAAWB.shape)
|
||
|
||
|
||
dfAATT = dfAllAccount.loc[dfAllAccount['账号类型'] == '今日头条'].copy()
|
||
dfRRTT = dfRR.loc[dfRR['类型'] == '今日头条'].copy() #'市州', '类型', '账号名称', '转发数'
|
||
|
||
|
||
dfRRTT.rename(columns={'账号名称':'account'}, inplace=True)
|
||
for i in range(dfAATT.shape[0]):
|
||
aName = re.sub('\s+','', str(dfAATT.iloc[i, dfAATT.columns.get_loc('账号名称')]))
|
||
|
||
strName = str(dfAATT.iloc[i, dfAATT.columns.get_loc('账号名称')])
|
||
dfRRRRTT = dfRRTT[dfRRTT.account == strName]
|
||
|
||
matchedRow = -1
|
||
#dftmp = dfRRTT[dfRRTT['账号名称']==aName]
|
||
for j in range(dfRRTT.shape[0]):
|
||
name = re.sub('\s+', '', str(dfRRTT.iloc[j, dfRRTT.columns.get_loc('account')]))
|
||
if name == aName:
|
||
matchedRow = j
|
||
break
|
||
if matchedRow > -1 :
|
||
r = int(dfRRTT.iloc[matchedRow, dfRRTT.columns.get_loc('转发数')])
|
||
######
|
||
######
|
||
#dfAATT.iloc[i, dfAATT.columns.get_loc('转发数')] = r
|
||
dfAATT.loc[i, dfAATT.columns.get_loc('转发数')] = r
|
||
|
||
|
||
if dfRRRRTT.shape[0]>0 and matchedRow<0:
|
||
print(' -', aName, strName, dfRRRRTT.shape[0], matchedRow)
|
||
######
|
||
|
||
if dfRRRRTT.shape[0]<1 and matchedRow>-1:
|
||
print(' =', aName, strName, dfRRRRTT.shape[0], matchedRow)
|
||
######
|
||
print('总头条账号数:', dfAATT.shape)
|
||
|
||
dfAAA = dfAAWX.append(dfAAWB, ignore_index=True, sort=False)
|
||
dfAAA = dfAAA.append(dfAATT, ignore_index=True, sort=False)
|
||
|
||
|
||
|
||
print('所有统计账号数:', dfAAA.shape)
|
||
|
||
################################################
|
||
# 为转发账号匹配单位全称和所属县区
|
||
#
|
||
for i in range(dfRR.shape[0]):
|
||
aName = re.sub('\s+','', str(dfRR.iloc[i, dfRR.columns.get_loc('账号名称')]))
|
||
matchedRow = -1
|
||
# 区县/地方部门
|
||
# 对每一个账号,从表中匹配账号名称
|
||
for j in range(dfAllAccount.shape[0]):
|
||
name = re.sub('\s+', '', str(dfAllAccount.iloc[j, dfAllAccount.columns.get_loc('账号名称')]))
|
||
if name == aName:
|
||
matchedRow = j
|
||
break
|
||
if matchedRow > -1 :
|
||
cc = str(dfAllAccount.iloc[matchedRow, dfAllAccount.columns.get_loc('区县/地方部门')])
|
||
if cc != 'nan':
|
||
dfRR.iloc[i, dfRR.columns.get_loc('区县')] = cc
|
||
bn = str(dfAllAccount.iloc[matchedRow, dfAllAccount.columns.get_loc('单位全称')])
|
||
if bn != 'nan':
|
||
dfRR.iloc[i, dfRR.columns.get_loc('单位名称')] = bn
|
||
|
||
######
|
||
# 为省级部门匹配简称
|
||
if '省直部门' in cities:
|
||
city = '省直部门'
|
||
dfAAACity = dfAAA[dfAAA['市/省局'] == city]
|
||
|
||
|
||
|
||
dfAAACity.to_excel(strOutputPath + '账号转发量_' + city + '.xlsx')
|
||
|
||
dfOCity = dfO[dfO['市州'] == city]
|
||
dfO.to_excel(strOutputPath + '转发文章_' + city + '.xlsx')
|
||
|
||
dfRRCity = dfRR[dfRR['市州'] == city]
|
||
if city in ['临夏回族自治州', '甘南藏族自治州']:
|
||
dfRRCity['区县'].fillna('州直部门', inplace=True)
|
||
else:
|
||
dfRRCity['区县'].fillna('市直部门', inplace=True)
|
||
dfRRCity.to_excel(strOutputPath + '转发账号_' + city + '.xlsx')
|
||
|
||
|
||
dfRR.to_excel(strOutputPath + '转发账号.xlsx')
|
||
dfAAA.to_excel(strOutputPath + '账号转发量.xlsx')
|
||
dfO.to_excel(strOutputPath + '转发文章.xlsx')
|
||
|
||
|
||
# dfAAA = pd.read_excel(strOutputPath + '账号转发量.xlsx')
|
||
# dfRR = pd.read_excel(strOutputPath + '转发账号.xlsx')
|
||
# dfO = pd.read_excel(strOutputPath + '转发文章.xlsx')
|
||
|
||
# 过长名称替换为简称,便于绘图
|
||
dfRR.loc[dfRR['区县'] == '积石山保安族东乡族撒拉族自治县', '区县'] = '积石山县'
|
||
dfRR.loc[dfRR['区县'] == '阿克塞哈萨克族自治县', '区县'] = '阿克塞自治县'
|
||
for city in cities:
|
||
if city in ['兰州新区', '省直部门']:
|
||
continue
|
||
print(" add up city", city)
|
||
######
|
||
# 匹配省级部门的简称
|
||
if city == '省直部门':
|
||
for i in range(dfRR.shape[0]):
|
||
aName = re.sub('\s+', '', str(dfRR.iloc[i, dfRR.columns.get_loc('账号名称')]))
|
||
matchedRow = -1
|
||
# 对每一个账号,从表中匹配账号名称
|
||
for j in range(dfProvincial.shape[0]):
|
||
name = re.sub('\s+', '', str(dfProvincial.iloc[j, dfProvincial.columns.get_loc('账号名称')]))
|
||
if name == aName:
|
||
matchedRow = j
|
||
break
|
||
if matchedRow > -1:
|
||
bmjc = str(dfProvincial.iloc[matchedRow, dfProvincial.columns.get_loc('简称')])
|
||
if bmjc != 'nan':
|
||
dfRR.iloc[i, dfRR.columns.get_loc('省直部门')] = bmjc
|
||
|
||
dfAAACity = dfAAA.loc[dfAAA['市/省局'] == city].copy()
|
||
dfAAACity.to_excel(strOutputPath + '账号转发量_' + city + '.xlsx')
|
||
|
||
dfOCity = dfO[dfO['市州'] == city]
|
||
dfO.to_excel(strOutputPath + '转发文章_' + city + '.xlsx')
|
||
|
||
dfRRCity = dfRR.loc[dfRR['市州'] == city].copy()
|
||
if city in ['临夏回族自治州', '甘南藏族自治州']:
|
||
dfRRCity['区县'].fillna('州直部门', inplace=True)
|
||
dfRRCityD = dfRRCity.loc[dfRRCity['区县'] == '州直部门'].copy()
|
||
else:
|
||
dfRRCity['区县'].fillna('市直部门', inplace=True)
|
||
dfRRCityD = dfRRCity.loc[dfRRCity['区县'] == '市直部门'].copy()
|
||
dfRRCity.to_excel(strOutputPath + '转发账号_' + city + '.xlsx')
|
||
####
|
||
# 统计市直部门
|
||
#dfRRCityD = dfRRCity[dfRRCity['区县'] == '州直部门']
|
||
dfRRCD1 = pd.pivot_table(dfRRCityD, index=['单位名称'], values=['账号名称'],
|
||
aggfunc = ['count'], fill_value='', margins=True, margins_name='总计')
|
||
dfRRCD2 = pd.pivot_table(dfRRCityD, index=['单位名称'], values=['转发数'],
|
||
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
|
||
dfRRCD3 = pd.pivot_table(dfRRCityD, index=['单位名称'], values=['阅读数'],
|
||
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
|
||
dfRRCD = pd.concat([dfRRCD1, dfRRCD2, dfRRCD3], axis=1)
|
||
|
||
# 计算转发率
|
||
dfRRCD.columns = ['_'.join(col) for col in dfRRCD.columns.values]
|
||
#dfRRCD.reset_index(inplace=True)
|
||
|
||
dfRRCD['rate'] = dfRRCD.apply(lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['内容'].count() * 1000)/1000.0, axis=1)
|
||
# 排序
|
||
dfRRCD = dfRRCD[0:dfRRCD.shape[0]-1].sort_values(by='rate', ascending=False)
|
||
dfRRCD = pd.concat([dfRRCD, dfRRCD[dfRRCD.shape[0]-1:dfRRCD.shape[0]] ], axis=0)
|
||
|
||
dfRRCD.to_excel(strOutputPath + '市州直部门转发_' + city + '.xlsx')
|
||
|
||
|
||
#######
|
||
# 统计县区
|
||
# 发现目前版本pivot_table函数aggfunc用列表时,前几列计算值不准确
|
||
# 所以,暂时单列计算,再合并
|
||
dfCountyA = pd.pivot_table(dfRRCity, index=['区县'], values=['账号名称'],
|
||
aggfunc = ['count'], fill_value='', margins=True, margins_name='总计')
|
||
dfCountyC = pd.pivot_table(dfRRCity, index=['区县'], values=['转发数'],
|
||
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
|
||
dfCountyR = pd.pivot_table(dfRRCity, index=['区县'], values=['阅读数'],
|
||
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
|
||
dfCounty = pd.concat([dfCountyA, dfCountyC, dfCountyR], axis=1)
|
||
|
||
# 计算转发率
|
||
|
||
dfCounty.columns = ['_'.join(col) for col in dfCounty.columns.values]
|
||
#dfCounty.reset_index(inplace=True)
|
||
|
||
dfCounty['rate'] = dfCounty.apply(lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['内容'].count() * 1000)/1000.0, axis=1)
|
||
|
||
# 排序
|
||
dfCC = dfCounty[0:dfCounty.shape[0]-1].sort_values(by='rate', ascending=False)
|
||
dfCC = pd.concat([dfCC, dfCounty[dfCounty.shape[0]-1:dfCounty.shape[0]] ], axis=0)
|
||
dfCC.to_excel(strOutputPath + '县区转发_' + city + '.xlsx')
|
||
|
||
# 统计市/州直部门转发数
|
||
dfRRD = dfRRCity[(dfRRCity['区县'] == '州直部门') | (dfRRCity['区县'] == '市直部门')]
|
||
dfDA = pd.pivot_table(dfRRD, index=['单位名称'], values=['账号名称'],
|
||
aggfunc=['count'], fill_value='', margins=True, margins_name='总计')
|
||
dfDC = pd.pivot_table(dfRRD, index=['单位名称'], values=['转发数'],
|
||
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
||
dfDR = pd.pivot_table(dfRRD, index=['单位名称'], values=['阅读数'],
|
||
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
||
dfD = pd.concat([dfDA, dfDC, dfDR], axis=1)
|
||
|
||
# 计算部门转发率
|
||
#print(dfD.columns)
|
||
# 合并多层索引MultiIndex
|
||
dfD.columns = ['_'.join(col) for col in dfD.columns.values]
|
||
#dfD.reset_index(inplace=True)
|
||
# 计算转发率
|
||
dfD['rate'] = dfD.apply(
|
||
lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['内容'].count() * 1000) / 1000.0, axis=1)
|
||
# 排序
|
||
dfDD = dfD[0:dfD.shape[0] - 1].sort_values(by='rate', ascending=False)
|
||
dfDD = pd.concat([dfDD, dfD[dfD.shape[0] - 1:dfD.shape[0]]], axis=0)
|
||
dfDD.to_excel(strOutputPath + '部门转发_' + city + '.xlsx')
|
||
|
||
#########################################################
|
||
#
|
||
# 生成报告
|
||
tpl = DocxTemplate(fnTemplate)
|
||
info = {
|
||
"taskCount": dfTask['内容'].count(),
|
||
"aNum": int(dfCC.iloc[-1]['count_账号名称']),
|
||
"fNum": int(dfCC.iloc[-1]['sum_转发数']),
|
||
"readNum": int(dfCC.iloc[-1]['sum_阅读数']),
|
||
"r": '%.1f'%(dfCC.iloc[-1]['rate']*100.0),
|
||
#
|
||
"dNum": int(dfDD.iloc[-1]['count_账号名称']), # 部门总账号数
|
||
"dFNum": int(dfDD.iloc[-1]['sum_转发数']), # 部门总转发数
|
||
"dReadNum": int(dfDD.iloc[-1]['sum_阅读数']), # 部门总阅读数
|
||
"dr": '%.1f'%(dfDD.iloc[-1]['rate']*100.0), # 部门平均转发率
|
||
}
|
||
context.update(info)
|
||
|
||
# 县区转发率表格
|
||
t1_list = []
|
||
for index, row in dfCC.iterrows():
|
||
if index == "总计":
|
||
continue
|
||
t1_a = {'county': str(index), 'rate': '%.1f'%(row['rate']*100.0),
|
||
'account': int(row['count_账号名称']), 'fNum': int(row['sum_转发数']),
|
||
'readNum': int(row['sum_阅读数'])}
|
||
t1_list.append(t1_a)
|
||
context['t1_contents'] = t1_list
|
||
|
||
# 部门转发率表格
|
||
t2_list = []
|
||
for index, row in dfDD.iterrows():
|
||
if index == "总计":
|
||
continue
|
||
t2_a = {'name': str(index),
|
||
'rate': '%.1f'%(row['rate']*100.0),
|
||
'account': int(row['count_账号名称']),
|
||
'fNum': int(row['sum_转发数']),
|
||
'readNum': int(row['sum_阅读数'])}
|
||
t2_list.append(t2_a)
|
||
context['t2_contents'] = t2_list
|
||
|
||
# 转发任务列表
|
||
t3_list = []
|
||
for index, row in dfTask.iterrows():
|
||
t3_a = {'id': row['序号'],
|
||
'title': row['内容'],
|
||
'date': row['时间'].strftime('%Y-%m-%d') }
|
||
t3_list.append(t3_a)
|
||
context['t3_contents'] = t3_list
|
||
|
||
# 绘制区县转发率图
|
||
drawBar(dfCC['rate'][:-1], dfCC.index[:-1],
|
||
'区县转发率', os.path.join(strOutputPath, city + '_graphCounty.png'))
|
||
|
||
dc = {
|
||
'graphCounty': InlineImage(tpl, os.path.join(strOutputPath, city+'_graphCounty.png'), width=Mm(120)),
|
||
}
|
||
context.update(dc)
|
||
|
||
|
||
tpl.render(context)
|
||
tpl.save(strOutputPath+city+'.docx')
|
||
|
||
|
||
######
|
||
####
|
||
#######
|
||
######
|
||
####
|
||
#######
|
||
######
|
||
####
|
||
#######
|
||
# 统计县区
|
||
# 发现目前版本pivot_table函数aggfunc用列表时,前几列计算值不准确
|
||
# 所以,暂时单列计算,再合并
|
||
dfCountyA = pd.pivot_table(dfRR, index=['市州'], values=['账号名称'],
|
||
aggfunc = ['count'], fill_value='', margins=True, margins_name='总计')
|
||
dfCountyC = pd.pivot_table(dfRR, index=['市州'], values=['转发数'],
|
||
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
|
||
dfCountyR = pd.pivot_table(dfRR, index=['市州'], values=['阅读数'],
|
||
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
|
||
dfCounty = pd.concat([dfCountyA, dfCountyC, dfCountyR], axis=1)
|
||
|
||
# 计算转发率
|
||
|
||
dfCounty.columns = ['_'.join(col) for col in dfCounty.columns.values]
|
||
#dfCounty.reset_index(inplace=True)
|
||
|
||
dfCounty['rate'] = dfCounty.apply(lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['内容'].count() * 1000)/1000.0, axis=1)
|
||
|
||
# 排序
|
||
dfCC = dfCounty[0:dfCounty.shape[0]-1].sort_values(by='rate', ascending=False)
|
||
dfCC = pd.concat([dfCC, dfCounty[dfCounty.shape[0]-1:dfCounty.shape[0]] ], axis=0)
|
||
dfCC.to_excel(strOutputPath + '市州转发_ALL.xlsx')
|
||
|
||
|
||
# 统计市/州直部门转发数
|
||
#if '省直部门' in cities:
|
||
dfRRD = dfRR[ (dfRR['市州'] == '省直部门')]
|
||
dfDD = pd.DataFrame()
|
||
if dfRRD.shape[0] > 0:
|
||
|
||
dfDA = pd.pivot_table(dfRRD, index=['单位名称'], values=['账号名称'],
|
||
aggfunc=['count'], fill_value='', margins=True, margins_name='总计')
|
||
dfDC = pd.pivot_table(dfRRD, index=['单位名称'], values=['转发数'],
|
||
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
||
dfDR = pd.pivot_table(dfRRD, index=['单位名称'], values=['阅读数'],
|
||
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
||
dfD = pd.concat([dfDA, dfDC, dfDR], axis=1)
|
||
|
||
# 计算部门转发率
|
||
print('---', dfD.columns)
|
||
# 合并多层索引MultiIndex
|
||
dfD.columns = ['_'.join(col) for col in dfD.columns.values]
|
||
#dfD.reset_index(inplace=True)
|
||
print('---', dfD.columns)
|
||
# 计算转发率
|
||
dfD['rate'] = dfD.apply(
|
||
lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['内容'].count() * 1000) / 1000.0, axis=1)
|
||
# 排序
|
||
dfDD = dfD[0:dfD.shape[0] - 1].sort_values(by='rate', ascending=False)
|
||
dfDD = pd.concat([dfDD, dfD[dfD.shape[0] - 1:dfD.shape[0]]], axis=0)
|
||
dfDD.to_excel(strOutputPath + '部门转发_ALL.xlsx')
|
||
|
||
#########################################################
|
||
#
|
||
# 生成报告
|
||
tpl = DocxTemplate(fnTemplate)
|
||
info = {
|
||
"taskCount": dfTask['内容'].count(),
|
||
"aNum": int(dfCC.iloc[-1]['count_账号名称']),
|
||
"fNum": int(dfCC.iloc[-1]['sum_转发数']),
|
||
"readNum": int(dfCC.iloc[-1]['sum_阅读数']),
|
||
"r": '%.1f'%(dfCC.iloc[-1]['rate']*100.0),
|
||
}
|
||
if dfDD.empty:
|
||
info.update( {
|
||
"dNum": 0, # 部门总账号数
|
||
"dFNum": 0, # 部门总转发数
|
||
"dReadNum": 0, # 部门总阅读数
|
||
"dr": '%.1f'%(0), # 部门平均转发率
|
||
} )
|
||
else:
|
||
info.update( {
|
||
"dNum": int(dfDD.iloc[-1]['count_账号名称']), # 部门总账号数
|
||
"dFNum": int(dfDD.iloc[-1]['sum_转发数']), # 部门总转发数
|
||
"dReadNum": int(dfDD.iloc[-1]['sum_阅读数']), # 部门总阅读数
|
||
"dr": '%.1f'%(dfDD.iloc[-1]['rate']*100.0), # 部门平均转发率
|
||
})
|
||
|
||
context.update(info)
|
||
|
||
# 县区转发率表格
|
||
t1_list = []
|
||
for index, row in dfCC.iterrows():
|
||
if index == "总计":
|
||
continue
|
||
t1_a = {'county': str(index), #str('账号名称'),
|
||
'rate': '%.1f'%(row['rate']*100.0),
|
||
'account': int(row['count_账号名称']), 'fNum': int(row['sum_转发数']),
|
||
'readNum': int(row['sum_阅读数'])}
|
||
t1_list.append(t1_a)
|
||
context['t1_contents'] = t1_list
|
||
|
||
# 部门转发率表格
|
||
t2_list = []
|
||
if not dfDD.empty:
|
||
for index, row in dfDD.iterrows():
|
||
if index == "总计":
|
||
continue
|
||
t2_a = {'name': str(index), #str(row['单位名称']),
|
||
'rate': '%.1f'%(row['rate']*100.0),
|
||
'account': int(row['count_账号名称']),
|
||
'fNum': int(row['sum_转发数']),
|
||
'readNum': int(row['sum_阅读数'])}
|
||
t2_list.append(t2_a)
|
||
context['t2_contents'] = t2_list
|
||
|
||
# 转发任务列表
|
||
t3_list = []
|
||
for index, row in dfTask.iterrows():
|
||
t3_a = {'id': row['序号'],
|
||
'title': row['内容'],
|
||
'date': row['时间'].strftime('%Y-%m-%d') }
|
||
t3_list.append(t3_a)
|
||
context['t3_contents'] = t3_list
|
||
|
||
# 绘制区县转发率图
|
||
drawBar(dfCC['rate'][:-1], dfCC.index[:-1],
|
||
'市州转发率', os.path.join(strOutputPath, 'ALL_graphCounty.png'))
|
||
|
||
dc = {
|
||
'graphCounty': InlineImage(tpl, os.path.join(strOutputPath, 'ALL_graphCounty.png'), width=Mm(120)),
|
||
}
|
||
context.update(dc)
|
||
|
||
tpl.render(context)
|
||
tpl.save(strOutputPath+'ALL.docx')
|
||
|
||
|
||
endtime = datetime.datetime.now()
|
||
usedtime = endtime - starttime
|
||
print("time: ", usedtime) |