1072 lines
51 KiB
Python
1072 lines
51 KiB
Python
|
import datetime
|
|||
|
import csv
|
|||
|
import pandas as pd
|
|||
|
import numpy as np
|
|||
|
import glob, os, re, time
|
|||
|
|
|||
|
import matplotlib.pyplot as plt
|
|||
|
from matplotlib.ticker import FuncFormatter
|
|||
|
from difflib import SequenceMatcher
|
|||
|
from collections import Counter
|
|||
|
import difflib
|
|||
|
|
|||
|
|
|||
|
from docxtpl import DocxTemplate
|
|||
|
from docxtpl import InlineImage
|
|||
|
from docx.shared import Mm
|
|||
|
|
|||
|
import jieba
|
|||
|
import jieba.posseg as pseg
|
|||
|
|
|||
|
# 画柱状图
|
|||
|
def drawBar(data, recipe, title='', fn=''):
|
|||
|
plt.figure(figsize=(6, 4))
|
|||
|
plt.rcParams['font.sans-serif'] = ['SimHei']
|
|||
|
plt.rcParams['axes.unicode_minus'] = False
|
|||
|
counties = recipe
|
|||
|
countyRates = data
|
|||
|
|
|||
|
plt.bar(counties, countyRates, width=0.5)
|
|||
|
plt.xticks(counties, counties, rotation=35)
|
|||
|
plt.ylim((0, 1))
|
|||
|
|
|||
|
def to_percent(temp, position):
|
|||
|
return '%2.0f' % (100 * temp) + '%'
|
|||
|
|
|||
|
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
|
|||
|
plt.title(title, fontsize=16)
|
|||
|
plt.tight_layout()
|
|||
|
plt.savefig(fn)
|
|||
|
# plt.show()
|
|||
|
plt.cla()
|
|||
|
plt.clf()
|
|||
|
plt.close()
|
|||
|
|
|||
|
def getWBData(path, cities, hasBody=False):
|
|||
|
# cityShorten
|
|||
|
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
|
|||
|
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
|
|||
|
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
|
|||
|
|
|||
|
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
|
|||
|
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
|
|||
|
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
|
|||
|
|
|||
|
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
|||
|
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
|||
|
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
|||
|
}
|
|||
|
dirCs = os.listdir(path)
|
|||
|
cs = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数',
|
|||
|
'转发数', '评论数', 'weiboID', 'weiboName', '市州']
|
|||
|
dfWB = pd.DataFrame(columns=cs)
|
|||
|
cityCount = 0
|
|||
|
for dirC in dirCs:
|
|||
|
if dirC[:1] == '.':
|
|||
|
continue
|
|||
|
if not os.path.isdir(os.path.join(path, dirC)):
|
|||
|
continue
|
|||
|
if 'weixin' in dirC.lower():
|
|||
|
continue
|
|||
|
if 'tt' in dirC.lower():
|
|||
|
continue
|
|||
|
if not cityShorten[dirC] in cities:
|
|||
|
continue
|
|||
|
print(' city: ', cityShorten[dirC], dirC)
|
|||
|
cityCount += 1
|
|||
|
# City LN
|
|||
|
cols = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数',
|
|||
|
'转发数', '评论数'] #WB下载工具中的格式
|
|||
|
dfWBC = pd.DataFrame(columns=cols)
|
|||
|
dirCTs = os.listdir(os.path.join(path, dirC))
|
|||
|
for dirCT in dirCTs:
|
|||
|
if dirCT[:1] == '.':
|
|||
|
continue
|
|||
|
# 时段 weibo weibo_1
|
|||
|
if not os.path.isdir(os.path.join(path, dirC, dirCT)):
|
|||
|
continue
|
|||
|
if 'weixin' in dirCT.lower():
|
|||
|
continue
|
|||
|
if 'tt' in dirCT.lower():
|
|||
|
continue
|
|||
|
print(' read WB... dir:',dirCT)
|
|||
|
dirAs = os.listdir(os.path.join(path, dirC, dirCT))
|
|||
|
for dirA in dirAs:
|
|||
|
if dirA[:1] == '.':
|
|||
|
continue
|
|||
|
# 都是账号名称目录下再存账号ID.txt,
|
|||
|
if not os.path.isdir(os.path.join(path, dirC, dirCT, dirA)):
|
|||
|
continue
|
|||
|
##print('---',dirA)
|
|||
|
# 账号名称
|
|||
|
wbName = dirA
|
|||
|
fileAs = os.listdir(os.path.join(path, dirC, dirCT, dirA))
|
|||
|
if len(fileAs) > 0 and os.path.splitext(fileAs[0])[-1] == '.csv':
|
|||
|
wbId = fileAs[0][:-4]
|
|||
|
if len(fileAs) > 1 and wbId.startswith('.'):
|
|||
|
wbId = fileAs[1][:-4]
|
|||
|
# 读取文件
|
|||
|
##print('----',wbName, wbId)
|
|||
|
filename = os.path.join(path, dirC, dirCT, dirA, fileAs[0])
|
|||
|
dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
|
|||
|
index_col=None)#, engine='python', encoding='gbk'#utf-8
|
|||
|
dfdfwb = dfdfwb[1:]
|
|||
|
dfdfwb["weiboID"] = wbId
|
|||
|
dfdfwb["weiboName"] = wbName
|
|||
|
|
|||
|
dfWBC = dfWBC.append(dfdfwb)
|
|||
|
#print(wbName, wbId, fileAs[0], dfdfwb.shape, dfWBC.shape)
|
|||
|
|
|||
|
if len(fileAs)>1:
|
|||
|
print(" +=+= ", fileAs)
|
|||
|
|
|||
|
print(' ', dfWBC.shape)
|
|||
|
#dfWBC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx")
|
|||
|
dfWBC['市州'] = cityShorten[dirC]
|
|||
|
dfWB = dfWB.append(dfWBC)
|
|||
|
|
|||
|
print('Read WB finished. cities', cityCount, '; lines', dfWB.shape)
|
|||
|
#dfWB.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
|
|||
|
return dfWB
|
|||
|
|
|||
|
# 从数据目录中读取xlsx文件,拼接到一起
|
|||
|
def getWXData(path, cities, hasBody=False):
|
|||
|
# cityShorten
|
|||
|
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
|
|||
|
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
|
|||
|
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
|
|||
|
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
|
|||
|
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
|
|||
|
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
|
|||
|
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
|||
|
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
|||
|
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
|||
|
}
|
|||
|
dirBatches = os.listdir(path)
|
|||
|
cols = ['公众号', '链接', '日期', '标题', '内容', '头条', '市州', '阅读数']
|
|||
|
dfWX = pd.DataFrame(columns=cols)
|
|||
|
countC = 0
|
|||
|
countFnC = 0
|
|||
|
# 监测批次目录
|
|||
|
for dirBatch in dirBatches:
|
|||
|
if not os.path.isdir(os.path.join(path, dirBatch)):
|
|||
|
continue # 仅目录
|
|||
|
|
|||
|
# City LN
|
|||
|
# 列出市州文件名称
|
|||
|
fileCs = os.listdir(os.path.join(path, dirBatch))
|
|||
|
count = 0
|
|||
|
for fileC in fileCs:
|
|||
|
if fileC[:1] == '.':
|
|||
|
continue
|
|||
|
# 处理目录
|
|||
|
if os.path.isdir(os.path.join(path, dirBatch, fileC)) and 'weixin' in fileC:
|
|||
|
print(' ', os.path.join(path, dirBatch, fileC))
|
|||
|
fs = os.listdir(os.path.join(path, dirBatch, fileC))
|
|||
|
for f in fs:
|
|||
|
fe = os.path.splitext(f)[-1]
|
|||
|
if fe == '.xlsx' or fe == '.xls':
|
|||
|
fName = os.path.splitext(fileC)[0]
|
|||
|
cityname = cityShorten[dirBatch]
|
|||
|
if cityname in cities:
|
|||
|
dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC, f))
|
|||
|
dfdfwxc['市州'] = cityname
|
|||
|
print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape)
|
|||
|
dfWX = dfWX.append(dfdfwxc)
|
|||
|
count = count + 1
|
|||
|
# 处理文件
|
|||
|
fExt = os.path.splitext(fileC)[-1]
|
|||
|
if fExt != '.xlsx' and fExt != '.xls':
|
|||
|
continue # 限制文件扩展名
|
|||
|
fName = os.path.splitext(fileC)[0]
|
|||
|
cityname = cityShorten[dirBatch]
|
|||
|
if cityname in cities:
|
|||
|
dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC))
|
|||
|
dfdfwxc['市州'] = cityShorten[dirBatch]
|
|||
|
print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape)
|
|||
|
dfWX = dfWX.append(dfdfwxc)
|
|||
|
count = count + 1
|
|||
|
countFnC += count
|
|||
|
if count > 0:
|
|||
|
countC += 1
|
|||
|
print("Read WX Finished. cities ", countC, '; Files', countFnC, '; lines ', dfWX.shape[0])
|
|||
|
#dfWX.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WX_ALL.xlsx")
|
|||
|
return dfWX
|
|||
|
|
|||
|
# 从数据目录中读取xlsx文件,拼接到一起
|
|||
|
def getTTData(path, cities, hasBody=False):
|
|||
|
# cityShorten
|
|||
|
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
|
|||
|
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
|
|||
|
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
|
|||
|
|
|||
|
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
|
|||
|
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
|
|||
|
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
|
|||
|
|
|||
|
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
|||
|
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
|||
|
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
|||
|
}
|
|||
|
dirCs = os.listdir(path)
|
|||
|
#account date title nread ncomment content url origin
|
|||
|
cs = ['account', 'date', 'title', 'nread', 'ncomment', 'content', 'url', 'origin', 'city']
|
|||
|
|
|||
|
dfTT = pd.DataFrame(columns=cs)
|
|||
|
cityCount = 0
|
|||
|
for dirC in dirCs:
|
|||
|
if dirC[:1] == '.':
|
|||
|
continue
|
|||
|
if not os.path.isdir(os.path.join(path, dirC)):
|
|||
|
continue
|
|||
|
if 'weixin' in dirC.lower():
|
|||
|
continue
|
|||
|
if 'weibo' in dirC.lower():
|
|||
|
continue
|
|||
|
if not cityShorten[dirC] in cities:
|
|||
|
continue
|
|||
|
print(' city: ', cityShorten[dirC], dirC)
|
|||
|
cityCount += 1
|
|||
|
# City LN
|
|||
|
dfTTC = pd.DataFrame(columns=cs)
|
|||
|
dirCTs = os.listdir(os.path.join(path, dirC))
|
|||
|
for dirCT in dirCTs:
|
|||
|
if dirCT[:1] == '.':
|
|||
|
continue
|
|||
|
# 时段 weibo weibo_1
|
|||
|
if not os.path.isdir(os.path.join(path, dirC, dirCT)):
|
|||
|
continue
|
|||
|
if 'weixin' in dirCT.lower():
|
|||
|
continue
|
|||
|
if 'weibo' in dirCT.lower():
|
|||
|
continue
|
|||
|
if 'tt' in dirCT.lower():
|
|||
|
print(' read TT... dir:',dirCT)
|
|||
|
fns = os.listdir(os.path.join(path, dirC, dirCT))
|
|||
|
for fn in fns:
|
|||
|
if fn[:1] == '.':
|
|||
|
continue
|
|||
|
if not fn[-5:] == '.xlsx':
|
|||
|
continue
|
|||
|
#print('---',fn)
|
|||
|
# 账号名称
|
|||
|
|
|||
|
ttName = fn[fn.index('_')+1:]
|
|||
|
ttName = ttName[:ttName.index('_')]
|
|||
|
#D:\Projects\POM\DATA\2022年11月\10月报告\全文\LN\TT
|
|||
|
fileAs = os.path.join(path, dirC, dirCT, fn)
|
|||
|
#print(' ', ttName, fileAs)
|
|||
|
if len(fileAs) > 0:
|
|||
|
dfdftt = pd.read_excel(fileAs)
|
|||
|
dfTTC = dfTTC.append(dfdftt)
|
|||
|
|
|||
|
|
|||
|
#dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
|
|||
|
# index_col=None)#, engine='python', encoding='gbk'#utf-8
|
|||
|
#dfdfwb = dfdfwb[1:]
|
|||
|
#dfdfwb["weiboID"] = wbId
|
|||
|
#dfdfwb["weiboName"] = wbName
|
|||
|
|
|||
|
#dfTTC = dfTTC.append(dfdfwb)
|
|||
|
#print(ttName, '读入:', dfdftt.shape[0], ' 总计:', dfTTC.shape[0])
|
|||
|
|
|||
|
#if len(fileAs)>1:
|
|||
|
# print(" +=+= ", fileAs)
|
|||
|
|
|||
|
print(' 读入头条数据行数', dfTTC.shape)
|
|||
|
#dfTTC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx")
|
|||
|
dfTTC['city'] = cityShorten[dirC]
|
|||
|
dfTT = dfTT.append(dfTTC)
|
|||
|
|
|||
|
print('Read TT finished. cities', cityCount, '; lines', dfTT.shape)
|
|||
|
#dfTT.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
|
|||
|
return dfTT
|
|||
|
|
|||
|
|
|||
|
def fetch_chinese(s):
|
|||
|
pattern =re.compile(r'[^\u4e00-\u9fa5]')
|
|||
|
sc = re.sub(pattern, '', s)
|
|||
|
return sc
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
starttime = datetime.datetime.now()
|
|||
|
_RATIO = 0.7
|
|||
|
isDoWX = True
|
|||
|
isDoWB = True
|
|||
|
isDoTT = True
|
|||
|
cities = [
|
|||
|
'临夏回族自治州',
|
|||
|
'白银市',
|
|||
|
'定西市',
|
|||
|
'酒泉市',
|
|||
|
'嘉峪关市',
|
|||
|
'平凉市',
|
|||
|
'庆阳市',
|
|||
|
'天水市',
|
|||
|
'武威市',
|
|||
|
'兰州新区',
|
|||
|
'陇南市',
|
|||
|
'兰州市', '张掖市', '甘南藏族自治州', '金昌市',
|
|||
|
'省直部门', # 共12市2州1新区
|
|||
|
]
|
|||
|
|
|||
|
cities = [
|
|||
|
'临夏回族自治州',
|
|||
|
'白银市',
|
|||
|
'定西市',
|
|||
|
'酒泉市',
|
|||
|
'天水市',
|
|||
|
'陇南市',
|
|||
|
|
|||
|
#'省直部门', # 共12市2州1新区
|
|||
|
]
|
|||
|
#cities = ['陇南市', '临夏回族自治州', '白银市', '定西市', '酒泉市', '平凉市','武威市','天水市']
|
|||
|
#cities = ['白银市']
|
|||
|
# 转发任务
|
|||
|
#dfTask = pd.read_excel('D:/Projects/POM/DATA/2022年S2/S2/全省政务新媒体二季度转发信息条目.xls')
|
|||
|
dfTask = pd.read_excel('D:/Projects/POM/DATA/2023年3月/2月报告/2023年2月份全省政务新媒体转发内容条目.xlsx')
|
|||
|
# 账号信息
|
|||
|
strFnAccount = 'D:/Projects/POM/DATA/2023年3月/2月报告/全国报送系统表单_2023.2.28.xlsx'
|
|||
|
dfAllAccount = pd.read_excel(strFnAccount)
|
|||
|
# 省直部门账号部门简称
|
|||
|
dfProvincial = pd.read_excel('D:/Projects/POM/DATA/2023年3月/2月报告/省直部门账号名称简称.xlsx')
|
|||
|
fnTemplate = 'D:/Projects/POM/DATA/2023年3月/2月报告/POM_ForewardTemplate.docx'
|
|||
|
|
|||
|
# 数据根目录,
|
|||
|
strPath = ['D:/Projects/POM/DATA/2023年3月/2月报告/']
|
|||
|
strOutputPath = 'D:/Projects/POM/DATA/2023年3月/2月报告/转发/'
|
|||
|
|
|||
|
context = {
|
|||
|
"year": "2023",
|
|||
|
"month": "2",
|
|||
|
"pubMonth": "3",
|
|||
|
"dateStart": "2023年2月1日",
|
|||
|
"dateEnd": "2023年2月28日"
|
|||
|
}
|
|||
|
|
|||
|
dfAllAccount.loc[:, '转发数'] = 0
|
|||
|
dfAllAccount.loc[:, '阅读数'] = 0
|
|||
|
|
|||
|
|
|||
|
################################################
|
|||
|
# 创建存储矩阵
|
|||
|
# 按照转发任务创建统计矩阵
|
|||
|
colRR = ['市州', '类型', '账号名称', '单位名称', '省直部门', '区县', '转发数', '阅读数']
|
|||
|
for ididid in dfTask['序号'][0:dfTask['内容'].count()].tolist():
|
|||
|
#for ididid in range(1, dfTask['内容'].count()):
|
|||
|
colRR.append(str(ididid))
|
|||
|
dfRR = pd.DataFrame(columns=colRR) # 每列一个转发任务,每行一个账号
|
|||
|
# 用于保存每一条转发任务的账号和文章
|
|||
|
dfO = pd.DataFrame(columns=['任务序号', '任务名称', '类型', '公众号', '日期', '内容', '链接', '市州'] )
|
|||
|
|
|||
|
|
|||
|
################################################
|
|||
|
#
|
|||
|
countWxForewards = 0
|
|||
|
countWbForewards = 0
|
|||
|
countTtForewards = 0
|
|||
|
|
|||
|
|
|||
|
# TT
|
|||
|
if isDoTT:
|
|||
|
print('=============================================================')
|
|||
|
print('---- TT ----')
|
|||
|
# id userId source city tid cellType title
|
|||
|
# time-stamp date url commentCount readNum likeNum showNum
|
|||
|
#dfTT = getTTData(strFnTT, strFnAccount, cities) #附加市州信息, cities未使用
|
|||
|
dfTT = pd.DataFrame()
|
|||
|
for strP in strPath:
|
|||
|
ddff = getTTData(strP + '全文/', cities)
|
|||
|
print(" read TT data", ddff.shape)
|
|||
|
dfTT = dfTT.append(ddff)
|
|||
|
|
|||
|
print("tt data ready", dfTT.shape)
|
|||
|
|
|||
|
# cs = ['account', 'date', 'title', 'nread', 'ncomment', 'content', 'url', 'origin', 'city']
|
|||
|
#cities = dfTT['city'].unique()
|
|||
|
for city in cities:
|
|||
|
dataC = dfTT.loc[dfTT['city'] == city].copy()
|
|||
|
accounts = dataC['account'].unique()
|
|||
|
|
|||
|
dfdfCityTT = dfAllAccount.loc[(dfAllAccount['账号类型'] == '今日头条')
|
|||
|
& (dfAllAccount['市/省局'] == city)].copy()
|
|||
|
|
|||
|
print(' count TT, city:', city, '读入账号数:', len(accounts), '任务账号数:', dfdfCityTT.shape[0])
|
|||
|
print(' ', dataC.shape)
|
|||
|
for account in accounts:
|
|||
|
#一个公众号的所有文章
|
|||
|
#print(account)
|
|||
|
dataA = dataC[dataC['account']==account]
|
|||
|
sR = pd.Series([], dtype=pd.StringDtype())
|
|||
|
sR['类型'] = '今日头条'
|
|||
|
sR['市州'] = city
|
|||
|
sR['账号名称'] = account
|
|||
|
count = 0
|
|||
|
for i in range(dfTask['内容'].count()):
|
|||
|
rn = dfTask.iloc[i, dfTask.columns.get_loc('序号')]
|
|||
|
rt = str(dfTask.iloc[i, dfTask.columns.get_loc('内容')]) #任务标题
|
|||
|
forwarded = 0
|
|||
|
for j in range(dataA.shape[0]):
|
|||
|
str1 = str(dataA.iloc[j, dataA.columns.get_loc('title')]) # 文章标题
|
|||
|
#
|
|||
|
if len(rt) > len(str1):
|
|||
|
strRT = rt[:len(str1)]
|
|||
|
else:
|
|||
|
strRT = rt
|
|||
|
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
|
|||
|
if ratio > _RATIO:
|
|||
|
forwarded += 1
|
|||
|
if forwarded > 0:
|
|||
|
break
|
|||
|
sR[str(rn)] = forwarded
|
|||
|
count += forwarded
|
|||
|
if forwarded > 0:
|
|||
|
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
|
|||
|
'类型': '今日头条',
|
|||
|
'公众号': account, #dfTT.iloc[j, dfTT.columns.get_loc('account')],
|
|||
|
'日期': dataA.iloc[j, dataA.columns.get_loc('date')],
|
|||
|
'内容': str1,
|
|||
|
'链接': dataA.iloc[j, dataA.columns.get_loc('url')],
|
|||
|
'市州': city,
|
|||
|
}], ignore_index=True)
|
|||
|
|
|||
|
#checknames = ['白银高新区管委会', '健康白银', '白银市卫生健康委', '白银市卫生健康委员会', '白银发改委', '白银市场监管', '白银农业农村', '白银应急', '白银退役军人', '白银政务服务', '白银政务服务', '白银文旅', '白银科技', '白银林草', '白银市平川区商务局', '平川区长征街道', '平川区红会路街道', '和谐复兴', '兴平路街道办', '平川金融办']
|
|||
|
#if account in checknames:
|
|||
|
# print(' -- ', account, count)
|
|||
|
sR['转发数'] = count
|
|||
|
dfRR = dfRR.append(sR, ignore_index=True, sort=False)
|
|||
|
#dfRR.to_excel('D:/Projects/POM/2021年6月/二季度/转发统计__'+city+'.xlsx')
|
|||
|
countTtForewards = dfRR.shape[0] - countWxForewards - countWbForewards
|
|||
|
print('TT forewards', countTtForewards)
|
|||
|
|
|||
|
# WX
|
|||
|
if isDoWX:
|
|||
|
print('=============================================================')
|
|||
|
print('---- WX ----')
|
|||
|
dfWX = pd.DataFrame()
|
|||
|
for strP in strPath:
|
|||
|
ddff = getWXData(strP + '全文/', cities)
|
|||
|
print(' read WX data', ddff.shape)
|
|||
|
dfWX = dfWX.append(ddff)
|
|||
|
print('WX data ready', dfWX.shape)
|
|||
|
|
|||
|
dfWX = dfWX.fillna(value=0)
|
|||
|
|
|||
|
# 公众号 链接 日期 标题 内容 头条 city
|
|||
|
## 逐个市州统计每个账号的转发情况
|
|||
|
#cities = dfWX['市州'].unique()
|
|||
|
for city in cities:
|
|||
|
print(" count WX, city:", city)
|
|||
|
dataC = dfWX.loc[dfWX['市州'] == city].copy()
|
|||
|
accounts = dataC['公众号'].unique()
|
|||
|
|
|||
|
dfdfCityWX = dfAllAccount.loc[((dfAllAccount['账号类型'] == '小程序+微信')
|
|||
|
| (dfAllAccount['账号类型'] == '微信服务号')
|
|||
|
| (dfAllAccount['账号类型'] == '微信订阅号'))
|
|||
|
& (dfAllAccount['市/省局'] == city)].copy()
|
|||
|
|
|||
|
print(" count WX, city:", city, '账号数:', len(accounts), '任务账号数:', dfdfCityWX.shape[0])
|
|||
|
for account in accounts:
|
|||
|
# print(account)
|
|||
|
dataA = dataC.loc[dataC['公众号'] == account].copy() # 一个公众号的所有文章
|
|||
|
sR = pd.Series(dtype='object')
|
|||
|
sR['类型'] = '微信'
|
|||
|
sR['市州'] = city
|
|||
|
sR['账号名称'] = account
|
|||
|
count = 0
|
|||
|
arn = 0
|
|||
|
for i in range(dfTask['内容'].count()):
|
|||
|
# 对于每一篇任务文章
|
|||
|
rn = dfTask.iloc[i, dfTask.columns.get_loc('序号')]
|
|||
|
rt = str(dfTask.iloc[i, dfTask.columns.get_loc('内容')])
|
|||
|
forwarded = 0
|
|||
|
readNum = 0
|
|||
|
# 查看该账号的所有文章
|
|||
|
for j in range(dataA.shape[0]):
|
|||
|
str1 = str(dataA.iloc[j, dataA.columns.get_loc('标题')])
|
|||
|
|
|||
|
#
|
|||
|
if len(rt) > len(str1):
|
|||
|
strRT = rt[:len(str1)]
|
|||
|
else:
|
|||
|
strRT = rt
|
|||
|
|
|||
|
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
|
|||
|
|
|||
|
tRN = int(dataA.iloc[j, dataA.columns.get_loc('阅读数')])
|
|||
|
# 遇到相似的即跳出
|
|||
|
if ratio > _RATIO:
|
|||
|
forwarded += 1
|
|||
|
readNum += tRN
|
|||
|
if forwarded > 0:
|
|||
|
break
|
|||
|
sR[str(rn)] = forwarded
|
|||
|
count += forwarded
|
|||
|
arn += readNum
|
|||
|
if forwarded > 0:
|
|||
|
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
|
|||
|
'类型': '微信',
|
|||
|
'公众号': account,
|
|||
|
'日期': dataA.iloc[j, dataA.columns.get_loc('日期')],
|
|||
|
'内容': str1,
|
|||
|
'链接': dataA.iloc[j, dataA.columns.get_loc('链接')],
|
|||
|
'市州': city,
|
|||
|
'阅读数': readNum,
|
|||
|
}], ignore_index=True)
|
|||
|
sR['转发数'] = count
|
|||
|
sR['阅读数'] = arn
|
|||
|
dfRR = dfRR.append(sR, ignore_index=True, sort=False)
|
|||
|
countWxForewards = dfRR.shape[0]
|
|||
|
print('WX forwards', countWxForewards)
|
|||
|
|
|||
|
# WB
|
|||
|
if isDoWB:
|
|||
|
print('=============================================================')
|
|||
|
print('---- WB ----')
|
|||
|
|
|||
|
dfWB = pd.DataFrame()
|
|||
|
for strP in strPath:
|
|||
|
ddff = getWBData(strP + '全文/', cities)
|
|||
|
print(' read WB data', ddff.shape)
|
|||
|
dfWB = dfWB.append(ddff)
|
|||
|
print('WB data ready', dfWB.shape)
|
|||
|
|
|||
|
################################################
|
|||
|
# WB
|
|||
|
# 微博id 微博正文 头条文章url 原始图片url 被转发微博原始图片url 是否为原创微博 微博视频url 发布位置 date
|
|||
|
# 发布工具 点赞数 转发数 评论数 weiboID weiboName city
|
|||
|
#cities = dfWB['市州'].unique()
|
|||
|
for city in cities:
|
|||
|
print(' count WB, city:', city)
|
|||
|
dataC = dfWB.loc[dfWB['市州'] == city].copy()
|
|||
|
accounts = dataC['weiboName'].unique()
|
|||
|
|
|||
|
|
|||
|
dfdfCityWB = dfAllAccount.loc[(dfAllAccount['账号类型'] == '新浪微博')
|
|||
|
& (dfAllAccount['市/省局'] == city)].copy()
|
|||
|
|
|||
|
print(' count WB, city:', city, "读入账号数:", len(accounts), '任务账号数:', dfdfCityWB.shape[0])
|
|||
|
|
|||
|
for account in accounts:
|
|||
|
# 一个公众号的所有文章
|
|||
|
# print(account)
|
|||
|
dataA = dataC.loc[dataC['weiboName'] == account].copy()
|
|||
|
sR = pd.Series(dtype='object')
|
|||
|
sR['类型'] = '新浪微博'
|
|||
|
sR['市州'] = city
|
|||
|
sR['账号名称'] = account
|
|||
|
count = 0
|
|||
|
# 对一个账号,用任务标题从它全部发文里比对
|
|||
|
# 若找到, 该任务标记为已转发
|
|||
|
for i in range(dfTask['内容'].count()):
|
|||
|
rn = dfTask.iloc[i, dfTask.columns.get_loc('序号')]
|
|||
|
rt = str(dfTask.iloc[i, dfTask.columns.get_loc('内容')])
|
|||
|
forwarded = 0
|
|||
|
for j in range(dataA.shape[0]):
|
|||
|
str1 = str(dataA.iloc[j, dataA.columns.get_loc('微博正文')])
|
|||
|
str2 = str1[:len(rt)]
|
|||
|
#if rt in str1:
|
|||
|
ratio = difflib.SequenceMatcher(None, fetch_chinese(rt), fetch_chinese(str2)).quick_ratio()
|
|||
|
# if account=='陇南公积金' and ratio > 0.5:
|
|||
|
# print('----', ratio)
|
|||
|
# print(rt)
|
|||
|
# print(fetch_chinese(rt))
|
|||
|
# print(str2)
|
|||
|
# print(fetch_chinese(str2))
|
|||
|
if ratio > _RATIO:
|
|||
|
forwarded += 1
|
|||
|
if forwarded > 0:
|
|||
|
break
|
|||
|
sR[str(rn)] = forwarded
|
|||
|
count += forwarded
|
|||
|
if forwarded > 0:
|
|||
|
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
|
|||
|
'类型': '新浪微博',
|
|||
|
'公众号': account,
|
|||
|
'日期': dataA.iloc[j, dataA.columns.get_loc('date')],
|
|||
|
'内容': str1,
|
|||
|
'链接': dataA.iloc[j, dataA.columns.get_loc('头条文章url')],
|
|||
|
'市州': city,
|
|||
|
}], ignore_index=True)
|
|||
|
sR['转发数'] = count
|
|||
|
##print(account, count)
|
|||
|
dfRR = dfRR.append(sR, ignore_index=True, sort=False)
|
|||
|
####if account=='陇南公积金' :
|
|||
|
####print(account, dataA.shape[0], count)
|
|||
|
# dfRR.to_excel('D:/Projects/POM/2021年6月/二季度/转发统计__'+city+'.xlsx')
|
|||
|
|
|||
|
countWbForewards = dfRR.shape[0] - countWxForewards
|
|||
|
print('WB forwards', countWbForewards)
|
|||
|
|
|||
|
|
|||
|
#dfRR.to_excel(strOutputPath + '转发统计_GS_ALL.xlsx')
|
|||
|
#dfO.to_excel(strOutputPath + '转发统计_GS_ALLDATA.xlsx')
|
|||
|
|
|||
|
if isDoWX or isDoWB or isDoTT:
|
|||
|
print('=============================================================')
|
|||
|
print('---- STATISTICS ----')
|
|||
|
print('=============================================================')
|
|||
|
print('ALL forewards account num:', dfRR.shape[0], 'task num:', dfRR.shape[1])
|
|||
|
|
|||
|
################################################
|
|||
|
# 统计每一个账号的累计转发数和阅读数
|
|||
|
# ~~ 匹配到账号所有信息的表格中
|
|||
|
#
|
|||
|
dfAAWX = dfAllAccount.loc[(dfAllAccount['账号类型'] == '小程序+微信')
|
|||
|
| (dfAllAccount['账号类型'] == '微信服务号')
|
|||
|
| (dfAllAccount['账号类型'] == '微信订阅号') ].copy()
|
|||
|
dfRRWX = dfRR.loc[dfRR['类型'] == '微信'].copy()
|
|||
|
dfRRWX.rename(columns={'账号名称':'account'}, inplace=True)
|
|||
|
## 遍历账号详情表
|
|||
|
for i in range(dfAAWX.shape[0]):
|
|||
|
strName = str(dfAAWX.iloc[i, dfAAWX.columns.get_loc('账号名称')])
|
|||
|
dfRRRRWX = dfRRWX[dfRRWX.account == strName]
|
|||
|
|
|||
|
aName = re.sub('\s+','', str(dfAAWX.iloc[i, dfAAWX.columns.get_loc('账号名称')]))
|
|||
|
matchedRow = -1 # 转发统计表中的行数
|
|||
|
|
|||
|
# 对每一个微信账号,从转发统计表中匹配账号名称
|
|||
|
for j in range(dfRRWX.shape[0]):
|
|||
|
name = re.sub('\s+', '', str(dfRRWX.iloc[j, dfRRWX.columns.get_loc('account')]))
|
|||
|
if name == aName:
|
|||
|
matchedRow = j
|
|||
|
break
|
|||
|
if matchedRow > -1 :
|
|||
|
r = int(dfRRWX.iloc[matchedRow, dfRRWX.columns.get_loc('转发数')])
|
|||
|
readNum = int(dfRRWX.iloc[matchedRow, dfRRWX.columns.get_loc('阅读数')])
|
|||
|
dfAAWX.iloc[i, dfAAWX.columns.get_loc('转发数')] = r
|
|||
|
dfAAWX.iloc[i, dfAAWX.columns.get_loc('阅读数')] = readNum
|
|||
|
|
|||
|
if dfRRRRWX.shape[0]>0 and matchedRow<0:
|
|||
|
print(' -', aName, strName, dfRRRRWX.shape[0], matchedRow)
|
|||
|
######
|
|||
|
|
|||
|
if dfRRRRWX.shape[0]<1 and matchedRow>-1:
|
|||
|
print(' =', aName, strName, dfRRRRWX.shape[0], matchedRow)
|
|||
|
######
|
|||
|
#dfAAWX.to_excel('D:/Projects/POM/2021年7月/月报告/6月份全文数据/转发统计___List.xlsx')
|
|||
|
print('总微信账号数:', dfAAWX.shape)
|
|||
|
|
|||
|
dfAAWB = dfAllAccount.loc[dfAllAccount['账号类型'] == '新浪微博'].copy()
|
|||
|
dfRRWB = dfRR.loc[dfRR['类型'] == '新浪微博'].copy()
|
|||
|
for i in range(dfAAWB.shape[0]):
|
|||
|
aName = re.sub('\s+','', str(dfAAWB.iloc[i, dfAAWB.columns.get_loc('账号名称')]))
|
|||
|
matchedRow = -1
|
|||
|
# 对每一个账号,从转发统计表中匹配账号名称
|
|||
|
for j in range(dfRRWB.shape[0]):
|
|||
|
name = re.sub('\s+', '', str(dfRRWB.iloc[j, dfRRWB.columns.get_loc('账号名称')]))
|
|||
|
if name == aName:
|
|||
|
matchedRow = j
|
|||
|
break
|
|||
|
if matchedRow > -1 :
|
|||
|
r = int(dfRRWB.iloc[matchedRow, dfRRWB.columns.get_loc('转发数')])
|
|||
|
dfAAWB.iloc[i, dfAAWB.columns.get_loc('转发数')] = r
|
|||
|
print('总微博账号数:', dfAAWB.shape)
|
|||
|
|
|||
|
|
|||
|
dfAATT = dfAllAccount.loc[dfAllAccount['账号类型'] == '今日头条'].copy()
|
|||
|
dfRRTT = dfRR.loc[dfRR['类型'] == '今日头条'].copy() #'市州', '类型', '账号名称', '转发数'
|
|||
|
|
|||
|
|
|||
|
dfRRTT.rename(columns={'账号名称':'account'}, inplace=True)
|
|||
|
for i in range(dfAATT.shape[0]):
|
|||
|
aName = re.sub('\s+','', str(dfAATT.iloc[i, dfAATT.columns.get_loc('账号名称')]))
|
|||
|
|
|||
|
strName = str(dfAATT.iloc[i, dfAATT.columns.get_loc('账号名称')])
|
|||
|
dfRRRRTT = dfRRTT[dfRRTT.account == strName]
|
|||
|
|
|||
|
matchedRow = -1
|
|||
|
#dftmp = dfRRTT[dfRRTT['账号名称']==aName]
|
|||
|
for j in range(dfRRTT.shape[0]):
|
|||
|
name = re.sub('\s+', '', str(dfRRTT.iloc[j, dfRRTT.columns.get_loc('account')]))
|
|||
|
if name == aName:
|
|||
|
matchedRow = j
|
|||
|
break
|
|||
|
if matchedRow > -1 :
|
|||
|
r = int(dfRRTT.iloc[matchedRow, dfRRTT.columns.get_loc('转发数')])
|
|||
|
######
|
|||
|
######
|
|||
|
#dfAATT.iloc[i, dfAATT.columns.get_loc('转发数')] = r
|
|||
|
dfAATT.loc[i, dfAATT.columns.get_loc('转发数')] = r
|
|||
|
|
|||
|
|
|||
|
if dfRRRRTT.shape[0]>0 and matchedRow<0:
|
|||
|
print(' -', aName, strName, dfRRRRTT.shape[0], matchedRow)
|
|||
|
######
|
|||
|
|
|||
|
if dfRRRRTT.shape[0]<1 and matchedRow>-1:
|
|||
|
print(' =', aName, strName, dfRRRRTT.shape[0], matchedRow)
|
|||
|
######
|
|||
|
print('总头条账号数:', dfAATT.shape)
|
|||
|
|
|||
|
dfAAA = dfAAWX.append(dfAAWB, ignore_index=True, sort=False)
|
|||
|
dfAAA = dfAAA.append(dfAATT, ignore_index=True, sort=False)
|
|||
|
|
|||
|
|
|||
|
|
|||
|
print('所有统计账号数:', dfAAA.shape)
|
|||
|
|
|||
|
################################################
|
|||
|
# 为转发账号匹配单位全称和所属县区
|
|||
|
#
|
|||
|
for i in range(dfRR.shape[0]):
|
|||
|
aName = re.sub('\s+','', str(dfRR.iloc[i, dfRR.columns.get_loc('账号名称')]))
|
|||
|
matchedRow = -1
|
|||
|
# 区县/地方部门
|
|||
|
# 对每一个账号,从表中匹配账号名称
|
|||
|
for j in range(dfAllAccount.shape[0]):
|
|||
|
name = re.sub('\s+', '', str(dfAllAccount.iloc[j, dfAllAccount.columns.get_loc('账号名称')]))
|
|||
|
if name == aName:
|
|||
|
matchedRow = j
|
|||
|
break
|
|||
|
if matchedRow > -1 :
|
|||
|
cc = str(dfAllAccount.iloc[matchedRow, dfAllAccount.columns.get_loc('区县/地方部门')])
|
|||
|
if cc != 'nan':
|
|||
|
dfRR.iloc[i, dfRR.columns.get_loc('区县')] = cc
|
|||
|
bn = str(dfAllAccount.iloc[matchedRow, dfAllAccount.columns.get_loc('单位全称')])
|
|||
|
if bn != 'nan':
|
|||
|
dfRR.iloc[i, dfRR.columns.get_loc('单位名称')] = bn
|
|||
|
|
|||
|
######
|
|||
|
# 为省级部门匹配简称
|
|||
|
if '省直部门' in cities:
|
|||
|
city = '省直部门'
|
|||
|
dfAAACity = dfAAA[dfAAA['市/省局'] == city]
|
|||
|
|
|||
|
|
|||
|
|
|||
|
dfAAACity.to_excel(strOutputPath + '账号转发量_' + city + '.xlsx')
|
|||
|
|
|||
|
dfOCity = dfO[dfO['市州'] == city]
|
|||
|
dfO.to_excel(strOutputPath + '转发文章_' + city + '.xlsx')
|
|||
|
|
|||
|
dfRRCity = dfRR[dfRR['市州'] == city]
|
|||
|
if city in ['临夏回族自治州', '甘南藏族自治州']:
|
|||
|
dfRRCity['区县'].fillna('州直部门', inplace=True)
|
|||
|
else:
|
|||
|
dfRRCity['区县'].fillna('市直部门', inplace=True)
|
|||
|
dfRRCity.to_excel(strOutputPath + '转发账号_' + city + '.xlsx')
|
|||
|
|
|||
|
|
|||
|
dfRR.to_excel(strOutputPath + '转发账号.xlsx')
|
|||
|
dfAAA.to_excel(strOutputPath + '账号转发量.xlsx')
|
|||
|
dfO.to_excel(strOutputPath + '转发文章.xlsx')
|
|||
|
|
|||
|
|
|||
|
# dfAAA = pd.read_excel(strOutputPath + '账号转发量.xlsx')
|
|||
|
# dfRR = pd.read_excel(strOutputPath + '转发账号.xlsx')
|
|||
|
# dfO = pd.read_excel(strOutputPath + '转发文章.xlsx')
|
|||
|
|
|||
|
# 过长名称替换为简称,便于绘图
|
|||
|
dfRR.loc[dfRR['区县'] == '积石山保安族东乡族撒拉族自治县', '区县'] = '积石山县'
|
|||
|
dfRR.loc[dfRR['区县'] == '阿克塞哈萨克族自治县', '区县'] = '阿克塞自治县'
|
|||
|
for city in cities:
|
|||
|
if city in ['兰州新区', '省直部门']:
|
|||
|
continue
|
|||
|
print(" add up city", city)
|
|||
|
######
|
|||
|
# 匹配省级部门的简称
|
|||
|
if city == '省直部门':
|
|||
|
for i in range(dfRR.shape[0]):
|
|||
|
aName = re.sub('\s+', '', str(dfRR.iloc[i, dfRR.columns.get_loc('账号名称')]))
|
|||
|
matchedRow = -1
|
|||
|
# 对每一个账号,从表中匹配账号名称
|
|||
|
for j in range(dfProvincial.shape[0]):
|
|||
|
name = re.sub('\s+', '', str(dfProvincial.iloc[j, dfProvincial.columns.get_loc('账号名称')]))
|
|||
|
if name == aName:
|
|||
|
matchedRow = j
|
|||
|
break
|
|||
|
if matchedRow > -1:
|
|||
|
bmjc = str(dfProvincial.iloc[matchedRow, dfProvincial.columns.get_loc('简称')])
|
|||
|
if bmjc != 'nan':
|
|||
|
dfRR.iloc[i, dfRR.columns.get_loc('省直部门')] = bmjc
|
|||
|
|
|||
|
dfAAACity = dfAAA.loc[dfAAA['市/省局'] == city].copy()
|
|||
|
dfAAACity.to_excel(strOutputPath + '账号转发量_' + city + '.xlsx')
|
|||
|
|
|||
|
dfOCity = dfO[dfO['市州'] == city]
|
|||
|
dfO.to_excel(strOutputPath + '转发文章_' + city + '.xlsx')
|
|||
|
|
|||
|
dfRRCity = dfRR.loc[dfRR['市州'] == city].copy()
|
|||
|
if city in ['临夏回族自治州', '甘南藏族自治州']:
|
|||
|
dfRRCity['区县'].fillna('州直部门', inplace=True)
|
|||
|
dfRRCityD = dfRRCity.loc[dfRRCity['区县'] == '州直部门'].copy()
|
|||
|
else:
|
|||
|
dfRRCity['区县'].fillna('市直部门', inplace=True)
|
|||
|
dfRRCityD = dfRRCity.loc[dfRRCity['区县'] == '市直部门'].copy()
|
|||
|
dfRRCity.to_excel(strOutputPath + '转发账号_' + city + '.xlsx')
|
|||
|
####
|
|||
|
# 统计市直部门
|
|||
|
#dfRRCityD = dfRRCity[dfRRCity['区县'] == '州直部门']
|
|||
|
dfRRCD1 = pd.pivot_table(dfRRCityD, index=['单位名称'], values=['账号名称'],
|
|||
|
aggfunc = ['count'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfRRCD2 = pd.pivot_table(dfRRCityD, index=['单位名称'], values=['转发数'],
|
|||
|
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfRRCD3 = pd.pivot_table(dfRRCityD, index=['单位名称'], values=['阅读数'],
|
|||
|
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfRRCD = pd.concat([dfRRCD1, dfRRCD2, dfRRCD3], axis=1)
|
|||
|
|
|||
|
# 计算转发率
|
|||
|
dfRRCD.columns = ['_'.join(col) for col in dfRRCD.columns.values]
|
|||
|
#dfRRCD.reset_index(inplace=True)
|
|||
|
|
|||
|
dfRRCD['rate'] = dfRRCD.apply(lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['内容'].count() * 1000)/1000.0, axis=1)
|
|||
|
# 排序
|
|||
|
dfRRCD = dfRRCD[0:dfRRCD.shape[0]-1].sort_values(by='rate', ascending=False)
|
|||
|
dfRRCD = pd.concat([dfRRCD, dfRRCD[dfRRCD.shape[0]-1:dfRRCD.shape[0]] ], axis=0)
|
|||
|
|
|||
|
dfRRCD.to_excel(strOutputPath + '市州直部门转发_' + city + '.xlsx')
|
|||
|
|
|||
|
|
|||
|
#######
|
|||
|
# 统计县区
|
|||
|
# 发现目前版本pivot_table函数aggfunc用列表时,前几列计算值不准确
|
|||
|
# 所以,暂时单列计算,再合并
|
|||
|
dfCountyA = pd.pivot_table(dfRRCity, index=['区县'], values=['账号名称'],
|
|||
|
aggfunc = ['count'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfCountyC = pd.pivot_table(dfRRCity, index=['区县'], values=['转发数'],
|
|||
|
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfCountyR = pd.pivot_table(dfRRCity, index=['区县'], values=['阅读数'],
|
|||
|
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfCounty = pd.concat([dfCountyA, dfCountyC, dfCountyR], axis=1)
|
|||
|
|
|||
|
# 计算转发率
|
|||
|
|
|||
|
dfCounty.columns = ['_'.join(col) for col in dfCounty.columns.values]
|
|||
|
#dfCounty.reset_index(inplace=True)
|
|||
|
|
|||
|
dfCounty['rate'] = dfCounty.apply(lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['内容'].count() * 1000)/1000.0, axis=1)
|
|||
|
|
|||
|
# 排序
|
|||
|
dfCC = dfCounty[0:dfCounty.shape[0]-1].sort_values(by='rate', ascending=False)
|
|||
|
dfCC = pd.concat([dfCC, dfCounty[dfCounty.shape[0]-1:dfCounty.shape[0]] ], axis=0)
|
|||
|
dfCC.to_excel(strOutputPath + '县区转发_' + city + '.xlsx')
|
|||
|
|
|||
|
# 统计市/州直部门转发数
|
|||
|
dfRRD = dfRRCity[(dfRRCity['区县'] == '州直部门') | (dfRRCity['区县'] == '市直部门')]
|
|||
|
dfDA = pd.pivot_table(dfRRD, index=['单位名称'], values=['账号名称'],
|
|||
|
aggfunc=['count'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfDC = pd.pivot_table(dfRRD, index=['单位名称'], values=['转发数'],
|
|||
|
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfDR = pd.pivot_table(dfRRD, index=['单位名称'], values=['阅读数'],
|
|||
|
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfD = pd.concat([dfDA, dfDC, dfDR], axis=1)
|
|||
|
|
|||
|
# 计算部门转发率
|
|||
|
#print(dfD.columns)
|
|||
|
# 合并多层索引MultiIndex
|
|||
|
dfD.columns = ['_'.join(col) for col in dfD.columns.values]
|
|||
|
#dfD.reset_index(inplace=True)
|
|||
|
# 计算转发率
|
|||
|
dfD['rate'] = dfD.apply(
|
|||
|
lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['内容'].count() * 1000) / 1000.0, axis=1)
|
|||
|
# 排序
|
|||
|
dfDD = dfD[0:dfD.shape[0] - 1].sort_values(by='rate', ascending=False)
|
|||
|
dfDD = pd.concat([dfDD, dfD[dfD.shape[0] - 1:dfD.shape[0]]], axis=0)
|
|||
|
dfDD.to_excel(strOutputPath + '部门转发_' + city + '.xlsx')
|
|||
|
|
|||
|
#########################################################
|
|||
|
#
|
|||
|
# 生成报告
|
|||
|
tpl = DocxTemplate(fnTemplate)
|
|||
|
info = {
|
|||
|
"taskCount": dfTask['内容'].count(),
|
|||
|
"aNum": int(dfCC.iloc[-1]['count_账号名称']),
|
|||
|
"fNum": int(dfCC.iloc[-1]['sum_转发数']),
|
|||
|
"readNum": int(dfCC.iloc[-1]['sum_阅读数']),
|
|||
|
"r": '%.1f'%(dfCC.iloc[-1]['rate']*100.0),
|
|||
|
#
|
|||
|
"dNum": int(dfDD.iloc[-1]['count_账号名称']), # 部门总账号数
|
|||
|
"dFNum": int(dfDD.iloc[-1]['sum_转发数']), # 部门总转发数
|
|||
|
"dReadNum": int(dfDD.iloc[-1]['sum_阅读数']), # 部门总阅读数
|
|||
|
"dr": '%.1f'%(dfDD.iloc[-1]['rate']*100.0), # 部门平均转发率
|
|||
|
}
|
|||
|
context.update(info)
|
|||
|
|
|||
|
# 县区转发率表格
|
|||
|
t1_list = []
|
|||
|
for index, row in dfCC.iterrows():
|
|||
|
if index == "总计":
|
|||
|
continue
|
|||
|
t1_a = {'county': str(index), 'rate': '%.1f'%(row['rate']*100.0),
|
|||
|
'account': int(row['count_账号名称']), 'fNum': int(row['sum_转发数']),
|
|||
|
'readNum': int(row['sum_阅读数'])}
|
|||
|
t1_list.append(t1_a)
|
|||
|
context['t1_contents'] = t1_list
|
|||
|
|
|||
|
# 部门转发率表格
|
|||
|
t2_list = []
|
|||
|
for index, row in dfDD.iterrows():
|
|||
|
if index == "总计":
|
|||
|
continue
|
|||
|
t2_a = {'name': str(index),
|
|||
|
'rate': '%.1f'%(row['rate']*100.0),
|
|||
|
'account': int(row['count_账号名称']),
|
|||
|
'fNum': int(row['sum_转发数']),
|
|||
|
'readNum': int(row['sum_阅读数'])}
|
|||
|
t2_list.append(t2_a)
|
|||
|
context['t2_contents'] = t2_list
|
|||
|
|
|||
|
# 转发任务列表
|
|||
|
t3_list = []
|
|||
|
for index, row in dfTask.iterrows():
|
|||
|
t3_a = {'id': row['序号'],
|
|||
|
'title': row['内容'],
|
|||
|
'date': row['时间'].strftime('%Y-%m-%d') }
|
|||
|
t3_list.append(t3_a)
|
|||
|
context['t3_contents'] = t3_list
|
|||
|
|
|||
|
# 绘制区县转发率图
|
|||
|
drawBar(dfCC['rate'][:-1], dfCC.index[:-1],
|
|||
|
'区县转发率', os.path.join(strOutputPath, city + '_graphCounty.png'))
|
|||
|
|
|||
|
dc = {
|
|||
|
'graphCounty': InlineImage(tpl, os.path.join(strOutputPath, city+'_graphCounty.png'), width=Mm(120)),
|
|||
|
}
|
|||
|
context.update(dc)
|
|||
|
|
|||
|
|
|||
|
tpl.render(context)
|
|||
|
tpl.save(strOutputPath+city+'.docx')
|
|||
|
|
|||
|
|
|||
|
######
|
|||
|
####
|
|||
|
#######
|
|||
|
######
|
|||
|
####
|
|||
|
#######
|
|||
|
######
|
|||
|
####
|
|||
|
#######
|
|||
|
# 统计县区
|
|||
|
# 发现目前版本pivot_table函数aggfunc用列表时,前几列计算值不准确
|
|||
|
# 所以,暂时单列计算,再合并
|
|||
|
dfCountyA = pd.pivot_table(dfRR, index=['市州'], values=['账号名称'],
|
|||
|
aggfunc = ['count'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfCountyC = pd.pivot_table(dfRR, index=['市州'], values=['转发数'],
|
|||
|
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfCountyR = pd.pivot_table(dfRR, index=['市州'], values=['阅读数'],
|
|||
|
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfCounty = pd.concat([dfCountyA, dfCountyC, dfCountyR], axis=1)
|
|||
|
|
|||
|
# 计算转发率
|
|||
|
|
|||
|
dfCounty.columns = ['_'.join(col) for col in dfCounty.columns.values]
|
|||
|
#dfCounty.reset_index(inplace=True)
|
|||
|
|
|||
|
dfCounty['rate'] = dfCounty.apply(lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['内容'].count() * 1000)/1000.0, axis=1)
|
|||
|
|
|||
|
# 排序
|
|||
|
dfCC = dfCounty[0:dfCounty.shape[0]-1].sort_values(by='rate', ascending=False)
|
|||
|
dfCC = pd.concat([dfCC, dfCounty[dfCounty.shape[0]-1:dfCounty.shape[0]] ], axis=0)
|
|||
|
dfCC.to_excel(strOutputPath + '市州转发_ALL.xlsx')
|
|||
|
|
|||
|
|
|||
|
# 统计市/州直部门转发数
|
|||
|
#if '省直部门' in cities:
|
|||
|
dfRRD = dfRR[ (dfRR['市州'] == '省直部门')]
|
|||
|
dfDD = pd.DataFrame()
|
|||
|
if dfRRD.shape[0] > 0:
|
|||
|
|
|||
|
dfDA = pd.pivot_table(dfRRD, index=['单位名称'], values=['账号名称'],
|
|||
|
aggfunc=['count'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfDC = pd.pivot_table(dfRRD, index=['单位名称'], values=['转发数'],
|
|||
|
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfDR = pd.pivot_table(dfRRD, index=['单位名称'], values=['阅读数'],
|
|||
|
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfD = pd.concat([dfDA, dfDC, dfDR], axis=1)
|
|||
|
|
|||
|
# 计算部门转发率
|
|||
|
print('---', dfD.columns)
|
|||
|
# 合并多层索引MultiIndex
|
|||
|
dfD.columns = ['_'.join(col) for col in dfD.columns.values]
|
|||
|
#dfD.reset_index(inplace=True)
|
|||
|
print('---', dfD.columns)
|
|||
|
# 计算转发率
|
|||
|
dfD['rate'] = dfD.apply(
|
|||
|
lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['内容'].count() * 1000) / 1000.0, axis=1)
|
|||
|
# 排序
|
|||
|
dfDD = dfD[0:dfD.shape[0] - 1].sort_values(by='rate', ascending=False)
|
|||
|
dfDD = pd.concat([dfDD, dfD[dfD.shape[0] - 1:dfD.shape[0]]], axis=0)
|
|||
|
dfDD.to_excel(strOutputPath + '部门转发_ALL.xlsx')
|
|||
|
|
|||
|
#########################################################
|
|||
|
#
|
|||
|
# 生成报告
|
|||
|
tpl = DocxTemplate(fnTemplate)
|
|||
|
info = {
|
|||
|
"taskCount": dfTask['内容'].count(),
|
|||
|
"aNum": int(dfCC.iloc[-1]['count_账号名称']),
|
|||
|
"fNum": int(dfCC.iloc[-1]['sum_转发数']),
|
|||
|
"readNum": int(dfCC.iloc[-1]['sum_阅读数']),
|
|||
|
"r": '%.1f'%(dfCC.iloc[-1]['rate']*100.0),
|
|||
|
}
|
|||
|
if dfDD.empty:
|
|||
|
info.update( {
|
|||
|
"dNum": 0, # 部门总账号数
|
|||
|
"dFNum": 0, # 部门总转发数
|
|||
|
"dReadNum": 0, # 部门总阅读数
|
|||
|
"dr": '%.1f'%(0), # 部门平均转发率
|
|||
|
} )
|
|||
|
else:
|
|||
|
info.update( {
|
|||
|
"dNum": int(dfDD.iloc[-1]['count_账号名称']), # 部门总账号数
|
|||
|
"dFNum": int(dfDD.iloc[-1]['sum_转发数']), # 部门总转发数
|
|||
|
"dReadNum": int(dfDD.iloc[-1]['sum_阅读数']), # 部门总阅读数
|
|||
|
"dr": '%.1f'%(dfDD.iloc[-1]['rate']*100.0), # 部门平均转发率
|
|||
|
})
|
|||
|
|
|||
|
context.update(info)
|
|||
|
|
|||
|
# 县区转发率表格
|
|||
|
t1_list = []
|
|||
|
for index, row in dfCC.iterrows():
|
|||
|
if index == "总计":
|
|||
|
continue
|
|||
|
t1_a = {'county': str(index), #str('账号名称'),
|
|||
|
'rate': '%.1f'%(row['rate']*100.0),
|
|||
|
'account': int(row['count_账号名称']), 'fNum': int(row['sum_转发数']),
|
|||
|
'readNum': int(row['sum_阅读数'])}
|
|||
|
t1_list.append(t1_a)
|
|||
|
context['t1_contents'] = t1_list
|
|||
|
|
|||
|
# 部门转发率表格
|
|||
|
t2_list = []
|
|||
|
if not dfDD.empty:
|
|||
|
for index, row in dfDD.iterrows():
|
|||
|
if index == "总计":
|
|||
|
continue
|
|||
|
t2_a = {'name': str(index), #str(row['单位名称']),
|
|||
|
'rate': '%.1f'%(row['rate']*100.0),
|
|||
|
'account': int(row['count_账号名称']),
|
|||
|
'fNum': int(row['sum_转发数']),
|
|||
|
'readNum': int(row['sum_阅读数'])}
|
|||
|
t2_list.append(t2_a)
|
|||
|
context['t2_contents'] = t2_list
|
|||
|
|
|||
|
# 转发任务列表
|
|||
|
t3_list = []
|
|||
|
for index, row in dfTask.iterrows():
|
|||
|
t3_a = {'id': row['序号'],
|
|||
|
'title': row['内容'],
|
|||
|
'date': row['时间'].strftime('%Y-%m-%d') }
|
|||
|
t3_list.append(t3_a)
|
|||
|
context['t3_contents'] = t3_list
|
|||
|
|
|||
|
# 绘制区县转发率图
|
|||
|
drawBar(dfCC['rate'][:-1], dfCC.index[:-1],
|
|||
|
'市州转发率', os.path.join(strOutputPath, 'ALL_graphCounty.png'))
|
|||
|
|
|||
|
dc = {
|
|||
|
'graphCounty': InlineImage(tpl, os.path.join(strOutputPath, 'ALL_graphCounty.png'), width=Mm(120)),
|
|||
|
}
|
|||
|
context.update(dc)
|
|||
|
|
|||
|
tpl.render(context)
|
|||
|
tpl.save(strOutputPath+'ALL.docx')
|
|||
|
|
|||
|
|
|||
|
endtime = datetime.datetime.now()
|
|||
|
usedtime = endtime - starttime
|
|||
|
print("time: ", usedtime)
|