pomscripts/statForward202303.py

1072 lines
51 KiB
Python
Raw Normal View History

2023-04-04 04:15:34 +00:00
import datetime
import csv
import pandas as pd
import numpy as np
import glob, os, re, time
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from difflib import SequenceMatcher
from collections import Counter
import difflib
from docxtpl import DocxTemplate
from docxtpl import InlineImage
from docx.shared import Mm
import jieba
import jieba.posseg as pseg
# 画柱状图
def drawBar(data, recipe, title='', fn=''):
plt.figure(figsize=(6, 4))
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
counties = recipe
countyRates = data
plt.bar(counties, countyRates, width=0.5)
plt.xticks(counties, counties, rotation=35)
plt.ylim((0, 1))
def to_percent(temp, position):
return '%2.0f' % (100 * temp) + '%'
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
plt.title(title, fontsize=16)
plt.tight_layout()
plt.savefig(fn)
# plt.show()
plt.cla()
plt.clf()
plt.close()
def getWBData(path, cities, hasBody=False):
# cityShorten
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
}
dirCs = os.listdir(path)
cs = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数',
'转发数', '评论数', 'weiboID', 'weiboName', '市州']
dfWB = pd.DataFrame(columns=cs)
cityCount = 0
for dirC in dirCs:
if dirC[:1] == '.':
continue
if not os.path.isdir(os.path.join(path, dirC)):
continue
if 'weixin' in dirC.lower():
continue
if 'tt' in dirC.lower():
continue
if not cityShorten[dirC] in cities:
continue
print(' city: ', cityShorten[dirC], dirC)
cityCount += 1
# City LN
cols = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数',
'转发数', '评论数'] #WB下载工具中的格式
dfWBC = pd.DataFrame(columns=cols)
dirCTs = os.listdir(os.path.join(path, dirC))
for dirCT in dirCTs:
if dirCT[:1] == '.':
continue
# 时段 weibo weibo_1
if not os.path.isdir(os.path.join(path, dirC, dirCT)):
continue
if 'weixin' in dirCT.lower():
continue
if 'tt' in dirCT.lower():
continue
print(' read WB... dir:',dirCT)
dirAs = os.listdir(os.path.join(path, dirC, dirCT))
for dirA in dirAs:
if dirA[:1] == '.':
continue
# 都是账号名称目录下再存账号ID.txt
if not os.path.isdir(os.path.join(path, dirC, dirCT, dirA)):
continue
##print('---',dirA)
# 账号名称
wbName = dirA
fileAs = os.listdir(os.path.join(path, dirC, dirCT, dirA))
if len(fileAs) > 0 and os.path.splitext(fileAs[0])[-1] == '.csv':
wbId = fileAs[0][:-4]
if len(fileAs) > 1 and wbId.startswith('.'):
wbId = fileAs[1][:-4]
# 读取文件
##print('----',wbName, wbId)
filename = os.path.join(path, dirC, dirCT, dirA, fileAs[0])
dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
index_col=None)#, engine='python', encoding='gbk'#utf-8
dfdfwb = dfdfwb[1:]
dfdfwb["weiboID"] = wbId
dfdfwb["weiboName"] = wbName
dfWBC = dfWBC.append(dfdfwb)
#print(wbName, wbId, fileAs[0], dfdfwb.shape, dfWBC.shape)
if len(fileAs)>1:
print(" +=+= ", fileAs)
print(' ', dfWBC.shape)
#dfWBC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx")
dfWBC['市州'] = cityShorten[dirC]
dfWB = dfWB.append(dfWBC)
print('Read WB finished. cities', cityCount, '; lines', dfWB.shape)
#dfWB.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
return dfWB
# 从数据目录中读取xlsx文件拼接到一起
def getWXData(path, cities, hasBody=False):
# cityShorten
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
}
dirBatches = os.listdir(path)
cols = ['公众号', '链接', '日期', '标题', '内容', '头条', '市州', '阅读数']
dfWX = pd.DataFrame(columns=cols)
countC = 0
countFnC = 0
# 监测批次目录
for dirBatch in dirBatches:
if not os.path.isdir(os.path.join(path, dirBatch)):
continue # 仅目录
# City LN
# 列出市州文件名称
fileCs = os.listdir(os.path.join(path, dirBatch))
count = 0
for fileC in fileCs:
if fileC[:1] == '.':
continue
# 处理目录
if os.path.isdir(os.path.join(path, dirBatch, fileC)) and 'weixin' in fileC:
print(' ', os.path.join(path, dirBatch, fileC))
fs = os.listdir(os.path.join(path, dirBatch, fileC))
for f in fs:
fe = os.path.splitext(f)[-1]
if fe == '.xlsx' or fe == '.xls':
fName = os.path.splitext(fileC)[0]
cityname = cityShorten[dirBatch]
if cityname in cities:
dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC, f))
dfdfwxc['市州'] = cityname
print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape)
dfWX = dfWX.append(dfdfwxc)
count = count + 1
# 处理文件
fExt = os.path.splitext(fileC)[-1]
if fExt != '.xlsx' and fExt != '.xls':
continue # 限制文件扩展名
fName = os.path.splitext(fileC)[0]
cityname = cityShorten[dirBatch]
if cityname in cities:
dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC))
dfdfwxc['市州'] = cityShorten[dirBatch]
print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape)
dfWX = dfWX.append(dfdfwxc)
count = count + 1
countFnC += count
if count > 0:
countC += 1
print("Read WX Finished. cities ", countC, '; Files', countFnC, '; lines ', dfWX.shape[0])
#dfWX.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WX_ALL.xlsx")
return dfWX
# 从数据目录中读取xlsx文件拼接到一起
def getTTData(path, cities, hasBody=False):
# cityShorten
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
}
dirCs = os.listdir(path)
#account date title nread ncomment content url origin
cs = ['account', 'date', 'title', 'nread', 'ncomment', 'content', 'url', 'origin', 'city']
dfTT = pd.DataFrame(columns=cs)
cityCount = 0
for dirC in dirCs:
if dirC[:1] == '.':
continue
if not os.path.isdir(os.path.join(path, dirC)):
continue
if 'weixin' in dirC.lower():
continue
if 'weibo' in dirC.lower():
continue
if not cityShorten[dirC] in cities:
continue
print(' city: ', cityShorten[dirC], dirC)
cityCount += 1
# City LN
dfTTC = pd.DataFrame(columns=cs)
dirCTs = os.listdir(os.path.join(path, dirC))
for dirCT in dirCTs:
if dirCT[:1] == '.':
continue
# 时段 weibo weibo_1
if not os.path.isdir(os.path.join(path, dirC, dirCT)):
continue
if 'weixin' in dirCT.lower():
continue
if 'weibo' in dirCT.lower():
continue
if 'tt' in dirCT.lower():
print(' read TT... dir:',dirCT)
fns = os.listdir(os.path.join(path, dirC, dirCT))
for fn in fns:
if fn[:1] == '.':
continue
if not fn[-5:] == '.xlsx':
continue
#print('---',fn)
# 账号名称
ttName = fn[fn.index('_')+1:]
ttName = ttName[:ttName.index('_')]
#D:\Projects\POM\DATA\2022年11月\10月报告\全文\LN\TT
fileAs = os.path.join(path, dirC, dirCT, fn)
#print(' ', ttName, fileAs)
if len(fileAs) > 0:
dfdftt = pd.read_excel(fileAs)
dfTTC = dfTTC.append(dfdftt)
#dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
# index_col=None)#, engine='python', encoding='gbk'#utf-8
#dfdfwb = dfdfwb[1:]
#dfdfwb["weiboID"] = wbId
#dfdfwb["weiboName"] = wbName
#dfTTC = dfTTC.append(dfdfwb)
#print(ttName, '读入:', dfdftt.shape[0], ' 总计:', dfTTC.shape[0])
#if len(fileAs)>1:
# print(" +=+= ", fileAs)
print(' 读入头条数据行数', dfTTC.shape)
#dfTTC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx")
dfTTC['city'] = cityShorten[dirC]
dfTT = dfTT.append(dfTTC)
print('Read TT finished. cities', cityCount, '; lines', dfTT.shape)
#dfTT.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
return dfTT
def fetch_chinese(s):
pattern =re.compile(r'[^\u4e00-\u9fa5]')
sc = re.sub(pattern, '', s)
return sc
if __name__ == "__main__":
starttime = datetime.datetime.now()
_RATIO = 0.7
isDoWX = True
isDoWB = True
isDoTT = True
cities = [
'临夏回族自治州',
'白银市',
'定西市',
'酒泉市',
'嘉峪关市',
'平凉市',
'庆阳市',
'天水市',
'武威市',
'兰州新区',
'陇南市',
'兰州市', '张掖市', '甘南藏族自治州', '金昌市',
'省直部门', # 共12市2州1新区
]
cities = [
'临夏回族自治州',
'白银市',
'定西市',
'酒泉市',
'天水市',
'陇南市',
#'省直部门', # 共12市2州1新区
]
#cities = ['陇南市', '临夏回族自治州', '白银市', '定西市', '酒泉市', '平凉市','武威市','天水市']
#cities = ['白银市']
# 转发任务
#dfTask = pd.read_excel('D:/Projects/POM/DATA/2022年S2/S2/全省政务新媒体二季度转发信息条目.xls')
dfTask = pd.read_excel('D:/Projects/POM/DATA/2023年3月/2月报告/2023年2月份全省政务新媒体转发内容条目.xlsx')
# 账号信息
strFnAccount = 'D:/Projects/POM/DATA/2023年3月/2月报告/全国报送系统表单_2023.2.28.xlsx'
dfAllAccount = pd.read_excel(strFnAccount)
# 省直部门账号部门简称
dfProvincial = pd.read_excel('D:/Projects/POM/DATA/2023年3月/2月报告/省直部门账号名称简称.xlsx')
fnTemplate = 'D:/Projects/POM/DATA/2023年3月/2月报告/POM_ForewardTemplate.docx'
# 数据根目录,
strPath = ['D:/Projects/POM/DATA/2023年3月/2月报告/']
strOutputPath = 'D:/Projects/POM/DATA/2023年3月/2月报告/转发/'
context = {
"year": "2023",
"month": "2",
"pubMonth": "3",
"dateStart": "2023年2月1日",
"dateEnd": "2023年2月28日"
}
dfAllAccount.loc[:, '转发数'] = 0
dfAllAccount.loc[:, '阅读数'] = 0
################################################
# 创建存储矩阵
# 按照转发任务创建统计矩阵
colRR = ['市州', '类型', '账号名称', '单位名称', '省直部门', '区县', '转发数', '阅读数']
for ididid in dfTask['序号'][0:dfTask['内容'].count()].tolist():
#for ididid in range(1, dfTask['内容'].count()):
colRR.append(str(ididid))
dfRR = pd.DataFrame(columns=colRR) # 每列一个转发任务,每行一个账号
# 用于保存每一条转发任务的账号和文章
dfO = pd.DataFrame(columns=['任务序号', '任务名称', '类型', '公众号', '日期', '内容', '链接', '市州'] )
################################################
#
countWxForewards = 0
countWbForewards = 0
countTtForewards = 0
# TT
if isDoTT:
print('=============================================================')
print('---- TT ----')
# id userId source city tid cellType title
# time-stamp date url commentCount readNum likeNum showNum
#dfTT = getTTData(strFnTT, strFnAccount, cities) #附加市州信息, cities未使用
dfTT = pd.DataFrame()
for strP in strPath:
ddff = getTTData(strP + '全文/', cities)
print(" read TT data", ddff.shape)
dfTT = dfTT.append(ddff)
print("tt data ready", dfTT.shape)
# cs = ['account', 'date', 'title', 'nread', 'ncomment', 'content', 'url', 'origin', 'city']
#cities = dfTT['city'].unique()
for city in cities:
dataC = dfTT.loc[dfTT['city'] == city].copy()
accounts = dataC['account'].unique()
dfdfCityTT = dfAllAccount.loc[(dfAllAccount['账号类型'] == '今日头条')
& (dfAllAccount['市/省局'] == city)].copy()
print(' count TT, city:', city, '读入账号数:', len(accounts), '任务账号数:', dfdfCityTT.shape[0])
print(' ', dataC.shape)
for account in accounts:
#一个公众号的所有文章
#print(account)
dataA = dataC[dataC['account']==account]
sR = pd.Series([], dtype=pd.StringDtype())
sR['类型'] = '今日头条'
sR['市州'] = city
sR['账号名称'] = account
count = 0
for i in range(dfTask['内容'].count()):
rn = dfTask.iloc[i, dfTask.columns.get_loc('序号')]
rt = str(dfTask.iloc[i, dfTask.columns.get_loc('内容')]) #任务标题
forwarded = 0
for j in range(dataA.shape[0]):
str1 = str(dataA.iloc[j, dataA.columns.get_loc('title')]) # 文章标题
#
if len(rt) > len(str1):
strRT = rt[:len(str1)]
else:
strRT = rt
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
if ratio > _RATIO:
forwarded += 1
if forwarded > 0:
break
sR[str(rn)] = forwarded
count += forwarded
if forwarded > 0:
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
'类型': '今日头条',
'公众号': account, #dfTT.iloc[j, dfTT.columns.get_loc('account')],
'日期': dataA.iloc[j, dataA.columns.get_loc('date')],
'内容': str1,
'链接': dataA.iloc[j, dataA.columns.get_loc('url')],
'市州': city,
}], ignore_index=True)
#checknames = ['白银高新区管委会', '健康白银', '白银市卫生健康委', '白银市卫生健康委员会', '白银发改委', '白银市场监管', '白银农业农村', '白银应急', '白银退役军人', '白银政务服务', '白银政务服务', '白银文旅', '白银科技', '白银林草', '白银市平川区商务局', '平川区长征街道', '平川区红会路街道', '和谐复兴', '兴平路街道办', '平川金融办']
#if account in checknames:
# print(' -- ', account, count)
sR['转发数'] = count
dfRR = dfRR.append(sR, ignore_index=True, sort=False)
#dfRR.to_excel('D:/Projects/POM/2021年6月/二季度/转发统计__'+city+'.xlsx')
countTtForewards = dfRR.shape[0] - countWxForewards - countWbForewards
print('TT forewards', countTtForewards)
# WX
if isDoWX:
print('=============================================================')
print('---- WX ----')
dfWX = pd.DataFrame()
for strP in strPath:
ddff = getWXData(strP + '全文/', cities)
print(' read WX data', ddff.shape)
dfWX = dfWX.append(ddff)
print('WX data ready', dfWX.shape)
dfWX = dfWX.fillna(value=0)
# 公众号 链接 日期 标题 内容 头条 city
## 逐个市州统计每个账号的转发情况
#cities = dfWX['市州'].unique()
for city in cities:
print(" count WX, city:", city)
dataC = dfWX.loc[dfWX['市州'] == city].copy()
accounts = dataC['公众号'].unique()
dfdfCityWX = dfAllAccount.loc[((dfAllAccount['账号类型'] == '小程序+微信')
| (dfAllAccount['账号类型'] == '微信服务号')
| (dfAllAccount['账号类型'] == '微信订阅号'))
& (dfAllAccount['市/省局'] == city)].copy()
print(" count WX, city:", city, '账号数:', len(accounts), '任务账号数:', dfdfCityWX.shape[0])
for account in accounts:
# print(account)
dataA = dataC.loc[dataC['公众号'] == account].copy() # 一个公众号的所有文章
sR = pd.Series(dtype='object')
sR['类型'] = '微信'
sR['市州'] = city
sR['账号名称'] = account
count = 0
arn = 0
for i in range(dfTask['内容'].count()):
# 对于每一篇任务文章
rn = dfTask.iloc[i, dfTask.columns.get_loc('序号')]
rt = str(dfTask.iloc[i, dfTask.columns.get_loc('内容')])
forwarded = 0
readNum = 0
# 查看该账号的所有文章
for j in range(dataA.shape[0]):
str1 = str(dataA.iloc[j, dataA.columns.get_loc('标题')])
#
if len(rt) > len(str1):
strRT = rt[:len(str1)]
else:
strRT = rt
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
tRN = int(dataA.iloc[j, dataA.columns.get_loc('阅读数')])
# 遇到相似的即跳出
if ratio > _RATIO:
forwarded += 1
readNum += tRN
if forwarded > 0:
break
sR[str(rn)] = forwarded
count += forwarded
arn += readNum
if forwarded > 0:
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
'类型': '微信',
'公众号': account,
'日期': dataA.iloc[j, dataA.columns.get_loc('日期')],
'内容': str1,
'链接': dataA.iloc[j, dataA.columns.get_loc('链接')],
'市州': city,
'阅读数': readNum,
}], ignore_index=True)
sR['转发数'] = count
sR['阅读数'] = arn
dfRR = dfRR.append(sR, ignore_index=True, sort=False)
countWxForewards = dfRR.shape[0]
print('WX forwards', countWxForewards)
# WB
if isDoWB:
print('=============================================================')
print('---- WB ----')
dfWB = pd.DataFrame()
for strP in strPath:
ddff = getWBData(strP + '全文/', cities)
print(' read WB data', ddff.shape)
dfWB = dfWB.append(ddff)
print('WB data ready', dfWB.shape)
################################################
# WB
# 微博id 微博正文 头条文章url 原始图片url 被转发微博原始图片url 是否为原创微博 微博视频url 发布位置 date
# 发布工具 点赞数 转发数 评论数 weiboID weiboName city
#cities = dfWB['市州'].unique()
for city in cities:
print(' count WB, city:', city)
dataC = dfWB.loc[dfWB['市州'] == city].copy()
accounts = dataC['weiboName'].unique()
dfdfCityWB = dfAllAccount.loc[(dfAllAccount['账号类型'] == '新浪微博')
& (dfAllAccount['市/省局'] == city)].copy()
print(' count WB, city:', city, "读入账号数:", len(accounts), '任务账号数:', dfdfCityWB.shape[0])
for account in accounts:
# 一个公众号的所有文章
# print(account)
dataA = dataC.loc[dataC['weiboName'] == account].copy()
sR = pd.Series(dtype='object')
sR['类型'] = '新浪微博'
sR['市州'] = city
sR['账号名称'] = account
count = 0
# 对一个账号,用任务标题从它全部发文里比对
# 若找到, 该任务标记为已转发
for i in range(dfTask['内容'].count()):
rn = dfTask.iloc[i, dfTask.columns.get_loc('序号')]
rt = str(dfTask.iloc[i, dfTask.columns.get_loc('内容')])
forwarded = 0
for j in range(dataA.shape[0]):
str1 = str(dataA.iloc[j, dataA.columns.get_loc('微博正文')])
str2 = str1[:len(rt)]
#if rt in str1:
ratio = difflib.SequenceMatcher(None, fetch_chinese(rt), fetch_chinese(str2)).quick_ratio()
# if account=='陇南公积金' and ratio > 0.5:
# print('----', ratio)
# print(rt)
# print(fetch_chinese(rt))
# print(str2)
# print(fetch_chinese(str2))
if ratio > _RATIO:
forwarded += 1
if forwarded > 0:
break
sR[str(rn)] = forwarded
count += forwarded
if forwarded > 0:
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
'类型': '新浪微博',
'公众号': account,
'日期': dataA.iloc[j, dataA.columns.get_loc('date')],
'内容': str1,
'链接': dataA.iloc[j, dataA.columns.get_loc('头条文章url')],
'市州': city,
}], ignore_index=True)
sR['转发数'] = count
##print(account, count)
dfRR = dfRR.append(sR, ignore_index=True, sort=False)
####if account=='陇南公积金' :
####print(account, dataA.shape[0], count)
# dfRR.to_excel('D:/Projects/POM/2021年6月/二季度/转发统计__'+city+'.xlsx')
countWbForewards = dfRR.shape[0] - countWxForewards
print('WB forwards', countWbForewards)
#dfRR.to_excel(strOutputPath + '转发统计_GS_ALL.xlsx')
#dfO.to_excel(strOutputPath + '转发统计_GS_ALLDATA.xlsx')
if isDoWX or isDoWB or isDoTT:
print('=============================================================')
print('---- STATISTICS ----')
print('=============================================================')
print('ALL forewards account num:', dfRR.shape[0], 'task num:', dfRR.shape[1])
################################################
# 统计每一个账号的累计转发数和阅读数
# ~~ 匹配到账号所有信息的表格中
#
dfAAWX = dfAllAccount.loc[(dfAllAccount['账号类型'] == '小程序+微信')
| (dfAllAccount['账号类型'] == '微信服务号')
| (dfAllAccount['账号类型'] == '微信订阅号') ].copy()
dfRRWX = dfRR.loc[dfRR['类型'] == '微信'].copy()
dfRRWX.rename(columns={'账号名称':'account'}, inplace=True)
## 遍历账号详情表
for i in range(dfAAWX.shape[0]):
strName = str(dfAAWX.iloc[i, dfAAWX.columns.get_loc('账号名称')])
dfRRRRWX = dfRRWX[dfRRWX.account == strName]
aName = re.sub('\s+','', str(dfAAWX.iloc[i, dfAAWX.columns.get_loc('账号名称')]))
matchedRow = -1 # 转发统计表中的行数
# 对每一个微信账号,从转发统计表中匹配账号名称
for j in range(dfRRWX.shape[0]):
name = re.sub('\s+', '', str(dfRRWX.iloc[j, dfRRWX.columns.get_loc('account')]))
if name == aName:
matchedRow = j
break
if matchedRow > -1 :
r = int(dfRRWX.iloc[matchedRow, dfRRWX.columns.get_loc('转发数')])
readNum = int(dfRRWX.iloc[matchedRow, dfRRWX.columns.get_loc('阅读数')])
dfAAWX.iloc[i, dfAAWX.columns.get_loc('转发数')] = r
dfAAWX.iloc[i, dfAAWX.columns.get_loc('阅读数')] = readNum
if dfRRRRWX.shape[0]>0 and matchedRow<0:
print(' -', aName, strName, dfRRRRWX.shape[0], matchedRow)
######
if dfRRRRWX.shape[0]<1 and matchedRow>-1:
print(' =', aName, strName, dfRRRRWX.shape[0], matchedRow)
######
#dfAAWX.to_excel('D:/Projects/POM/2021年7月/月报告/6月份全文数据/转发统计___List.xlsx')
print('总微信账号数:', dfAAWX.shape)
dfAAWB = dfAllAccount.loc[dfAllAccount['账号类型'] == '新浪微博'].copy()
dfRRWB = dfRR.loc[dfRR['类型'] == '新浪微博'].copy()
for i in range(dfAAWB.shape[0]):
aName = re.sub('\s+','', str(dfAAWB.iloc[i, dfAAWB.columns.get_loc('账号名称')]))
matchedRow = -1
# 对每一个账号,从转发统计表中匹配账号名称
for j in range(dfRRWB.shape[0]):
name = re.sub('\s+', '', str(dfRRWB.iloc[j, dfRRWB.columns.get_loc('账号名称')]))
if name == aName:
matchedRow = j
break
if matchedRow > -1 :
r = int(dfRRWB.iloc[matchedRow, dfRRWB.columns.get_loc('转发数')])
dfAAWB.iloc[i, dfAAWB.columns.get_loc('转发数')] = r
print('总微博账号数:', dfAAWB.shape)
dfAATT = dfAllAccount.loc[dfAllAccount['账号类型'] == '今日头条'].copy()
dfRRTT = dfRR.loc[dfRR['类型'] == '今日头条'].copy() #'市州', '类型', '账号名称', '转发数'
dfRRTT.rename(columns={'账号名称':'account'}, inplace=True)
for i in range(dfAATT.shape[0]):
aName = re.sub('\s+','', str(dfAATT.iloc[i, dfAATT.columns.get_loc('账号名称')]))
strName = str(dfAATT.iloc[i, dfAATT.columns.get_loc('账号名称')])
dfRRRRTT = dfRRTT[dfRRTT.account == strName]
matchedRow = -1
#dftmp = dfRRTT[dfRRTT['账号名称']==aName]
for j in range(dfRRTT.shape[0]):
name = re.sub('\s+', '', str(dfRRTT.iloc[j, dfRRTT.columns.get_loc('account')]))
if name == aName:
matchedRow = j
break
if matchedRow > -1 :
r = int(dfRRTT.iloc[matchedRow, dfRRTT.columns.get_loc('转发数')])
######
######
#dfAATT.iloc[i, dfAATT.columns.get_loc('转发数')] = r
dfAATT.loc[i, dfAATT.columns.get_loc('转发数')] = r
if dfRRRRTT.shape[0]>0 and matchedRow<0:
print(' -', aName, strName, dfRRRRTT.shape[0], matchedRow)
######
if dfRRRRTT.shape[0]<1 and matchedRow>-1:
print(' =', aName, strName, dfRRRRTT.shape[0], matchedRow)
######
print('总头条账号数:', dfAATT.shape)
dfAAA = dfAAWX.append(dfAAWB, ignore_index=True, sort=False)
dfAAA = dfAAA.append(dfAATT, ignore_index=True, sort=False)
print('所有统计账号数:', dfAAA.shape)
################################################
# 为转发账号匹配单位全称和所属县区
#
for i in range(dfRR.shape[0]):
aName = re.sub('\s+','', str(dfRR.iloc[i, dfRR.columns.get_loc('账号名称')]))
matchedRow = -1
# 区县/地方部门
# 对每一个账号,从表中匹配账号名称
for j in range(dfAllAccount.shape[0]):
name = re.sub('\s+', '', str(dfAllAccount.iloc[j, dfAllAccount.columns.get_loc('账号名称')]))
if name == aName:
matchedRow = j
break
if matchedRow > -1 :
cc = str(dfAllAccount.iloc[matchedRow, dfAllAccount.columns.get_loc('区县/地方部门')])
if cc != 'nan':
dfRR.iloc[i, dfRR.columns.get_loc('区县')] = cc
bn = str(dfAllAccount.iloc[matchedRow, dfAllAccount.columns.get_loc('单位全称')])
if bn != 'nan':
dfRR.iloc[i, dfRR.columns.get_loc('单位名称')] = bn
######
# 为省级部门匹配简称
if '省直部门' in cities:
city = '省直部门'
dfAAACity = dfAAA[dfAAA['市/省局'] == city]
dfAAACity.to_excel(strOutputPath + '账号转发量_' + city + '.xlsx')
dfOCity = dfO[dfO['市州'] == city]
dfO.to_excel(strOutputPath + '转发文章_' + city + '.xlsx')
dfRRCity = dfRR[dfRR['市州'] == city]
if city in ['临夏回族自治州', '甘南藏族自治州']:
dfRRCity['区县'].fillna('州直部门', inplace=True)
else:
dfRRCity['区县'].fillna('市直部门', inplace=True)
dfRRCity.to_excel(strOutputPath + '转发账号_' + city + '.xlsx')
dfRR.to_excel(strOutputPath + '转发账号.xlsx')
dfAAA.to_excel(strOutputPath + '账号转发量.xlsx')
dfO.to_excel(strOutputPath + '转发文章.xlsx')
# dfAAA = pd.read_excel(strOutputPath + '账号转发量.xlsx')
# dfRR = pd.read_excel(strOutputPath + '转发账号.xlsx')
# dfO = pd.read_excel(strOutputPath + '转发文章.xlsx')
# 过长名称替换为简称,便于绘图
dfRR.loc[dfRR['区县'] == '积石山保安族东乡族撒拉族自治县', '区县'] = '积石山县'
dfRR.loc[dfRR['区县'] == '阿克塞哈萨克族自治县', '区县'] = '阿克塞自治县'
for city in cities:
if city in ['兰州新区', '省直部门']:
continue
print(" add up city", city)
######
# 匹配省级部门的简称
if city == '省直部门':
for i in range(dfRR.shape[0]):
aName = re.sub('\s+', '', str(dfRR.iloc[i, dfRR.columns.get_loc('账号名称')]))
matchedRow = -1
# 对每一个账号,从表中匹配账号名称
for j in range(dfProvincial.shape[0]):
name = re.sub('\s+', '', str(dfProvincial.iloc[j, dfProvincial.columns.get_loc('账号名称')]))
if name == aName:
matchedRow = j
break
if matchedRow > -1:
bmjc = str(dfProvincial.iloc[matchedRow, dfProvincial.columns.get_loc('简称')])
if bmjc != 'nan':
dfRR.iloc[i, dfRR.columns.get_loc('省直部门')] = bmjc
dfAAACity = dfAAA.loc[dfAAA['市/省局'] == city].copy()
dfAAACity.to_excel(strOutputPath + '账号转发量_' + city + '.xlsx')
dfOCity = dfO[dfO['市州'] == city]
dfO.to_excel(strOutputPath + '转发文章_' + city + '.xlsx')
dfRRCity = dfRR.loc[dfRR['市州'] == city].copy()
if city in ['临夏回族自治州', '甘南藏族自治州']:
dfRRCity['区县'].fillna('州直部门', inplace=True)
dfRRCityD = dfRRCity.loc[dfRRCity['区县'] == '州直部门'].copy()
else:
dfRRCity['区县'].fillna('市直部门', inplace=True)
dfRRCityD = dfRRCity.loc[dfRRCity['区县'] == '市直部门'].copy()
dfRRCity.to_excel(strOutputPath + '转发账号_' + city + '.xlsx')
####
# 统计市直部门
#dfRRCityD = dfRRCity[dfRRCity['区县'] == '州直部门']
dfRRCD1 = pd.pivot_table(dfRRCityD, index=['单位名称'], values=['账号名称'],
aggfunc = ['count'], fill_value='', margins=True, margins_name='总计')
dfRRCD2 = pd.pivot_table(dfRRCityD, index=['单位名称'], values=['转发数'],
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
dfRRCD3 = pd.pivot_table(dfRRCityD, index=['单位名称'], values=['阅读数'],
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
dfRRCD = pd.concat([dfRRCD1, dfRRCD2, dfRRCD3], axis=1)
# 计算转发率
dfRRCD.columns = ['_'.join(col) for col in dfRRCD.columns.values]
#dfRRCD.reset_index(inplace=True)
dfRRCD['rate'] = dfRRCD.apply(lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['内容'].count() * 1000)/1000.0, axis=1)
# 排序
dfRRCD = dfRRCD[0:dfRRCD.shape[0]-1].sort_values(by='rate', ascending=False)
dfRRCD = pd.concat([dfRRCD, dfRRCD[dfRRCD.shape[0]-1:dfRRCD.shape[0]] ], axis=0)
dfRRCD.to_excel(strOutputPath + '市州直部门转发_' + city + '.xlsx')
#######
# 统计县区
# 发现目前版本pivot_table函数aggfunc用列表时前几列计算值不准确
# 所以,暂时单列计算,再合并
dfCountyA = pd.pivot_table(dfRRCity, index=['区县'], values=['账号名称'],
aggfunc = ['count'], fill_value='', margins=True, margins_name='总计')
dfCountyC = pd.pivot_table(dfRRCity, index=['区县'], values=['转发数'],
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
dfCountyR = pd.pivot_table(dfRRCity, index=['区县'], values=['阅读数'],
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
dfCounty = pd.concat([dfCountyA, dfCountyC, dfCountyR], axis=1)
# 计算转发率
dfCounty.columns = ['_'.join(col) for col in dfCounty.columns.values]
#dfCounty.reset_index(inplace=True)
dfCounty['rate'] = dfCounty.apply(lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['内容'].count() * 1000)/1000.0, axis=1)
# 排序
dfCC = dfCounty[0:dfCounty.shape[0]-1].sort_values(by='rate', ascending=False)
dfCC = pd.concat([dfCC, dfCounty[dfCounty.shape[0]-1:dfCounty.shape[0]] ], axis=0)
dfCC.to_excel(strOutputPath + '县区转发_' + city + '.xlsx')
# 统计市/州直部门转发数
dfRRD = dfRRCity[(dfRRCity['区县'] == '州直部门') | (dfRRCity['区县'] == '市直部门')]
dfDA = pd.pivot_table(dfRRD, index=['单位名称'], values=['账号名称'],
aggfunc=['count'], fill_value='', margins=True, margins_name='总计')
dfDC = pd.pivot_table(dfRRD, index=['单位名称'], values=['转发数'],
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
dfDR = pd.pivot_table(dfRRD, index=['单位名称'], values=['阅读数'],
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
dfD = pd.concat([dfDA, dfDC, dfDR], axis=1)
# 计算部门转发率
#print(dfD.columns)
# 合并多层索引MultiIndex
dfD.columns = ['_'.join(col) for col in dfD.columns.values]
#dfD.reset_index(inplace=True)
# 计算转发率
dfD['rate'] = dfD.apply(
lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['内容'].count() * 1000) / 1000.0, axis=1)
# 排序
dfDD = dfD[0:dfD.shape[0] - 1].sort_values(by='rate', ascending=False)
dfDD = pd.concat([dfDD, dfD[dfD.shape[0] - 1:dfD.shape[0]]], axis=0)
dfDD.to_excel(strOutputPath + '部门转发_' + city + '.xlsx')
#########################################################
#
# 生成报告
tpl = DocxTemplate(fnTemplate)
info = {
"taskCount": dfTask['内容'].count(),
"aNum": int(dfCC.iloc[-1]['count_账号名称']),
"fNum": int(dfCC.iloc[-1]['sum_转发数']),
"readNum": int(dfCC.iloc[-1]['sum_阅读数']),
"r": '%.1f'%(dfCC.iloc[-1]['rate']*100.0),
#
"dNum": int(dfDD.iloc[-1]['count_账号名称']), # 部门总账号数
"dFNum": int(dfDD.iloc[-1]['sum_转发数']), # 部门总转发数
"dReadNum": int(dfDD.iloc[-1]['sum_阅读数']), # 部门总阅读数
"dr": '%.1f'%(dfDD.iloc[-1]['rate']*100.0), # 部门平均转发率
}
context.update(info)
# 县区转发率表格
t1_list = []
for index, row in dfCC.iterrows():
if index == "总计":
continue
t1_a = {'county': str(index), 'rate': '%.1f'%(row['rate']*100.0),
'account': int(row['count_账号名称']), 'fNum': int(row['sum_转发数']),
'readNum': int(row['sum_阅读数'])}
t1_list.append(t1_a)
context['t1_contents'] = t1_list
# 部门转发率表格
t2_list = []
for index, row in dfDD.iterrows():
if index == "总计":
continue
t2_a = {'name': str(index),
'rate': '%.1f'%(row['rate']*100.0),
'account': int(row['count_账号名称']),
'fNum': int(row['sum_转发数']),
'readNum': int(row['sum_阅读数'])}
t2_list.append(t2_a)
context['t2_contents'] = t2_list
# 转发任务列表
t3_list = []
for index, row in dfTask.iterrows():
t3_a = {'id': row['序号'],
'title': row['内容'],
'date': row['时间'].strftime('%Y-%m-%d') }
t3_list.append(t3_a)
context['t3_contents'] = t3_list
# 绘制区县转发率图
drawBar(dfCC['rate'][:-1], dfCC.index[:-1],
'区县转发率', os.path.join(strOutputPath, city + '_graphCounty.png'))
dc = {
'graphCounty': InlineImage(tpl, os.path.join(strOutputPath, city+'_graphCounty.png'), width=Mm(120)),
}
context.update(dc)
tpl.render(context)
tpl.save(strOutputPath+city+'.docx')
######
####
#######
######
####
#######
######
####
#######
# 统计县区
# 发现目前版本pivot_table函数aggfunc用列表时前几列计算值不准确
# 所以,暂时单列计算,再合并
dfCountyA = pd.pivot_table(dfRR, index=['市州'], values=['账号名称'],
aggfunc = ['count'], fill_value='', margins=True, margins_name='总计')
dfCountyC = pd.pivot_table(dfRR, index=['市州'], values=['转发数'],
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
dfCountyR = pd.pivot_table(dfRR, index=['市州'], values=['阅读数'],
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
dfCounty = pd.concat([dfCountyA, dfCountyC, dfCountyR], axis=1)
# 计算转发率
dfCounty.columns = ['_'.join(col) for col in dfCounty.columns.values]
#dfCounty.reset_index(inplace=True)
dfCounty['rate'] = dfCounty.apply(lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['内容'].count() * 1000)/1000.0, axis=1)
# 排序
dfCC = dfCounty[0:dfCounty.shape[0]-1].sort_values(by='rate', ascending=False)
dfCC = pd.concat([dfCC, dfCounty[dfCounty.shape[0]-1:dfCounty.shape[0]] ], axis=0)
dfCC.to_excel(strOutputPath + '市州转发_ALL.xlsx')
# 统计市/州直部门转发数
#if '省直部门' in cities:
dfRRD = dfRR[ (dfRR['市州'] == '省直部门')]
dfDD = pd.DataFrame()
if dfRRD.shape[0] > 0:
dfDA = pd.pivot_table(dfRRD, index=['单位名称'], values=['账号名称'],
aggfunc=['count'], fill_value='', margins=True, margins_name='总计')
dfDC = pd.pivot_table(dfRRD, index=['单位名称'], values=['转发数'],
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
dfDR = pd.pivot_table(dfRRD, index=['单位名称'], values=['阅读数'],
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
dfD = pd.concat([dfDA, dfDC, dfDR], axis=1)
# 计算部门转发率
print('---', dfD.columns)
# 合并多层索引MultiIndex
dfD.columns = ['_'.join(col) for col in dfD.columns.values]
#dfD.reset_index(inplace=True)
print('---', dfD.columns)
# 计算转发率
dfD['rate'] = dfD.apply(
lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['内容'].count() * 1000) / 1000.0, axis=1)
# 排序
dfDD = dfD[0:dfD.shape[0] - 1].sort_values(by='rate', ascending=False)
dfDD = pd.concat([dfDD, dfD[dfD.shape[0] - 1:dfD.shape[0]]], axis=0)
dfDD.to_excel(strOutputPath + '部门转发_ALL.xlsx')
#########################################################
#
# 生成报告
tpl = DocxTemplate(fnTemplate)
info = {
"taskCount": dfTask['内容'].count(),
"aNum": int(dfCC.iloc[-1]['count_账号名称']),
"fNum": int(dfCC.iloc[-1]['sum_转发数']),
"readNum": int(dfCC.iloc[-1]['sum_阅读数']),
"r": '%.1f'%(dfCC.iloc[-1]['rate']*100.0),
}
if dfDD.empty:
info.update( {
"dNum": 0, # 部门总账号数
"dFNum": 0, # 部门总转发数
"dReadNum": 0, # 部门总阅读数
"dr": '%.1f'%(0), # 部门平均转发率
} )
else:
info.update( {
"dNum": int(dfDD.iloc[-1]['count_账号名称']), # 部门总账号数
"dFNum": int(dfDD.iloc[-1]['sum_转发数']), # 部门总转发数
"dReadNum": int(dfDD.iloc[-1]['sum_阅读数']), # 部门总阅读数
"dr": '%.1f'%(dfDD.iloc[-1]['rate']*100.0), # 部门平均转发率
})
context.update(info)
# 县区转发率表格
t1_list = []
for index, row in dfCC.iterrows():
if index == "总计":
continue
t1_a = {'county': str(index), #str('账号名称'),
'rate': '%.1f'%(row['rate']*100.0),
'account': int(row['count_账号名称']), 'fNum': int(row['sum_转发数']),
'readNum': int(row['sum_阅读数'])}
t1_list.append(t1_a)
context['t1_contents'] = t1_list
# 部门转发率表格
t2_list = []
if not dfDD.empty:
for index, row in dfDD.iterrows():
if index == "总计":
continue
t2_a = {'name': str(index), #str(row['单位名称']),
'rate': '%.1f'%(row['rate']*100.0),
'account': int(row['count_账号名称']),
'fNum': int(row['sum_转发数']),
'readNum': int(row['sum_阅读数'])}
t2_list.append(t2_a)
context['t2_contents'] = t2_list
# 转发任务列表
t3_list = []
for index, row in dfTask.iterrows():
t3_a = {'id': row['序号'],
'title': row['内容'],
'date': row['时间'].strftime('%Y-%m-%d') }
t3_list.append(t3_a)
context['t3_contents'] = t3_list
# 绘制区县转发率图
drawBar(dfCC['rate'][:-1], dfCC.index[:-1],
'市州转发率', os.path.join(strOutputPath, 'ALL_graphCounty.png'))
dc = {
'graphCounty': InlineImage(tpl, os.path.join(strOutputPath, 'ALL_graphCounty.png'), width=Mm(120)),
}
context.update(dc)
tpl.render(context)
tpl.save(strOutputPath+'ALL.docx')
endtime = datetime.datetime.now()
usedtime = endtime - starttime
print("time: ", usedtime)