1035 lines
54 KiB
Python
1035 lines
54 KiB
Python
|
import datetime
|
|||
|
import csv
|
|||
|
import pandas as pd
|
|||
|
import numpy as np
|
|||
|
import glob, os, re, time
|
|||
|
|
|||
|
import matplotlib.pyplot as plt
|
|||
|
from matplotlib.ticker import FuncFormatter
|
|||
|
from difflib import SequenceMatcher
|
|||
|
from collections import Counter
|
|||
|
import difflib
|
|||
|
|
|||
|
|
|||
|
from docxtpl import DocxTemplate
|
|||
|
from docxtpl import InlineImage
|
|||
|
from docx.shared import Mm
|
|||
|
|
|||
|
import jieba
|
|||
|
import jieba.posseg as pseg
|
|||
|
|
|||
|
|
|||
|
|
|||
|
def fetch_chinese(s):
|
|||
|
pattern =re.compile(r'[^\u4e00-\u9fa5]')
|
|||
|
sc = re.sub(pattern, '', s)
|
|||
|
return sc
|
|||
|
|
|||
|
#---
|
|||
|
#那我们的目标就是将字段列名的日期数据替换成标准的日期格式,具体的思路是:
|
|||
|
#1、先用excel实验2018-11-02对应的日期时间戳是43406。
|
|||
|
#2、我再用2018-11-02减43406看看是从那一年开始计算的,所以得出结论是1899-12-30。
|
|||
|
#3、那最后要达成目标就只需要时间戳+1899-12-30就等于对应的当前日
|
|||
|
def ts2date(dates, sf='%Y-%m-%d'):#定义转化日期戳的函数,dates为日期戳
|
|||
|
delta=datetime.timedelta(days=dates)
|
|||
|
today=datetime.datetime.strptime('1899-12-30','%Y-%m-%d')+delta#将1899-12-30转化为可以计算的时间格式并加上要转化的日期戳
|
|||
|
return datetime.datetime.strftime(today,sf)#制定输出日期的格式
|
|||
|
#---
|
|||
|
|
|||
|
|
|||
|
# 画柱状图
|
|||
|
def drawBar(data, recipe, title='', fn=''):
|
|||
|
plt.figure(figsize=(6, 4))
|
|||
|
plt.rcParams['font.sans-serif'] = ['SimHei']
|
|||
|
plt.rcParams['axes.unicode_minus'] = False
|
|||
|
counties = recipe
|
|||
|
countyRates = data
|
|||
|
|
|||
|
plt.bar(counties, countyRates, width=0.5)
|
|||
|
plt.xticks(counties, counties, rotation=35)
|
|||
|
plt.ylim((0, 1))
|
|||
|
|
|||
|
def to_percent(temp, position):
|
|||
|
return '%2.0f' % (100 * temp) + '%'
|
|||
|
|
|||
|
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
|
|||
|
plt.title(title, fontsize=16)
|
|||
|
plt.tight_layout()
|
|||
|
plt.savefig(fn)
|
|||
|
# plt.show()
|
|||
|
plt.cla()
|
|||
|
plt.clf()
|
|||
|
plt.close()
|
|||
|
|
|||
|
def getWBData(path, cities, hasBody=False):
|
|||
|
# cityShorten
|
|||
|
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
|
|||
|
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
|
|||
|
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
|
|||
|
|
|||
|
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
|
|||
|
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
|
|||
|
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
|
|||
|
|
|||
|
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
|||
|
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
|||
|
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
|||
|
}
|
|||
|
dirCs = os.listdir(path)
|
|||
|
cs = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数',
|
|||
|
'转发数', '评论数', 'weiboID', 'weiboName', '市州']
|
|||
|
dfWB = pd.DataFrame(columns=cs)
|
|||
|
cityCount = 0
|
|||
|
for dirC in dirCs:
|
|||
|
if dirC[:1] == '.':
|
|||
|
continue
|
|||
|
if not os.path.isdir(os.path.join(path, dirC)):
|
|||
|
continue
|
|||
|
if 'weixin' in dirC.lower():
|
|||
|
continue
|
|||
|
if 'tt' in dirC.lower():
|
|||
|
continue
|
|||
|
if not cityShorten[dirC] in cities:
|
|||
|
continue
|
|||
|
print(' city: ', cityShorten[dirC], dirC)
|
|||
|
cityCount += 1
|
|||
|
# City LN
|
|||
|
cols = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数',
|
|||
|
'转发数', '评论数'] #WB下载工具中的格式
|
|||
|
dfWBC = pd.DataFrame(columns=cols)
|
|||
|
dirCTs = os.listdir(os.path.join(path, dirC))
|
|||
|
for dirCT in dirCTs:
|
|||
|
if dirCT[:1] == '.':
|
|||
|
continue
|
|||
|
# 时段 weibo weibo_1
|
|||
|
if not os.path.isdir(os.path.join(path, dirC, dirCT)):
|
|||
|
continue
|
|||
|
if 'weixin' in dirCT.lower():
|
|||
|
continue
|
|||
|
if 'tt' in dirCT.lower():
|
|||
|
continue
|
|||
|
print(' read WB... dir:',dirCT)
|
|||
|
dirAs = os.listdir(os.path.join(path, dirC, dirCT))
|
|||
|
for dirA in dirAs:
|
|||
|
if dirA[:1] == '.':
|
|||
|
continue
|
|||
|
# 都是账号名称目录下再存账号ID.txt,
|
|||
|
if not os.path.isdir(os.path.join(path, dirC, dirCT, dirA)):
|
|||
|
continue
|
|||
|
##print('---',dirA)
|
|||
|
# 账号名称
|
|||
|
wbName = dirA
|
|||
|
fileAs = os.listdir(os.path.join(path, dirC, dirCT, dirA))
|
|||
|
if len(fileAs) > 0 and os.path.splitext(fileAs[0])[-1] == '.csv':
|
|||
|
wbId = fileAs[0][:-4]
|
|||
|
if len(fileAs) > 1 and wbId.startswith('.'):
|
|||
|
wbId = fileAs[1][:-4]
|
|||
|
# 读取文件
|
|||
|
##print('----',wbName, wbId)
|
|||
|
filename = os.path.join(path, dirC, dirCT, dirA, fileAs[0])
|
|||
|
dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
|
|||
|
index_col=None)#, engine='python', encoding='gbk'#utf-8
|
|||
|
dfdfwb = dfdfwb[1:]
|
|||
|
dfdfwb["weiboID"] = wbId
|
|||
|
dfdfwb["weiboName"] = wbName
|
|||
|
|
|||
|
dfWBC = dfWBC.append(dfdfwb)
|
|||
|
#print(wbName, wbId, fileAs[0], dfdfwb.shape, dfWBC.shape)
|
|||
|
|
|||
|
if len(fileAs)>1:
|
|||
|
print(" +=+= ", fileAs)
|
|||
|
|
|||
|
print(' ', dfWBC.shape)
|
|||
|
#dfWBC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx")
|
|||
|
dfWBC['市州'] = cityShorten[dirC]
|
|||
|
dfWB = dfWB.append(dfWBC)
|
|||
|
|
|||
|
print('Read WB finished. cities', cityCount, '; lines', dfWB.shape)
|
|||
|
#dfWB.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
|
|||
|
return dfWB
|
|||
|
|
|||
|
# 从数据目录中读取xlsx文件,拼接到一起
|
|||
|
def getWXData(path, cities, hasBody=False):
|
|||
|
# cityShorten
|
|||
|
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
|
|||
|
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
|
|||
|
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
|
|||
|
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
|
|||
|
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
|
|||
|
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
|
|||
|
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
|||
|
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
|||
|
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
|||
|
}
|
|||
|
dirBatches = os.listdir(path)
|
|||
|
cols = ['公众号', '链接', '日期', '标题', '内容', '头条', '市州', '阅读数']
|
|||
|
dfWX = pd.DataFrame(columns=cols)
|
|||
|
countC = 0
|
|||
|
countFnC = 0
|
|||
|
# 监测批次目录
|
|||
|
for dirBatch in dirBatches:
|
|||
|
if not os.path.isdir(os.path.join(path, dirBatch)):
|
|||
|
continue # 仅目录
|
|||
|
|
|||
|
# City LN
|
|||
|
# 列出市州文件名称
|
|||
|
fileCs = os.listdir(os.path.join(path, dirBatch))
|
|||
|
count = 0
|
|||
|
for fileC in fileCs:
|
|||
|
if fileC[:1] == '.':
|
|||
|
continue
|
|||
|
# 处理目录
|
|||
|
if os.path.isdir(os.path.join(path, dirBatch, fileC)) and 'weixin' in fileC.lower():
|
|||
|
print(' ', os.path.join(path, dirBatch, fileC))
|
|||
|
fs = os.listdir(os.path.join(path, dirBatch, fileC))
|
|||
|
for f in fs:
|
|||
|
fe = os.path.splitext(f)[-1]
|
|||
|
if fe == '.xlsx' or fe == '.xls':
|
|||
|
fName = os.path.splitext(fileC)[0]
|
|||
|
cityname = cityShorten[dirBatch]
|
|||
|
if cityname in cities:
|
|||
|
dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC, f))
|
|||
|
dfdfwxc['市州'] = cityname
|
|||
|
print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape)
|
|||
|
dfWX = dfWX.append(dfdfwxc)
|
|||
|
count = count + 1
|
|||
|
# 处理文件
|
|||
|
fExt = os.path.splitext(fileC)[-1]
|
|||
|
if fExt != '.xlsx' and fExt != '.xls':
|
|||
|
continue # 限制文件扩展名
|
|||
|
fName = os.path.splitext(fileC)[0]
|
|||
|
cityname = cityShorten[dirBatch]
|
|||
|
if cityname in cities:
|
|||
|
dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC))
|
|||
|
dfdfwxc['市州'] = cityShorten[dirBatch]
|
|||
|
print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape)
|
|||
|
dfWX = dfWX.append(dfdfwxc)
|
|||
|
count = count + 1
|
|||
|
countFnC += count
|
|||
|
if count > 0:
|
|||
|
countC += 1
|
|||
|
print(" Read WX Finished. cities ", countC, '; Files', countFnC, '; lines ', dfWX.shape[0])
|
|||
|
#dfWX.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WX_ALL.xlsx")
|
|||
|
return dfWX
|
|||
|
|
|||
|
# 从数据目录中读取xlsx文件,拼接到一起
|
|||
|
def getTTData(path, cities, hasBody=False):
|
|||
|
# cityShorten
|
|||
|
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
|
|||
|
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
|
|||
|
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
|
|||
|
|
|||
|
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
|
|||
|
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
|
|||
|
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
|
|||
|
|
|||
|
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
|||
|
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
|||
|
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
|||
|
}
|
|||
|
dirCs = os.listdir(path)
|
|||
|
#account date title nread ncomment content url origin
|
|||
|
cs = ['account', 'date', 'title', 'nread', 'ncomment', 'content', 'url', 'origin', 'city']
|
|||
|
|
|||
|
dfTT = pd.DataFrame(columns=cs)
|
|||
|
cityCount = 0
|
|||
|
for dirC in dirCs:
|
|||
|
if dirC[:1] == '.':
|
|||
|
continue
|
|||
|
if not os.path.isdir(os.path.join(path, dirC)):
|
|||
|
continue
|
|||
|
if 'weixin' in dirC.lower():
|
|||
|
continue
|
|||
|
if 'weibo' in dirC.lower():
|
|||
|
continue
|
|||
|
if not cityShorten[dirC] in cities:
|
|||
|
continue
|
|||
|
print(' city: ', cityShorten[dirC], dirC)
|
|||
|
cityCount += 1
|
|||
|
# City LN
|
|||
|
dfTTC = pd.DataFrame(columns=cs)
|
|||
|
dirCTs = os.listdir(os.path.join(path, dirC))
|
|||
|
for dirCT in dirCTs:
|
|||
|
if dirCT[:1] == '.':
|
|||
|
continue
|
|||
|
# 时段 weibo weibo_1
|
|||
|
if not os.path.isdir(os.path.join(path, dirC, dirCT)):
|
|||
|
continue
|
|||
|
if 'weixin' in dirCT.lower():
|
|||
|
continue
|
|||
|
if 'weibo' in dirCT.lower():
|
|||
|
continue
|
|||
|
if 'tt' in dirCT.lower():
|
|||
|
print(' read TT... dir:',dirCT)
|
|||
|
fns = os.listdir(os.path.join(path, dirC, dirCT))
|
|||
|
for fn in fns:
|
|||
|
if fn[:1] == '.':
|
|||
|
continue
|
|||
|
if not fn[-5:] == '.xlsx':
|
|||
|
continue
|
|||
|
#print('---',fn)
|
|||
|
# 账号名称
|
|||
|
|
|||
|
ttName = fn[fn.index('_')+1:]
|
|||
|
ttName = ttName[:ttName.index('_')]
|
|||
|
#D:\Projects\POM\DATA\2022年11月\10月报告\全文\LN\TT
|
|||
|
fileAs = os.path.join(path, dirC, dirCT, fn)
|
|||
|
#print(' ', ttName, fileAs)
|
|||
|
if len(fileAs) > 0:
|
|||
|
try:
|
|||
|
dfdftt = pd.read_excel(fileAs)
|
|||
|
dfTTC = dfTTC.append(dfdftt)
|
|||
|
except:
|
|||
|
print("read file failed. ", fileAs)
|
|||
|
|
|||
|
#dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
|
|||
|
# index_col=None)#, engine='python', encoding='gbk'#utf-8
|
|||
|
#dfdfwb = dfdfwb[1:]
|
|||
|
#dfdfwb["weiboID"] = wbId
|
|||
|
#dfdfwb["weiboName"] = wbName
|
|||
|
|
|||
|
#dfTTC = dfTTC.append(dfdfwb)
|
|||
|
#print(ttName, '读入:', dfdftt.shape[0], ' 总计:', dfTTC.shape[0])
|
|||
|
|
|||
|
#if len(fileAs)>1:
|
|||
|
# print(" +=+= ", fileAs)
|
|||
|
|
|||
|
print(' 读入头条数据行数', dfTTC.shape)
|
|||
|
#dfTTC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx")
|
|||
|
dfTTC['city'] = cityShorten[dirC]
|
|||
|
dfTT = dfTT.append(dfTTC)
|
|||
|
|
|||
|
print('Read TT finished. cities', cityCount, '; lines', dfTT.shape)
|
|||
|
#dfTT.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
|
|||
|
return dfTT
|
|||
|
|
|||
|
|
|||
|
def fetch_chinese(s):
|
|||
|
pattern =re.compile(r'[^\u4e00-\u9fa5]')
|
|||
|
sc = re.sub(pattern, '', s)
|
|||
|
return sc
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
starttime = datetime.datetime.now()
|
|||
|
_RATIO = 0.5
|
|||
|
isDoWX = True
|
|||
|
isDoWB = True
|
|||
|
isDoTT = True
|
|||
|
cities = [
|
|||
|
'临夏回族自治州',
|
|||
|
'白银市',
|
|||
|
'定西市',
|
|||
|
'酒泉市',
|
|||
|
'嘉峪关市',
|
|||
|
'平凉市',
|
|||
|
'庆阳市',
|
|||
|
'天水市',
|
|||
|
'武威市',
|
|||
|
'兰州新区',
|
|||
|
'陇南市',
|
|||
|
'兰州市', '张掖市', '甘南藏族自治州', '金昌市',
|
|||
|
'省直部门', # 共12市2州1新区
|
|||
|
]
|
|||
|
'''
|
|||
|
cities = [
|
|||
|
'临夏回族自治州',
|
|||
|
'省直部门', # 共12市2州1新区
|
|||
|
]
|
|||
|
'''
|
|||
|
#cities = ['陇南市', '临夏回族自治州', '白银市', '定西市', '酒泉市', '平凉市','武威市','天水市']
|
|||
|
cities = ['酒泉市']
|
|||
|
# 转发任务
|
|||
|
sTaskTitle = '内容'
|
|||
|
sTaskDate = '时间'
|
|||
|
dfTask = pd.read_excel('D:/Projects/POM/DATA/2023年6月/季度报告/转发台账第二季度.xlsx', parse_dates=[sTaskDate])
|
|||
|
dfTask.dropna(axis=0,subset = [sTaskTitle])
|
|||
|
yT0 = dfTask.columns.get_loc('序号')
|
|||
|
yT1 = dfTask.columns.get_loc(sTaskTitle)
|
|||
|
#dfTask[sTaskDate] = pd.to_datetime(dfTask[sTaskDate]).dt.date
|
|||
|
|
|||
|
# 账号信息
|
|||
|
strFnAccount = 'D:/Projects/POM/DATA/2023年7月/6月报告/全国报送系统表单_2023.6.30.xlsx'
|
|||
|
dfAllAccount = pd.read_excel(strFnAccount)
|
|||
|
|
|||
|
# 增加列
|
|||
|
dfAllAccount.loc[:, '转发数'] = 0
|
|||
|
dfAllAccount.loc[:, '阅读数'] = 0
|
|||
|
dfAllAccount = pd.concat([dfAllAccount, pd.DataFrame(np.zeros((dfAllAccount.shape[0], dfTask.shape[0])), columns=dfTask['序号'].astype(str).tolist())], axis=1)
|
|||
|
|
|||
|
# 整理数据
|
|||
|
dfAllAccount['市/省局'] = dfAllAccount['市/省局'].fillna('省直部门')
|
|||
|
dfAllAccount['区县/地方部门'] = dfAllAccount['区县/地方部门'].fillna('市直部门')
|
|||
|
dfAllAccount.loc[(dfAllAccount['市/省局'].isin(['临夏回族自治州', '甘南藏族自治州'])) & (dfAllAccount['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门'
|
|||
|
dfAllAccount.loc[(dfAllAccount['市/省局'].isin(['省直部门'])) & (dfAllAccount['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '省直部门'
|
|||
|
|
|||
|
# 过长名称替换为简称,便于绘图
|
|||
|
dfAllAccount.loc[dfAllAccount['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县'
|
|||
|
dfAllAccount.loc[dfAllAccount['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县'
|
|||
|
|
|||
|
sUnitName = '单位全称' #单位全称
|
|||
|
#sUnitSubjectName = '开设主体' #开设主体
|
|||
|
#yAccountUnit = dfAllAccount.columns.get_loc(sUnitName)
|
|||
|
#yAccountUnitSubject = dfAllAccount.columns.get_loc(sUnitSubjectName)
|
|||
|
yAccountName = dfAllAccount.columns.get_loc('账号名称')
|
|||
|
yAccountCity = dfAllAccount.columns.get_loc('市/省局')
|
|||
|
yAccountCounty = dfAllAccount.columns.get_loc('区县/地方部门')
|
|||
|
|
|||
|
|
|||
|
dfAllAccount.loc[dfAllAccount['市/省局'].isin(['白银市',]) , '单位全称'] = dfAllAccount.loc[dfAllAccount['市/省局'].isin(['白银市',]) , '开设主体']
|
|||
|
newNames = {"白银市公安局交通警察支队车辆管理所":"白银市公安局","白银市公安局交通警察支队":"白银市公安局",
|
|||
|
"白银市公安局交通警察支队铜城高速公路大队":"白银市公安局","白银市公安局交通警察支队响泉高速公路大队":"白银市公安局",
|
|||
|
"白银市公安局交通警察支队会师高速公路大队":"白银市公安局","白银市公安局交通警察支队条山高速公路大队":"白银市公安局",
|
|||
|
"白银市公安局出入境管理科":"白银市公安局","白银市禁毒委员会办公室":"白银市公安局",
|
|||
|
"白银市公安局交通警察支队喜泉高速公路大队":"白银市公安局","白银市卫生计生综合监督执法局":"白银市卫生健康委员会",}
|
|||
|
dfAllAccount[sUnitName].replace(newNames, inplace=True)
|
|||
|
|
|||
|
|
|||
|
fnTemplate = 'D:/Projects/POM/DATA/2023年7月/6月报告/POM_ForewardTemplate.docx'
|
|||
|
|
|||
|
# 数据根目录,
|
|||
|
strPath = ['D:/Projects/POM/DATA/2023年6月/季度报告/全文/']
|
|||
|
strOutputPath = 'D:/Projects/POM/DATA/2023年6月/季度报告/转发/'
|
|||
|
|
|||
|
context = {
|
|||
|
"year": "2023",
|
|||
|
"month": "6",
|
|||
|
"pubMonth": "7",
|
|||
|
"dateStart": "2023年4月1日",
|
|||
|
"dateEnd": "2023年6月30日"
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
|
|||
|
################################################
|
|||
|
# 创建存储矩阵
|
|||
|
dfO = pd.DataFrame(columns=['任务序号', '任务名称', '类型', '公众号', '日期', '内容', '链接', '市州'] )
|
|||
|
|
|||
|
|
|||
|
################################################
|
|||
|
|
|||
|
# WX
|
|||
|
if isDoWX:
|
|||
|
print('=============================================================')
|
|||
|
print('---- WX ----')
|
|||
|
dfWX = pd.DataFrame()
|
|||
|
for strP in strPath:
|
|||
|
ddff = getWXData(strP, cities)
|
|||
|
dfWX = dfWX.append(ddff)
|
|||
|
|
|||
|
dfWX = dfWX.fillna(value=0)
|
|||
|
yWXtitle = dfWX.columns.get_loc('标题')
|
|||
|
yWXnread = dfWX.columns.get_loc('阅读数')
|
|||
|
yWXdate = dfWX.columns.get_loc('日期')
|
|||
|
yWXurl = dfWX.columns.get_loc('链接')
|
|||
|
|
|||
|
# 公众号 链接 日期 标题 内容 头条 city
|
|||
|
## 逐个市州统计每个账号的转发情况
|
|||
|
#cities = dfWX['市州'].unique()
|
|||
|
for city in cities:
|
|||
|
print('---- WX title match', city, ' ----' )
|
|||
|
# 本市微信数据
|
|||
|
dataC = dfWX.loc[dfWX['市州'] == city].copy()
|
|||
|
# 获取微信账号数
|
|||
|
accounts = dataC['公众号'].unique()
|
|||
|
|
|||
|
# 所有微信账号数
|
|||
|
maskCWX = ( (dfAllAccount['账号类型'] == '微信服务号')|(dfAllAccount['账号类型'] == '微信订阅号') ) & (dfAllAccount['市/省局'] == city)
|
|||
|
accountNumCWX = maskCWX.tolist().count(True)
|
|||
|
|
|||
|
# 按获取得微信账号遍历
|
|||
|
for account in accounts:
|
|||
|
#print(account)
|
|||
|
# 该账号的所有文章
|
|||
|
dataA = dataC.loc[dataC['公众号'] == account].copy() # 一个公众号的所有文章
|
|||
|
sR = pd.Series(dtype='object')
|
|||
|
sR['类型'] = '微信'
|
|||
|
sR['市州'] = city
|
|||
|
sR['账号名称'] = account
|
|||
|
count = 0
|
|||
|
arn = 0
|
|||
|
|
|||
|
# 从账号信息中匹配该账号详细信息
|
|||
|
mask = ( (dfAllAccount['账号类型'] == '小程序+微信')
|
|||
|
| (dfAllAccount['账号类型'] == '微信服务号')
|
|||
|
| (dfAllAccount['账号类型'] == '微信订阅号') ) & (dfAllAccount['市/省局'] == city) & (dfAllAccount['账号名称']==account)
|
|||
|
if mask.any():
|
|||
|
sxq = str(dfAllAccount.loc[mask, '区县/地方部门'].values[0])
|
|||
|
if sxq.lower() !='nan':
|
|||
|
sR['区县'] = sxq
|
|||
|
sdwmc = str(dfAllAccount.loc[mask, sUnitName].values[0])
|
|||
|
if sdwmc.lower() != 'nan':
|
|||
|
sR['单位名称'] = sdwmc
|
|||
|
else:
|
|||
|
print(' !!!! 微信', account, '在', city, '无详细信息' )
|
|||
|
continue
|
|||
|
|
|||
|
# 按任务标题逐个匹配所有发文,得到每篇任务的转发情况
|
|||
|
for i in range(dfTask[sTaskTitle].count()):
|
|||
|
# 对于每一篇任务文章
|
|||
|
rn = dfTask.iloc[i, yT0] # 序号
|
|||
|
ssrt = str(dfTask.iloc[i, yT1]) # 标题/内容
|
|||
|
rt = fetch_chinese(ssrt) # 只取汉字
|
|||
|
forwarded = 0 # 转发数
|
|||
|
readNum = 0 # 阅读数
|
|||
|
# 查看该账号的所有文章
|
|||
|
for j in range(dataA.shape[0]):
|
|||
|
str1 = fetch_chinese(str(dataA.iloc[j, yWXtitle])) # 只取汉字
|
|||
|
|
|||
|
# 任务标题过长,截取前半部分进行对比
|
|||
|
if len(rt) > len(str1):
|
|||
|
strRT = rt[:len(str1)]
|
|||
|
else:#文章标题过长,只比较任务标题长度部分
|
|||
|
strRT = rt
|
|||
|
str1 = str1[:len(rt)]
|
|||
|
|
|||
|
|
|||
|
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
|
|||
|
'''
|
|||
|
if (i==4 or i==5 or i==6 ) and account=='陇南市工业和信息化局': # and ratio<0.7 and ratio > 0.3 :
|
|||
|
if ratio > 0.5:
|
|||
|
print('-----------------')
|
|||
|
print(ratio)
|
|||
|
print(strRT)
|
|||
|
print(str1)
|
|||
|
'''
|
|||
|
|
|||
|
# 遇到相似的,认为已转发,即跳出不再查找
|
|||
|
if ratio > _RATIO:
|
|||
|
forwarded += 1
|
|||
|
readNum += int(dataA.iloc[j, yWXnread])
|
|||
|
if forwarded > 0:
|
|||
|
break
|
|||
|
sR[str(rn)] = forwarded # 记录该篇文章的转发数
|
|||
|
|
|||
|
count += forwarded # 累加该篇文章的转发数
|
|||
|
arn += readNum # 累加该篇文章的阅读数
|
|||
|
|
|||
|
# 记录该篇任务转发情况加入
|
|||
|
if forwarded > 0:
|
|||
|
dfO = dfO.append([{'任务序号': rn, '任务名称': ssrt,
|
|||
|
'类型': '微信',
|
|||
|
'公众号': account,
|
|||
|
'日期': dataA.iloc[j, yWXdate],
|
|||
|
'内容': str1,
|
|||
|
'链接': dataA.iloc[j, yWXurl],
|
|||
|
'市州': city,
|
|||
|
'阅读数': readNum,
|
|||
|
}], ignore_index=True)
|
|||
|
#记录该任务的转发情况
|
|||
|
dfAllAccount.loc[mask, str(rn)] = forwarded
|
|||
|
#记录该账号的总转发数
|
|||
|
dfAllAccount.loc[mask, '转发数'] = count
|
|||
|
sR['转发数'] = count
|
|||
|
sR['阅读数'] = arn
|
|||
|
# 全市总转发文章篇数
|
|||
|
ccwx = dfAllAccount.loc[maskCWX, '转发数'].sum()
|
|||
|
# 全市总转发率
|
|||
|
rcc = ccwx/accountNumCWX/dfTask.shape[0]
|
|||
|
print(' ', city, '共有', accountNumCWX, '个微信号,获取数据', len(accounts), '个。共转发', ccwx, '次,转发率{:.1f}%'.format(rcc*100) )
|
|||
|
#countWxForewards = dfRR.shape[0]
|
|||
|
#print(' 获取 WX 账号数', len(dfWX['公众号'].unique()),'参与转发账号数', countWxForewards)
|
|||
|
|
|||
|
# WB
|
|||
|
if isDoWB:
|
|||
|
print('=============================================================')
|
|||
|
print('---- WB data read ----')
|
|||
|
#获取微博数据
|
|||
|
dfWB = pd.DataFrame()
|
|||
|
for strP in strPath:
|
|||
|
ddff = getWBData(strP, cities)
|
|||
|
dfWB = dfWB.append(ddff)
|
|||
|
yWBcontent = dfWB.columns.get_loc('微博正文')
|
|||
|
yWBdate = dfWB.columns.get_loc('date')
|
|||
|
yWBurl = dfWB.columns.get_loc('头条文章url')
|
|||
|
################################################
|
|||
|
# WB
|
|||
|
# 微博id 微博正文 头条文章url 原始图片url 被转发微博原始图片url 是否为原创微博 微博视频url 发布位置 date
|
|||
|
# 发布工具 点赞数 转发数 评论数 weiboID weiboName city
|
|||
|
#cities = dfWB['市州'].unique()
|
|||
|
for city in cities:
|
|||
|
print('---- WB match', city, ' ----' )
|
|||
|
# 本市微博数据
|
|||
|
dataC = dfWB.loc[dfWB['市州'] == city].copy()
|
|||
|
# 获取数据的微博账号
|
|||
|
accounts = dataC['weiboName'].unique()
|
|||
|
|
|||
|
# 本市所有微博账号
|
|||
|
maskCWB = (dfAllAccount['账号类型'] == '新浪微博') & (dfAllAccount['市/省局'] == city)
|
|||
|
accountNumCWB = maskCWB.tolist().count(True)
|
|||
|
|
|||
|
# 按获取的微博账号遍历
|
|||
|
for account in accounts:
|
|||
|
# print(account)
|
|||
|
# 该公众号的所有文章
|
|||
|
dataA = dataC.loc[dataC['weiboName'] == account].copy()
|
|||
|
sR = pd.Series(dtype='object')
|
|||
|
sR['类型'] = '新浪微博'
|
|||
|
sR['市州'] = city
|
|||
|
sR['账号名称'] = account
|
|||
|
count = 0
|
|||
|
|
|||
|
# 为转发账号匹配单位全称和所属县区
|
|||
|
mask = ( dfAllAccount['账号类型'] == '新浪微博' ) & (dfAllAccount['市/省局'] == city) & (dfAllAccount['账号名称']==account)
|
|||
|
if mask.any():
|
|||
|
sxq = str(dfAllAccount.loc[mask, '区县/地方部门'].values[0])
|
|||
|
if sxq.lower() !='nan':
|
|||
|
sR['区县'] = sxq
|
|||
|
sdwmc = str(dfAllAccount.loc[mask, sUnitName].values[0])
|
|||
|
if sdwmc.lower() != 'nan':
|
|||
|
sR['单位名称'] = sdwmc
|
|||
|
else:
|
|||
|
print(' !!!! 微博', account, '在', city, '无详细信息' )
|
|||
|
continue
|
|||
|
|
|||
|
# 按任务标题逐个匹配所有发文,得到每篇任务的转发情况
|
|||
|
for i in range(dfTask[sTaskTitle].count()):
|
|||
|
rn = dfTask.iloc[i, yT0] # 任务序号
|
|||
|
ssrt = str(dfTask.iloc[i, yT1]) # 任务标题
|
|||
|
rt = fetch_chinese(ssrt) # 只取中文
|
|||
|
forwarded = 0
|
|||
|
# 对该账号的所有文章
|
|||
|
for j in range(dataA.shape[0]):
|
|||
|
str0 = str(dataA.iloc[j, yWBcontent])
|
|||
|
str1 = fetch_chinese(str0)
|
|||
|
str2 = str1[:len(rt)] # 取任务标题相同汉字数进行比较
|
|||
|
|
|||
|
ratio = difflib.SequenceMatcher(None, rt, str2).quick_ratio()
|
|||
|
|
|||
|
if ratio > _RATIO:
|
|||
|
forwarded += 1
|
|||
|
if forwarded > 0:
|
|||
|
break
|
|||
|
#记记录该任务的转发情况
|
|||
|
dfAllAccount.loc[mask, str(rn)] = forwarded
|
|||
|
sR[str(rn)] = forwarded
|
|||
|
# 转发数累加到本账号里
|
|||
|
count += forwarded
|
|||
|
|
|||
|
# 记录该篇任务转发情况加入
|
|||
|
if forwarded > 0:
|
|||
|
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
|
|||
|
'类型': '新浪微博',
|
|||
|
'公众号': account,
|
|||
|
'日期': dataA.iloc[j, yWBdate],
|
|||
|
'内容': str1,
|
|||
|
'链接': dataA.iloc[j, yWBurl],
|
|||
|
'市州': city,
|
|||
|
}], ignore_index=True)
|
|||
|
# 记录该账号的总转发数
|
|||
|
dfAllAccount.loc[mask, '转发数'] = count
|
|||
|
sR['转发数'] = count
|
|||
|
|
|||
|
# 全市总转发文章篇数
|
|||
|
ccwb = dfAllAccount.loc[maskCWB, '转发数'].sum()
|
|||
|
# 全市总转发率
|
|||
|
rcc = ccwb/accountNumCWB/dfTask.shape[0]
|
|||
|
print(' ', city, '共有', accountNumCWB, '个微博号,获取数据', len(accounts), '个。共转发', ccwb, '次,转发率{:.1f}%'.format(rcc*100) )
|
|||
|
|
|||
|
#countWbForewards = dfRR.shape[0] - countWxForewards
|
|||
|
#print(' 获取 WB 账号数', len(dfWB['weiboName'].unique()), '参与转发账号数', countWbForewards)
|
|||
|
|
|||
|
|
|||
|
# TT
|
|||
|
if isDoTT:
|
|||
|
print('=============================================================')
|
|||
|
print('---- TT data read ----')
|
|||
|
# id userId source city tid cellType title
|
|||
|
# time-stamp date url commentCount readNum likeNum showNum
|
|||
|
# 获取头条数据
|
|||
|
dfTT = pd.DataFrame()
|
|||
|
for strP in strPath:
|
|||
|
ddff = getTTData(strP, cities)
|
|||
|
dfTT = dfTT.append(ddff)
|
|||
|
|
|||
|
yTTtitle = dfTT.columns.get_loc('title')
|
|||
|
yTTdate = dfTT.columns.get_loc('date')
|
|||
|
yTTurl = dfTT.columns.get_loc('url')
|
|||
|
|
|||
|
# 逐个市州统计账号转发情况
|
|||
|
for city in cities:
|
|||
|
print("++++++++++++++++++++++++++++++++++++++++++++++++++")
|
|||
|
print('---- TT title match', city, ' ----' )
|
|||
|
# 本市头条数据
|
|||
|
dataC = dfTT.loc[dfTT['city'] == city].copy()
|
|||
|
# 获取数据的头条账号
|
|||
|
accounts = dataC['account'].unique()
|
|||
|
|
|||
|
# 本市所有头条账号信息
|
|||
|
maskCTT = (dfAllAccount['账号类型'] == '今日头条') & (dfAllAccount['市/省局'] == city)
|
|||
|
accountNumCTT = maskCTT.tolist().count(True)
|
|||
|
|
|||
|
# 按头条数据的账号遍历
|
|||
|
for account in accounts:
|
|||
|
#print(account)
|
|||
|
# 该账号的所有文章
|
|||
|
dataA = dataC[dataC['account']==account]
|
|||
|
sR = pd.Series([], dtype=pd.StringDtype())
|
|||
|
sR['类型'] = '今日头条'
|
|||
|
sR['市州'] = city
|
|||
|
sR['账号名称'] = account
|
|||
|
count = 0
|
|||
|
|
|||
|
# 为转发账号匹配单位全称和所属县区
|
|||
|
mask = ( dfAllAccount['账号类型'] == '今日头条' ) & (dfAllAccount['市/省局'] == city) & (dfAllAccount['账号名称']==account)
|
|||
|
if mask.any():
|
|||
|
sxq = str(dfAllAccount.loc[mask, '区县/地方部门'].values[0])
|
|||
|
if sxq.lower() !='nan':
|
|||
|
sR['区县'] = sxq
|
|||
|
sdwmc = str(dfAllAccount.loc[mask, sUnitName].values[0])
|
|||
|
if sdwmc.lower() != 'nan':
|
|||
|
sR['单位名称'] = sdwmc
|
|||
|
else:
|
|||
|
print(' !!!! 头条', account, '在', city, '无详细信息' )
|
|||
|
continue
|
|||
|
|
|||
|
# 按任务标题逐个匹配所有发文,得到每篇任务的转发情况
|
|||
|
for i in range(dfTask[sTaskTitle].count()):
|
|||
|
# 对于每一篇任务文章
|
|||
|
rn = dfTask.iloc[i, yT0] # 任务序号
|
|||
|
ssrt = str(dfTask.iloc[i, yT1]) # 任务标题
|
|||
|
rt = fetch_chinese(ssrt) # 只取中文
|
|||
|
forwarded = 0
|
|||
|
|
|||
|
# 查看该账号的所有文章
|
|||
|
for j in range(dataA.shape[0]):
|
|||
|
str0 = str(dataA.iloc[j, yTTtitle])
|
|||
|
str1 = fetch_chinese(str0)
|
|||
|
#
|
|||
|
if len(rt) > len(str1): # 若任务标题过长,截取前半部分进行对比
|
|||
|
strRT = rt[:len(str1)]
|
|||
|
else: #若文章标题过长,只比较任务标题长度部分
|
|||
|
strRT = rt
|
|||
|
str1 = str1[:len(rt)]
|
|||
|
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
|
|||
|
if ratio > _RATIO:
|
|||
|
forwarded += 1
|
|||
|
if forwarded > 0:
|
|||
|
break
|
|||
|
#记录该任务转发情况
|
|||
|
dfAllAccount.loc[mask, str(rn)] = forwarded
|
|||
|
sR[str(rn)] = forwarded
|
|||
|
count += forwarded
|
|||
|
if forwarded > 0:
|
|||
|
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
|
|||
|
'类型': '今日头条',
|
|||
|
'公众号': account,
|
|||
|
'日期': dataA.iloc[j, yTTdate],
|
|||
|
'内容': str1,
|
|||
|
'链接': dataA.iloc[j, yTTurl],
|
|||
|
'市州': city,
|
|||
|
}], ignore_index=True)
|
|||
|
|
|||
|
# 记录该账号转发情况
|
|||
|
dfAllAccount.loc[mask, '转发数'] = count
|
|||
|
sR['转发数'] = count
|
|||
|
|
|||
|
|
|||
|
# 全市总转发文章篇数
|
|||
|
cctt = dfAllAccount.loc[maskCTT, '转发数'].sum()
|
|||
|
# 全市总转发率
|
|||
|
rcc = cctt/accountNumCTT/dfTask.shape[0]
|
|||
|
print(' ', city, '共有', accountNumCTT, '个头条号,获取数据', len(accounts), '个。共转发', cctt, '次,转发率{:.1f}%'.format(rcc*100) )
|
|||
|
|
|||
|
#countTtForewards = dfRR.shape[0] - countWxForewards - countWbForewards
|
|||
|
#print(' 获取 TT 账号数', len(dfTT['account'].unique()),'参与转发账号数', countTtForewards)
|
|||
|
|
|||
|
if isDoWX or isDoWB or isDoTT:
|
|||
|
print('=============================================================')
|
|||
|
print('---- STATISTICS ----')
|
|||
|
print('=============================================================')
|
|||
|
|
|||
|
dfAllAccount.to_excel(strOutputPath + '甘肃省_转发账号.xlsx')
|
|||
|
dfO.to_excel(strOutputPath + '甘肃省_转发文章.xlsx')
|
|||
|
|
|||
|
|
|||
|
print('---- 统计市州转发率 ----')
|
|||
|
for city in cities:
|
|||
|
#if city in ['兰州新区', '省直部门']:
|
|||
|
# continue
|
|||
|
print(" add up city", city)
|
|||
|
|
|||
|
|
|||
|
maskC = ( (dfAllAccount['账号类型'] == '新浪微博')
|
|||
|
| (dfAllAccount['账号类型'] == '微信服务号')
|
|||
|
| (dfAllAccount['账号类型'] == '微信订阅号')
|
|||
|
| (dfAllAccount['账号类型'] == '今日头条') ) & (dfAllAccount['市/省局'] == city)
|
|||
|
|
|||
|
# dfdfC = dfAllAccount.loc[((dfAllAccount['账号类型'] == '新浪微博')
|
|||
|
# | (dfAllAccount['账号类型'] == '微信服务号')
|
|||
|
# | (dfAllAccount['账号类型'] == '微信订阅号')
|
|||
|
# | (dfAllAccount['账号类型'] == '今日头条'))
|
|||
|
# & (dfAllAccount['市/省局'] == city)].copy()
|
|||
|
|
|||
|
dfdfC = dfAllAccount.loc[maskC,:]
|
|||
|
dfdfC.to_excel(strOutputPath + city + '_转发账号.xlsx')
|
|||
|
|
|||
|
dfOCity = dfO[dfO['市州'] == city]
|
|||
|
dfO.to_excel(strOutputPath + city + '_转发文章.xlsx')
|
|||
|
|
|||
|
#dfRRCity = dfRR.loc[dfRR['市州'] == city].copy()
|
|||
|
|
|||
|
#########################################################################################################
|
|||
|
# 统计市/州直部门转发数
|
|||
|
dfdfCD = dfdfC.loc[dfdfC['区县/地方部门'].isin(['州直部门', '市直部门', '省直部门'])].copy()
|
|||
|
dfdfCDA = pd.pivot_table(dfdfCD, index=[sUnitName], values=['账号名称'],
|
|||
|
aggfunc=['count'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfdfCDC = pd.pivot_table(dfdfCD, index=[sUnitName], values=['转发数'],
|
|||
|
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
#dfdfCDR = pd.pivot_table(dfdfCD, index=['单位全称'], values=['阅读数'],
|
|||
|
# aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfdfCD_A = pd.concat([dfdfCDA, dfdfCDC], axis=1)
|
|||
|
|
|||
|
#print('-', dfdfCD_A.columns.values)
|
|||
|
# 合并多层索引MultiIndex
|
|||
|
dfdfCD_A.columns = ['_'.join(col) for col in dfdfCD_A.columns.values]
|
|||
|
#print('=', dfdfCD_A.columns.values)
|
|||
|
# 计算转发率
|
|||
|
dfdfCD_A['rate'] = dfdfCD_A.apply(
|
|||
|
lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask[sTaskTitle].count() * 1000) / 1000.0, axis=1)
|
|||
|
# 排序
|
|||
|
dfdfCD_AD = dfdfCD_A[0:dfdfCD_A.shape[0] - 1].sort_values(by='rate', ascending=False)
|
|||
|
|
|||
|
dfdfCD_AD = pd.concat([dfdfCD_AD, dfdfCD_A[dfdfCD_A.shape[0] - 1:dfdfCD_A.shape[0]]], axis=0)
|
|||
|
|
|||
|
dfdfCD_AD.to_excel(strOutputPath + city + '部门转发统计表.xlsx')
|
|||
|
#dfDD
|
|||
|
|
|||
|
|
|||
|
##########################################################################################
|
|||
|
# 全市/州账号按'区县'统计
|
|||
|
# 发现目前版本pivot_table函数aggfunc用列表时,前几列计算值不准确
|
|||
|
# 所以,暂时单列计算,再合并
|
|||
|
|
|||
|
|
|||
|
#dfdfCD = dfdfC.loc[dfdfC['区县/地方部门'].isin(['州直部门', '市直部门', '省直部门'])].copy()
|
|||
|
dfdfCA = pd.pivot_table(dfdfC, index=['区县/地方部门'], values=['账号名称'],
|
|||
|
aggfunc=['count'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfdfCC = pd.pivot_table(dfdfC, index=['区县/地方部门'], values=['转发数'],
|
|||
|
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
#dfdfCDR = pd.pivot_table(dfdfCD, index=['单位全称'], values=['阅读数'],
|
|||
|
# aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfdfC_A = pd.concat([dfdfCA, dfdfCC], axis=1)
|
|||
|
|
|||
|
#print('-', dfdfCD_A.columns.values)
|
|||
|
# 合并多层索引MultiIndex
|
|||
|
dfdfC_A.columns = ['_'.join(col) for col in dfdfC_A.columns.values]
|
|||
|
#print('=', dfdfCD_A.columns.values)
|
|||
|
# 计算转发率
|
|||
|
dfdfC_A['rate'] = dfdfC_A.apply(
|
|||
|
lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask[sTaskTitle].count() * 1000) / 1000.0, axis=1)
|
|||
|
# 排序
|
|||
|
dfdfC_AD = dfdfC_A[0:dfdfC_A.shape[0] - 1].sort_values(by='rate', ascending=False)
|
|||
|
|
|||
|
dfdfC_AD = pd.concat([dfdfC_AD, dfdfC_A[dfdfC_A.shape[0] - 1:dfdfC_A.shape[0]]], axis=0)
|
|||
|
|
|||
|
dfdfC_AD.to_excel(strOutputPath + city + '转发统计表.xlsx')
|
|||
|
#dfCC
|
|||
|
|
|||
|
#########################################################
|
|||
|
#
|
|||
|
# 生成报告
|
|||
|
tpl = DocxTemplate(fnTemplate)
|
|||
|
if city in ['临夏回族自治州', '甘南藏族自治州']:
|
|||
|
sL0 = '州'
|
|||
|
else:
|
|||
|
sL0 = '市'
|
|||
|
info = {
|
|||
|
"strL0":sL0,
|
|||
|
"strL1":"区县",
|
|||
|
"taskCount": dfTask[sTaskTitle].count(),
|
|||
|
"aNum": int(dfdfC_AD.iloc[-1]['count_账号名称']),
|
|||
|
"fNum": int(dfdfC_AD.iloc[-1]['sum_转发数']),
|
|||
|
"r": '%.1f'%(dfdfC_AD.iloc[-1]['rate']*100.0),
|
|||
|
#
|
|||
|
"dNum": int(dfdfCD_AD.iloc[-1]['count_账号名称']), # 部门总账号数
|
|||
|
"dFNum": int(dfdfCD_AD.iloc[-1]['sum_转发数']), # 部门总转发数
|
|||
|
"dr": '%.1f'%(dfdfCD_AD.iloc[-1]['rate']*100.0), # 部门平均转发率
|
|||
|
}
|
|||
|
context.update(info)
|
|||
|
|
|||
|
# 县区转发率表格
|
|||
|
t1_list = []
|
|||
|
for index, row in dfdfC_AD.iterrows():
|
|||
|
if index == "总计":
|
|||
|
continue
|
|||
|
t1_a = {'county': str(index), 'rate': '%.1f'%(row['rate']*100.0),
|
|||
|
'account': int(row['count_账号名称']), 'fNum': int(row['sum_转发数']) }
|
|||
|
t1_list.append(t1_a)
|
|||
|
context['t1_contents'] = t1_list
|
|||
|
|
|||
|
# 部门转发率表格
|
|||
|
t2_list = []
|
|||
|
for index, row in dfdfCD_AD.iterrows():
|
|||
|
if index == "总计":
|
|||
|
continue
|
|||
|
t2_a = {'name': str(index),
|
|||
|
'rate': '%.1f'%(row['rate']*100.0),
|
|||
|
'account': int(row['count_账号名称']),
|
|||
|
'fNum': int(row['sum_转发数']) }
|
|||
|
t2_list.append(t2_a)
|
|||
|
context['t2_contents'] = t2_list
|
|||
|
|
|||
|
# 转发任务列表
|
|||
|
t3_list = []
|
|||
|
for index, row in dfTask.iterrows():
|
|||
|
t3_a = {'id': row['序号'],
|
|||
|
'title': row[sTaskTitle],
|
|||
|
'date': row[sTaskDate].strftime('%m月%d日') }
|
|||
|
#'date': ts2date(row[sTaskDate], '%m月%d日') }
|
|||
|
t3_list.append(t3_a)
|
|||
|
context['t3_contents'] = t3_list
|
|||
|
|
|||
|
# 绘制区县转发率图
|
|||
|
drawBar(dfdfC_AD['rate'][:-1], dfdfC_AD.index[:-1],
|
|||
|
'区县转发率', os.path.join(strOutputPath, '_' + city + '_graphCounty.png'))
|
|||
|
|
|||
|
dc = {
|
|||
|
'graphCounty': InlineImage(tpl, os.path.join(strOutputPath, '_' + city+'_graphCounty.png'), width=Mm(120)),
|
|||
|
}
|
|||
|
context.update(dc)
|
|||
|
|
|||
|
tpl.render(context)
|
|||
|
tpl.save(strOutputPath+city+'转发统计报告_2023年{}月份.docx'.format(context['month']))
|
|||
|
|
|||
|
#########################################################
|
|||
|
# 统计全省各市州和省级部门数据
|
|||
|
if True:
|
|||
|
dShortname = {"甘肃省交通运输厅":"省交通厅","甘肃省文化和旅游厅":"省文旅厅","甘肃省司法厅":"省司法厅","甘肃省人民政府国有资产监督管理委员会":"省国资委",
|
|||
|
"甘肃省乡村振兴局":"省乡村振兴局","甘肃省民政厅":"省民政厅","甘肃省财政厅":"省财政厅","甘肃省人民政府驻北京办事处":"省政府驻京办",
|
|||
|
"甘肃省人力资源和社会保障厅":"省人社厅","甘肃省无线电监测站":"省工信厅","甘肃省人民政府办公厅":"省政府","甘肃省工业和信息化厅":"省工信厅",
|
|||
|
"甘肃省林业和草原局":"省林草局","甘肃省水利厅":"省水利厅","甘肃省公共资源交易中心":"省公共资源交易中心","甘肃省文物局":"省文物局",
|
|||
|
"甘肃省药品监督管理局":"省药监局","甘肃省农村饮水安全管理办公室":"省水利厅","甘肃省应急管理厅":"省应急厅","甘肃省粮食和物资储备局":"省粮食局",
|
|||
|
"甘肃省人民政府驻新疆办事处":"省政府驻疆办","甘肃省景泰川电力提灌管理局(甘肃省景泰川电力提灌工程指挥部)":"省水利厅","甘肃省生态环境厅":"省生态环境厅",
|
|||
|
"甘肃省商务厅":"省商务厅","甘肃省社会保险事业管理局":"省人社厅","甘肃省科学技术厅":"省科技厅","甘肃省市场监督管理局":"省市场监管局",
|
|||
|
"甘肃省经济合作局":"省商务厅","甘肃省体育局":"省体育局","甘肃省发展和改革委员会":"省发改委","甘肃省审计厅":"省审计厅","甘肃省教育厅":"省教育厅",
|
|||
|
"甘肃省民族事务委员会":"省民委","甘肃省农业农村厅":"省农业农村厅","甘肃省人民政府外事办公室":"省政府外事办","甘肃省自然资源厅":"省自然资源厅",
|
|||
|
"甘肃省统计局":"省统计局","甘肃省退役军人事务厅":"省退役军人厅","甘肃省疏勒河流域水资源局":"省水利厅","甘肃省广播电视局":"省广电局",
|
|||
|
"甘肃省讨赖河流域水资源局":"省水利厅","甘肃省卫生健康委员会":"省卫健委","甘肃省药品检验研究院":"省药监局","甘肃省住房和城乡建设厅":"省住建厅",
|
|||
|
"甘肃省公安厅":"省公安厅","甘肃省供销合作社联合社":"省供销社","甘肃省人民政府办公厅":"省政府办公厅","甘肃警察职业学院":"省公安厅",
|
|||
|
"甘肃省教育考试院":"省教育厅","甘肃省医疗保障局":"省医保局","甘肃省公安厅刑事警察总队":"省公安厅","甘肃省人力资源市场":"省人社厅",
|
|||
|
"甘肃省不动产登记事务中心":"省自然资源厅","甘肃省人力资源考试中心":"省人社厅","甘肃省人民政府驻上海办事处":"省政府驻上海办",
|
|||
|
"甘肃省公安厅交通警察总队":"省公安厅","民航甘肃机场公安局":"省公安厅","甘肃省农业信息中心":"省农业农村厅","甘肃省高速路政执法总队":"省交通厅",
|
|||
|
"甘肃省兰州市司法局强制隔离戒毒所官方微博":"省司法厅","甘肃省戒毒管理局":"省司法厅","甘肃省兰州监狱":"省司法厅","甘肃省合作监狱":"省司法厅",
|
|||
|
"甘肃省天水监狱":"省司法厅","甘肃省女子强制隔离戒毒所官方微博":"省司法厅","甘肃省平凉监狱":"省司法厅","甘肃省武威监狱":"省司法厅",
|
|||
|
"甘肃省武都监狱":"省司法厅","甘肃省永登监狱":"省司法厅","甘肃省白银监狱":"省司法厅","甘肃省第一强制隔离戒毒所":"省司法厅",
|
|||
|
"甘肃省第三强制隔离戒毒所官方微博":"省司法厅","甘肃省第二强制隔离戒毒所":"省司法厅","甘肃省酒泉监狱":"省司法厅","甘肃省金昌监狱":"省司法厅",
|
|||
|
"甘肃省公安厅":"省广电局","甘肃省人民政府外事办公室":"省外事办",}
|
|||
|
dfAllAccount[sUnitName].replace(dShortname, inplace=True)
|
|||
|
maskA = (dfAllAccount['账号类型'] == '新浪微博') | (dfAllAccount['账号类型'] == '微信服务号') | (dfAllAccount['账号类型'] == '微信订阅号') | (dfAllAccount['账号类型'] == '今日头条')
|
|||
|
dfRR = dfAllAccount.loc[maskA,:]
|
|||
|
#######
|
|||
|
# 按市州统计
|
|||
|
dfCountyA = pd.pivot_table(dfRR, index=['市/省局'], values=['账号名称'],
|
|||
|
aggfunc = ['count'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfCountyC = pd.pivot_table(dfRR, index=['市/省局'], values=['转发数'],
|
|||
|
aggfunc = ['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfCounty = pd.concat([dfCountyA, dfCountyC], axis=1)
|
|||
|
|
|||
|
# 计算转发率
|
|||
|
dfCounty.columns = ['_'.join(col) for col in dfCounty.columns.values]
|
|||
|
|
|||
|
dfCounty['rate'] = dfCounty.apply(lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask[sTaskTitle].count() * 1000)/1000.0, axis=1)
|
|||
|
|
|||
|
# 排序
|
|||
|
dfCC = dfCounty[0:dfCounty.shape[0]-1].sort_values(by='rate', ascending=False) # 除最后总计行外进行排序
|
|||
|
dfCC = pd.concat([dfCC, dfCounty[dfCounty.shape[0]-1:dfCounty.shape[0]] ], axis=0) # 加上总计行
|
|||
|
dfCC.to_excel(strOutputPath + '甘肃省市州转发统计表.xlsx')
|
|||
|
|
|||
|
|
|||
|
# 统计省直部门转发数
|
|||
|
dfRRD = dfRR[ (dfRR['市/省局'] == '省直部门')]
|
|||
|
if dfRRD.shape[0] > 0:
|
|||
|
dfDA = pd.pivot_table(dfRRD, index=[sUnitName], values=['账号名称'],
|
|||
|
aggfunc=['count'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfDC = pd.pivot_table(dfRRD, index=[sUnitName], values=['转发数'],
|
|||
|
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfD = pd.concat([dfDA, dfDC], axis=1)
|
|||
|
|
|||
|
# 计算部门转发率
|
|||
|
# 合并多层索引MultiIndex
|
|||
|
dfD.columns = ['_'.join(col) for col in dfD.columns.values]
|
|||
|
# 计算转发率
|
|||
|
dfD['rate'] = dfD.apply(
|
|||
|
lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask[sTaskTitle].count() * 1000) / 1000.0, axis=1)
|
|||
|
# 排序
|
|||
|
dfDD = dfD[0:dfD.shape[0] - 1].sort_values(by='rate', ascending=False)
|
|||
|
dfDD = pd.concat([dfDD, dfD[dfD.shape[0] - 1:dfD.shape[0]]], axis=0)
|
|||
|
dfDD.to_excel(strOutputPath + '甘肃省直部门转发统计表.xlsx')
|
|||
|
|
|||
|
#########################################################
|
|||
|
#
|
|||
|
# 生成报告
|
|||
|
tpl = DocxTemplate(fnTemplate)
|
|||
|
info = {
|
|||
|
"strL0":"省",
|
|||
|
"strL1":"市州",
|
|||
|
"taskCount": dfTask[sTaskTitle].count(),
|
|||
|
"aNum": int(dfCC.iloc[-1]['count_账号名称']),
|
|||
|
"fNum": int(dfCC.iloc[-1]['sum_转发数']),
|
|||
|
"r": '%.1f'%(dfCC.iloc[-1]['rate']*100.0),
|
|||
|
}
|
|||
|
if dfDD.empty:
|
|||
|
info.update( {
|
|||
|
"dNum": 0, # 部门总账号数
|
|||
|
"dFNum": 0, # 部门总转发数
|
|||
|
"dr": '%.1f'%(0), # 部门平均转发率
|
|||
|
} )
|
|||
|
else:
|
|||
|
info.update( {
|
|||
|
"dNum": int(dfDD.iloc[-1]['count_账号名称']), # 部门总账号数
|
|||
|
"dFNum": int(dfDD.iloc[-1]['sum_转发数']), # 部门总转发数
|
|||
|
"dr": '%.1f'%(dfDD.iloc[-1]['rate']*100.0), # 部门平均转发率
|
|||
|
})
|
|||
|
|
|||
|
context.update(info)
|
|||
|
|
|||
|
# 全省各市州转发率表格
|
|||
|
t1_list = []
|
|||
|
for index, row in dfCC.iterrows():
|
|||
|
if index == "总计":
|
|||
|
continue
|
|||
|
t1_a = {'county': str(index), # 市州
|
|||
|
'rate': '%.1f'%(row['rate']*100.0), # 转发比率
|
|||
|
'account': int(row['count_账号名称']), # 账号数量
|
|||
|
'fNum': int(row['sum_转发数']) } # 转发数量
|
|||
|
t1_list.append(t1_a)
|
|||
|
context['t1_contents'] = t1_list
|
|||
|
|
|||
|
# 部门转发率表格
|
|||
|
t2_list = []
|
|||
|
if not dfDD.empty:
|
|||
|
for index, row in dfDD.iterrows():
|
|||
|
if index == "总计":
|
|||
|
continue
|
|||
|
t2_a = {'name': str(index), #str(row['单位名称']),
|
|||
|
'rate': '%.1f'%(row['rate']*100.0),
|
|||
|
'account': int(row['count_账号名称']),
|
|||
|
'fNum': int(row['sum_转发数']) }
|
|||
|
t2_list.append(t2_a)
|
|||
|
context['t2_contents'] = t2_list
|
|||
|
|
|||
|
# 转发任务列表
|
|||
|
t3_list = []
|
|||
|
for index, row in dfTask.iterrows():
|
|||
|
t3_a = {'id': row['序号'],
|
|||
|
'title': row['内容'],
|
|||
|
'date': row[sTaskDate].strftime('%m月%d日') } #ts2date(row[sTaskDate], '%m月%d日')
|
|||
|
t3_list.append(t3_a)
|
|||
|
context['t3_contents'] = t3_list
|
|||
|
|
|||
|
# 绘制区县转发率图
|
|||
|
drawBar(dfCC['rate'][:-1], dfCC.index[:-1],
|
|||
|
'市州转发率', os.path.join(strOutputPath, '_ALL_graphCounty.png'))
|
|||
|
|
|||
|
dc = {
|
|||
|
'graphCounty': InlineImage(tpl, os.path.join(strOutputPath, '_ALL_graphCounty.png'), width=Mm(120)),
|
|||
|
}
|
|||
|
context.update(dc)
|
|||
|
|
|||
|
tpl.render(context)
|
|||
|
tpl.save(strOutputPath+'甘肃省转发统计报告_2023年{}月份.docx'.format(context['month']))
|
|||
|
|
|||
|
|
|||
|
endtime = datetime.datetime.now()
|
|||
|
usedtime = endtime - starttime
|
|||
|
print("time: ", usedtime)
|