965 lines
47 KiB
Python
965 lines
47 KiB
Python
|
import datetime
|
|||
|
import csv
|
|||
|
import pandas as pd
|
|||
|
import numpy as np
|
|||
|
import glob, os, re, time
|
|||
|
|
|||
|
import matplotlib.pyplot as plt
|
|||
|
from matplotlib.ticker import FuncFormatter
|
|||
|
from difflib import SequenceMatcher
|
|||
|
from collections import Counter
|
|||
|
import difflib
|
|||
|
|
|||
|
|
|||
|
from docxtpl import DocxTemplate
|
|||
|
from docxtpl import InlineImage
|
|||
|
from docx.shared import Mm
|
|||
|
|
|||
|
import jieba
|
|||
|
import jieba.posseg as pseg
|
|||
|
|
|||
|
#---
|
|||
|
#那我们的目标就是将字段列名的日期数据替换成标准的日期格式,具体的思路是:
|
|||
|
#1、先用excel实验2018-11-02对应的日期时间戳是43406。
|
|||
|
#2、我再用2018-11-02减43406看看是从那一年开始计算的,所以得出结论是1899-12-30。
|
|||
|
#3、那最后要达成目标就只需要时间戳+1899-12-30就等于对应的当前日
|
|||
|
def ts2date(dates, sf='%Y-%m-%d'):#定义转化日期戳的函数,dates为日期戳
|
|||
|
delta=datetime.timedelta(days=dates)
|
|||
|
today=datetime.datetime.strptime('1899-12-30','%Y-%m-%d')+delta#将1899-12-30转化为可以计算的时间格式并加上要转化的日期戳
|
|||
|
return datetime.datetime.strftime(today,sf)#制定输出日期的格式
|
|||
|
#---
|
|||
|
|
|||
|
def fetch_chinese(s):
|
|||
|
pattern =re.compile(r'[^\u4e00-\u9fa5]')
|
|||
|
sc = re.sub(pattern, '', s)
|
|||
|
return sc
|
|||
|
|
|||
|
# 画柱状图
|
|||
|
def drawBar(data, recipe, title='', fn=''):
|
|||
|
plt.figure(figsize=(6, 4))
|
|||
|
plt.rcParams['font.sans-serif'] = ['SimHei']
|
|||
|
plt.rcParams['axes.unicode_minus'] = False
|
|||
|
counties = recipe
|
|||
|
countyRates = data
|
|||
|
|
|||
|
plt.bar(counties, countyRates, width=0.5)
|
|||
|
plt.xticks(counties, counties, rotation=35)
|
|||
|
plt.ylim((0, 1))
|
|||
|
|
|||
|
def to_percent(temp, position):
|
|||
|
return '%2.0f' % (100 * temp) + '%'
|
|||
|
|
|||
|
plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
|
|||
|
plt.title(title, fontsize=16)
|
|||
|
plt.tight_layout()
|
|||
|
plt.savefig(fn)
|
|||
|
# plt.show()
|
|||
|
plt.cla()
|
|||
|
plt.clf()
|
|||
|
plt.close()
|
|||
|
|
|||
|
def getWBData(path, cities, hasBody=False):
|
|||
|
# cityShorten
|
|||
|
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
|
|||
|
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
|
|||
|
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
|
|||
|
|
|||
|
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
|
|||
|
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
|
|||
|
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
|
|||
|
|
|||
|
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
|||
|
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
|||
|
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
|||
|
}
|
|||
|
dirCs = os.listdir(path)
|
|||
|
cs = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数',
|
|||
|
'转发数', '评论数', 'weiboID', 'weiboName', '市州']
|
|||
|
dfWB = pd.DataFrame(columns=cs)
|
|||
|
cityCount = 0
|
|||
|
for dirC in dirCs:
|
|||
|
if dirC[:1] == '.':
|
|||
|
continue
|
|||
|
if not os.path.isdir(os.path.join(path, dirC)):
|
|||
|
continue
|
|||
|
if 'weixin' in dirC.lower():
|
|||
|
continue
|
|||
|
if 'tt' in dirC.lower():
|
|||
|
continue
|
|||
|
if not cityShorten[dirC] in cities:
|
|||
|
continue
|
|||
|
print(' city: ', cityShorten[dirC], dirC)
|
|||
|
cityCount += 1
|
|||
|
# City LN
|
|||
|
cols = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数',
|
|||
|
'转发数', '评论数'] #WB下载工具中的格式
|
|||
|
dfWBC = pd.DataFrame(columns=cols)
|
|||
|
dirCTs = os.listdir(os.path.join(path, dirC))
|
|||
|
for dirCT in dirCTs:
|
|||
|
if dirCT[:1] == '.':
|
|||
|
continue
|
|||
|
# 时段 weibo weibo_1
|
|||
|
if not os.path.isdir(os.path.join(path, dirC, dirCT)):
|
|||
|
continue
|
|||
|
if 'weixin' in dirCT.lower():
|
|||
|
continue
|
|||
|
if 'tt' in dirCT.lower():
|
|||
|
continue
|
|||
|
print(' read WB... dir:',dirCT)
|
|||
|
dirAs = os.listdir(os.path.join(path, dirC, dirCT))
|
|||
|
for dirA in dirAs:
|
|||
|
if dirA[:1] == '.':
|
|||
|
continue
|
|||
|
# 都是账号名称目录下再存账号ID.txt,
|
|||
|
if not os.path.isdir(os.path.join(path, dirC, dirCT, dirA)):
|
|||
|
continue
|
|||
|
##print('---',dirA)
|
|||
|
# 账号名称
|
|||
|
wbName = dirA
|
|||
|
fileAs = os.listdir(os.path.join(path, dirC, dirCT, dirA))
|
|||
|
if len(fileAs) > 0 and os.path.splitext(fileAs[0])[-1] == '.csv':
|
|||
|
wbId = fileAs[0][:-4]
|
|||
|
if len(fileAs) > 1 and wbId.startswith('.'):
|
|||
|
wbId = fileAs[1][:-4]
|
|||
|
# 读取文件
|
|||
|
##print('----',wbName, wbId)
|
|||
|
filename = os.path.join(path, dirC, dirCT, dirA, fileAs[0])
|
|||
|
dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
|
|||
|
index_col=None)#, engine='python', encoding='gbk'#utf-8
|
|||
|
dfdfwb = dfdfwb[1:]
|
|||
|
dfdfwb["weiboID"] = wbId
|
|||
|
dfdfwb["weiboName"] = wbName
|
|||
|
|
|||
|
dfWBC = dfWBC.append(dfdfwb)
|
|||
|
#print(wbName, wbId, fileAs[0], dfdfwb.shape, dfWBC.shape)
|
|||
|
|
|||
|
if len(fileAs)>1:
|
|||
|
print(" +=+= ", fileAs)
|
|||
|
|
|||
|
print(' ', dfWBC.shape)
|
|||
|
#dfWBC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx")
|
|||
|
dfWBC['市州'] = cityShorten[dirC]
|
|||
|
dfWB = dfWB.append(dfWBC)
|
|||
|
|
|||
|
print('Read WB finished. cities', cityCount, '; lines', dfWB.shape)
|
|||
|
#dfWB.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
|
|||
|
return dfWB
|
|||
|
|
|||
|
# 从数据目录中读取xlsx文件,拼接到一起
|
|||
|
def getWXData(path, cities, hasBody=False):
|
|||
|
# cityShorten
|
|||
|
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
|
|||
|
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
|
|||
|
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
|
|||
|
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
|
|||
|
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
|
|||
|
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
|
|||
|
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
|||
|
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
|||
|
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
|||
|
}
|
|||
|
dirBatches = os.listdir(path)
|
|||
|
cols = ['公众号', '链接', '日期', '标题', '内容', '头条', '市州', '阅读数']
|
|||
|
dfWX = pd.DataFrame(columns=cols)
|
|||
|
countC = 0
|
|||
|
countFnC = 0
|
|||
|
# 监测批次目录
|
|||
|
for dirBatch in dirBatches:
|
|||
|
if not os.path.isdir(os.path.join(path, dirBatch)):
|
|||
|
continue # 仅目录
|
|||
|
|
|||
|
# City LN
|
|||
|
# 列出市州文件名称
|
|||
|
fileCs = os.listdir(os.path.join(path, dirBatch))
|
|||
|
count = 0
|
|||
|
for fileC in fileCs:
|
|||
|
if fileC[:1] == '.':
|
|||
|
continue
|
|||
|
# 处理目录
|
|||
|
if os.path.isdir(os.path.join(path, dirBatch, fileC)) and 'weixin' in fileC.lower():
|
|||
|
print(' ', os.path.join(path, dirBatch, fileC))
|
|||
|
fs = os.listdir(os.path.join(path, dirBatch, fileC))
|
|||
|
for f in fs:
|
|||
|
fe = os.path.splitext(f)[-1]
|
|||
|
if fe == '.xlsx' or fe == '.xls':
|
|||
|
fName = os.path.splitext(fileC)[0]
|
|||
|
cityname = cityShorten[dirBatch]
|
|||
|
if cityname in cities:
|
|||
|
dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC, f))
|
|||
|
dfdfwxc['市州'] = cityname
|
|||
|
print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape)
|
|||
|
dfWX = dfWX.append(dfdfwxc)
|
|||
|
count = count + 1
|
|||
|
# 处理文件
|
|||
|
fExt = os.path.splitext(fileC)[-1]
|
|||
|
if fExt != '.xlsx' and fExt != '.xls':
|
|||
|
continue # 限制文件扩展名
|
|||
|
fName = os.path.splitext(fileC)[0]
|
|||
|
cityname = cityShorten[dirBatch]
|
|||
|
if cityname in cities:
|
|||
|
dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC))
|
|||
|
dfdfwxc['市州'] = cityShorten[dirBatch]
|
|||
|
print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape)
|
|||
|
dfWX = dfWX.append(dfdfwxc)
|
|||
|
count = count + 1
|
|||
|
countFnC += count
|
|||
|
if count > 0:
|
|||
|
countC += 1
|
|||
|
print(" Read WX Finished. cities ", countC, '; Files', countFnC, '; lines ', dfWX.shape[0])
|
|||
|
#dfWX.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WX_ALL.xlsx")
|
|||
|
return dfWX
|
|||
|
|
|||
|
# 从数据目录中读取xlsx文件,拼接到一起
|
|||
|
def getTTData(path, cities, hasBody=False):
|
|||
|
# cityShorten
|
|||
|
cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市',
|
|||
|
'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市',
|
|||
|
'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区',
|
|||
|
|
|||
|
'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市',
|
|||
|
'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市',
|
|||
|
'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区',
|
|||
|
|
|||
|
'白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市',
|
|||
|
'临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区',
|
|||
|
'兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市',
|
|||
|
}
|
|||
|
dirCs = os.listdir(path)
|
|||
|
#account date title nread ncomment content url origin
|
|||
|
cs = ['account', 'date', 'title', 'nread', 'ncomment', 'content', 'url', 'origin', 'city']
|
|||
|
|
|||
|
dfTT = pd.DataFrame(columns=cs)
|
|||
|
cityCount = 0
|
|||
|
for dirC in dirCs:
|
|||
|
if dirC[:1] == '.':
|
|||
|
continue
|
|||
|
if not os.path.isdir(os.path.join(path, dirC)):
|
|||
|
continue
|
|||
|
if 'weixin' in dirC.lower():
|
|||
|
continue
|
|||
|
if 'weibo' in dirC.lower():
|
|||
|
continue
|
|||
|
if not cityShorten[dirC] in cities:
|
|||
|
continue
|
|||
|
print(' city: ', cityShorten[dirC], dirC)
|
|||
|
cityCount += 1
|
|||
|
# City LN
|
|||
|
dfTTC = pd.DataFrame(columns=cs)
|
|||
|
dirCTs = os.listdir(os.path.join(path, dirC))
|
|||
|
for dirCT in dirCTs:
|
|||
|
if dirCT[:1] == '.':
|
|||
|
continue
|
|||
|
# 时段 weibo weibo_1
|
|||
|
if not os.path.isdir(os.path.join(path, dirC, dirCT)):
|
|||
|
continue
|
|||
|
if 'weixin' in dirCT.lower():
|
|||
|
continue
|
|||
|
if 'weibo' in dirCT.lower():
|
|||
|
continue
|
|||
|
if 'tt' in dirCT.lower():
|
|||
|
print(' read TT... dir:',dirCT)
|
|||
|
fns = os.listdir(os.path.join(path, dirC, dirCT))
|
|||
|
for fn in fns:
|
|||
|
if fn[:1] == '.':
|
|||
|
continue
|
|||
|
if not fn[-5:] == '.xlsx':
|
|||
|
continue
|
|||
|
#print('---',fn)
|
|||
|
# 账号名称
|
|||
|
|
|||
|
ttName = fn[fn.index('_')+1:]
|
|||
|
ttName = ttName[:ttName.index('_')]
|
|||
|
#D:\Projects\POM\DATA\2022年11月\10月报告\全文\LN\TT
|
|||
|
fileAs = os.path.join(path, dirC, dirCT, fn)
|
|||
|
#print(' ', ttName, fileAs)
|
|||
|
if len(fileAs) > 0:
|
|||
|
try:
|
|||
|
dfdftt = pd.read_excel(fileAs)
|
|||
|
dfTTC = dfTTC.append(dfdftt)
|
|||
|
except:
|
|||
|
print("read file failed. ", fileAs)
|
|||
|
|
|||
|
#dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols,
|
|||
|
# index_col=None)#, engine='python', encoding='gbk'#utf-8
|
|||
|
#dfdfwb = dfdfwb[1:]
|
|||
|
#dfdfwb["weiboID"] = wbId
|
|||
|
#dfdfwb["weiboName"] = wbName
|
|||
|
|
|||
|
#dfTTC = dfTTC.append(dfdfwb)
|
|||
|
#print(ttName, '读入:', dfdftt.shape[0], ' 总计:', dfTTC.shape[0])
|
|||
|
|
|||
|
#if len(fileAs)>1:
|
|||
|
# print(" +=+= ", fileAs)
|
|||
|
|
|||
|
print(' 读入头条数据行数', dfTTC.shape)
|
|||
|
#dfTTC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx")
|
|||
|
dfTTC['city'] = cityShorten[dirC]
|
|||
|
dfTT = dfTT.append(dfTTC)
|
|||
|
|
|||
|
print('Read TT finished. cities', cityCount, '; lines', dfTT.shape)
|
|||
|
#dfTT.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx")
|
|||
|
return dfTT
|
|||
|
|
|||
|
|
|||
|
def fetch_chinese(s):
|
|||
|
pattern =re.compile(r'[^\u4e00-\u9fa5]')
|
|||
|
sc = re.sub(pattern, '', s)
|
|||
|
return sc
|
|||
|
|
|||
|
def doWBData():
|
|||
|
|
|||
|
dfAccount = pd.read_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全国报送系统表单_2023.6.30.xlsx')
|
|||
|
dfAccount = dfAccount[dfAccount['账号类型']=='新浪微博']
|
|||
|
dfAccount['微信biz/oid/账号ID'] = dfAccount['微信biz/oid/账号ID'].astype('int64')
|
|||
|
|
|||
|
dfwb1 = pd.read_csv('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/weibo_2/weibo1.csv', sep=',',index_col=None)#, engine='python', encoding='gbk'#utf-8
|
|||
|
dfwb1 = dfwb1.fillna(0)
|
|||
|
dfwb1['user_id'] = dfwb1['user_id'].astype('int64')
|
|||
|
|
|||
|
dfwb1.rename(columns={'id':'微博id', 'content':'微博正文', 'article_url':'头条文章url', 'original_pictures':'原始图片url',
|
|||
|
'retweet_pictures':'被转发微博原始图片url', 'original':'是否为原创微博', 'video_url':'微博视频url',
|
|||
|
'publish_place':'发布位置', 'publish_time':'发布时间', 'publish_tool':'发布工具',
|
|||
|
'up_num':'点赞数', 'retweet_num':'转发数', 'comment_num':'评论数'}, inplace = True)
|
|||
|
|
|||
|
print(dfAccount.shape)
|
|||
|
|
|||
|
print(dfwb1.shape, dfwb1.dtypes)
|
|||
|
if 1:
|
|||
|
sDir = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全文/LN/weibo_3/'
|
|||
|
i=0
|
|||
|
j=0
|
|||
|
for uid in dfwb1['user_id'].unique():
|
|||
|
dfa1 = dfAccount[dfAccount['微信biz/oid/账号ID']==uid]
|
|||
|
dfa1.reset_index(inplace=True)
|
|||
|
if dfa1.shape[0]>0:
|
|||
|
sA = str(dfa1.loc[0,'账号名称'])
|
|||
|
#print(dfa1['账号名称'])
|
|||
|
i = i + 1
|
|||
|
dfwba = dfwb1.loc[dfwb1['user_id']==uid]
|
|||
|
os.mkdir(sDir+sA)
|
|||
|
#微博id,微博正文,头条文章url,原始图片url,被转发微博原始图片url,是否为原创微博,微博视频url,
|
|||
|
# 发布位置,发布时间,发布工具,点赞数,转发数,评论数
|
|||
|
dfwba = dfwba[['微博id', '微博正文', '头条文章url', '原始图片url',
|
|||
|
'被转发微博原始图片url', '是否为原创微博',
|
|||
|
'微博视频url', '微博视频url', '发布位置', '发布时间', '发布工具',
|
|||
|
'点赞数', '转发数', '评论数']]
|
|||
|
|
|||
|
dfwba = dfwba.reset_index()
|
|||
|
dfwba.to_csv(sDir+sA+'/'+str(uid)+'.csv', encoding='utf_8_sig', index=0, quoting=1)
|
|||
|
|
|||
|
else:
|
|||
|
j = j+1
|
|||
|
print('found ', i, '; nofound', j)
|
|||
|
i=0
|
|||
|
j=0
|
|||
|
if 1:
|
|||
|
sDir = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全文/LN/weibo_4/'
|
|||
|
dfwb2 = pd.read_csv('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/weibo_2/weibo2.csv', sep=',',index_col=None)#, engine='python', encoding='gbk'#utf-8
|
|||
|
|
|||
|
dfwb2 = dfwb2.fillna(0)
|
|||
|
dfwb2['user_id'] = dfwb2['user_id'].astype('int64')
|
|||
|
|
|||
|
dfwb2.rename(columns={'id':'微博id', 'text':'微博正文', 'article_url':'头条文章url', 'pics':'原始图片url',
|
|||
|
'topics':'被转发微博原始图片url','source':'是否为原创微博','video_url':'微博视频url',
|
|||
|
'location':'发布位置', 'created_at':'发布时间', 'bid':'发布工具',
|
|||
|
'attitudes_count':'点赞数', 'reposts_count':'转发数', 'comments_count':'评论数'}, inplace = True)
|
|||
|
print(dfwb2.shape)
|
|||
|
for uid in dfwb2['user_id'].unique():
|
|||
|
dfa2 = dfAccount[dfAccount['微信biz/oid/账号ID']==uid]
|
|||
|
dfa2.reset_index(inplace=True)
|
|||
|
if dfa2.shape[0]>0:
|
|||
|
sA = str(dfa2.loc[0, '账号名称'])
|
|||
|
i = i+1
|
|||
|
dfwba = dfwb2.loc[dfwb2['user_id']==uid]
|
|||
|
os.mkdir(sDir+sA)
|
|||
|
#微博id,微博正文,头条文章url,原始图片url,被转发微博原始图片url,是否为原创微博,微博视频url,发布位置,发布时间,发布工具,点赞数,转发数,评论数
|
|||
|
dfwba = dfwba[['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博',
|
|||
|
'微博视频url', '发布位置', '发布时间', '发布工具',
|
|||
|
'点赞数', '转发数', '评论数']]
|
|||
|
dfwba = dfwba.reset_index()
|
|||
|
dfwba.to_csv(sDir+sA+'/'+str(uid)+'.csv', encoding='utf_8_sig', index=0, quoting=1)
|
|||
|
else:
|
|||
|
#print(uid)
|
|||
|
j = j+1
|
|||
|
|
|||
|
print('found ', i, '; nofound', j)
|
|||
|
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
#doWBData()
|
|||
|
#exit(0)
|
|||
|
starttime = datetime.datetime.now()
|
|||
|
_RATIO = 0.5
|
|||
|
isDoWX = True
|
|||
|
isDoWB = True
|
|||
|
isDoTT = True
|
|||
|
cities = [
|
|||
|
'临夏回族自治州',
|
|||
|
'白银市',
|
|||
|
'定西市',
|
|||
|
'酒泉市',
|
|||
|
'嘉峪关市',
|
|||
|
'平凉市',
|
|||
|
'庆阳市',
|
|||
|
'天水市',
|
|||
|
'武威市',
|
|||
|
'兰州新区',
|
|||
|
'陇南市',
|
|||
|
'兰州市', '张掖市', '甘南藏族自治州', '金昌市',
|
|||
|
'省直部门', # 共12市2州1新区
|
|||
|
]
|
|||
|
'''
|
|||
|
cities = [
|
|||
|
'临夏回族自治州',
|
|||
|
'白银市',
|
|||
|
'定西市',
|
|||
|
'酒泉市',
|
|||
|
'天水市',
|
|||
|
'陇南市',
|
|||
|
|
|||
|
#'省直部门', # 共12市2州1新区
|
|||
|
]
|
|||
|
'''
|
|||
|
cities = ['陇南市',]
|
|||
|
#cities = ['陇南市', '临夏回族自治州', '白银市', '定西市', '酒泉市', '平凉市','武威市','天水市']
|
|||
|
#cities = ['陇南市']
|
|||
|
# 转发任务
|
|||
|
#dfTask = pd.read_excel('D:/Projects/POM/DATA/2022年S2/S2/全省政务新媒体二季度转发信息条目.xls')
|
|||
|
dfTask = pd.read_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/陇南7月上旬转发台账.xlsx')
|
|||
|
sTaskTitle = '标题'
|
|||
|
sTaskDate = '推送时间'
|
|||
|
# 删除标题列为空的行
|
|||
|
dfTask.dropna(axis=0,subset = ["标题"])
|
|||
|
yT0 = dfTask.columns.get_loc('序号')
|
|||
|
yT1 = dfTask.columns.get_loc('标题')
|
|||
|
|
|||
|
# 账号信息
|
|||
|
strFnAccount = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全国报送系统表单_2023.6.30.xlsx'
|
|||
|
dfAllAccount = pd.read_excel(strFnAccount)
|
|||
|
# 添加列
|
|||
|
dfAllAccount.loc[:, '转发数'] = 0
|
|||
|
#dfAllAccount.loc[:, '阅读数'] = 0
|
|||
|
dfAllAccount = pd.concat([dfAllAccount, pd.DataFrame(np.zeros((dfAllAccount.shape[0], dfTask.shape[0])), columns=dfTask['序号'].astype(str).tolist())], axis=1)
|
|||
|
# 整理数据
|
|||
|
dfAllAccount['市/省局'] = dfAllAccount['市/省局'].fillna('省直部门')
|
|||
|
dfAllAccount['区县/地方部门'] = dfAllAccount['区县/地方部门'].fillna('市直部门')
|
|||
|
dfAllAccount.loc[(dfAllAccount['市/省局'].isin(['临夏回族自治州', '甘南藏族自治州'])) & (dfAllAccount['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门'
|
|||
|
dfAllAccount.loc[(dfAllAccount['市/省局'].isin(['省直部门'])) & (dfAllAccount['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '省直部门'
|
|||
|
# 过长名称替换为简称,便于绘图
|
|||
|
dfAllAccount.loc[dfAllAccount['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县'
|
|||
|
dfAllAccount.loc[dfAllAccount['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县'
|
|||
|
yAccountName = dfAllAccount.columns.get_loc('账号名称')
|
|||
|
yAccountCity = dfAllAccount.columns.get_loc('市/省局')
|
|||
|
yAccountCounty = dfAllAccount.columns.get_loc('区县/地方部门')
|
|||
|
yAccountUnit = dfAllAccount.columns.get_loc('单位全称')
|
|||
|
|
|||
|
# 省直部门账号部门简称
|
|||
|
fnTemplate = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/POM_ForewardTemplate.docx'
|
|||
|
|
|||
|
# 数据根目录,
|
|||
|
strPath = ['D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全文/',
|
|||
|
]
|
|||
|
strOutputPath = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/转发/'
|
|||
|
|
|||
|
context = {
|
|||
|
"year": "2023",
|
|||
|
"month": "7",
|
|||
|
"pubMonth": "7",
|
|||
|
"dateStart": "2023年7月1日",
|
|||
|
"dateEnd": "2023年7月10日"
|
|||
|
}
|
|||
|
|
|||
|
################################################
|
|||
|
# 创建存储矩阵
|
|||
|
# 按照转发任务创建统计矩阵
|
|||
|
colRR = ['市州', '类型', '账号名称', '单位名称', '省直部门', '区县', '转发数', '阅读数']
|
|||
|
for ididid in dfTask['序号'][0:dfTask['标题'].count()].tolist():
|
|||
|
#for ididid in range(1, dfTask['标题'].count()):
|
|||
|
colRR.append(str(ididid))
|
|||
|
# 用于保存每一条转发任务的账号和文章
|
|||
|
dfO = pd.DataFrame(columns=['任务序号', '任务名称', '类型', '公众号', '日期', '内容', '链接', '市州'] )
|
|||
|
|
|||
|
|
|||
|
|
|||
|
# WX
|
|||
|
if isDoWX:
|
|||
|
print('=============================================================')
|
|||
|
print('---- WX ----')
|
|||
|
dfWX = pd.DataFrame()
|
|||
|
for strP in strPath:
|
|||
|
ddff = getWXData(strP, cities)
|
|||
|
dfWX = dfWX.append(ddff)
|
|||
|
|
|||
|
dfWX = dfWX.fillna(value=0)
|
|||
|
yWXtitle = dfWX.columns.get_loc('标题')
|
|||
|
yWXnread = dfWX.columns.get_loc('阅读数')
|
|||
|
yWXdate = dfWX.columns.get_loc('日期')
|
|||
|
yWXurl = dfWX.columns.get_loc('链接')
|
|||
|
|
|||
|
# 公众号 链接 日期 标题 内容 头条 city
|
|||
|
## 逐个市州统计每个账号的转发情况
|
|||
|
#cities = dfWX['市州'].unique()
|
|||
|
for city in cities:
|
|||
|
print('---- WX title match', city, ' ----' )
|
|||
|
# 本市微信数据
|
|||
|
dataC = dfWX.loc[dfWX['市州'] == city].copy()
|
|||
|
# 获取微信账号数
|
|||
|
accounts = dataC['公众号'].unique()
|
|||
|
|
|||
|
# 所有微信账号数
|
|||
|
maskCWX = ( (dfAllAccount['账号类型'] == '微信服务号')|(dfAllAccount['账号类型'] == '微信订阅号') ) & (dfAllAccount['市/省局'] == city)
|
|||
|
accountNumCWX = maskCWX.tolist().count(True)
|
|||
|
|
|||
|
# 按获取得微信账号遍历
|
|||
|
for account in accounts:
|
|||
|
#print(account)
|
|||
|
# 该账号的所有文章
|
|||
|
dataA = dataC.loc[dataC['公众号'] == account].copy() # 一个公众号的所有文章
|
|||
|
sR = pd.Series(dtype='object')
|
|||
|
sR['类型'] = '微信'
|
|||
|
sR['市州'] = city
|
|||
|
sR['账号名称'] = account
|
|||
|
count = 0
|
|||
|
arn = 0
|
|||
|
|
|||
|
# 从账号信息中匹配该账号详细信息
|
|||
|
mask = ( (dfAllAccount['账号类型'] == '小程序+微信')
|
|||
|
| (dfAllAccount['账号类型'] == '微信服务号')
|
|||
|
| (dfAllAccount['账号类型'] == '微信订阅号') ) & (dfAllAccount['市/省局'] == city) & (dfAllAccount['账号名称']==account)
|
|||
|
if mask.any():
|
|||
|
sxq = str(dfAllAccount.loc[mask, '区县/地方部门'].values[0])
|
|||
|
if sxq.lower() !='nan':
|
|||
|
sR['区县'] = sxq
|
|||
|
sdwmc = str(dfAllAccount.loc[mask, '单位全称'].values[0])
|
|||
|
if sdwmc.lower() != 'nan':
|
|||
|
sR['单位名称'] = sdwmc
|
|||
|
else:
|
|||
|
print(' !!!! 微信', account, '在', city, '无详细信息' )
|
|||
|
continue
|
|||
|
|
|||
|
# 按任务标题逐个匹配所有发文,得到每篇任务的转发情况
|
|||
|
for i in range(dfTask['标题'].count()):
|
|||
|
# 对于每一篇任务文章
|
|||
|
rn = dfTask.iloc[i, yT0] # 序号
|
|||
|
ssrt = str(dfTask.iloc[i, yT1]) # 标题/内容
|
|||
|
rt = fetch_chinese(ssrt) # 只取汉字
|
|||
|
forwarded = 0 # 转发数
|
|||
|
readNum = 0 # 阅读数
|
|||
|
# 查看该账号的所有文章
|
|||
|
for j in range(dataA.shape[0]):
|
|||
|
str1 = fetch_chinese(str(dataA.iloc[j, yWXtitle])) # 只取汉字
|
|||
|
|
|||
|
# 任务标题过长,截取前半部分进行对比
|
|||
|
if len(rt) > len(str1):
|
|||
|
strRT = rt[:len(str1)]
|
|||
|
else:#文章标题过长,只比较任务标题长度部分
|
|||
|
strRT = rt
|
|||
|
str1 = str1[:len(rt)]
|
|||
|
|
|||
|
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
|
|||
|
|
|||
|
# 遇到相似的,认为已转发,即跳出不再查找
|
|||
|
if ratio > _RATIO:
|
|||
|
forwarded += 1
|
|||
|
readNum += int(dataA.iloc[j, yWXnread])
|
|||
|
if forwarded > 0:
|
|||
|
break
|
|||
|
sR[str(rn)] = forwarded # 记录该篇文章的转发数
|
|||
|
|
|||
|
count += forwarded # 累加该篇文章的转发数
|
|||
|
arn += readNum # 累加该篇文章的阅读数
|
|||
|
|
|||
|
# 记录该篇任务转发情况加入
|
|||
|
if forwarded > 0:
|
|||
|
dfO = dfO.append([{'任务序号': rn, '任务名称': ssrt,
|
|||
|
'类型': '微信',
|
|||
|
'公众号': account,
|
|||
|
'日期': dataA.iloc[j, yWXdate],
|
|||
|
'内容': str1,
|
|||
|
'链接': dataA.iloc[j, yWXurl],
|
|||
|
'市州': city,
|
|||
|
'阅读数': readNum,
|
|||
|
}], ignore_index=True)
|
|||
|
#记录该任务的转发情况
|
|||
|
dfAllAccount.loc[mask, str(rn)] = forwarded
|
|||
|
#记录该账号的总转发数
|
|||
|
dfAllAccount.loc[mask, '转发数'] = count
|
|||
|
sR['转发数'] = count
|
|||
|
sR['阅读数'] = arn
|
|||
|
# 全市总转发文章篇数
|
|||
|
ccwx = dfAllAccount.loc[maskCWX, '转发数'].sum()
|
|||
|
# 全市总转发率
|
|||
|
rcc = ccwx/accountNumCWX/dfTask.shape[0]
|
|||
|
print(' ', city, '共有', accountNumCWX, '个微信号,获取数据', len(accounts), '个。共转发', ccwx, '次,转发率{:.1f}%'.format(rcc*100) )
|
|||
|
#countWxForewards = dfRR.shape[0]
|
|||
|
#print(' 获取 WX 账号数', len(dfWX['公众号'].unique()),'参与转发账号数', countWxForewards)
|
|||
|
|
|||
|
# WB
|
|||
|
if isDoWB:
|
|||
|
print('=============================================================')
|
|||
|
print('---- WB data read ----')
|
|||
|
#获取微博数据
|
|||
|
dfWB = pd.read_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/微博全文.xlsx')
|
|||
|
#for strP in strPath:
|
|||
|
## ddff = getWBData(strP, cities)
|
|||
|
# dfWB = dfWB.append(ddff)
|
|||
|
print('----', dfWB.shape)
|
|||
|
#===========================================================================================
|
|||
|
|
|||
|
#===========================================================================================
|
|||
|
yWBcontent = dfWB.columns.get_loc('微博正文')
|
|||
|
yWBdate = dfWB.columns.get_loc('date')
|
|||
|
yWBurl = dfWB.columns.get_loc('头条文章url')
|
|||
|
#dfWB.to_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/转发/微博全文.xlsx')
|
|||
|
################################################
|
|||
|
# WB
|
|||
|
# 微博id 微博正文 头条文章url 原始图片url 被转发微博原始图片url 是否为原创微博 微博视频url 发布位置 date
|
|||
|
# 发布工具 点赞数 转发数 评论数 weiboID weiboName city
|
|||
|
#cities = dfWB['市州'].unique()
|
|||
|
for city in cities:
|
|||
|
print('---- WB match', city, ' ----' )
|
|||
|
# 本市微博数据
|
|||
|
dataC = dfWB.loc[dfWB['市州'] == city].copy()
|
|||
|
# 获取数据的微博账号
|
|||
|
accounts = dataC['weiboName'].unique()
|
|||
|
|
|||
|
# 本市所有微博账号
|
|||
|
maskCWB = (dfAllAccount['账号类型'] == '新浪微博') & (dfAllAccount['市/省局'] == city)
|
|||
|
accountNumCWB = maskCWB.tolist().count(True)
|
|||
|
|
|||
|
# 按获取的微博账号遍历
|
|||
|
for account in accounts:
|
|||
|
# print(account)
|
|||
|
# 该公众号的所有文章
|
|||
|
dataA = dataC.loc[dataC['weiboName'] == account].copy()
|
|||
|
sR = pd.Series(dtype='object')
|
|||
|
sR['类型'] = '新浪微博'
|
|||
|
sR['市州'] = city
|
|||
|
sR['账号名称'] = account
|
|||
|
count = 0
|
|||
|
|
|||
|
# 为转发账号匹配单位全称和所属县区
|
|||
|
mask = ( dfAllAccount['账号类型'] == '新浪微博' ) & (dfAllAccount['市/省局'] == city) & (dfAllAccount['账号名称']==account)
|
|||
|
if mask.any():
|
|||
|
sxq = str(dfAllAccount.loc[mask, '区县/地方部门'].values[0])
|
|||
|
if sxq.lower() !='nan':
|
|||
|
sR['区县'] = sxq
|
|||
|
sdwmc = str(dfAllAccount.loc[mask, '单位全称'].values[0])
|
|||
|
if sdwmc.lower() != 'nan':
|
|||
|
sR['单位名称'] = sdwmc
|
|||
|
else:
|
|||
|
print(' !!!! 微博', account, '在', city, '无详细信息' )
|
|||
|
continue
|
|||
|
|
|||
|
# 按任务标题逐个匹配所有发文,得到每篇任务的转发情况
|
|||
|
for i in range(dfTask['标题'].count()):
|
|||
|
rn = dfTask.iloc[i, yT0] # 任务序号
|
|||
|
ssrt = str(dfTask.iloc[i, yT1]) # 任务标题
|
|||
|
rt = fetch_chinese(ssrt) # 只取中文
|
|||
|
forwarded = 0
|
|||
|
# 对该账号的所有文章
|
|||
|
for j in range(dataA.shape[0]):
|
|||
|
str0 = str(dataA.iloc[j, yWBcontent])
|
|||
|
str1 = fetch_chinese(str0)
|
|||
|
str2 = str1[:len(rt)] # 取任务标题相同汉字数进行比较
|
|||
|
|
|||
|
ratio = difflib.SequenceMatcher(None, rt, str2).quick_ratio()
|
|||
|
|
|||
|
if ratio > _RATIO:
|
|||
|
forwarded += 1
|
|||
|
if forwarded > 0:
|
|||
|
break
|
|||
|
#记记录该任务的转发情况
|
|||
|
dfAllAccount.loc[mask, str(rn)] = forwarded
|
|||
|
sR[str(rn)] = forwarded
|
|||
|
# 转发数累加到本账号里
|
|||
|
count += forwarded
|
|||
|
|
|||
|
# 记录该篇任务转发情况加入
|
|||
|
if forwarded > 0:
|
|||
|
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
|
|||
|
'类型': '新浪微博',
|
|||
|
'公众号': account,
|
|||
|
'日期': dataA.iloc[j, yWBdate],
|
|||
|
'内容': str1,
|
|||
|
'链接': dataA.iloc[j, yWBurl],
|
|||
|
'市州': city,
|
|||
|
}], ignore_index=True)
|
|||
|
# 记录该账号的总转发数
|
|||
|
dfAllAccount.loc[mask, '转发数'] = count
|
|||
|
sR['转发数'] = count
|
|||
|
|
|||
|
# 全市总转发文章篇数
|
|||
|
ccwb = dfAllAccount.loc[maskCWB, '转发数'].sum()
|
|||
|
# 全市总转发率
|
|||
|
rcc = ccwb/accountNumCWB/dfTask.shape[0]
|
|||
|
print(' ', city, '共有', accountNumCWB, '个微博号,获取数据', len(accounts), '个。共转发', ccwb, '次,转发率{:.1f}%'.format(rcc*100) )
|
|||
|
|
|||
|
#countWbForewards = dfRR.shape[0] - countWxForewards
|
|||
|
#print(' 获取 WB 账号数', len(dfWB['weiboName'].unique()), '参与转发账号数', countWbForewards)
|
|||
|
|
|||
|
|
|||
|
# TT
|
|||
|
if isDoTT:
|
|||
|
print('=============================================================')
|
|||
|
print('---- TT data read ----')
|
|||
|
# id userId source city tid cellType title
|
|||
|
# time-stamp date url commentCount readNum likeNum showNum
|
|||
|
# 获取头条数据
|
|||
|
dfTT = pd.DataFrame()
|
|||
|
for strP in strPath:
|
|||
|
ddff = getTTData(strP, cities)
|
|||
|
dfTT = dfTT.append(ddff)
|
|||
|
|
|||
|
yTTtitle = dfTT.columns.get_loc('title')
|
|||
|
yTTdate = dfTT.columns.get_loc('date')
|
|||
|
yTTurl = dfTT.columns.get_loc('url')
|
|||
|
|
|||
|
# 逐个市州统计账号转发情况
|
|||
|
for city in cities:
|
|||
|
print("++++++++++++++++++++++++++++++++++++++++++++++++++")
|
|||
|
print('---- TT title match', city, ' ----' )
|
|||
|
# 本市头条数据
|
|||
|
dataC = dfTT.loc[dfTT['city'] == city].copy()
|
|||
|
# 获取数据的头条账号
|
|||
|
accounts = dataC['account'].unique()
|
|||
|
|
|||
|
# 本市所有头条账号信息
|
|||
|
maskCTT = (dfAllAccount['账号类型'] == '今日头条') & (dfAllAccount['市/省局'] == city)
|
|||
|
accountNumCTT = maskCTT.tolist().count(True)
|
|||
|
|
|||
|
# 按头条数据的账号遍历
|
|||
|
for account in accounts:
|
|||
|
#print(account)
|
|||
|
# 该账号的所有文章
|
|||
|
dataA = dataC[dataC['account']==account]
|
|||
|
sR = pd.Series([], dtype=pd.StringDtype())
|
|||
|
sR['类型'] = '今日头条'
|
|||
|
sR['市州'] = city
|
|||
|
sR['账号名称'] = account
|
|||
|
count = 0
|
|||
|
|
|||
|
# 为转发账号匹配单位全称和所属县区
|
|||
|
mask = ( dfAllAccount['账号类型'] == '今日头条' ) & (dfAllAccount['市/省局'] == city) & (dfAllAccount['账号名称']==account)
|
|||
|
if mask.any():
|
|||
|
sxq = str(dfAllAccount.loc[mask, '区县/地方部门'].values[0])
|
|||
|
if sxq.lower() !='nan':
|
|||
|
sR['区县'] = sxq
|
|||
|
sdwmc = str(dfAllAccount.loc[mask, '单位全称'].values[0])
|
|||
|
if sdwmc.lower() != 'nan':
|
|||
|
sR['单位名称'] = sdwmc
|
|||
|
else:
|
|||
|
print(' !!!! 头条', account, '在', city, '无详细信息' )
|
|||
|
continue
|
|||
|
|
|||
|
# 按任务标题逐个匹配所有发文,得到每篇任务的转发情况
|
|||
|
for i in range(dfTask['标题'].count()):
|
|||
|
# 对于每一篇任务文章
|
|||
|
rn = dfTask.iloc[i, yT0] # 任务序号
|
|||
|
ssrt = str(dfTask.iloc[i, yT1]) # 任务标题
|
|||
|
rt = fetch_chinese(ssrt) # 只取中文
|
|||
|
forwarded = 0
|
|||
|
|
|||
|
# 查看该账号的所有文章
|
|||
|
for j in range(dataA.shape[0]):
|
|||
|
str0 = str(dataA.iloc[j, yTTtitle])
|
|||
|
str1 = fetch_chinese(str0)
|
|||
|
#
|
|||
|
if len(rt) > len(str1): # 若任务标题过长,截取前半部分进行对比
|
|||
|
strRT = rt[:len(str1)]
|
|||
|
else: #若文章标题过长,只比较任务标题长度部分
|
|||
|
strRT = rt
|
|||
|
str1 = str1[:len(rt)]
|
|||
|
ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio()
|
|||
|
if ratio > _RATIO:
|
|||
|
forwarded += 1
|
|||
|
if forwarded > 0:
|
|||
|
break
|
|||
|
#记录该任务转发情况
|
|||
|
dfAllAccount.loc[mask, str(rn)] = forwarded
|
|||
|
sR[str(rn)] = forwarded
|
|||
|
count += forwarded
|
|||
|
if forwarded > 0:
|
|||
|
dfO = dfO.append([{'任务序号': rn, '任务名称': rt,
|
|||
|
'类型': '今日头条',
|
|||
|
'公众号': account,
|
|||
|
'日期': dataA.iloc[j, yTTdate],
|
|||
|
'内容': str1,
|
|||
|
'链接': dataA.iloc[j, yTTurl],
|
|||
|
'市州': city,
|
|||
|
}], ignore_index=True)
|
|||
|
|
|||
|
# 记录该账号转发情况
|
|||
|
dfAllAccount.loc[mask, '转发数'] = count
|
|||
|
sR['转发数'] = count
|
|||
|
|
|||
|
|
|||
|
# 全市总转发文章篇数
|
|||
|
cctt = dfAllAccount.loc[maskCTT, '转发数'].sum()
|
|||
|
# 全市总转发率
|
|||
|
rcc = cctt/accountNumCTT/dfTask.shape[0]
|
|||
|
print(' ', city, '共有', accountNumCTT, '个头条号,获取数据', len(accounts), '个。共转发', cctt, '次,转发率{:.1f}%'.format(rcc*100) )
|
|||
|
|
|||
|
#countTtForewards = dfRR.shape[0] - countWxForewards - countWbForewards
|
|||
|
#print(' 获取 TT 账号数', len(dfTT['account'].unique()),'参与转发账号数', countTtForewards)
|
|||
|
|
|||
|
if isDoWX or isDoWB or isDoTT:
|
|||
|
print('=============================================================')
|
|||
|
print('---- STATISTICS ----')
|
|||
|
print('=============================================================')
|
|||
|
|
|||
|
dfAllAccount.to_excel(strOutputPath + '甘肃省_转发账号.xlsx')
|
|||
|
dfO.to_excel(strOutputPath + '甘肃省_转发文章.xlsx')
|
|||
|
|
|||
|
|
|||
|
print('---- 统计市州转发率 ----')
|
|||
|
for city in cities:
|
|||
|
#if city in ['兰州新区', '省直部门']:
|
|||
|
# continue
|
|||
|
print(" add up city", city)
|
|||
|
|
|||
|
|
|||
|
maskC = ( (dfAllAccount['账号类型'] == '新浪微博')
|
|||
|
| (dfAllAccount['账号类型'] == '微信服务号')
|
|||
|
| (dfAllAccount['账号类型'] == '微信订阅号')
|
|||
|
| (dfAllAccount['账号类型'] == '今日头条') ) & (dfAllAccount['市/省局'] == city)
|
|||
|
|
|||
|
# dfdfC = dfAllAccount.loc[((dfAllAccount['账号类型'] == '新浪微博')
|
|||
|
# | (dfAllAccount['账号类型'] == '微信服务号')
|
|||
|
# | (dfAllAccount['账号类型'] == '微信订阅号')
|
|||
|
# | (dfAllAccount['账号类型'] == '今日头条'))
|
|||
|
# & (dfAllAccount['市/省局'] == city)].copy()
|
|||
|
|
|||
|
dfdfC = dfAllAccount.loc[maskC,:]
|
|||
|
dfdfC.to_excel(strOutputPath + city + '_转发账号.xlsx')
|
|||
|
|
|||
|
dfOCity = dfO[dfO['市州'] == city]
|
|||
|
dfO.to_excel(strOutputPath + city + '_转发文章.xlsx')
|
|||
|
|
|||
|
#dfRRCity = dfRR.loc[dfRR['市州'] == city].copy()
|
|||
|
|
|||
|
#########################################################################################################
|
|||
|
# 统计市/州直部门转发数
|
|||
|
dfdfCD = dfdfC.loc[dfdfC['区县/地方部门'].isin(['州直部门', '市直部门', '省直部门'])].copy()
|
|||
|
dfdfCDA = pd.pivot_table(dfdfCD, index=['单位全称'], values=['账号名称'],
|
|||
|
aggfunc=['count'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfdfCDC = pd.pivot_table(dfdfCD, index=['单位全称'], values=['转发数'],
|
|||
|
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
#dfdfCDR = pd.pivot_table(dfdfCD, index=['单位全称'], values=['阅读数'],
|
|||
|
# aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfdfCD_A = pd.concat([dfdfCDA, dfdfCDC], axis=1)
|
|||
|
|
|||
|
#print('-', dfdfCD_A.columns.values)
|
|||
|
# 合并多层索引MultiIndex
|
|||
|
dfdfCD_A.columns = ['_'.join(col) for col in dfdfCD_A.columns.values]
|
|||
|
#print('=', dfdfCD_A.columns.values)
|
|||
|
# 计算转发率
|
|||
|
dfdfCD_A['rate'] = dfdfCD_A.apply(
|
|||
|
lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['标题'].count() * 1000) / 1000.0, axis=1)
|
|||
|
# 排序
|
|||
|
dfdfCD_AD = dfdfCD_A[0:dfdfCD_A.shape[0] - 1].sort_values(by='rate', ascending=False)
|
|||
|
|
|||
|
dfdfCD_AD = pd.concat([dfdfCD_AD, dfdfCD_A[dfdfCD_A.shape[0] - 1:dfdfCD_A.shape[0]]], axis=0)
|
|||
|
|
|||
|
dfdfCD_AD.to_excel(strOutputPath + city + '部门转发统计表.xlsx')
|
|||
|
#dfDD
|
|||
|
|
|||
|
|
|||
|
##########################################################################################
|
|||
|
# 全市/州账号按'区县'统计
|
|||
|
# 发现目前版本pivot_table函数aggfunc用列表时,前几列计算值不准确
|
|||
|
# 所以,暂时单列计算,再合并
|
|||
|
|
|||
|
|
|||
|
#dfdfCD = dfdfC.loc[dfdfC['区县/地方部门'].isin(['州直部门', '市直部门', '省直部门'])].copy()
|
|||
|
dfdfCA = pd.pivot_table(dfdfC, index=['区县/地方部门'], values=['账号名称'],
|
|||
|
aggfunc=['count'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfdfCC = pd.pivot_table(dfdfC, index=['区县/地方部门'], values=['转发数'],
|
|||
|
aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
#dfdfCDR = pd.pivot_table(dfdfCD, index=['单位全称'], values=['阅读数'],
|
|||
|
# aggfunc=['sum'], fill_value='', margins=True, margins_name='总计')
|
|||
|
dfdfC_A = pd.concat([dfdfCA, dfdfCC], axis=1)
|
|||
|
|
|||
|
#print('-', dfdfCD_A.columns.values)
|
|||
|
# 合并多层索引MultiIndex
|
|||
|
dfdfC_A.columns = ['_'.join(col) for col in dfdfC_A.columns.values]
|
|||
|
#print('=', dfdfCD_A.columns.values)
|
|||
|
# 计算转发率
|
|||
|
dfdfC_A['rate'] = dfdfC_A.apply(
|
|||
|
lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['标题'].count() * 1000) / 1000.0, axis=1)
|
|||
|
# 排序
|
|||
|
dfdfC_AD = dfdfC_A[0:dfdfC_A.shape[0] - 1].sort_values(by='rate', ascending=False)
|
|||
|
|
|||
|
dfdfC_AD = pd.concat([dfdfC_AD, dfdfC_A[dfdfC_A.shape[0] - 1:dfdfC_A.shape[0]]], axis=0)
|
|||
|
|
|||
|
dfdfC_AD.to_excel(strOutputPath + city + '转发统计表.xlsx')
|
|||
|
#dfCC
|
|||
|
|
|||
|
#########################################################
|
|||
|
#
|
|||
|
# 生成报告
|
|||
|
tpl = DocxTemplate(fnTemplate)
|
|||
|
if city in ['临夏回族自治州', '甘南藏族自治州']:
|
|||
|
sL0 = '州'
|
|||
|
else:
|
|||
|
sL0 = '市'
|
|||
|
info = {
|
|||
|
"strL0":sL0,
|
|||
|
"strL1":"区县",
|
|||
|
"taskCount": dfTask['标题'].count(),
|
|||
|
"aNum": int(dfdfC_AD.iloc[-1]['count_账号名称']),
|
|||
|
"fNum": int(dfdfC_AD.iloc[-1]['sum_转发数']),
|
|||
|
"r": '%.1f'%(dfdfC_AD.iloc[-1]['rate']*100.0),
|
|||
|
#
|
|||
|
"dNum": int(dfdfCD_AD.iloc[-1]['count_账号名称']), # 部门总账号数
|
|||
|
"dFNum": int(dfdfCD_AD.iloc[-1]['sum_转发数']), # 部门总转发数
|
|||
|
"dr": '%.1f'%(dfdfCD_AD.iloc[-1]['rate']*100.0), # 部门平均转发率
|
|||
|
}
|
|||
|
context.update(info)
|
|||
|
|
|||
|
# 县区转发率表格
|
|||
|
t1_list = []
|
|||
|
for index, row in dfdfC_AD.iterrows():
|
|||
|
if index == "总计":
|
|||
|
continue
|
|||
|
t1_a = {'county': str(index), 'rate': '%.1f'%(row['rate']*100.0),
|
|||
|
'account': int(row['count_账号名称']), 'fNum': int(row['sum_转发数']) }
|
|||
|
t1_list.append(t1_a)
|
|||
|
context['t1_contents'] = t1_list
|
|||
|
|
|||
|
# 部门转发率表格
|
|||
|
t2_list = []
|
|||
|
for index, row in dfdfCD_AD.iterrows():
|
|||
|
if index == "总计":
|
|||
|
continue
|
|||
|
t2_a = {'name': str(index),
|
|||
|
'rate': '%.1f'%(row['rate']*100.0),
|
|||
|
'account': int(row['count_账号名称']),
|
|||
|
'fNum': int(row['sum_转发数']) }
|
|||
|
t2_list.append(t2_a)
|
|||
|
context['t2_contents'] = t2_list
|
|||
|
|
|||
|
# 转发任务列表
|
|||
|
t3_list = []
|
|||
|
for index, row in dfTask.iterrows():
|
|||
|
t3_a = {'id': row['序号'],
|
|||
|
'title': row['标题'],
|
|||
|
'date': ts2date(row[sTaskDate], '%m月%d日') }
|
|||
|
t3_list.append(t3_a)
|
|||
|
context['t3_contents'] = t3_list
|
|||
|
|
|||
|
# 绘制区县转发率图
|
|||
|
drawBar(dfdfC_AD['rate'][:-1], dfdfC_AD.index[:-1],
|
|||
|
'区县转发率', os.path.join(strOutputPath, '_' + city + '_graphCounty.png'))
|
|||
|
|
|||
|
dc = {
|
|||
|
'graphCounty': InlineImage(tpl, os.path.join(strOutputPath, '_' + city+'_graphCounty.png'), width=Mm(120)),
|
|||
|
}
|
|||
|
context.update(dc)
|
|||
|
|
|||
|
tpl.render(context)
|
|||
|
tpl.save(strOutputPath+city+'转发统计报告_2023年{}月份.docx'.format(context['month']))
|
|||
|
|
|||
|
endtime = datetime.datetime.now()
|
|||
|
usedtime = endtime - starttime
|
|||
|
print("time: ", usedtime)
|