import datetime import csv import pandas as pd import numpy as np import glob, os, re, time import matplotlib.pyplot as plt from matplotlib.ticker import FuncFormatter from difflib import SequenceMatcher from collections import Counter import difflib from docxtpl import DocxTemplate from docxtpl import InlineImage from docx.shared import Mm import jieba import jieba.posseg as pseg #--- #那我们的目标就是将字段列名的日期数据替换成标准的日期格式,具体的思路是: #1、先用excel实验2018-11-02对应的日期时间戳是43406。 #2、我再用2018-11-02减43406看看是从那一年开始计算的,所以得出结论是1899-12-30。 #3、那最后要达成目标就只需要时间戳+1899-12-30就等于对应的当前日 def ts2date(dates, sf='%Y-%m-%d'):#定义转化日期戳的函数,dates为日期戳 delta=datetime.timedelta(days=dates) today=datetime.datetime.strptime('1899-12-30','%Y-%m-%d')+delta#将1899-12-30转化为可以计算的时间格式并加上要转化的日期戳 return datetime.datetime.strftime(today,sf)#制定输出日期的格式 #--- def fetch_chinese(s): pattern =re.compile(r'[^\u4e00-\u9fa5]') sc = re.sub(pattern, '', s) return sc # 画柱状图 def drawBar(data, recipe, title='', fn=''): plt.figure(figsize=(6, 4)) plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False counties = recipe countyRates = data plt.bar(counties, countyRates, width=0.5) plt.xticks(counties, counties, rotation=35) plt.ylim((0, 1)) def to_percent(temp, position): return '%2.0f' % (100 * temp) + '%' plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent)) plt.title(title, fontsize=16) plt.tight_layout() plt.savefig(fn) # plt.show() plt.cla() plt.clf() plt.close() def getWBData(path, cities, hasBody=False): # cityShorten cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市', 'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市', 'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区', 'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市', 'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市', 'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区', '白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市', '临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市', } dirCs = os.listdir(path) cs = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数', '转发数', '评论数', 'weiboID', 'weiboName', '市州'] dfWB = pd.DataFrame(columns=cs) cityCount = 0 for dirC in dirCs: if dirC[:1] == '.': continue if not os.path.isdir(os.path.join(path, dirC)): continue if 'weixin' in dirC.lower(): continue if 'tt' in dirC.lower(): continue if not cityShorten[dirC] in cities: continue print(' city: ', cityShorten[dirC], dirC) cityCount += 1 # City LN cols = ['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', 'date', '发布工具', '点赞数', '转发数', '评论数'] #WB下载工具中的格式 dfWBC = pd.DataFrame(columns=cols) dirCTs = os.listdir(os.path.join(path, dirC)) for dirCT in dirCTs: if dirCT[:1] == '.': continue # 时段 weibo weibo_1 if not os.path.isdir(os.path.join(path, dirC, dirCT)): continue if 'weixin' in dirCT.lower(): continue if 'tt' in dirCT.lower(): continue print(' read WB... dir:',dirCT) dirAs = os.listdir(os.path.join(path, dirC, dirCT)) for dirA in dirAs: if dirA[:1] == '.': continue # 都是账号名称目录下再存账号ID.txt, if not os.path.isdir(os.path.join(path, dirC, dirCT, dirA)): continue ##print('---',dirA) # 账号名称 wbName = dirA fileAs = os.listdir(os.path.join(path, dirC, dirCT, dirA)) if len(fileAs) > 0 and os.path.splitext(fileAs[0])[-1] == '.csv': wbId = fileAs[0][:-4] if len(fileAs) > 1 and wbId.startswith('.'): wbId = fileAs[1][:-4] # 读取文件 ##print('----',wbName, wbId) filename = os.path.join(path, dirC, dirCT, dirA, fileAs[0]) dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols, index_col=None)#, engine='python', encoding='gbk'#utf-8 dfdfwb = dfdfwb[1:] dfdfwb["weiboID"] = wbId dfdfwb["weiboName"] = wbName dfWBC = dfWBC.append(dfdfwb) #print(wbName, wbId, fileAs[0], dfdfwb.shape, dfWBC.shape) if len(fileAs)>1: print(" +=+= ", fileAs) print(' ', dfWBC.shape) #dfWBC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx") dfWBC['市州'] = cityShorten[dirC] dfWB = dfWB.append(dfWBC) print('Read WB finished. cities', cityCount, '; lines', dfWB.shape) #dfWB.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx") return dfWB # 从数据目录中读取xlsx文件,拼接到一起 def getWXData(path, cities, hasBody=False): # cityShorten cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市', 'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市', 'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区', 'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市', 'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市', 'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区', '白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市', '临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市', } dirBatches = os.listdir(path) cols = ['公众号', '链接', '日期', '标题', '内容', '头条', '市州', '阅读数'] dfWX = pd.DataFrame(columns=cols) countC = 0 countFnC = 0 # 监测批次目录 for dirBatch in dirBatches: if not os.path.isdir(os.path.join(path, dirBatch)): continue # 仅目录 # City LN # 列出市州文件名称 fileCs = os.listdir(os.path.join(path, dirBatch)) count = 0 for fileC in fileCs: if fileC[:1] == '.': continue # 处理目录 if os.path.isdir(os.path.join(path, dirBatch, fileC)) and 'weixin' in fileC.lower(): print(' ', os.path.join(path, dirBatch, fileC)) fs = os.listdir(os.path.join(path, dirBatch, fileC)) for f in fs: fe = os.path.splitext(f)[-1] if fe == '.xlsx' or fe == '.xls': fName = os.path.splitext(fileC)[0] cityname = cityShorten[dirBatch] if cityname in cities: dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC, f)) dfdfwxc['市州'] = cityname print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape) dfWX = dfWX.append(dfdfwxc) count = count + 1 # 处理文件 fExt = os.path.splitext(fileC)[-1] if fExt != '.xlsx' and fExt != '.xls': continue # 限制文件扩展名 fName = os.path.splitext(fileC)[0] cityname = cityShorten[dirBatch] if cityname in cities: dfdfwxc = pd.read_excel(os.path.join(path, dirBatch, fileC)) dfdfwxc['市州'] = cityShorten[dirBatch] print(' read wx: ', cityShorten[dirBatch], dirBatch, fName, dfdfwxc.shape) dfWX = dfWX.append(dfdfwxc) count = count + 1 countFnC += count if count > 0: countC += 1 print(" Read WX Finished. cities ", countC, '; Files', countFnC, '; lines ', dfWX.shape[0]) #dfWX.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WX_ALL.xlsx") return dfWX # 从数据目录中读取xlsx文件,拼接到一起 def getTTData(path, cities, hasBody=False): # cityShorten cityShorten = {'LZ':'兰州市', 'LX':'临夏回族自治州', 'JC':'金昌市', 'ZY':'张掖市', 'LN':'陇南市', 'JYG':'嘉峪关市', 'TS':'天水市', 'GN':'甘南藏族自治州', 'BY':'白银市', 'JQ':'酒泉市', 'QY':'庆阳市', 'PL':'平凉市', 'DX':'定西市', 'WW':'武威市', 'SZ':'省直部门', 'XQ': '兰州新区', 'LZXQ': '兰州新区', 'lz': '兰州市', 'lx': '临夏回族自治州', 'jc': '金昌市', 'zy': '张掖市', 'ln': '陇南市', 'jyg': '嘉峪关市', 'ts': '天水市', 'gn': '甘南藏族自治州', 'by': '白银市', 'jq': '酒泉市', 'qy': '庆阳市', 'pl': '平凉市', 'dx': '定西市', 'ww': '武威市', 'sz': '省直部门', 'xq': '兰州新区', 'lzxq': '兰州新区', '白银': '白银市', '定西': '定西市', '酒泉': '酒泉市', '嘉峪关': '嘉峪关市', '陇南': '陇南市', '临夏': '临夏回族自治州', '平凉': '平凉市', '庆阳': '庆阳市', '天水': '天水市', '武威': '武威市', '新区': '兰州新区', '兰州': '兰州市', '张掖': '张掖市', '甘南': '甘南藏族自治州', '省直': '省直部门', '金昌': '金昌市', } dirCs = os.listdir(path) #account date title nread ncomment content url origin cs = ['account', 'date', 'title', 'nread', 'ncomment', 'content', 'url', 'origin', 'city'] dfTT = pd.DataFrame(columns=cs) cityCount = 0 for dirC in dirCs: if dirC[:1] == '.': continue if not os.path.isdir(os.path.join(path, dirC)): continue if 'weixin' in dirC.lower(): continue if 'weibo' in dirC.lower(): continue if not cityShorten[dirC] in cities: continue print(' city: ', cityShorten[dirC], dirC) cityCount += 1 # City LN dfTTC = pd.DataFrame(columns=cs) dirCTs = os.listdir(os.path.join(path, dirC)) for dirCT in dirCTs: if dirCT[:1] == '.': continue # 时段 weibo weibo_1 if not os.path.isdir(os.path.join(path, dirC, dirCT)): continue if 'weixin' in dirCT.lower(): continue if 'weibo' in dirCT.lower(): continue if 'tt' in dirCT.lower(): print(' read TT... dir:',dirCT) fns = os.listdir(os.path.join(path, dirC, dirCT)) for fn in fns: if fn[:1] == '.': continue if not fn[-5:] == '.xlsx': continue #print('---',fn) # 账号名称 ttName = fn[fn.index('_')+1:] ttName = ttName[:ttName.index('_')] #D:\Projects\POM\DATA\2022年11月\10月报告\全文\LN\TT fileAs = os.path.join(path, dirC, dirCT, fn) #print(' ', ttName, fileAs) if len(fileAs) > 0: try: dfdftt = pd.read_excel(fileAs) dfTTC = dfTTC.append(dfdftt) except: print("read file failed. ", fileAs) #dfdfwb = pd.read_csv(filename, sep=',', header=None, names=cols, # index_col=None)#, engine='python', encoding='gbk'#utf-8 #dfdfwb = dfdfwb[1:] #dfdfwb["weiboID"] = wbId #dfdfwb["weiboName"] = wbName #dfTTC = dfTTC.append(dfdfwb) #print(ttName, '读入:', dfdftt.shape[0], ' 总计:', dfTTC.shape[0]) #if len(fileAs)>1: # print(" +=+= ", fileAs) print(' 读入头条数据行数', dfTTC.shape) #dfTTC.to_excel("D:/Projects/POM/2021年6月/二季度/全文/WB/WB_"+dirC+".xlsx") dfTTC['city'] = cityShorten[dirC] dfTT = dfTT.append(dfTTC) print('Read TT finished. cities', cityCount, '; lines', dfTT.shape) #dfTT.to_excel("D:/Projects/POM/2021年7月/2021年上半年/WB_ALL.xlsx") return dfTT def fetch_chinese(s): pattern =re.compile(r'[^\u4e00-\u9fa5]') sc = re.sub(pattern, '', s) return sc def doWBData(): dfAccount = pd.read_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全国报送系统表单_2023.6.30.xlsx') dfAccount = dfAccount[dfAccount['账号类型']=='新浪微博'] dfAccount['微信biz/oid/账号ID'] = dfAccount['微信biz/oid/账号ID'].astype('int64') dfwb1 = pd.read_csv('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/weibo_2/weibo1.csv', sep=',',index_col=None)#, engine='python', encoding='gbk'#utf-8 dfwb1 = dfwb1.fillna(0) dfwb1['user_id'] = dfwb1['user_id'].astype('int64') dfwb1.rename(columns={'id':'微博id', 'content':'微博正文', 'article_url':'头条文章url', 'original_pictures':'原始图片url', 'retweet_pictures':'被转发微博原始图片url', 'original':'是否为原创微博', 'video_url':'微博视频url', 'publish_place':'发布位置', 'publish_time':'发布时间', 'publish_tool':'发布工具', 'up_num':'点赞数', 'retweet_num':'转发数', 'comment_num':'评论数'}, inplace = True) print(dfAccount.shape) print(dfwb1.shape, dfwb1.dtypes) if 1: sDir = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全文/LN/weibo_3/' i=0 j=0 for uid in dfwb1['user_id'].unique(): dfa1 = dfAccount[dfAccount['微信biz/oid/账号ID']==uid] dfa1.reset_index(inplace=True) if dfa1.shape[0]>0: sA = str(dfa1.loc[0,'账号名称']) #print(dfa1['账号名称']) i = i + 1 dfwba = dfwb1.loc[dfwb1['user_id']==uid] os.mkdir(sDir+sA) #微博id,微博正文,头条文章url,原始图片url,被转发微博原始图片url,是否为原创微博,微博视频url, # 发布位置,发布时间,发布工具,点赞数,转发数,评论数 dfwba = dfwba[['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '微博视频url', '发布位置', '发布时间', '发布工具', '点赞数', '转发数', '评论数']] dfwba = dfwba.reset_index() dfwba.to_csv(sDir+sA+'/'+str(uid)+'.csv', encoding='utf_8_sig', index=0, quoting=1) else: j = j+1 print('found ', i, '; nofound', j) i=0 j=0 if 1: sDir = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全文/LN/weibo_4/' dfwb2 = pd.read_csv('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/weibo_2/weibo2.csv', sep=',',index_col=None)#, engine='python', encoding='gbk'#utf-8 dfwb2 = dfwb2.fillna(0) dfwb2['user_id'] = dfwb2['user_id'].astype('int64') dfwb2.rename(columns={'id':'微博id', 'text':'微博正文', 'article_url':'头条文章url', 'pics':'原始图片url', 'topics':'被转发微博原始图片url','source':'是否为原创微博','video_url':'微博视频url', 'location':'发布位置', 'created_at':'发布时间', 'bid':'发布工具', 'attitudes_count':'点赞数', 'reposts_count':'转发数', 'comments_count':'评论数'}, inplace = True) print(dfwb2.shape) for uid in dfwb2['user_id'].unique(): dfa2 = dfAccount[dfAccount['微信biz/oid/账号ID']==uid] dfa2.reset_index(inplace=True) if dfa2.shape[0]>0: sA = str(dfa2.loc[0, '账号名称']) i = i+1 dfwba = dfwb2.loc[dfwb2['user_id']==uid] os.mkdir(sDir+sA) #微博id,微博正文,头条文章url,原始图片url,被转发微博原始图片url,是否为原创微博,微博视频url,发布位置,发布时间,发布工具,点赞数,转发数,评论数 dfwba = dfwba[['微博id', '微博正文', '头条文章url', '原始图片url', '被转发微博原始图片url', '是否为原创微博', '微博视频url', '发布位置', '发布时间', '发布工具', '点赞数', '转发数', '评论数']] dfwba = dfwba.reset_index() dfwba.to_csv(sDir+sA+'/'+str(uid)+'.csv', encoding='utf_8_sig', index=0, quoting=1) else: #print(uid) j = j+1 print('found ', i, '; nofound', j) if __name__ == "__main__": #doWBData() #exit(0) starttime = datetime.datetime.now() _RATIO = 0.5 isDoWX = True isDoWB = True isDoTT = True cities = [ '临夏回族自治州', '白银市', '定西市', '酒泉市', '嘉峪关市', '平凉市', '庆阳市', '天水市', '武威市', '兰州新区', '陇南市', '兰州市', '张掖市', '甘南藏族自治州', '金昌市', '省直部门', # 共12市2州1新区 ] ''' cities = [ '临夏回族自治州', '白银市', '定西市', '酒泉市', '天水市', '陇南市', #'省直部门', # 共12市2州1新区 ] ''' cities = ['陇南市',] #cities = ['陇南市', '临夏回族自治州', '白银市', '定西市', '酒泉市', '平凉市','武威市','天水市'] #cities = ['陇南市'] # 转发任务 #dfTask = pd.read_excel('D:/Projects/POM/DATA/2022年S2/S2/全省政务新媒体二季度转发信息条目.xls') dfTask = pd.read_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/陇南7月上旬转发台账.xlsx') sTaskTitle = '标题' sTaskDate = '推送时间' # 删除标题列为空的行 dfTask.dropna(axis=0,subset = ["标题"]) yT0 = dfTask.columns.get_loc('序号') yT1 = dfTask.columns.get_loc('标题') # 账号信息 strFnAccount = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全国报送系统表单_2023.6.30.xlsx' dfAllAccount = pd.read_excel(strFnAccount) # 添加列 dfAllAccount.loc[:, '转发数'] = 0 #dfAllAccount.loc[:, '阅读数'] = 0 dfAllAccount = pd.concat([dfAllAccount, pd.DataFrame(np.zeros((dfAllAccount.shape[0], dfTask.shape[0])), columns=dfTask['序号'].astype(str).tolist())], axis=1) # 整理数据 dfAllAccount['市/省局'] = dfAllAccount['市/省局'].fillna('省直部门') dfAllAccount['区县/地方部门'] = dfAllAccount['区县/地方部门'].fillna('市直部门') dfAllAccount.loc[(dfAllAccount['市/省局'].isin(['临夏回族自治州', '甘南藏族自治州'])) & (dfAllAccount['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '州直部门' dfAllAccount.loc[(dfAllAccount['市/省局'].isin(['省直部门'])) & (dfAllAccount['区县/地方部门'] == '市直部门'), '区县/地方部门'] = '省直部门' # 过长名称替换为简称,便于绘图 dfAllAccount.loc[dfAllAccount['区县/地方部门'] == '积石山保安族东乡族撒拉族自治县', '区县/地方部门'] = '积石山县' dfAllAccount.loc[dfAllAccount['区县/地方部门'] == '阿克塞哈萨克族自治县', '区县/地方部门'] = '阿克塞自治县' yAccountName = dfAllAccount.columns.get_loc('账号名称') yAccountCity = dfAllAccount.columns.get_loc('市/省局') yAccountCounty = dfAllAccount.columns.get_loc('区县/地方部门') yAccountUnit = dfAllAccount.columns.get_loc('单位全称') # 省直部门账号部门简称 fnTemplate = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/POM_ForewardTemplate.docx' # 数据根目录, strPath = ['D:/Projects/POM/DATA/2023年7月/7.11陇南转发/全文/', ] strOutputPath = 'D:/Projects/POM/DATA/2023年7月/7.11陇南转发/转发/' context = { "year": "2023", "month": "7", "pubMonth": "7", "dateStart": "2023年7月1日", "dateEnd": "2023年7月10日" } ################################################ # 创建存储矩阵 # 按照转发任务创建统计矩阵 colRR = ['市州', '类型', '账号名称', '单位名称', '省直部门', '区县', '转发数', '阅读数'] for ididid in dfTask['序号'][0:dfTask['标题'].count()].tolist(): #for ididid in range(1, dfTask['标题'].count()): colRR.append(str(ididid)) # 用于保存每一条转发任务的账号和文章 dfO = pd.DataFrame(columns=['任务序号', '任务名称', '类型', '公众号', '日期', '内容', '链接', '市州'] ) # WX if isDoWX: print('=============================================================') print('---- WX ----') dfWX = pd.DataFrame() for strP in strPath: ddff = getWXData(strP, cities) dfWX = dfWX.append(ddff) dfWX = dfWX.fillna(value=0) yWXtitle = dfWX.columns.get_loc('标题') yWXnread = dfWX.columns.get_loc('阅读数') yWXdate = dfWX.columns.get_loc('日期') yWXurl = dfWX.columns.get_loc('链接') # 公众号 链接 日期 标题 内容 头条 city ## 逐个市州统计每个账号的转发情况 #cities = dfWX['市州'].unique() for city in cities: print('---- WX title match', city, ' ----' ) # 本市微信数据 dataC = dfWX.loc[dfWX['市州'] == city].copy() # 获取微信账号数 accounts = dataC['公众号'].unique() # 所有微信账号数 maskCWX = ( (dfAllAccount['账号类型'] == '微信服务号')|(dfAllAccount['账号类型'] == '微信订阅号') ) & (dfAllAccount['市/省局'] == city) accountNumCWX = maskCWX.tolist().count(True) # 按获取得微信账号遍历 for account in accounts: #print(account) # 该账号的所有文章 dataA = dataC.loc[dataC['公众号'] == account].copy() # 一个公众号的所有文章 sR = pd.Series(dtype='object') sR['类型'] = '微信' sR['市州'] = city sR['账号名称'] = account count = 0 arn = 0 # 从账号信息中匹配该账号详细信息 mask = ( (dfAllAccount['账号类型'] == '小程序+微信') | (dfAllAccount['账号类型'] == '微信服务号') | (dfAllAccount['账号类型'] == '微信订阅号') ) & (dfAllAccount['市/省局'] == city) & (dfAllAccount['账号名称']==account) if mask.any(): sxq = str(dfAllAccount.loc[mask, '区县/地方部门'].values[0]) if sxq.lower() !='nan': sR['区县'] = sxq sdwmc = str(dfAllAccount.loc[mask, '单位全称'].values[0]) if sdwmc.lower() != 'nan': sR['单位名称'] = sdwmc else: print(' !!!! 微信', account, '在', city, '无详细信息' ) continue # 按任务标题逐个匹配所有发文,得到每篇任务的转发情况 for i in range(dfTask['标题'].count()): # 对于每一篇任务文章 rn = dfTask.iloc[i, yT0] # 序号 ssrt = str(dfTask.iloc[i, yT1]) # 标题/内容 rt = fetch_chinese(ssrt) # 只取汉字 forwarded = 0 # 转发数 readNum = 0 # 阅读数 # 查看该账号的所有文章 for j in range(dataA.shape[0]): str1 = fetch_chinese(str(dataA.iloc[j, yWXtitle])) # 只取汉字 # 任务标题过长,截取前半部分进行对比 if len(rt) > len(str1): strRT = rt[:len(str1)] else:#文章标题过长,只比较任务标题长度部分 strRT = rt str1 = str1[:len(rt)] ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio() # 遇到相似的,认为已转发,即跳出不再查找 if ratio > _RATIO: forwarded += 1 readNum += int(dataA.iloc[j, yWXnread]) if forwarded > 0: break sR[str(rn)] = forwarded # 记录该篇文章的转发数 count += forwarded # 累加该篇文章的转发数 arn += readNum # 累加该篇文章的阅读数 # 记录该篇任务转发情况加入 if forwarded > 0: dfO = dfO.append([{'任务序号': rn, '任务名称': ssrt, '类型': '微信', '公众号': account, '日期': dataA.iloc[j, yWXdate], '内容': str1, '链接': dataA.iloc[j, yWXurl], '市州': city, '阅读数': readNum, }], ignore_index=True) #记录该任务的转发情况 dfAllAccount.loc[mask, str(rn)] = forwarded #记录该账号的总转发数 dfAllAccount.loc[mask, '转发数'] = count sR['转发数'] = count sR['阅读数'] = arn # 全市总转发文章篇数 ccwx = dfAllAccount.loc[maskCWX, '转发数'].sum() # 全市总转发率 rcc = ccwx/accountNumCWX/dfTask.shape[0] print(' ', city, '共有', accountNumCWX, '个微信号,获取数据', len(accounts), '个。共转发', ccwx, '次,转发率{:.1f}%'.format(rcc*100) ) #countWxForewards = dfRR.shape[0] #print(' 获取 WX 账号数', len(dfWX['公众号'].unique()),'参与转发账号数', countWxForewards) # WB if isDoWB: print('=============================================================') print('---- WB data read ----') #获取微博数据 dfWB = pd.read_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/微博全文.xlsx') #for strP in strPath: ## ddff = getWBData(strP, cities) # dfWB = dfWB.append(ddff) print('----', dfWB.shape) #=========================================================================================== #=========================================================================================== yWBcontent = dfWB.columns.get_loc('微博正文') yWBdate = dfWB.columns.get_loc('date') yWBurl = dfWB.columns.get_loc('头条文章url') #dfWB.to_excel('D:/Projects/POM/DATA/2023年7月/7.11陇南转发/转发/微博全文.xlsx') ################################################ # WB # 微博id 微博正文 头条文章url 原始图片url 被转发微博原始图片url 是否为原创微博 微博视频url 发布位置 date # 发布工具 点赞数 转发数 评论数 weiboID weiboName city #cities = dfWB['市州'].unique() for city in cities: print('---- WB match', city, ' ----' ) # 本市微博数据 dataC = dfWB.loc[dfWB['市州'] == city].copy() # 获取数据的微博账号 accounts = dataC['weiboName'].unique() # 本市所有微博账号 maskCWB = (dfAllAccount['账号类型'] == '新浪微博') & (dfAllAccount['市/省局'] == city) accountNumCWB = maskCWB.tolist().count(True) # 按获取的微博账号遍历 for account in accounts: # print(account) # 该公众号的所有文章 dataA = dataC.loc[dataC['weiboName'] == account].copy() sR = pd.Series(dtype='object') sR['类型'] = '新浪微博' sR['市州'] = city sR['账号名称'] = account count = 0 # 为转发账号匹配单位全称和所属县区 mask = ( dfAllAccount['账号类型'] == '新浪微博' ) & (dfAllAccount['市/省局'] == city) & (dfAllAccount['账号名称']==account) if mask.any(): sxq = str(dfAllAccount.loc[mask, '区县/地方部门'].values[0]) if sxq.lower() !='nan': sR['区县'] = sxq sdwmc = str(dfAllAccount.loc[mask, '单位全称'].values[0]) if sdwmc.lower() != 'nan': sR['单位名称'] = sdwmc else: print(' !!!! 微博', account, '在', city, '无详细信息' ) continue # 按任务标题逐个匹配所有发文,得到每篇任务的转发情况 for i in range(dfTask['标题'].count()): rn = dfTask.iloc[i, yT0] # 任务序号 ssrt = str(dfTask.iloc[i, yT1]) # 任务标题 rt = fetch_chinese(ssrt) # 只取中文 forwarded = 0 # 对该账号的所有文章 for j in range(dataA.shape[0]): str0 = str(dataA.iloc[j, yWBcontent]) str1 = fetch_chinese(str0) str2 = str1[:len(rt)] # 取任务标题相同汉字数进行比较 ratio = difflib.SequenceMatcher(None, rt, str2).quick_ratio() if ratio > _RATIO: forwarded += 1 if forwarded > 0: break #记记录该任务的转发情况 dfAllAccount.loc[mask, str(rn)] = forwarded sR[str(rn)] = forwarded # 转发数累加到本账号里 count += forwarded # 记录该篇任务转发情况加入 if forwarded > 0: dfO = dfO.append([{'任务序号': rn, '任务名称': rt, '类型': '新浪微博', '公众号': account, '日期': dataA.iloc[j, yWBdate], '内容': str1, '链接': dataA.iloc[j, yWBurl], '市州': city, }], ignore_index=True) # 记录该账号的总转发数 dfAllAccount.loc[mask, '转发数'] = count sR['转发数'] = count # 全市总转发文章篇数 ccwb = dfAllAccount.loc[maskCWB, '转发数'].sum() # 全市总转发率 rcc = ccwb/accountNumCWB/dfTask.shape[0] print(' ', city, '共有', accountNumCWB, '个微博号,获取数据', len(accounts), '个。共转发', ccwb, '次,转发率{:.1f}%'.format(rcc*100) ) #countWbForewards = dfRR.shape[0] - countWxForewards #print(' 获取 WB 账号数', len(dfWB['weiboName'].unique()), '参与转发账号数', countWbForewards) # TT if isDoTT: print('=============================================================') print('---- TT data read ----') # id userId source city tid cellType title # time-stamp date url commentCount readNum likeNum showNum # 获取头条数据 dfTT = pd.DataFrame() for strP in strPath: ddff = getTTData(strP, cities) dfTT = dfTT.append(ddff) yTTtitle = dfTT.columns.get_loc('title') yTTdate = dfTT.columns.get_loc('date') yTTurl = dfTT.columns.get_loc('url') # 逐个市州统计账号转发情况 for city in cities: print("++++++++++++++++++++++++++++++++++++++++++++++++++") print('---- TT title match', city, ' ----' ) # 本市头条数据 dataC = dfTT.loc[dfTT['city'] == city].copy() # 获取数据的头条账号 accounts = dataC['account'].unique() # 本市所有头条账号信息 maskCTT = (dfAllAccount['账号类型'] == '今日头条') & (dfAllAccount['市/省局'] == city) accountNumCTT = maskCTT.tolist().count(True) # 按头条数据的账号遍历 for account in accounts: #print(account) # 该账号的所有文章 dataA = dataC[dataC['account']==account] sR = pd.Series([], dtype=pd.StringDtype()) sR['类型'] = '今日头条' sR['市州'] = city sR['账号名称'] = account count = 0 # 为转发账号匹配单位全称和所属县区 mask = ( dfAllAccount['账号类型'] == '今日头条' ) & (dfAllAccount['市/省局'] == city) & (dfAllAccount['账号名称']==account) if mask.any(): sxq = str(dfAllAccount.loc[mask, '区县/地方部门'].values[0]) if sxq.lower() !='nan': sR['区县'] = sxq sdwmc = str(dfAllAccount.loc[mask, '单位全称'].values[0]) if sdwmc.lower() != 'nan': sR['单位名称'] = sdwmc else: print(' !!!! 头条', account, '在', city, '无详细信息' ) continue # 按任务标题逐个匹配所有发文,得到每篇任务的转发情况 for i in range(dfTask['标题'].count()): # 对于每一篇任务文章 rn = dfTask.iloc[i, yT0] # 任务序号 ssrt = str(dfTask.iloc[i, yT1]) # 任务标题 rt = fetch_chinese(ssrt) # 只取中文 forwarded = 0 # 查看该账号的所有文章 for j in range(dataA.shape[0]): str0 = str(dataA.iloc[j, yTTtitle]) str1 = fetch_chinese(str0) # if len(rt) > len(str1): # 若任务标题过长,截取前半部分进行对比 strRT = rt[:len(str1)] else: #若文章标题过长,只比较任务标题长度部分 strRT = rt str1 = str1[:len(rt)] ratio = difflib.SequenceMatcher(None, strRT, str1).quick_ratio() if ratio > _RATIO: forwarded += 1 if forwarded > 0: break #记录该任务转发情况 dfAllAccount.loc[mask, str(rn)] = forwarded sR[str(rn)] = forwarded count += forwarded if forwarded > 0: dfO = dfO.append([{'任务序号': rn, '任务名称': rt, '类型': '今日头条', '公众号': account, '日期': dataA.iloc[j, yTTdate], '内容': str1, '链接': dataA.iloc[j, yTTurl], '市州': city, }], ignore_index=True) # 记录该账号转发情况 dfAllAccount.loc[mask, '转发数'] = count sR['转发数'] = count # 全市总转发文章篇数 cctt = dfAllAccount.loc[maskCTT, '转发数'].sum() # 全市总转发率 rcc = cctt/accountNumCTT/dfTask.shape[0] print(' ', city, '共有', accountNumCTT, '个头条号,获取数据', len(accounts), '个。共转发', cctt, '次,转发率{:.1f}%'.format(rcc*100) ) #countTtForewards = dfRR.shape[0] - countWxForewards - countWbForewards #print(' 获取 TT 账号数', len(dfTT['account'].unique()),'参与转发账号数', countTtForewards) if isDoWX or isDoWB or isDoTT: print('=============================================================') print('---- STATISTICS ----') print('=============================================================') dfAllAccount.to_excel(strOutputPath + '甘肃省_转发账号.xlsx') dfO.to_excel(strOutputPath + '甘肃省_转发文章.xlsx') print('---- 统计市州转发率 ----') for city in cities: #if city in ['兰州新区', '省直部门']: # continue print(" add up city", city) maskC = ( (dfAllAccount['账号类型'] == '新浪微博') | (dfAllAccount['账号类型'] == '微信服务号') | (dfAllAccount['账号类型'] == '微信订阅号') | (dfAllAccount['账号类型'] == '今日头条') ) & (dfAllAccount['市/省局'] == city) # dfdfC = dfAllAccount.loc[((dfAllAccount['账号类型'] == '新浪微博') # | (dfAllAccount['账号类型'] == '微信服务号') # | (dfAllAccount['账号类型'] == '微信订阅号') # | (dfAllAccount['账号类型'] == '今日头条')) # & (dfAllAccount['市/省局'] == city)].copy() dfdfC = dfAllAccount.loc[maskC,:] dfdfC.to_excel(strOutputPath + city + '_转发账号.xlsx') dfOCity = dfO[dfO['市州'] == city] dfO.to_excel(strOutputPath + city + '_转发文章.xlsx') #dfRRCity = dfRR.loc[dfRR['市州'] == city].copy() ######################################################################################################### # 统计市/州直部门转发数 dfdfCD = dfdfC.loc[dfdfC['区县/地方部门'].isin(['州直部门', '市直部门', '省直部门'])].copy() dfdfCDA = pd.pivot_table(dfdfCD, index=['单位全称'], values=['账号名称'], aggfunc=['count'], fill_value='', margins=True, margins_name='总计') dfdfCDC = pd.pivot_table(dfdfCD, index=['单位全称'], values=['转发数'], aggfunc=['sum'], fill_value='', margins=True, margins_name='总计') #dfdfCDR = pd.pivot_table(dfdfCD, index=['单位全称'], values=['阅读数'], # aggfunc=['sum'], fill_value='', margins=True, margins_name='总计') dfdfCD_A = pd.concat([dfdfCDA, dfdfCDC], axis=1) #print('-', dfdfCD_A.columns.values) # 合并多层索引MultiIndex dfdfCD_A.columns = ['_'.join(col) for col in dfdfCD_A.columns.values] #print('=', dfdfCD_A.columns.values) # 计算转发率 dfdfCD_A['rate'] = dfdfCD_A.apply( lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['标题'].count() * 1000) / 1000.0, axis=1) # 排序 dfdfCD_AD = dfdfCD_A[0:dfdfCD_A.shape[0] - 1].sort_values(by='rate', ascending=False) dfdfCD_AD = pd.concat([dfdfCD_AD, dfdfCD_A[dfdfCD_A.shape[0] - 1:dfdfCD_A.shape[0]]], axis=0) dfdfCD_AD.to_excel(strOutputPath + city + '部门转发统计表.xlsx') #dfDD ########################################################################################## # 全市/州账号按'区县'统计 # 发现目前版本pivot_table函数aggfunc用列表时,前几列计算值不准确 # 所以,暂时单列计算,再合并 #dfdfCD = dfdfC.loc[dfdfC['区县/地方部门'].isin(['州直部门', '市直部门', '省直部门'])].copy() dfdfCA = pd.pivot_table(dfdfC, index=['区县/地方部门'], values=['账号名称'], aggfunc=['count'], fill_value='', margins=True, margins_name='总计') dfdfCC = pd.pivot_table(dfdfC, index=['区县/地方部门'], values=['转发数'], aggfunc=['sum'], fill_value='', margins=True, margins_name='总计') #dfdfCDR = pd.pivot_table(dfdfCD, index=['单位全称'], values=['阅读数'], # aggfunc=['sum'], fill_value='', margins=True, margins_name='总计') dfdfC_A = pd.concat([dfdfCA, dfdfCC], axis=1) #print('-', dfdfCD_A.columns.values) # 合并多层索引MultiIndex dfdfC_A.columns = ['_'.join(col) for col in dfdfC_A.columns.values] #print('=', dfdfCD_A.columns.values) # 计算转发率 dfdfC_A['rate'] = dfdfC_A.apply( lambda x: int(x['sum_转发数'] / x['count_账号名称'] / dfTask['标题'].count() * 1000) / 1000.0, axis=1) # 排序 dfdfC_AD = dfdfC_A[0:dfdfC_A.shape[0] - 1].sort_values(by='rate', ascending=False) dfdfC_AD = pd.concat([dfdfC_AD, dfdfC_A[dfdfC_A.shape[0] - 1:dfdfC_A.shape[0]]], axis=0) dfdfC_AD.to_excel(strOutputPath + city + '转发统计表.xlsx') #dfCC ######################################################### # # 生成报告 tpl = DocxTemplate(fnTemplate) if city in ['临夏回族自治州', '甘南藏族自治州']: sL0 = '州' else: sL0 = '市' info = { "strL0":sL0, "strL1":"区县", "taskCount": dfTask['标题'].count(), "aNum": int(dfdfC_AD.iloc[-1]['count_账号名称']), "fNum": int(dfdfC_AD.iloc[-1]['sum_转发数']), "r": '%.1f'%(dfdfC_AD.iloc[-1]['rate']*100.0), # "dNum": int(dfdfCD_AD.iloc[-1]['count_账号名称']), # 部门总账号数 "dFNum": int(dfdfCD_AD.iloc[-1]['sum_转发数']), # 部门总转发数 "dr": '%.1f'%(dfdfCD_AD.iloc[-1]['rate']*100.0), # 部门平均转发率 } context.update(info) # 县区转发率表格 t1_list = [] for index, row in dfdfC_AD.iterrows(): if index == "总计": continue t1_a = {'county': str(index), 'rate': '%.1f'%(row['rate']*100.0), 'account': int(row['count_账号名称']), 'fNum': int(row['sum_转发数']) } t1_list.append(t1_a) context['t1_contents'] = t1_list # 部门转发率表格 t2_list = [] for index, row in dfdfCD_AD.iterrows(): if index == "总计": continue t2_a = {'name': str(index), 'rate': '%.1f'%(row['rate']*100.0), 'account': int(row['count_账号名称']), 'fNum': int(row['sum_转发数']) } t2_list.append(t2_a) context['t2_contents'] = t2_list # 转发任务列表 t3_list = [] for index, row in dfTask.iterrows(): t3_a = {'id': row['序号'], 'title': row['标题'], 'date': ts2date(row[sTaskDate], '%m月%d日') } t3_list.append(t3_a) context['t3_contents'] = t3_list # 绘制区县转发率图 drawBar(dfdfC_AD['rate'][:-1], dfdfC_AD.index[:-1], '区县转发率', os.path.join(strOutputPath, '_' + city + '_graphCounty.png')) dc = { 'graphCounty': InlineImage(tpl, os.path.join(strOutputPath, '_' + city+'_graphCounty.png'), width=Mm(120)), } context.update(dc) tpl.render(context) tpl.save(strOutputPath+city+'转发统计报告_2023年{}月份.docx'.format(context['month'])) endtime = datetime.datetime.now() usedtime = endtime - starttime print("time: ", usedtime)