39 lines
1.8 KiB
Python
39 lines
1.8 KiB
Python
|
import csv
|
||
|
import os
|
||
|
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
s = []
|
||
|
path = "D:/2020/舆论监测平台/庆阳/数据/镇原县/html"
|
||
|
|
||
|
with open("D:/2020/舆论监测平台/庆阳/数据整理/镇原县/镇原县微信评论统计.csv", "w",newline='') as csvfile:
|
||
|
writer = csv.writer(csvfile)
|
||
|
|
||
|
# 先写入columns_name
|
||
|
writer.writerow(["评论","用户","时间", "公众号","标题"])
|
||
|
files= os.listdir(path)
|
||
|
for file in files:
|
||
|
fi = os.listdir(path+"/"+file)
|
||
|
for f in fi:
|
||
|
p = os.listdir(path+"/"+file+"/"+f)
|
||
|
for x in p:
|
||
|
# print(x.split('.html')[0].split('_')[2])
|
||
|
# print(path+"/"+file+"/"+f+"/"+x)
|
||
|
with open(path+"/"+file+"/"+f+"/"+x,'r',encoding='utf-8',newline='') as wb_data:
|
||
|
try:
|
||
|
Soup = BeautifulSoup(wb_data, 'lxml')
|
||
|
comment = Soup.select("body > div.rich_media > div.msgBox > div.msg > div.msgBody ")
|
||
|
if len(comment) > 0:
|
||
|
for c in comment:
|
||
|
print(c)
|
||
|
username = str(c).split('<p class="userName">')[1].split('<span>')[0]
|
||
|
date = str(c).split('<p class="userName">')[1].split('<span>')[1].split('</span>')[0]
|
||
|
co = str(c).split('<p class="replyBody">')[1].split('<span class="reply_like_num">')[
|
||
|
0].split('</p>')[0]
|
||
|
title = x.split('.html')[0].split('_')[2]
|
||
|
gongzhonghao = x.split('.html')[0].split('_')[0]
|
||
|
writer.writerow([co, username, date, gongzhonghao, title])
|
||
|
except:
|
||
|
pass
|
||
|
|