MonitoringOfPublicOpinion/desk/WX(3).py

186 lines
8.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from selenium import webdriver
import time
from Write_File_Excel import *
from Validate_Code import *
import datetime
from Extension_Check import *
def verify():
while True:
if 'antispider' in driver.current_url:
print('正在识别验证码..')
changeImg = driver.find_element_by_xpath('//a[@id="change-img"]')
driver.execute_script('arguments[0].click()', changeImg)
img = driver.find_element_by_xpath('//img[@id="seccodeImage"]')
codeInput = driver.find_element_by_xpath('//input[@id="seccodeInput"]')
codeInput.clear()
submit = driver.find_element_by_xpath('//a[@id="submit"]')
img.screenshot('code.jpg')
with open('user.txt', 'r') as f:
user = f.read().split('\n')
username = user[0]
userpassword = user[1]
code = validateCode('code.jpg', username, userpassword)
print('验证码:' + code)
time.sleep(1)
codeInput.send_keys(code)
time.sleep(0.5)
driver.execute_script('arguments[0].click()', submit)
time.sleep(2)
else:
break
###启动函数传入参数datas=[['微信号','距离今天(天)'],['微信号2','距离今天(天)2'].....'微信号n','距离今天(天)n']
def startCrawl(datas):
check_extension()
if not os.path.exists('user.txt'):
with open('user.txt','w+') as f:
f.write('')
else:
with open('user.txt', 'r') as f:
user = f.read().split('\n')
try:
print('打码狗账号:'+user[0])
print('打码狗密码:'+user[1])
except:
print('未找到打码狗账号密码')
#datas = read('datas')
create('weixin',['标题','正文','公众号','微信号','发布日期','链接','采集日期'])
os.system('taskkill /f /im chrome.exe')
os.system('taskkill /f /im chromedriver.exe')
option = webdriver.ChromeOptions()
#option.add_argument("--user-data-dir=C:/Users/" + getpass.getuser() + "/AppData/Local/Google/Chrome/User Data")
#option.add_argument('headless')
global driver
driver = webdriver.Chrome(executable_path="chromedriver.exe", options=option)
driver.get('https://weixin.sogou.com')
time.sleep(2)
loginBtn = driver.find_element_by_xpath('//a[@id="loginBtn"]')
if '登录' in loginBtn.text:
driver.execute_script('arguments[0].click()',loginBtn)
time.sleep(2)
while True:
try:
iframe = driver.find_element_by_xpath('//iframe[1]')
except:
print('登录成功!')
break
time.sleep(3)
for data_index,data in enumerate(datas):
print('微信号:'+str(data))
now = datetime.datetime.now()
year = int(now.strftime('%Y'))
month = int(now.strftime('%m'))
day = int(now.strftime('%d'))
# if data_index<=0:
# continue
driver.get('https://weixin.sogou.com/weixin?type=2&s_from=input&query='+data[0]+'&ie=utf8')
time.sleep(2)
verify()
html = driver.page_source
# try:
# realAccount = str(re.search(r'article_account_0">(.*?)</a>',html , re.S).group(1)).strip()
# except:
# continue
try:
result_nums = int(re.search(r'找到约(.*?)条结果', html, re.S).group(1).replace(',', ''))
except:
result_nums = 0
if result_nums <=10:
page = 1
else:
page = result_nums // 10 + 1
for index in range(page):
if index>0:
driver.get('https://weixin.sogou.com/weixin?type=2&s_from=input&query='+data[0]+'&ie=utf8&page='+str(index+1))
verify()
#html2 = driver.page_source
# try:
# accounts = re.findall(r'article_account_\d+">(.*?)</a>',html2,re.S)
# except:
# accounts = []
try:
articles = driver.find_elements_by_xpath('//a[contains(@uigs,"article_title_")]')
except:
articles = []
try:
#newList = re.search(r'class="news-list".*?</ul>',html2,re.S).group(0)
#articles_date = re.findall(r'\d{4}-\d{1,2}-\d{1,2}',newList,re.S)
articles_date = driver.find_elements_by_xpath('//div[@class="s-p"]/span[@class="s2"]')
except:
continue
h0 = driver.current_window_handle
for article_index in range(len(articles)):
# if not str(accounts[article_index]).strip()==realAccount:
# continue
try:
article_date = str(articles_date[article_index].text).split('-')
d1 = datetime.datetime(int(article_date[0]), int(article_date[1]), int(article_date[2]))
except:
if '' in articles_date[article_index].text:
try:
d1 = datetime.datetime(year, month, day-int(re.search(r'\d+',articles_date[article_index].text,re.S).group(0)))
except:
d1 = datetime.datetime(year, month, day)
else:
d1 = datetime.datetime(year, month, day)
d2 = datetime.datetime(year, month, day)
if (d2-d1).days>int(data[1]):
continue
print(str(d1))
try:
articles = driver.find_elements_by_xpath('//a[contains(@uigs,"article_title_")]')
except:
articles = []
article_url = articles[article_index].get_attribute('href')
print(article_url)
driver.execute_script('arguments[0].click()', articles[article_index])
for h in driver.window_handles:
if not h==driver.current_window_handle:
driver.switch_to.window(h)
verify()
try:
title = driver.find_element_by_xpath('//*[@id="activity-name"]').text
except:
title = ''
try:
gongzhonghao = driver.find_element_by_xpath('//*[@id="js_name"]').text
except:
gongzhonghao = ''
weixinhao = data[0]
try:
content = str(driver.find_element_by_xpath('//*[@id="js_content"]').text).strip()
content_imgs = re.findall(r'<img.*?src="(.*?)"',driver.find_element_by_xpath('//*[@id="js_content"]').get_attribute('innerHTML'),re.S)
for content_img in content_imgs:
content+='\n'+content_img
except:
content = ''
try:
publish_date = driver.find_element_by_xpath('//*[@id="publish_time"]').text
except:
publish_date = ''
date = datetime.datetime.now().strftime('%Y/%m/%d')
if publish_date == '1周前':
publish_date = datetime.date.today() - datetime.timedelta(days=7)
elif publish_date == '6天前':
publish_date = datetime.date.today() - datetime.timedelta(days=6)
elif publish_date == '5天前':
publish_date = datetime.date.today() - datetime.timedelta(days=5)
elif publish_date == '4天前':
publish_date = datetime.date.today() - datetime.timedelta(days=4)
elif publish_date == '3天前':
publish_date = datetime.date.today() - datetime.timedelta(days=3)
elif publish_date == '2天前':
publish_date = datetime.date.today() - datetime.timedelta(days=2)
elif publish_date == '前天':
publish_date = datetime.date.today() - datetime.timedelta(days=2)
elif publish_date == '昨天':
publish_date = datetime.date.today() - datetime.timedelta(days=1)
publish_date = str(publish_date)
append('weixin',[title,content[:32767],gongzhonghao,weixinhao,publish_date,driver.current_url,date])
driver.close()
driver.switch_to.window(h0)
time.sleep(2)
if __name__=='__main__':
create('datas',['微信号','距离今天(天)'])
datas = read('datas')
startCrawl(datas)