186 lines
8.5 KiB
Python
186 lines
8.5 KiB
Python
from selenium import webdriver
|
||
import time
|
||
from Write_File_Excel import *
|
||
from Validate_Code import *
|
||
import datetime
|
||
from Extension_Check import *
|
||
def verify():
|
||
while True:
|
||
if 'antispider' in driver.current_url:
|
||
print('正在识别验证码..')
|
||
changeImg = driver.find_element_by_xpath('//a[@id="change-img"]')
|
||
driver.execute_script('arguments[0].click()', changeImg)
|
||
img = driver.find_element_by_xpath('//img[@id="seccodeImage"]')
|
||
codeInput = driver.find_element_by_xpath('//input[@id="seccodeInput"]')
|
||
codeInput.clear()
|
||
submit = driver.find_element_by_xpath('//a[@id="submit"]')
|
||
img.screenshot('code.jpg')
|
||
with open('user.txt', 'r') as f:
|
||
user = f.read().split('\n')
|
||
username = user[0]
|
||
userpassword = user[1]
|
||
code = validateCode('code.jpg', username, userpassword)
|
||
print('验证码:' + code)
|
||
time.sleep(1)
|
||
codeInput.send_keys(code)
|
||
time.sleep(0.5)
|
||
driver.execute_script('arguments[0].click()', submit)
|
||
time.sleep(2)
|
||
else:
|
||
break
|
||
###启动函数,传入参数datas=[['微信号','距离今天(天)'],['微信号2','距离今天(天)2'].....'微信号n','距离今天(天)n']
|
||
def startCrawl(datas):
|
||
check_extension()
|
||
if not os.path.exists('user.txt'):
|
||
with open('user.txt','w+') as f:
|
||
f.write('')
|
||
else:
|
||
with open('user.txt', 'r') as f:
|
||
user = f.read().split('\n')
|
||
try:
|
||
print('打码狗账号:'+user[0])
|
||
print('打码狗密码:'+user[1])
|
||
except:
|
||
print('未找到打码狗账号密码')
|
||
#datas = read('datas')
|
||
create('weixin',['标题','正文','公众号','微信号','发布日期','链接','采集日期'])
|
||
os.system('taskkill /f /im chrome.exe')
|
||
os.system('taskkill /f /im chromedriver.exe')
|
||
option = webdriver.ChromeOptions()
|
||
#option.add_argument("--user-data-dir=C:/Users/" + getpass.getuser() + "/AppData/Local/Google/Chrome/User Data")
|
||
#option.add_argument('headless')
|
||
global driver
|
||
driver = webdriver.Chrome(executable_path="chromedriver.exe", options=option)
|
||
driver.get('https://weixin.sogou.com')
|
||
time.sleep(2)
|
||
loginBtn = driver.find_element_by_xpath('//a[@id="loginBtn"]')
|
||
if '登录' in loginBtn.text:
|
||
driver.execute_script('arguments[0].click()',loginBtn)
|
||
time.sleep(2)
|
||
while True:
|
||
try:
|
||
iframe = driver.find_element_by_xpath('//iframe[1]')
|
||
except:
|
||
print('登录成功!')
|
||
break
|
||
time.sleep(3)
|
||
for data_index,data in enumerate(datas):
|
||
print('微信号:'+str(data))
|
||
now = datetime.datetime.now()
|
||
year = int(now.strftime('%Y'))
|
||
month = int(now.strftime('%m'))
|
||
day = int(now.strftime('%d'))
|
||
# if data_index<=0:
|
||
# continue
|
||
driver.get('https://weixin.sogou.com/weixin?type=2&s_from=input&query='+data[0]+'&ie=utf8')
|
||
time.sleep(2)
|
||
verify()
|
||
html = driver.page_source
|
||
# try:
|
||
# realAccount = str(re.search(r'article_account_0">(.*?)</a>',html , re.S).group(1)).strip()
|
||
# except:
|
||
# continue
|
||
try:
|
||
result_nums = int(re.search(r'找到约(.*?)条结果', html, re.S).group(1).replace(',', ''))
|
||
except:
|
||
result_nums = 0
|
||
if result_nums <=10:
|
||
page = 1
|
||
else:
|
||
page = result_nums // 10 + 1
|
||
for index in range(page):
|
||
if index>0:
|
||
driver.get('https://weixin.sogou.com/weixin?type=2&s_from=input&query='+data[0]+'&ie=utf8&page='+str(index+1))
|
||
verify()
|
||
#html2 = driver.page_source
|
||
# try:
|
||
# accounts = re.findall(r'article_account_\d+">(.*?)</a>',html2,re.S)
|
||
# except:
|
||
# accounts = []
|
||
try:
|
||
articles = driver.find_elements_by_xpath('//a[contains(@uigs,"article_title_")]')
|
||
except:
|
||
articles = []
|
||
try:
|
||
#newList = re.search(r'class="news-list".*?</ul>',html2,re.S).group(0)
|
||
#articles_date = re.findall(r'\d{4}-\d{1,2}-\d{1,2}',newList,re.S)
|
||
articles_date = driver.find_elements_by_xpath('//div[@class="s-p"]/span[@class="s2"]')
|
||
except:
|
||
continue
|
||
h0 = driver.current_window_handle
|
||
for article_index in range(len(articles)):
|
||
# if not str(accounts[article_index]).strip()==realAccount:
|
||
# continue
|
||
try:
|
||
article_date = str(articles_date[article_index].text).split('-')
|
||
d1 = datetime.datetime(int(article_date[0]), int(article_date[1]), int(article_date[2]))
|
||
except:
|
||
if '天' in articles_date[article_index].text:
|
||
try:
|
||
d1 = datetime.datetime(year, month, day-int(re.search(r'\d+',articles_date[article_index].text,re.S).group(0)))
|
||
except:
|
||
d1 = datetime.datetime(year, month, day)
|
||
else:
|
||
d1 = datetime.datetime(year, month, day)
|
||
d2 = datetime.datetime(year, month, day)
|
||
if (d2-d1).days>int(data[1]):
|
||
continue
|
||
print(str(d1))
|
||
try:
|
||
articles = driver.find_elements_by_xpath('//a[contains(@uigs,"article_title_")]')
|
||
except:
|
||
articles = []
|
||
article_url = articles[article_index].get_attribute('href')
|
||
print(article_url)
|
||
driver.execute_script('arguments[0].click()', articles[article_index])
|
||
for h in driver.window_handles:
|
||
if not h==driver.current_window_handle:
|
||
driver.switch_to.window(h)
|
||
verify()
|
||
try:
|
||
title = driver.find_element_by_xpath('//*[@id="activity-name"]').text
|
||
except:
|
||
title = ''
|
||
try:
|
||
gongzhonghao = driver.find_element_by_xpath('//*[@id="js_name"]').text
|
||
except:
|
||
gongzhonghao = ''
|
||
weixinhao = data[0]
|
||
try:
|
||
content = str(driver.find_element_by_xpath('//*[@id="js_content"]').text).strip()
|
||
content_imgs = re.findall(r'<img.*?src="(.*?)"',driver.find_element_by_xpath('//*[@id="js_content"]').get_attribute('innerHTML'),re.S)
|
||
for content_img in content_imgs:
|
||
content+='\n'+content_img
|
||
except:
|
||
content = ''
|
||
try:
|
||
publish_date = driver.find_element_by_xpath('//*[@id="publish_time"]').text
|
||
except:
|
||
publish_date = ''
|
||
date = datetime.datetime.now().strftime('%Y/%m/%d')
|
||
if publish_date == '1周前':
|
||
publish_date = datetime.date.today() - datetime.timedelta(days=7)
|
||
elif publish_date == '6天前':
|
||
publish_date = datetime.date.today() - datetime.timedelta(days=6)
|
||
elif publish_date == '5天前':
|
||
publish_date = datetime.date.today() - datetime.timedelta(days=5)
|
||
elif publish_date == '4天前':
|
||
publish_date = datetime.date.today() - datetime.timedelta(days=4)
|
||
elif publish_date == '3天前':
|
||
publish_date = datetime.date.today() - datetime.timedelta(days=3)
|
||
elif publish_date == '2天前':
|
||
publish_date = datetime.date.today() - datetime.timedelta(days=2)
|
||
elif publish_date == '前天':
|
||
publish_date = datetime.date.today() - datetime.timedelta(days=2)
|
||
elif publish_date == '昨天':
|
||
publish_date = datetime.date.today() - datetime.timedelta(days=1)
|
||
publish_date = str(publish_date)
|
||
append('weixin',[title,content[:32767],gongzhonghao,weixinhao,publish_date,driver.current_url,date])
|
||
driver.close()
|
||
driver.switch_to.window(h0)
|
||
time.sleep(2)
|
||
if __name__=='__main__':
|
||
create('datas',['微信号','距离今天(天)'])
|
||
datas = read('datas')
|
||
startCrawl(datas)
|