selenium函数:使用 Selenium 启动无头浏览器,访问目标网页并提取动态加载的内容,生成文本信息并通过send_request发送。reptile函数:通过普通 HTTP 请求获取网页内容,解析静态数据并生成文本信息,调用send_request发送。send_request函数:将生成的文本信息发送到指定的消息服务器。
import requests
from fake_useragent import UserAgent
import time
import random
from lxml import etree
from selenium import webdriver
# 爬取网页的请求头
ua = UserAgent()
headers = {
'user-agent': ua.random # 随机生成 User-Agent
}
url = 'https://xxx.com'
# 发送请求的URL与参数
msg_url = 'http://aaa.com'
params = {
'sendkey': 'abc'
}
def selenium():
allText = '爬取内容:' + '\n'
# 启动无头浏览器
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
# 访问网页
driver.get(url)
time.sleep(3) # 等待页面加载完成
# 获取动态加载的内容
html_content = driver.page_source
tree = etree.HTML(html_content)
# 提取数据
xpath = '//*[@id="page"]/div[2]/div[2]/div[1]/div[2]/div/div[1]/table/tbody/tr'
elements = tree.xpath(xpath)
for element in elements:
num = str(element.xpath('td[1]/text()'))[2:-2]
text = element.xpath('td[2]/a/text()')
if text:
allText += num+''.join(text) + '\n'
print(allText)
send_request(allText)
driver.quit()
# 爬取网页的函数
def reptile():
allText = '爬取内容:' + '\n'
response = requests.get(url=url, headers=headers)
response.encoding = response.apparent_encoding # 自动获取网页编码方式并设置
tree = etree.HTML(response.text)
for i in range(1, 16):
num = str(tree.xpath('//*[@id="page"]/div[2]/div[2]/div[1]/div[2]/div/div[1]/table/tbody/tr[{}]/td[1]/text()'.format(i)))[2:-2]
xpath = '//*[@id="page"]/div[2]/div[2]/div[1]/div[2]/div/div[1]/table/tbody/tr[{}]/td[2]/a/text()'.format(i)
text = tree.xpath(xpath)[0]
if text: # 确保提取到内容
allText += num+''.join(text) + '\n'
print(allText)
send_request(allText)
# 发送信息的函数
def send_request(content):
params['text'] = 'Msg:\n'+content
try:
response = requests.get(msg_url, params=params)
if response.status_code == 200:
print('请求发送成功')
else:
print(f'请求发送失败,状态码:{response.status_code}')
except Exception as e:
print('请求发送失败', e)
# 爬取网页
selenium()
# reptile()







Comments | NOTHING