抓取數(shù)據(jù)時,經(jīng)常遇到有的數(shù)據(jù)是通過ajax異步調(diào)取的,如何通過selenium獲取網(wǎng)址所加載的全部請求url地址了,即我們打開開發(fā)者工具里面network中記錄的請求url列表,可以參考下面代碼
# -*- coding=utf-8 -*-
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
url="http://www.phper163.com/";
chrome_options = Options()
#chrome_options.add_argument('--no-sandbox')
#chrome_options.add_argument('--disable-dev-shm-usage')
#chrome_options.add_argument('--disable-gpu')
#chrome_options.add_argument('--headless')
chrome_options.add_experimental_option('w3c', False)
caps = {
'loggingPrefs': {
'performance': 'ALL',
}
}
driver = webdriver.Chrome(desired_capabilities=caps, options=chrome_options)
driver.get(url)
time.sleep(5)
requests = []
response = []
for log in driver.get_log('performance'):
x = json.loads(log['message'])['message']
if x["method"] == "Network.responseReceived":
try:
ip = x["params"]["response"]["remoteIPAddress"]
except BaseException as p:
print(p)
ip = ""
try:
port = x["params"]["response"]["remotePort"]
except BaseException as f:
print(f)
port = ""
response.append(
[
x["params"]["response"]["url"],
ip,
port,
x["params"]["response"]["status"],
x["params"]["response"]["statusText"],
x["params"]["type"]
]
)
elif x["method"] == "Network.requestWillBeSent":
requests.append(
[
x["params"]["request"]["url"],
x["params"]["initiator"]["type"],
x["params"]["request"]["method"],
x["params"]["type"]
]
)
else:
pass
newlist = []
for iqurl in requests:
qwelist = [iqurl]
for ipurl in response:
if iqurl[0] == ipurl[0]:
qwelist.append(ipurl)
else:
pass
newlist.append(qwelist)
for ipurl in response:
p = 0
for i in newlist:
if len(i) == 1:
pass
else:
if ipurl == i[1]:
p += 1
else:
pass
if p == 0:
newlist.append(ipurl)
else:
pass
return_list = []
for a in newlist:
dic = {
"url": "",
"method": "",
"status": "",
"statusText": "",
"type": "",
"initiator": "",
"netloc": "",
"remoteIPAddress": "",
"remotePort": ""
}
if len(a) == 2:
dic["url"] = a[0][0]
dic["initiator"] = a[0][1]
dic["method"] = a[0][2]
dic["type"] = a[0][3]
dic["remoteIPAddress"] = a[1][1]
dic["remotePort"] = a[1][2]
dic["status"] = a[1][3]
dic["statusText"] = a[1][4]
return_list.append(dic)
elif len(a) == 1:
if len(a[0]) == 4:
dic["url"] = a[0][0]
dic["initiator"] = a[0][1]
dic["method"] = a[0][2]
dic["type"] = a[0][3]
return_list.append(dic)
elif len(a[0]) == 6:
dic["url"] = a[0][0]
dic["remoteIPAddress"] = a[0][1]
dic["remotePort"] = a[0][2]
dic["status"] = a[0][3]
dic["statusText"] = a[0][4]
dic["type"] = a[0][5]
return_list.append(dic)
else:
pass
else:
pass
driver.close()
driver.quit()
print(return_list)調(diào)用selenium,開啟selenium的日志收集功能,收集所有日志,并從中挑出network部分,分析格式化數(shù)據(jù),取出需要的數(shù)據(jù)
