51Testing软件测试论坛
标题:
Python数据采集Selenium、PantomJS浅谈
[打印本页]
作者:
海鸥一飞
时间:
2018-2-7 16:46
标题:
Python数据采集Selenium、PantomJS浅谈
一直以来我觉得用在运维的Selenium、PantomJS是一个重器,
不到万不得已的时候不要祭出这个大杀器,
但是涉及到JavaScript及Ajax渲染的时候,Requests就完全懵逼了!
最近回过头来重新审视这货,
这个重器用反倒轻便了很多。
1.安装Selenium、PantomJS
Selenium可以直接通过pip安装,PantomJS则时一个exe可执行文件,需要下载解压。在使用的时候指定exe的绝对路径即可。
2.Selenium、PantomJS基本设置
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
dcap = DesiredCapabilities.PHANTOMJS
dcap[ "phantomjs.page.settings.userAgent"] = "Mozilla / 4.0(Windows NT 10.0; Win64;x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome/51.0.2704.79 Safari/ 537.36Edge/14.14393"
# 请求头不一样,自适应的窗口不一样,卧槽,坑爹!
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.set_page_load_timeout(10)
driver.set_script_timeout(10) # 设置页面退出时间,没有必要等一个网页加载完了采集
# 采集网页源码
try:
driver.get(inurl)
content = driver.page_source
# print(content)
time.sleep(1)
except:
driver.execute_script('window.stop()')
driver.close()
复制代码
3.Selenium、PantomJS基本操作
如果你的网络和机子足够好,基本上就不用等待网页渲染,
否则,还需要等待,如果用time.sleep(),则有点笨拙,
#等待页面渲染完成
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
...
try:
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "loadedButton")))
# 等某个标签元素出来,不见鸭子不撒鹰。
finally: # 撒鹰
print(driver.find_element_by_id("content").text)
driver.close()
复制代码
或者用
try:
elem == driver.find_element_by_tag_name("html")
# 抛出StaleElementReferenceException异常说明elem元素已经消失了,也就说明页面已经跳转了。
except StaleElementReferenceException:
return
复制代码
其他driver内置函数,可以通过查看源代码或者在pycharm提示获取。
4.Xpath定位Html标签
1.id定位:find_element_by_id(self, id_)
2.name定位:find_element_by_name(self, name)
3.class定位:find_element_by_class_name(self, name)
4.tag定位:find_element_by_tag_name(self, name)
5.link定位:find_element_by_link_text(self, link_text)
6.partial_link定位find_element_by_partial_link_text(self, link_text)
7.xpath定位:find_element_by_xpath(self, xpath)
8.css定位:find_element_by_css_selector(self, css_selector)
9.id复数定位find_elements_by_id(self, id_)
10.name复数定位find_elements_by_name(self, name)
11.class复数定位find_elements_by_class_name(self, name)
12.tag复数定位find_elements_by_tag_name(self, name)
13.link复数定位find_elements_by_link_text(self, text)
14.partial_link复数定位find_elements_by_partial_link_text(self, link_text)
15.xpath复数定位find_elements_by_xpath(self, xpath)
16.css复数定位find_elements_by_css_selector(self, css_selector
17.find_element(self, by='id', value=None)
18.find_elements(self, by='id', value=None)
复制代码
其中element方法定位到是是单数,是直接定位到元素;elements方法是复数,这个学过英文的
都知道,定位到的是一组元素,返回的是list队列。可参照Re函数中的findall理解。
5.完整例子
这个例子属于标准化操作,在实际中可以适当简化,并结合上面的Xpath定位完成。
from selenium import webdriver
import time
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
driver = webdriver.PhantomJS(executable_path=r'C:\Users\taojw\Desktop\pywork\phantomjs-2.1.1-windows\bin\phantomjs.exe', desired_capabilities=dcap)
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
time.sleep(3)
print(driver.find_element_by_id("content").text)
driver.close()
#设置PHANTOMJS的USER-AGENT
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
driver = webdriver.PhantomJS(executable_path='./phantomjs.exe', desired_capabilities=dcap)
driver.get("http://dianping.com/")
cap_dict = driver.desired_capabilities #查看所有可用的desired_capabilities属性。
for key in cap_dict:
print('%s: %s' % (key, cap_dict[key]))
print(driver.current_url)
driver.quit()
#等待页面渲染完成
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.PhantomJS(executable_path=r'C:\Users\taojw\Desktop\pywork\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get("http://pythonscraping.com/pages/javascript/ajaxDemo.html")
try:
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "loadedButton")))
finally:
print(driver.find_element_by_id("content").text)
driver.close()
#处理Javascript重定向
from selenium import webdriver
import time
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import StaleElementReferenceException
def waitForLoad(driver):
elem = driver.find_element_by_tag_name("html")
count = 0
while True:
count += 1
if count > 20:
print("Timing out after 10 seconds and returning")
return
time.sleep(.5)
try:
elem == driver.find_element_by_tag_name("html")
except StaleElementReferenceException:
return
driver = webdriver.PhantomJS(executable_path=r'C:\Users\taojw\Desktop\pywork\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
waitForLoad(driver)
print(driver.page_source)
######
from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver import ActionChains
driver = webdriver.PhantomJS(executable_path='phantomjs/bin/phantomjs')
driver.get('http://pythonscraping.com/pages/javascript/draggableDemo.html')
print(driver.find_element_by_id("message").text)
element = driver.find_element_by_id("draggable")
target = driver.find_element_by_id("div2")
actions = ActionChains(driver)
actions.drag_and_drop(element, target).perform()
print(driver.find_element_by_id("message").text)
#######
#截屏
driver.get_screenshot_as_file('tmp/pythonscraping.png')
####
#登陆知乎,然后能自动点击页面下方的“更多”,以载入更多的内容
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver import ActionChains
import time
import sys
driver = webdriver.PhantomJS(executable_path='C:\Users\Gentlyguitar\Desktop\phantomjs-1.9.7-windows\phantomjs.exe')
driver.get("http://www.zhihu.com/#signin")
#driver.find_element_by_name('email').send_keys('your email')
driver.find_element_by_xpath('//input[@name="password"]').send_keys('your password')
#driver.find_element_by_xpath('//input[@name="password"]').send_keys(Keys.RETURN)
time.sleep(2)
driver.get_screenshot_as_file('show.png')
#driver.find_element_by_xpath('//button[@class="sign-button"]').click()
driver.find_element_by_xpath('//form[@class="zu-side-login-box"]').submit()
try:
#等待页面加载完毕
dr=WebDriverWait(driver,5)
dr.until(lambda the_driver:the_driver.find_element_by_xpath('//a[@class="zu-top-nav-userinfo "]').is_displayed())
except:
print('登录失败')
sys.exit(0)
driver.get_screenshot_as_file('show.png')
#user=driver.find_element_by_class_name('zu-top-nav-userinfo ')
#webdriver.ActionChains(driver).move_to_element(user).perform() #移动鼠标到我的用户名
loadmore=driver.find_element_by_xpath('//a[@id="zh-load-more"]')
actions = ActionChains(driver)
actions.move_to_element(loadmore)
actions.click(loadmore)
actions.perform()
time.sleep(2)
driver.get_screenshot_as_file('show.png')
print(driver.current_url)
print(driver.page_source)
driver.quit()
复制代码
作者:
梦想家
时间:
2018-5-14 16:48
欢迎光临 51Testing软件测试论坛 (http://bbs.51testing.com/)
Powered by Discuz! X3.2