python用selenium爬取网页运行仅在url显示data:
我在练习一本爬虫书上Selenium的实战,但写完代码运行空白网页出来from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import urljoin
from selenium.webdriver import ChromeOptions
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
INDEX_URL='https://spa2.scrape.center/page/%7Bpage%7D'
TIME_OUT=10
TOTAL_PAGE=10
option=ChromeOptions()
option.add_experimental_option('excludeSwitches',['enable-automation'])
option.add_experimental_option('useAutomationExtension',False)
browser=webdriver.Chrome(options=option)
browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument',{
'source':'Object.defineProperty(navigator,"webdriver",{get:() => undefined})'
})
wait=WebDriverWait(browser,TIME_OUT)
def scrape_page(url,condition,locator):
logging.info('scraping %s',url)
try:
browser.get(url)
wait.until(condition(locator))
except TimeoutException:
logging.error('error occurred while scraping %s',url,exc_info=True)
def scrape_index(page):
url=INDEX_URL.format(page=page)
scrape_page(url,condition=EC.visibility_of_all_elements_located,
locator=(By.CSS_SELECTOR,'#index .el-row'))
def parse_index():
elements=browser.find_elements(by=By.CSS_SELECTOR,value='#index .el-row .name')
for element in elements:
href=element.get_attribute('href')
yield urljoin(INDEX_URL,href)
def main():
try:
for page in range(1,TOTAL_PAGE+1):
scrape_index(page)
detail_urls=parse_index()
logging.info('details urls %s',list(detail_urls))
finally:
browser.close()运行出来结果就是弹出个网页书上正常运行可以打印输出每部电影详情页的url
检查下驱动 检查环境
页:
[1]