关于爬虫selenium的使用问题

测试积点老人 · 发表于 2022-5-6 10:17:13

这是爬虫文件，就这个一个爬虫

class FirstSpiderSpider(scrapy.Spider):
name = 'first_spider'
allowed_domains = ['movie.douban.com']
start_urls = ['https://read.douban.com/?dcm=original-nav']
def parse(self, response):
title = response.xpath('//*[@id="react-root"]/div/div/div[3]/div/div[2]/div/div/div[2]/div/div[1]/div['
'2]/h4/a/span/text()').extract_first()
print(title)

复制代码

有一个下载中间件，而且启用了

class LolDownloaderMiddleware:
def process_request(self, request, spider):
url = request.url
# 开启selenium
driver = webdriver.PhantomJS(executable_path=r'D:\tool\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get(url)
c = driver.find_element(By.XPATH, '//*[@id="react-root"]/div/div/div[3]/a[1]')
c.click()
time.sleep(1)
data = driver.page_source # 获取页面源代码
driver.close() # 关闭selenium
return HtmlResponse(url=url, body=data, encoding='utf-8', request=request)