最近用selenium写了一个小爬虫,需要循环在搜索框内输入内容,然后模拟点击搜索,在前进的新的页面获取数据,之后循环之前步骤,知道搜索结束。在爬虫刚开始运行的时候速度还可以,大约一秒一个页面,随着运行,速度开始越来越慢。 这里贴出代码 - browser = webdriver.Chrome(executable_path="D:\GeckoDriver\chromedriver")
- browser.get("https://www.qcc.com/")
- # #4.设置浏览器的大小
- browser.maximize_window()
- login = browser.find_element_by_xpath('/html/body/header/div/ul/li[10]/a')
- login.click()
- # sleep(30)
- # print("30 seconds later")
- x = input("登录后请按y")
- cookies = browser.get_cookies()
- browser.quit()
- browser = webdriver.Chrome(executable_path="D:\GeckoDriver\chromedriver",options = chrome_options)
- # , options = chrome_options
- browser.get("https://www.qcc.com/")
- for cookie in cookies:
- browser.add_cookie(cookie)
- browser.get("https://www.qcc.com/")
- browser.maximize_window()
- qccinput = browser.find_element_by_css_selector("#searchkey")
- # qccinput.clear()
- qccinput.send_keys(companyNames[random.randint(0, len(companyNames))])
- qccbutton = browser.find_element_by_css_selector(".index-searchbtn")
- sleep(0.5)
- qccbutton.click()
- qccbutton = browser.find_element_by_css_selector(".input-group-btn")
- sleep(0.5)
- qccbutton.click()
- pbar = tqdm(range(len(companyNames)))
- for companyName, i in zip(companyNames, pbar):
- browser.forward()
- # browser.delete_all_cookies()
- # browser.refresh();
- lem = WebDriverWait(browser, 15, 0.5).until(EC.presence_of_element_located((By.ID, "searchKey"))) # 节约时间,网页出现这个元素再操作
- seach = browser.find_element_by_css_selector("#searchKey")
- seach.clear()
- seach.send_keys(companyName)
- seachButton = browser.find_element_by_css_selector(".btn-primary")
- seachButton.click()
- response = browser.page_source
- html = etree.HTML(response)
- result = etree.tostring(html)
- cookies = browser.get_cookies()
- try:
- companyName = html.xpath(
- 'normalize-space(/html/body/div[1]/div[2]/div[2]/div[3]/div/div[2]/div/table/tr[1]/td[3]/div/a[1])') # 去掉tbody
- urls = html.xpath(
- '/html/body/div[1]/div[2]/div[2]/div[3]/div/div[2]/div/table/tr[1]/td[3]/div/a[1]/@href') # 去掉tbody
- getCompanyNames.append(companyName)
- getTaxpayerNumber(urls, cookies)
- except Exception as r:
- getCompanyNames.append("无法搜索到公司")
- taxpayerNumber.append("无法查到税号")
- # browser = webdriver.Chrome(profile)
- # send_command = ('POST', '/session/$sessionId/chromium/send_command')
- # browser.command_executor._commands['SEND_COMMAND'] = send_command
- # browser.execute('SEND_COMMAND', dict(cmd='Network.clearBrowserCache', params={}))
- browser.delete_all_cookies()
- for cookie in cookies:
- browser.add_cookie(cookie)
复制代码
|