51Testing软件测试论坛

标题: selenium模拟登录及爬取信息 [打印本页]

作者: 测试积点老人    时间: 2022-9-1 10:06
标题: selenium模拟登录及爬取信息
怎么定位下一页
  1. import re
  2. from selenium import webdriver
  3. from selenium.webdriver.common.by import By
  4. from selenium.webdriver.support.ui import WebDriverWait
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from pyquery import PyQuery as pq
  7. from selenium.webdriver import ActionChains
  8. import time
  9. from lxml import etree
  10. driver = webdriver.Chrome()
  11. driver.get("https://www.taobao.com/")
  12. driver.find_element(by=By.XPATH,value='/html/body/div[3]/div[2]/div[2]/div[2]/div[5]/div/div[2]/div[1]/a[1]').click()
  13. time.sleep(6)
  14. handles = driver.window_handles
  15. driver.switch_to.window(handles[-1])
  16. driver.find_element(by=By.ID, value='fm-login-id').send_keys('1')
  17. driver.find_element(by=By.ID, value='fm-login-password').send_keys('1')
  18. driver.find_element(by=By.XPATH,value='/html/body/div/div[2]/div[3]/div/div/div/div[2]/div/form/div[4]/button').click()
  19. time.sleep(6)
  20. driver.switch_to.frame('baxia-dialog-content')
  21. el1 = driver.find_element(by=By.XPATH, value='//*[@id="nc_1_n1z"]')
  22. print(el1.size['width'])
  23. print(el1.size['height'])
  24. el2 = driver.find_element(by=By.XPATH, value='//*[@id="nc_1__scale_text"]')
  25. print(el2.size['width'])
  26. print(el2.size['height'])
  27. time.sleep(2)
  28. driver.maximize_window()
  29. chains = ActionChains(driver)
  30. chains.drag_and_drop_by_offset(el1, el2.size['width'], -el2.size['height'])
  31. # 事件提交
  32. chains.perform()
  33. # driver.close()
  34. # 退出浏览器
  35. # driver.quit()
  36. wait = WebDriverWait(driver, 10)



  37. def search():


  38.     try:
  39.         input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#q")))
  40.         submit = wait.until(
  41.             EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
  42.         input.send_keys('美食')
  43.         submit.click()
  44.         total = wait.until(
  45.             EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))

  46.         get_products()

  47.         return total.text
  48.         driver.find_element(by=By.XPATH,
  49.                             value='//*[@id="mainsrp-pager"]/div/div/div/ul/li[8]/a/span[1]').click()

  50.     except:
  51.         search()






  52. def get_products():
  53.     # wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist.items.item')))#加载宝贝信息并等待
  54.     html = driver.page_source
  55.     doc = pq(html)
  56.     items = doc('#mainsrp-itemlist .items .item').items()  # 得到所有宝贝的内容
  57.     for item in items:
  58.         product = {
  59.             'image': item.find('.pic .img').attr('data-src'),  # 图片链接
  60.             'price': item.find('.price').text(),  # 商品价格
  61.             'deal': item.find('.deal-cnt').text()[:-3],  # 付款人数,-3是为了去掉人付款这几个字
  62.             'title': item.find('.title').text(),  # 商品名称
  63.             'shop': item.find('.shop').text(),  # 店铺名称
  64.             'location': item.find('.location').text()
  65.         }
  66.         print(product)





  67. def main():


  68.     total = search()
  69.     # 用来写csv文件的标题
  70.     start_csv = True

  71.     #total1 = int(re.compile('(\d+)').search(total).group(1))  # 转换为数值型
  72.     #print(total1)




  73. if __name__ == '__main__':
  74.        main()

复制代码
运行结果及报错内容

只能打印第一页内容

我想要达到的结果

正常点击下一页,并继续爬取,直到全部爬取完



作者: qqq911    时间: 2022-9-2 11:50
代码中要遍历
作者: jingzizx    时间: 2022-9-2 14:07
没有操作吗




欢迎光临 51Testing软件测试论坛 (http://bbs.51testing.com/) Powered by Discuz! X3.2