51Testing软件测试论坛

标题: 用python爬取数据出错 [打印本页]

作者: 测试积点老人    时间: 2022-1-7 09:55
标题: 用python爬取数据出错
  1. import pandas as pd
  2. from selenium import webdriver
  3. from selenium.webdriver.chrome.options import Options
  4. from time import sleep
  5. import csv
  6. # 这个爬虫爬取结果的最后几列需要手工处理(可能会多出来几列)
  7. def get_infos(ID):
  8.     driver.get(r"http://192.168.3.252/xhlisweb-inspection_id/XHlisWebReport.aspx")
  9.     sleep(1)
  10.     driver.find_element_by_name("txtoutpatient_id").send_keys(ID)
  11.     driver.find_element_by_name("btnConfirm").click()
  12.     cols = driver.find_elements_by_xpath('''//tr[contains(@onclick, "return btnClick")]''')
  13.     times = len(cols)
  14.     # 思路:挨个去点击一行的病员号,然后获取下方表格的信息
  15.     # print(driver.page_source)
  16.     # cols = driver.find_elements_by_xpath("//td[text()=" + ID + "]")
  17.     # col = cols[4]
  18.     # col_info = col.text.split(' ')[:14]
  19.     # col.click()
  20.     # items = driver.find_elements_by_xpath("//div[@id='report-content']//tbody//tr")[1:]
  21.     # item = items[0]
  22.     infos = []
  23.     for i in range(times):
  24.         driver.get(r"http://192.168.3.252/xhlisweb-inspection_id/XHlisWebReport.aspx")
  25.         sleep(2)
  26.         driver.find_element_by_name("txtoutpatient_id").send_keys(ID)
  27.         driver.find_element_by_name("btnConfirm").click()
  28.         cols = driver.find_elements_by_xpath('''//tr[contains(@onclick, "return btnClick")]''')
  29.         col = cols[i]
  30.         col_info = col.text.split(' ')[:14]
  31.         col.click()
  32.         items = driver.find_elements_by_xpath("//div[@id='report-content']//tbody//tr")[1:]
  33.         for item in items:
  34.             a = item.text.split(' ')
  35.             try:
  36.                 a.remove('')
  37.             except:
  38.                 pass
  39.             # 这里要做点长度判断,如果a的长度大于7,那就截断;如果不够,就填充''
  40.             #if len(a) <= 7:
  41.             #    for i in range(7-len(a)):
  42.             #        a.append('')
  43.             #else:
  44.             #    a = a[:7]
  45.             infos.append([ID] + col_info + a)
  46.     return infos

  47. # start最小为0, end最大为641
  48. start = 200
  49. end = 641
  50. data = pd.read_excel(r"C:\Users\cc\Desktop\资料\数据录入\ALL_raw.xlsx")
  51. IDs = data['登记号'].tolist()[start:end]
  52. # IDs = ["0005248871", '0010610644']
  53. options = Options()
  54. options.binary_location = r"C:\Users\newceshi\Desktop\蒋丽莎病历检查\pzwj\google\chrome.exe"
  55. driver = webdriver.Chrome(r"C:\Users\newceshi\Desktop\蒋丽莎病历检查\pzwj\chromedriver.exe", chrome_options=options)
  56. driver.maximize_window()
  57. ALL = []
  58. for ID in IDs:
  59.     try:
  60.         infos = get_infos(ID)
  61.         ALL += infos
  62.     except:
  63.         pass
  64. headers = ['ID', '序号', '检验单', '病员号', '类型', '送检', '目的', '姓名', '性别', '年龄', '科别', '病区', '工作组', '审核人员', '审核日期', '审核时间', 'NO', '英文名称', '检验项目', '结果', '单位', '状态', '参考值']
  65. with open(r"result_检验_" + str(start) + "_" + str(end) +".csv", 'w', newline='') as f:
  66.     f_csv = csv.writer(f)
  67.     f_csv.writerow(headers)
  68.     for i in ALL:
  69.         f_csv.writerow(i)

  70. sleep(3)
  71. driver.quit()
复制代码
运行结果及报错内容
  1. C:\Users\cc\AppData\Local\Programs\Python\Python39\python.exe D:/Pycharm/data/chaxue4.py
  2. Traceback (most recent call last):
  3.   File "D:\Pycharm\data\chaxue4.py", line 58, in <module>
  4.     data = pd.read_excel(r"C:\Users\cc\Desktop\资料\数据录入\ALL_raw.xlsx")
  5.   File "C:\Users\cc\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\util\_decorators.py", line 311, in wrapper
  6.     return func(*args, **kwargs)
  7.   File "C:\Users\cc\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\excel\_base.py", line 364, in read_excel
  8.     io = ExcelFile(io, storage_options=storage_options, engine=engine)
  9.   File "C:\Users\cc\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\excel\_base.py", line 1233, in __init__
  10.     self._reader = self._engines[engine](self._io, storage_options=storage_options)
  11.   File "C:\Users\cc\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\excel\_openpyxl.py", line 521, in __init__
  12.     import_optional_dependency("openpyxl")
  13.   File "C:\Users\cc\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\compat\_optional.py", line 118, in import_optional_dependency
  14.     raise ImportError(msg) from None
  15. ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.
  16. Process finished with exit code 1
复制代码
我的解答思路和尝试过的方法

我从别人那搞到的代码,但我电脑上运行的结果是这样,看不懂什么意思



作者: 郭小贱    时间: 2022-1-10 09:44
ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.
问题出在这。
作者: qqq911    时间: 2022-1-10 11:07
下断点吧,一步一步调试
作者: jingzizx    时间: 2022-1-10 14:34
缺少依赖?




欢迎光临 51Testing软件测试论坛 (http://bbs.51testing.com/) Powered by Discuz! X3.2