TA的每日心情 | 无聊 4 天前 |
---|
签到天数: 530 天 连续签到: 2 天 [LV.9]测试副司令
|
1测试积点
本人正在练习通过selenium爬取上市公司年报,试了很久才写出如下效果,但其中的selenium定位语句,实在不知道怎么优化,试了很多次,每次都报错,所以期待有高人可以拨冗指点,谢谢!
-
- ```python
-
- import re
- from selenium import webdriver
- from selenium.webdriver import ActionChains
- from selenium.webdriver.chrome.options import Options
- import requests
-
-
- def main( out_fold ):
- url = "https://gu.qq.com/sh600018/gp/jbnb/"
- file_list = {}
- chrome_options = Options()
- chrome_options.add_argument('--log-level=3')
- chrome_options.add_argument('--disable-gpu') # 禁用gpu
- chrome_options.add_argument('--mute-audio') # 关闭声音
- driver = webdriver.Chrome(options=chrome_options, executable_path=r"E:/python_work/BrowseDriver/chromedriver.exe")
- driver.implicitly_wait(2)
-
- driver.get(url)
- ul = driver.find_element_by_xpath("//strong[text()='公告标题']/../..") # 这块怎么一次性取到报告区域呢???
- lis = ul.find_elements_by_xpath('./*') # 每行报告记录
-
- for index, li in enumerate(lis):
- eles = li.find_elements_by_xpath('./*')
- filename = eles[0].text
-
- file_list[ index ] = {
- "filename" : filename ,
- "href" : eles[0].get_attribute("href")
- }
- for k in file_list: # 遍历存储的 file_list
- if file_list[k]['filename'] =="公告标题":
- continue
-
- driver.implicitly_wait(2)
- driver.get( file_list[k]['href'] )
- ele = driver.find_element_by_xpath("//a[@class='yk_on']/../..") # 这块语句怎么优化,始终无法直接读取到iframe?????
- ele1 = ele.find_elements_by_xpath('./*')
- ele2 = ele1[2].find_elements_by_xpath('./*')
- ele3 = ele2[0].find_elements_by_xpath('./*')
- pdf_src = ele3[1].get_attribute("src")
- r = requests.get( pdf_src )
- filename_full_path = out_fold + file_list[k]['filename'] + ".pdf"
- with open( filename_full_path , "wb") as f:
- f.write(r.content)
-
- print("over")
-
-
- if __name__ == '__main__':
- out_fold = 'D:\\data_work\\财报分析模型\财报\\tt\\'
- main( out_fold )
-
-
复制代码
|
|