我有一个Ubuntu linux 的服务器我想在上面每小时运行我的爬虫程序(用到了selenium chromedriver).程序会打开不同产品网页(大约1p00个),输入不同的邮编(100个)然后爬取价格。
但当我在服务器上运行几个小时后,服务器的人联系我说我的运行程序消耗了大量的 /tmp 空间( free space: / 3765 MB (53% inode=70%): /home 699 MB (81% inode=99%): /opt 1169 MB (43% inode=99%): /tmp 25 MB (1% inode=55%): /usr/local 819 MB (95% inode=99%): /var 5957 MB (78% inode=99%):)。请问这是怎么回事 这是我的代码: - data = read_csv("C:\\Users\\12987\\desktop\\zipcode\\zc.csv")
- # converting column data to list
- zipCodeList = data['Zipcode'].tolist()
-
- while(True):
- AArray = []
-
- def ScrapingTarget(url):
- wait_imp = 10
- CO = webdriver.ChromeOptions()
- CO.add_experimental_option('useAutomationExtension', False)
- CO.add_argument('--ignore-certificate-errors')
- CO.add_argument('--start-maximized')
- wd = webdriver.Chrome(r'D:\chromedriver\chromedriver_win32new\chromedriver_win32 (2)\chromedriver.exe',options=CO)
- wd.get(url)
- wd.implicitly_wait(wait_imp)
-
- for zipcode in zipCodeList:
- # click the My Store
- myStore = wd.find_element(by=By.XPATH, value="//*[@id='web-store-id-msg-btn']/div[2]/div")
- myStore.click()
- sleep(0.5)
-
- #input ZipCode
- inputZipCode = wd.find_element(by=By.XPATH, value="//*[@id='zip-or-city-state']")
- inputZipCode.clear()
- inputZipCode.send_keys(zipcode)
-
- #click lookup
- clickLoopUP = wd.find_element(by=By.XPATH, value="//*[@id='overlay-1']/div[2]/div[1]/div/div[3]/div[2]/button")
- clickLoopUP.click()
- sleep(0.5)
-
- #choose Store
- store = wd.find_element(by=By.XPATH, value="//*[@id='overlay-1']/div[2]/div[3]/div[2]/div[1]/button")
- store.click()
-
- #start scraping
- name = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[1]/h1/span").text
- #nameArray.append(name)
-
- price = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[1]/div[1]/span").text
- #priceArray.append(price)
-
- currentZipCode = zipcode
- #zipCodeArray.append(currentZipCode)
-
- tz = pytz.timezone('Europe/London')
- GMT = datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
- #GMTArray.append(GMT)
-
- # needed to click onto the "Show more" to get the tcin and upc
- xpath = '//*[@id="tabContent-tab-Details"]/div/button'
- element_present = EC.presence_of_element_located((By.XPATH, xpath))
- WebDriverWait(wd, 5).until(element_present)
- showMore = wd.find_element(by=By.XPATH, value=xpath)
- sleep(2)
- showMore.click()
- soup = BeautifulSoup(wd.page_source, 'html.parser')
- # gets a list of all elements under "Specifications"
- div = soup.find("div", {"class": "styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight"})
- list = div.find_all("div")
- for a in range(len(list)):
- list[a] = list[a].text
- # locates the elements in the list
- tcin = [v for v in list if v.startswith("TCIN")]
- upc = [v for v in list if v.startswith("UPC")]
- #TCIN.append(tcin)
- #UPC.append(upc)
-
- #scroll up
- #wd.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME)
- wd.find_element(by=By.TAG_NAME, value='body').send_keys(Keys.CONTROL + Keys.HOME)
-
- AArray.append([name, price, currentZipCode, tcin, upc, GMT])
-
-
- with concurrent.futures.ThreadPoolExecutor(10) as executor:
- executor.map(ScrapingTarget, urlList)
-
- with open(r'C:\Users\12987\PycharmProjects\python\Network\priceingAlgoriCoding\export_Target_dataframe.csv',
- 'a', newline="", encoding='utf-8') as f:
- writer = csv.writer(f)
- writer.writerows(AArray)
-
- sleep(3600)
复制代码我觉得driver.quit()和driver.close()可能会有用但不敢尝试
希望能顺利每小时都运行这个程序
|