|
我们要抓取的网站是:quotes.toscrape.com
流程框架:
1.抓取第一页
2.获取内容和下一页的链接
3保存爬取结果、
4翻页爬取,请求下一页信息,分析内容并请求再下一页的链接
在命令行下如下操作:
创建项目:scrapy startproject quote
创建spider文件:scrapy genspider quotes quotes.toscrtapy.com
然后用pycharm打开
quotes.py
- # -*- coding: utf-8 -*-
- import scrapy
- from quote.items import QuoteItem
- class QuotesSpider(scrapy.Spider):
- name = 'quotes'
- # name指定spidedr的名称
- allowed_domains = ['quotes.toscrape.com']
- start_urls = ['http://quotes.toscrape.com/']
- def parse(self, response):
- # parse方法是默认的回调,当爬虫开始时会从start_url得到链接,
- # 然后自动调用parse方法进行解析
- quotes =response.css('.quote')
- # 选择出每一个区域块
- for quote in quotes: # 循环每一个区域块
- item = QuoteItem()
- text = quote.css('.text::text').extract_first()
- author = quote.css('.author::text').extract_first()
- tags = quote.css('.tags .tag::text').extract()
- item['text'] = text
- item['author'] = author
- item['tags'] = tags
- yield item
- # .text::text意思是获取class=text的文本内容
- next = response.css('.pager .next a::attr(href)').extract_first()
- # 得到的只是url的一部分
- url = response.urljoin(next)
- # 使用join构建完整的url
- yield scrapy.Request(url=url,callback=self.parse)
- # 生成request
- # 命令行下数据的保存 :scrapy crawl quotes -o quotes.json
- #也支持数据格式:('json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle')
复制代码
setting.py
- # -*- coding: utf-8 -*-
- # Scrapy settings for quote project
- #
- # For simplicity, this file contains only settings considered important or
- # commonly used. You can find more settings consulting the documentation:
- #
- # https://doc.scrapy.org/en/latest/topics/settings.html
- # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
- # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
- BOT_NAME = 'quote'
- SPIDER_MODULES = ['quote.spiders']
- NEWSPIDER_MODULE = 'quote.spiders'
- # Crawl responsibly by identifying yourself (and your website) on the user-agent
- #USER_AGENT = 'quote (+http://www.yourdomain.com)'
- # Obey robots.txt rules
- ROBOTSTXT_OBEY = True
- # Configure maximum concurrent requests performed by Scrapy (default: 16)
- #CONCURRENT_REQUESTS = 32
- # Configure a delay for requests for the same website (default: 0)
- # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
- # See also autothrottle settings and docs
- #DOWNLOAD_DELAY = 3
- # The download delay setting will honor only one of:
- #CONCURRENT_REQUESTS_PER_DOMAIN = 16
- #CONCURRENT_REQUESTS_PER_IP = 16
- # Disable cookies (enabled by default)
- #COOKIES_ENABLED = False
- # Disable Telnet Console (enabled by default)
- #TELNETCONSOLE_ENABLED = False
- # Override the default request headers:
- #DEFAULT_REQUEST_HEADERS = {
- # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- # 'Accept-Language': 'en',
- #}
- # Enable or disable spider middlewares
- # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
- #SPIDER_MIDDLEWARES = {
- # 'quote.middlewares.QuoteSpiderMiddleware': 543,
- #}
- # Enable or disable downloader middlewares
- # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
- #DOWNLOADER_MIDDLEWARES = {
- # 'quote.middlewares.QuoteDownloaderMiddleware': 543,
- #}
- # Enable or disable extensions
- # See https://doc.scrapy.org/en/latest/topics/extensions.html
- #EXTENSIONS = {
- # 'scrapy.extensions.telnet.TelnetConsole': None,
- #}
- # Configure item pipelines
- # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
- #ITEM_PIPELINES = {
- # 'quote.pipelines.QuotePipeline': 300,
- #}
- # Enable and configure the AutoThrottle extension (disabled by default)
- # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
- #AUTOTHROTTLE_ENABLED = True
- # The initial download delay
- #AUTOTHROTTLE_START_DELAY = 5
- # The maximum download delay to be set in case of high latencies
- #AUTOTHROTTLE_MAX_DELAY = 60
- # The average number of requests Scrapy should be sending in parallel to
- # each remote server
- #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
- # Enable showing throttling stats for every response received:
- #AUTOTHROTTLE_DEBUG = False
- # Enable and configure HTTP caching (disabled by default)
- # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
- #HTTPCACHE_ENABLED = True
- #HTTPCACHE_EXPIRATION_SECS = 0
- #HTTPCACHE_DIR = 'httpcache'
- #HTTPCACHE_IGNORE_HTTP_CODES = []
- #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
复制代码
items.py
- # -*- coding: utf-8 -*-
- # Define here the models for your scraped items
- #
- # See documentation in:
- # https://doc.scrapy.org/en/latest/topics/items.html
- import scrapy
- class QuoteItem(scrapy.Item):
- text = scrapy.Field()
- author = scrapy.Field()
- tags = scrapy.Field()
- # define the fields for your item here like:
- # name = scrapy.Field()
复制代码
piplines.py
- # -*- coding: utf-8 -*-
- # Define your item pipelines here
- #
- # Don't forget to add your pipeline to the ITEM_PIPELINES setting
- # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
- ## 对item进行处理,保存到数据库当中
- from scrapy.exceptions import DropItem
- class TextPipeline(object):
- def __init__(self):
- self.limit = 50
- def process_item(self, item, spider):
- if item['text']:
- if len(item['text']) > self.limit:
- item['text'] = item['text'][0:self.limit].rstrip() + '...'
- return item
- else:
- return DropItem('missing Text')
复制代码
|
|