51Testing软件测试论坛

标题: scrapy框架爬取知乎信息实例详解(详细) [打印本页]

作者: 歪小Y_02    时间: 2019-2-25 17:23
标题: scrapy框架爬取知乎信息实例详解(详细)
爬取知乎的关注信息作为我们scrapy框架的详解例子,爬取的知乎大v是轮子哥,然后将爬取的信息存储进mongo数据库。我将所有解释都放进例子里。虽然每一句代码都有解释,但是最好还是有爬虫的基础。
整体思路:
1.选定一位有较多关注数的知乎达人作为我们的爬取对象
2.通过知乎接口获得获得该粉丝的关注列表和粉丝列表
3.通过递归的方法实现对列表中每一个用户的爬取,爬取他们的粉丝列表和关注列表。
4.通过知乎接口获得列表中的每位用户的详细信息。

zhihu.py
  1. # -*- coding: utf-8 -*-
  2. # 出现500服务器响应的错误,原因是检测到我们不是通过浏览器访问的,我们在setting文件中修改headers
  3. #   的值,将user-agent加入。
  4. import scrapy
  5. import json
  6. from scrapy import Request
  7. from zhihuuser.items import UserItem
  8. class ZhihuSpider(scrapy.Spider):
  9.     name = 'zhihu'
  10.     allowed_domains = ['www.zhihu.com']
  11.     start_url = ['http://www.zhihu.com/']
  12.     start_user='excited-vczh'
  13.     user_url='https://www.zhihu.com/api/v4/members/{user}?include={include}'
  14.     #user意思是用户的url,即轮子哥的url

  15.     user_query='allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
  16.     follows_url='https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
  17.     follows_query='data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
  18.     # follows意思是他关注的人的列表

  19.     followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
  20.     followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
  21.     # followers_query和user_query都是后面的include
  22.     #followers意思是关注他的列表
  23.     #limit和offset也是变量,limit是固定值,而offset则根据页面的变化而变化


  24.     def start_requests(self):
  25.         # url='https://www.zhihu.com/api/v4/members/{user}/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20'
  26.         # url='https://www.zhihu.com/api/v4/members/excited-vczh/publications?include=data%5B*%5D.cover%2Cebook_type%2Ccomment_count%2Cvoteup_count&offset=0&limit=5'
  27.         yield Request(self.user_url.format(user=self.start_url,include=self.user_query), self.parse_user)
  28.         yield Request(self.follows_url.format(user=self.start_user,include=self.follows_query,offset=0,limit=20), callback=self.parse_follows)
  29.         yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, offset=0, limit=20), callback=self.parse_followers)
  30. # 使用format方法是为了动态的构造url
  31. # 同时也需要指定回调函数来进行url的解析


  32.     def parse_user(self, response):
  33.         result = json.loads(response.text)  # 利用json.load声明一个json对象
  34.         item = UserItem()
  35.         for field in item.fields:
  36.             # 利用item的field属性进行赋值,实际上field输出的是所有集合的名称
  37.             if field in result.keys():
  38.                 # 如果field是result的键名之一则field进行赋值
  39.                 item[field] = result.get(field)
  40.         yield item
  41.         yield Request(self.follows_url.format(user=result.get('url_token'),include=self.follows_query,offset=0,limit=20,callback=self.parse_follows))
  42.         # 获取关注列表的request
  43.         yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, offset=0, limit=20,callback=self.parse_followers))


  44.     def parse_follows(self, response):
  45.         result = json.loads(response.text)
  46.         if 'data' in result.keys():
  47.             for result in result.get('data'):
  48.                 yield Request(self.user_url.format(user=result.get('url_token'),include=self.user_query),callback=self.parse_user)
  49.         if 'paging' in result.keys() and result.get('pagging').get('is_end') == False:
  50.             next_page = result.get('pagging').get('next')
  51.             # 从pagging的next键得到分页下一页的链接
  52.             yield Request(next_page,self.parse_follows)
  53.             #传入url链接,新建一个request请求,然后回调parse_follows



  54.     def parse_followers(self, response):
  55.         result = json.loads(response.text)
  56.         if 'data' in result.keys():
  57.             for result in result.get('data'):
  58.                 yield Request(self.user_url.format(user=result.get('url_token'),include=self.user_query),callback=self.parse_user)
  59.         if 'paging' in result.keys() and result.get('pagging').get('is_end') == False:
  60.             next_page = result.get('pagging').get('next')
  61.             # 从pagging的next键得到分页下一页的链接
  62.             yield Request(next_page,self.parse_followers)
  63.             #传入url链接,新建一个request请求,然后回调parse_follows
复制代码
item.py
  1. # -*- coding: utf-8 -*-

  2. # Define here the models for your scraped items
  3. #
  4. # See documentation in:
  5. # https://doc.scrapy.org/en/latest/topics/items.html

  6. import scrapy
  7. from scrapy import Item, Field

  8. class UserItem(scrapy.Item):
  9.     id = Field()
  10.     name = Field()
  11.     avatar_url = Field()
  12.     headline = Field()
  13.     url_token =Field()
  14.     url = Field()
  15.     avatar_url_template = Field()
  16.     type = Field()

  17.     # define the fields for your item here like:
  18.     # name = scrapy.Field()
复制代码
setting.py
  1. # -*- coding: utf-8 -*-

  2. # Scrapy settings for zhihuuser project
  3. #
  4. # For simplicity, this file contains only settings considered important or
  5. # commonly used. You can find more settings consulting the documentation:
  6. #
  7. #     https://doc.scrapy.org/en/latest/topics/settings.html
  8. #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
  9. #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

  10. BOT_NAME = 'zhihuuser'

  11. SPIDER_MODULES = ['zhihuuser.spiders']
  12. NEWSPIDER_MODULE = 'zhihuuser.spiders'


  13. # Crawl responsibly by identifying yourself (and your website) on the user-agent
  14. #USER_AGENT = 'zhihuuser (+http://www.yourdomain.com)'

  15. # Obey robots.txt rules
  16. ROBOTSTXT_OBEY = False
  17.    # 默认为true,意思是遵守rouot协议,我们改成False

  18. # Configure maximum concurrent requests performed by Scrapy (default: 16)
  19. #CONCURRENT_REQUESTS = 32

  20. # Configure a delay for requests for the same website (default: 0)
  21. # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
  22. # See also autothrottle settings and docs
  23. #DOWNLOAD_DELAY = 3
  24. # The download delay setting will honor only one of:
  25. #CONCURRENT_REQUESTS_PER_DOMAIN = 16
  26. #CONCURRENT_REQUESTS_PER_IP = 16

  27. # Disable cookies (enabled by default)
  28. #COOKIES_ENABLED = False

  29. # Disable Telnet Console (enabled by default)
  30. #TELNETCONSOLE_ENABLED = False

  31. # Override the default request headers:
  32. DEFAULT_REQUEST_HEADERS = {
  33.   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  34.   'Accept-Language': 'en',
  35. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
  36. }

  37. # Enable or disable spider middlewares
  38. # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  39. #SPIDER_MIDDLEWARES = {
  40. #    'zhihuuser.middlewares.ZhihuuserSpiderMiddleware': 543,
  41. #}

  42. # Enable or disable downloader middlewares
  43. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
  44. #DOWNLOADER_MIDDLEWARES = {
  45. #    'zhihuuser.middlewares.ZhihuuserDownloaderMiddleware': 543,
  46. #}

  47. # Enable or disable extensions
  48. # See https://doc.scrapy.org/en/latest/topics/extensions.html
  49. #EXTENSIONS = {
  50. #    'scrapy.extensions.telnet.TelnetConsole': None,
  51. #}

  52. # Configure item pipelines
  53. # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  54. #ITEM_PIPELINES = {
  55. #    'zhihuuser.pipelines.ZhihuuserPipeline': 300,
  56. #}

  57. # Enable and configure the AutoThrottle extension (disabled by default)
  58. # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
  59. #AUTOTHROTTLE_ENABLED = True
  60. # The initial download delay
  61. #AUTOTHROTTLE_START_DELAY = 5
  62. # The maximum download delay to be set in case of high latencies
  63. #AUTOTHROTTLE_MAX_DELAY = 60
  64. # The average number of requests Scrapy should be sending in parallel to
  65. # each remote server
  66. #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  67. # Enable showing throttling stats for every response received:
  68. #AUTOTHROTTLE_DEBUG = False

  69. # Enable and configure HTTP caching (disabled by default)
  70. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
  71. #HTTPCACHE_ENABLED = True
  72. #HTTPCACHE_EXPIRATION_SECS = 0
  73. #HTTPCACHE_DIR = 'httpcache'
  74. #HTTPCACHE_IGNORE_HTTP_CODES = []
  75. #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
复制代码



作者: Miss_love    时间: 2020-12-31 08:53
支持分享




欢迎光临 51Testing软件测试论坛 (http://bbs.51testing.com/) Powered by Discuz! X3.2