|
爬取知乎的关注信息作为我们scrapy框架的详解例子,爬取的知乎大v是轮子哥,然后将爬取的信息存储进mongo数据库。我将所有解释都放进例子里。虽然每一句代码都有解释,但是最好还是有爬虫的基础。
整体思路:
1.选定一位有较多关注数的知乎达人作为我们的爬取对象
2.通过知乎接口获得获得该粉丝的关注列表和粉丝列表
3.通过递归的方法实现对列表中每一个用户的爬取,爬取他们的粉丝列表和关注列表。
4.通过知乎接口获得列表中的每位用户的详细信息。
zhihu.py
- # -*- coding: utf-8 -*-
- # 出现500服务器响应的错误,原因是检测到我们不是通过浏览器访问的,我们在setting文件中修改headers
- # 的值,将user-agent加入。
- import scrapy
- import json
- from scrapy import Request
- from zhihuuser.items import UserItem
- class ZhihuSpider(scrapy.Spider):
- name = 'zhihu'
- allowed_domains = ['www.zhihu.com']
- start_url = ['http://www.zhihu.com/']
- start_user='excited-vczh'
- user_url='https://www.zhihu.com/api/v4/members/{user}?include={include}'
- #user意思是用户的url,即轮子哥的url
- user_query='allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
- follows_url='https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
- follows_query='data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
- # follows意思是他关注的人的列表
- followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
- followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
- # followers_query和user_query都是后面的include
- #followers意思是关注他的列表
- #limit和offset也是变量,limit是固定值,而offset则根据页面的变化而变化
- def start_requests(self):
- # url='https://www.zhihu.com/api/v4/members/{user}/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20'
- # url='https://www.zhihu.com/api/v4/members/excited-vczh/publications?include=data%5B*%5D.cover%2Cebook_type%2Ccomment_count%2Cvoteup_count&offset=0&limit=5'
- yield Request(self.user_url.format(user=self.start_url,include=self.user_query), self.parse_user)
- yield Request(self.follows_url.format(user=self.start_user,include=self.follows_query,offset=0,limit=20), callback=self.parse_follows)
- yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, offset=0, limit=20), callback=self.parse_followers)
- # 使用format方法是为了动态的构造url
- # 同时也需要指定回调函数来进行url的解析
- def parse_user(self, response):
- result = json.loads(response.text) # 利用json.load声明一个json对象
- item = UserItem()
- for field in item.fields:
- # 利用item的field属性进行赋值,实际上field输出的是所有集合的名称
- if field in result.keys():
- # 如果field是result的键名之一则field进行赋值
- item[field] = result.get(field)
- yield item
- yield Request(self.follows_url.format(user=result.get('url_token'),include=self.follows_query,offset=0,limit=20,callback=self.parse_follows))
- # 获取关注列表的request
- yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, offset=0, limit=20,callback=self.parse_followers))
- def parse_follows(self, response):
- result = json.loads(response.text)
- if 'data' in result.keys():
- for result in result.get('data'):
- yield Request(self.user_url.format(user=result.get('url_token'),include=self.user_query),callback=self.parse_user)
- if 'paging' in result.keys() and result.get('pagging').get('is_end') == False:
- next_page = result.get('pagging').get('next')
- # 从pagging的next键得到分页下一页的链接
- yield Request(next_page,self.parse_follows)
- #传入url链接,新建一个request请求,然后回调parse_follows
- def parse_followers(self, response):
- result = json.loads(response.text)
- if 'data' in result.keys():
- for result in result.get('data'):
- yield Request(self.user_url.format(user=result.get('url_token'),include=self.user_query),callback=self.parse_user)
- if 'paging' in result.keys() and result.get('pagging').get('is_end') == False:
- next_page = result.get('pagging').get('next')
- # 从pagging的next键得到分页下一页的链接
- yield Request(next_page,self.parse_followers)
- #传入url链接,新建一个request请求,然后回调parse_follows
复制代码 item.py
- # -*- coding: utf-8 -*-
- # Define here the models for your scraped items
- #
- # See documentation in:
- # https://doc.scrapy.org/en/latest/topics/items.html
- import scrapy
- from scrapy import Item, Field
- class UserItem(scrapy.Item):
- id = Field()
- name = Field()
- avatar_url = Field()
- headline = Field()
- url_token =Field()
- url = Field()
- avatar_url_template = Field()
- type = Field()
- # define the fields for your item here like:
- # name = scrapy.Field()
复制代码 setting.py
- # -*- coding: utf-8 -*-
- # Scrapy settings for zhihuuser project
- #
- # For simplicity, this file contains only settings considered important or
- # commonly used. You can find more settings consulting the documentation:
- #
- # https://doc.scrapy.org/en/latest/topics/settings.html
- # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
- # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
- BOT_NAME = 'zhihuuser'
- SPIDER_MODULES = ['zhihuuser.spiders']
- NEWSPIDER_MODULE = 'zhihuuser.spiders'
- # Crawl responsibly by identifying yourself (and your website) on the user-agent
- #USER_AGENT = 'zhihuuser (+http://www.yourdomain.com)'
- # Obey robots.txt rules
- ROBOTSTXT_OBEY = False
- # 默认为true,意思是遵守rouot协议,我们改成False
- # Configure maximum concurrent requests performed by Scrapy (default: 16)
- #CONCURRENT_REQUESTS = 32
- # Configure a delay for requests for the same website (default: 0)
- # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
- # See also autothrottle settings and docs
- #DOWNLOAD_DELAY = 3
- # The download delay setting will honor only one of:
- #CONCURRENT_REQUESTS_PER_DOMAIN = 16
- #CONCURRENT_REQUESTS_PER_IP = 16
- # Disable cookies (enabled by default)
- #COOKIES_ENABLED = False
- # Disable Telnet Console (enabled by default)
- #TELNETCONSOLE_ENABLED = False
- # Override the default request headers:
- DEFAULT_REQUEST_HEADERS = {
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language': 'en',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
- }
- # Enable or disable spider middlewares
- # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
- #SPIDER_MIDDLEWARES = {
- # 'zhihuuser.middlewares.ZhihuuserSpiderMiddleware': 543,
- #}
- # Enable or disable downloader middlewares
- # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
- #DOWNLOADER_MIDDLEWARES = {
- # 'zhihuuser.middlewares.ZhihuuserDownloaderMiddleware': 543,
- #}
- # Enable or disable extensions
- # See https://doc.scrapy.org/en/latest/topics/extensions.html
- #EXTENSIONS = {
- # 'scrapy.extensions.telnet.TelnetConsole': None,
- #}
- # Configure item pipelines
- # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
- #ITEM_PIPELINES = {
- # 'zhihuuser.pipelines.ZhihuuserPipeline': 300,
- #}
- # Enable and configure the AutoThrottle extension (disabled by default)
- # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
- #AUTOTHROTTLE_ENABLED = True
- # The initial download delay
- #AUTOTHROTTLE_START_DELAY = 5
- # The maximum download delay to be set in case of high latencies
- #AUTOTHROTTLE_MAX_DELAY = 60
- # The average number of requests Scrapy should be sending in parallel to
- # each remote server
- #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
- # Enable showing throttling stats for every response received:
- #AUTOTHROTTLE_DEBUG = False
- # Enable and configure HTTP caching (disabled by default)
- # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
- #HTTPCACHE_ENABLED = True
- #HTTPCACHE_EXPIRATION_SECS = 0
- #HTTPCACHE_DIR = 'httpcache'
- #HTTPCACHE_IGNORE_HTTP_CODES = []
- #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
复制代码
|
|