scrapy框架爬取知乎信息实例详解(详细)
爬取知乎的关注信息作为我们scrapy框架的详解例子,爬取的知乎大v是轮子哥,然后将爬取的信息存储进mongo数据库。我将所有解释都放进例子里。虽然每一句代码都有解释,但是最好还是有爬虫的基础。整体思路:
1.选定一位有较多关注数的知乎达人作为我们的爬取对象
2.通过知乎接口获得获得该粉丝的关注列表和粉丝列表
3.通过递归的方法实现对列表中每一个用户的爬取,爬取他们的粉丝列表和关注列表。
4.通过知乎接口获得列表中的每位用户的详细信息。
zhihu.py
# -*- coding: utf-8 -*-
# 出现500服务器响应的错误,原因是检测到我们不是通过浏览器访问的,我们在setting文件中修改headers
# 的值,将user-agent加入。
import scrapy
import json
from scrapy import Request
from zhihuuser.items import UserItem
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_url = ['http://www.zhihu.com/']
start_user='excited-vczh'
user_url='https://www.zhihu.com/api/v4/members/{user}?include={include}'
#user意思是用户的url,即轮子哥的url
user_query='allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
follows_url='https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
follows_query='data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
# follows意思是他关注的人的列表
followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
# followers_query和user_query都是后面的include
#followers意思是关注他的列表
#limit和offset也是变量,limit是固定值,而offset则根据页面的变化而变化
def start_requests(self):
# url='https://www.zhihu.com/api/v4/members/{user}/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20'
# url='https://www.zhihu.com/api/v4/members/excited-vczh/publications?include=data%5B*%5D.cover%2Cebook_type%2Ccomment_count%2Cvoteup_count&offset=0&limit=5'
yield Request(self.user_url.format(user=self.start_url,include=self.user_query), self.parse_user)
yield Request(self.follows_url.format(user=self.start_user,include=self.follows_query,offset=0,limit=20), callback=self.parse_follows)
yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, offset=0, limit=20), callback=self.parse_followers)
# 使用format方法是为了动态的构造url
# 同时也需要指定回调函数来进行url的解析
def parse_user(self, response):
result = json.loads(response.text)# 利用json.load声明一个json对象
item = UserItem()
for field in item.fields:
# 利用item的field属性进行赋值,实际上field输出的是所有集合的名称
if field in result.keys():
# 如果field是result的键名之一则field进行赋值
item = result.get(field)
yield item
yield Request(self.follows_url.format(user=result.get('url_token'),include=self.follows_query,offset=0,limit=20,callback=self.parse_follows))
# 获取关注列表的request
yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, offset=0, limit=20,callback=self.parse_followers))
def parse_follows(self, response):
result = json.loads(response.text)
if 'data' in result.keys():
for result in result.get('data'):
yield Request(self.user_url.format(user=result.get('url_token'),include=self.user_query),callback=self.parse_user)
if 'paging' in result.keys() and result.get('pagging').get('is_end') == False:
next_page = result.get('pagging').get('next')
# 从pagging的next键得到分页下一页的链接
yield Request(next_page,self.parse_follows)
#传入url链接,新建一个request请求,然后回调parse_follows
def parse_followers(self, response):
result = json.loads(response.text)
if 'data' in result.keys():
for result in result.get('data'):
yield Request(self.user_url.format(user=result.get('url_token'),include=self.user_query),callback=self.parse_user)
if 'paging' in result.keys() and result.get('pagging').get('is_end') == False:
next_page = result.get('pagging').get('next')
# 从pagging的next键得到分页下一页的链接
yield Request(next_page,self.parse_followers)
#传入url链接,新建一个request请求,然后回调parse_follows
item.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy import Item, Field
class UserItem(scrapy.Item):
id = Field()
name = Field()
avatar_url = Field()
headline = Field()
url_token =Field()
url = Field()
avatar_url_template = Field()
type = Field()
# define the fields for your item here like:
# name = scrapy.Field()
setting.py
# -*- coding: utf-8 -*-
# Scrapy settings for zhihuuser project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'zhihuuser'
SPIDER_MODULES = ['zhihuuser.spiders']
NEWSPIDER_MODULE = 'zhihuuser.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'zhihuuser (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# 默认为true,意思是遵守rouot协议,我们改成False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'zhihuuser.middlewares.ZhihuuserSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'zhihuuser.middlewares.ZhihuuserDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'zhihuuser.pipelines.ZhihuuserPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
支持分享
页:
[1]