pyspider使用实例 - 51Testing软件测试论坛

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-06-14 17:11:18
# Project: qiushi
from pyspider.libs.base_handler import *
import pymongo, requests
from fake_useragent import UserAgent
import time
def get_proxy():
return requests.get('http://localhost:5010/get/').text
#由于代理池获取的免费代理，不稳定！因此后面的就不在使用代理，仅仅在这里把获取过程的封装函数放在这里！！
class Handler(BaseHandler):
ua = UserAgent()
headers = {
'User-Agent': ua.random,
# 'Host':
# 'Referer':
}
# 创建client和db
client = pymongo.MongoClient('localhost')
db = client['qidian']
# 对于整个爬虫项目的全局配置：所有的self.crawl()在请求的时候都会加载这个配置。
crawl_config = {
'headers': headers,
# 'proxy': get_proxy(),
'itag': 'v0'
}
# 增量爬虫1：每天重新启动爬虫的时候，只爬取页面上更新的数据。(采用去重策略)
# 增量爬虫2：url没有变化，数据更新了。(不能采用去重，每天都要重新爬取)
# 项目启动首先进入的函数
# @every: 用于设置定时爬取任务：可以是minutes, 也可以设置为seconds。
@every(seconds=2 * 60)
def on_start(self):
# 初始爬取的url
self.crawl('https://www.qidian.com/all', callback=self.index_page,validate_cert=False)
# age: 主要是对任务url进行去重/过滤(根据taskid)，每一个url有唯一的一个标识taskid，age是一个以秒为单位的时间点，
#如果在这个时间范围内，遇到了相同的taskid，这个任务就会被丢弃。
@config(age=60)
def index_page(self, response):
# response.doc返回一个pyquery对象
for each in response.doc('h4 > a').items():
#fetch_type='js', js_script="":如果是javascript代码，需要在self.crawl()加入该参数
self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
# 找到下一页
# next_page = response.doc('.lbf-pagination-item-list > li:nth-of-type(9) > a')
# self.crawl(next_page.attr.href, callback=self.index_page, validate_cert=False)
# age的默认值是-1，永远不过期。
@config(priority=2,age=60)
def detail_page(self, response):
# 从response中解析详情页的数据
name = response.doc('h1 > em').text()
author = response.doc('h1 a').text()
tag = response.doc('.tag').text()
info = response.doc('.intro').text()
print(time.time())
print(name)
print(author)
print(tag)
print(info)
return {
"time": str(time.time()),
"url": response.url,
"title": response.doc('title').text(),
"name":name,
"author":author,
"tag":tag,
"info":info,
}
# on_result是固定的函数，只要一个函数中有return，就会自动调用这个函数。
def on_result(self, data):
# 将detail_page函数返回的结果，保存至mongodb中
#print('接收到数据了.....',response['url'])
if data == None:
print('空')
else:
if self.db['q'].update_one({'name': data['name']}, {'$set': data}, True):
print('数据保存成功')
else:
print('数据保存失败')

复制代码