一 . 编写scrapy爬虫
创建项目:D:\scrapy>scrapy startproject Tencent
D:\scrapy> cd Tentcent
创建爬虫:D:\scrapy\Tentcent>scrapy genspider tencent hr.tencent.com
# 腾讯社招 https://hr.tencent.com/position.php?&start=0#a
职位名 positionName
职位详情连接 positionLink
职位类别 positionType
招聘人数 peopleNumber
工作地点 workLocation
发布时间 publishTime scrapy启动,会先读取配置文件 setting.py
编写items.py,明确需要提取的数据。
编写spiders/xxxx.py爬虫文件,处理请求和响应,以及提取数据(yield item)。
编写pipelines.py,编写管道文件,处理spider返回item数据。
编写setting.py, 启动管道文件,以及其他相关设置
执行爬虫,调试等。
二 、配置mysql数据库
1. 安装pymysql
cmd下:pip install pymysql
然后python >>> import pymysql 检查是否安装完成,安装成功!
2. 查看items.py
- class TencentItem(scrapy.Item):
- # name = scrapy.Field()
- # 职位名
- positionName = scrapy.Field()
- # 职位详情连接
- positionLink = scrapy.Field()
- # 职位类别
- positionType = scrapy.Field()
- # 招聘人数
- peopleNumber = scrapy.Field()
- # 工作地点
- workLocation = scrapy.Field()
- # 发布时间
- publishTime = scrapy.Field()
复制代码
3. 创建数据库和表- mysql> create database tencent DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci;
- mysql> use tencent;
-
- CREATE TABLE `hr` (
- `id` int(10) NOT NULL ,
- `positionName` varchar(100) NOT NULL COMMENT '职位名' ,
- `positionLink` varchar(150) NULL DEFAULT COMMENT '职位详情连接' ,
- `positionType` varchar(30) NULL COMMENT '职位类别' ,
- `peopleNumber` int(10) NULL COMMENT '招聘人数' ,
- `workLocation` varchar(30) NULL COMMENT '工作地点' ,
- `publishTime` timestamp NULL ON UPDATE CURRENT_TIMESTAMP COMMENT '发布时间' ,
- PRIMARY KEY (`id`)
- );
复制代码
4. setting.py
- ITEM_PIPELINES = {
- #'Tencent.pipelines.TencentPipeline': 300,
- 'Tencent.pipelines.TencentMysqlDBPipeline': 200,
- }
-
- #Mysql数据库的配置信息
- MYSQL_HOST = '127.0.0.1'
- MYSQL_DBNAME = 'tencent'
- MYSQL_USER = 'root'
- MYSQL_PASSWD = ''
- MYSQL_PORT = 3306
复制代码
5. pipelines.py
- # import json
- import pymysql
- # from scrapy.conf import settings
- from scrapy import log
- from twisted.enterprise import adbapi
-
-
- class TencentMysqlDBPipeline(object):
- @classmethod
- def from_settings(cls, settings):
- dbargs = dict(
- host=settings['MYSQL_HOST'],
- db=settings['MYSQL_DBNAME'],
- user=settings['MYSQL_USER'],
- passwd=settings['MYSQL_PASSWD'],
- port=settings['MYSQL_PORT'],
- charset='utf8',
- cursorclass=pymysql.cursors.DictCursor,
- use_unicode=True,
- )
- dbpool = adbapi.ConnectionPool('pymysql', **dbargs)
- return cls(dbpool)
-
-
- def __init__(self,dbpool):
- self.dbpool=dbpool
-
- #pipeline默认调用
- def process_item(self, item, spider):
- d=self.dbpool.runInteraction(self._conditional_insert, item, spider) #调用插入的方法
- log.msg("-------------------连接好了-------------------")
- d.addErrback(self._handle_error,item,spider) #调用异常处理方法
- d.addBoth(lambda _: item)
- return d
-
- def _conditional_insert(self, conn, item, spider):
- log.msg("-------------------打印-------------------")
-
- conn.execute("insert into hr (positionName, positionLink, positionType, peopleNumber, workLocation, publishTime) values(%s, %s, %s, %s, %s, %s)",
- (item['positionName'], item['positionLink'], item['positionType'], item['peopleNumber'], item['workLocation'], item['publishTime']))
- log.msg("-------------------一轮循环完毕-------------------")
- def _handle_error(self, failue, item, spider):
- print(failue)
复制代码
6. spiders/tencent.py
- # -*- coding: utf-8 -*-
- import scrapy
- from Tencent.items import TencentItem
- # https://hr.tencent.com/position.php?&start=0#a
-
- class TencentSpider(scrapy.Spider):
- name = 'tencent'
- allowed_domains = ['tencent.com']
- baseURL = 'https://hr.tencent.com/position.php?&start='
- offset = 0
- start_urls = [baseURL + str(offset)]
-
-
- def parse(self, response):
- node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
- for node in node_list:
- item = TencentItem() # encode("utf-8") 将字符串从unicode转换到utf-8
- item['positionName'] = node.xpath("./td[1]/a/text()").extract()[0].encode("utf-8")
- item['positionLink'] = ("https://hr.tencent.com/" + node.xpath("./td[1]/a/@href").extract()[0]).encode("utf-8")
- if(len(node.xpath("./td[2]/text()"))):
- item['positionType'] = node.xpath("./td[2]/text()").extract()[0].encode("utf-8")
- else:
- item['positionType'] = "无类别".encode("utf-8")
- item['peopleNumber'] = node.xpath("./td[3]/text()").extract()[0].encode("utf-8")
- item['workLocation'] = node.xpath("./td[4]/text()").extract()[0].encode("utf-8")
- item['publishTime'] = node.xpath("./td[5]/text()").extract()[0].encode("utf-8")
-
- # yield 是返回数据后还能回来接着执行代码
- yield item
-
- # 第二种方法,提取下一页,爬去全部页面
- # 如果xpath取不到值 ==0 就不是最终页
- if len(response.xpath("//a[@class='noactive' and @id='next']")) == 0:
- nextUrl = "https://hr.tencent.com/" + response.xpath("//a[@id='next']/@href").extract()[0]
- yield scrapy.Request(nextUrl, callback=self.parse)
-
- # 第一种方法:指定爬取页数
- # if self.offset < 3980:
- # self.offset += 10
- # url = self.baseURL + str(self.offset)
- # #scrapy.Request(url, callback=self.parse_next)
- # yield scrapy.Request(url, callback=self.parse)
-
- # #def parse_next(self,response):
- # #pass
复制代码
|