python scrapy 腾讯社会招聘爬虫摘要 - 51Testing软件测试论坛

class TencentItem(scrapy.Item):
# name = scrapy.Field()
# 职位名
positionName = scrapy.Field()
# 职位详情连接　
positionLink = scrapy.Field()
# 职位类别
positionType = scrapy.Field()
# 招聘人数
peopleNumber = scrapy.Field()
# 工作地点
workLocation = scrapy.Field()
# 发布时间
publishTime = scrapy.Field()

复制代码

mysql> create database tencent DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci;
mysql> use tencent;
CREATE TABLE `hr` (
`id` int(10) NOT NULL ,
`positionName` varchar(100) NOT NULL COMMENT '职位名' ,
`positionLink` varchar(150) NULL DEFAULT COMMENT '职位详情连接' ,
`positionType` varchar(30) NULL COMMENT '职位类别' ,
`peopleNumber` int(10) NULL COMMENT '招聘人数' ,
`workLocation` varchar(30) NULL COMMENT '工作地点' ,
`publishTime` timestamp NULL ON UPDATE CURRENT_TIMESTAMP COMMENT '发布时间' ,
PRIMARY KEY (`id`)
);

复制代码

ITEM_PIPELINES = {
#'Tencent.pipelines.TencentPipeline': 300,
'Tencent.pipelines.TencentMysqlDBPipeline': 200,
}
#Mysql数据库的配置信息
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'tencent'
MYSQL_USER = 'root'
MYSQL_PASSWD = ''
MYSQL_PORT = 3306

复制代码

# import json
import pymysql
# from scrapy.conf import settings
from scrapy import log
from twisted.enterprise import adbapi
class TencentMysqlDBPipeline(object):
@classmethod
def from_settings(cls, settings):
dbargs = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASSWD'],
port=settings['MYSQL_PORT'],
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool('pymysql', **dbargs)
return cls(dbpool)
def __init__(self,dbpool):
self.dbpool=dbpool
#pipeline默认调用
def process_item(self, item, spider):
d=self.dbpool.runInteraction(self._conditional_insert, item, spider) #调用插入的方法
log.msg("-------------------连接好了-------------------")
d.addErrback(self._handle_error,item,spider) #调用异常处理方法
d.addBoth(lambda _: item)
return d
def _conditional_insert(self, conn, item, spider):
log.msg("-------------------打印-------------------")
conn.execute("insert into hr (positionName, positionLink, positionType, peopleNumber, workLocation, publishTime) values(%s, %s, %s, %s, %s, %s)",
(item['positionName'], item['positionLink'], item['positionType'], item['peopleNumber'], item['workLocation'], item['publishTime']))
log.msg("-------------------一轮循环完毕-------------------")
def _handle_error(self, failue, item, spider):
print(failue)

复制代码

# -*- coding: utf-8 -*-
import scrapy
from Tencent.items import TencentItem
# https://hr.tencent.com/position.php?&start=0#a
class TencentSpider(scrapy.Spider):
name = 'tencent'
allowed_domains = ['tencent.com']
baseURL = 'https://hr.tencent.com/position.php?&start='
offset = 0
start_urls = [baseURL + str(offset)]
def parse(self, response):
node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
for node in node_list:
item = TencentItem() # encode("utf-8") 将字符串从unicode转换到utf-8
item['positionName'] = node.xpath("./td[1]/a/text()").extract()[0].encode("utf-8")
item['positionLink'] = ("https://hr.tencent.com/" + node.xpath("./td[1]/a/@href").extract()[0]).encode("utf-8")
if(len(node.xpath("./td[2]/text()"))):
item['positionType'] = node.xpath("./td[2]/text()").extract()[0].encode("utf-8")
else:
item['positionType'] = "无类别".encode("utf-8")
item['peopleNumber'] = node.xpath("./td[3]/text()").extract()[0].encode("utf-8")
item['workLocation'] = node.xpath("./td[4]/text()").extract()[0].encode("utf-8")
item['publishTime'] = node.xpath("./td[5]/text()").extract()[0].encode("utf-8")
# yield 是返回数据后还能回来接着执行代码
yield item
#　第二种方法，提取下一页，爬去全部页面
# 如果xpath取不到值 ==0 就不是最终页
if len(response.xpath("//a[@class='noactive' and @id='next']")) == 0:
nextUrl = "https://hr.tencent.com/" + response.xpath("//a[@id='next']/@href").extract()[0]
yield scrapy.Request(nextUrl, callback=self.parse)
#　第一种方法：指定爬取页数
# if self.offset < 3980:
# self.offset += 10
# url = self.baseURL + str(self.offset)
# #scrapy.Request(url, callback=self.parse_next)
# yield scrapy.Request(url, callback=self.parse)
# #def parse_next(self,response):
# #pass

复制代码