Scrapy

scrapy

  • 新建项目

    1
    scrapy startproject lianjia
  • 打开目录

    1
    cd lianjia
  • 新建爬虫

    1
    scrapy genspider fang wh.lianjia.com/ershoufang
  • 运行爬虫

    1
    scrapy crawl fang

工程修改

  • settings.py
    配置参数相关

    1
    2
    3
    4
    //robots.txt 规则  
    ROBOTSTXT_OBEY = False
    //延时
    DOWNLOAD_DELAY = 0.2
  • pipelines.py
    存储相关

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    from itemadapter import ItemAdapter
    import csv
    import codecs

    class LianjiaPipeline:
    def process_item(self, item, spider):
    # self.filename = 'a.csv'
    self.filename = str(item['local']+'.csv')
    self.file = codecs.open(self.filename, 'a', encoding='gbk')#'utf_8_sig'
    fieldnames = ['local','page','cnt','title', 'html','flood', 'type','area','dir','mode1','mode2','mode3','price', 'unit']#,'year'
    # fieldnames = ['title', 'flood', 'address','price', 'unit']
    w = csv.DictWriter(self.file, fieldnames=fieldnames)
    w.writerow(item)
    self.file.close()
    return item

    # def __init__(self):
    # self.file = codecs.open('zhuankou.csv', 'w', encoding='gbk')#'utf_8_sig'

    # def close_spider(self, spider):
    # self.file.close()

  • items.py
    注册项

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    import scrapy


    class LianjiaItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    local= scrapy.Field()
    page = scrapy.Field()
    cnt =scrapy.Field()
    title= scrapy.Field()
    html= scrapy.Field()
    flood= scrapy.Field()
    address= scrapy.Field()

    type= scrapy.Field()
    area= scrapy.Field()
    dir= scrapy.Field()
    mode1= scrapy.Field()
    mode2= scrapy.Field()
    # year= scrapy.Field()
    mode3= scrapy.Field()

    price= scrapy.Field()
    unit= scrapy.Field()

    pass

  • spiders
    解析相关

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    import scrapy
    from lianjia.items import LianjiaItem


    class FangSpider(scrapy.Spider):
    name = 'fang'
    allowed_domains = ['wh.lianjia.com']
    start_urls = ['http://wh.lianjia.com/ershoufang/']#+pg1/
    getpathall = []
    urlnum = 0

    '''
    #解析全部区域
    def start_requests(self):
    self.base = 'http://wh.lianjia.com'
    url = 'http://wh.lianjia.com/ershoufang/'
    yield scrapy.Request(url=url,callback=self.parse_url)

    def parse_url(self,response):
    locals = response.xpath('//div[@class="position"]//div[@data-role="ershoufang"]//a[@title]/@href').extract()
    # print(locals)
    self.getpathall.extend(locals)

    cnt = len(locals)
    # print(cnt)
    for list in locals:
    url = self.base +list
    # print(url)
    yield scrapy.Request(url=url,meta={'flag':cnt},callback=self.prase_local)
    # print(list)

    def prase_local(self,response):
    self.urlnum+=1
    self.area = response.xpath('//div[@class="position"]//div[@data-role="ershoufang"]//div[2]//a/@href').extract()
    # print(self.area)
    self.getpathall.extend(self.area)
    # print(self.urlnum)
    num = response.meta['flag']
    if num == self.urlnum:
    set(self.getpathall)
    for list in self.getpathall:
    url = self.base+list
    local = list.split('/')[2]
    yield scrapy.Request(url=url,meta={'local':local},callback=self.parse_list)
    print('获取区域列表')
    print(list)
    '''

    def start_requests(self):
    # def parse_list(self,response):
    # local = response.meta['local']
    total = 100
    local = 'guanggudong'
    print('解析区域:'+local)
    for i in range(1,total):
    if i > 1:
    url="http://wh.lianjia.com/ershoufang/{0}/pg{1}/".format(local,i)
    # print(url)
    else:
    url = 'http://wh.lianjia.com/ershoufang/{}'.format(local)
    yield scrapy.Request(url=url,meta={'local':local,'page':i},callback=self.parse)

    def parse(self, response):
    item=LianjiaItem()
    item['local'] = response.meta['local']
    item['page'] = response.meta['page']
    xpath_parse = response.xpath('//div[@class="info clear"]')
    print("获取第{}页".format(item['page']))
    cnt = 1
    for xpath in xpath_parse:
    item['cnt'] = cnt
    item['title'] = xpath.xpath('.//div[@class="title"]//a/text()').get()
    # print(item['title'])
    item['html'] = xpath.xpath('.//div[@class="title"]//a/@href').get()
    item['flood'] = xpath.xpath('.//div[@class="flood"]//a/text()').get()
    # item['address'] = xpath.xpath('.//div[@class="address"]//div/text()').get()
    strinfo = xpath.xpath('.//div[@class="address"]//div/text()').get()
    list = strinfo.split('|',5)
    item['type'] = list[0]
    item['area'] = list[1]
    item['dir'] = list[2]
    item['mode1'] = list[3]
    item['mode2'] = list[4]
    item['mode3'] = list[5]

    item['price'] = xpath.xpath('.//div[@class="totalPrice totalPrice2"]//span//text()').get()
    item['price'] += xpath.xpath('.//div[@class="totalPrice totalPrice2"]//i[2]//text()').get()
    item['unit'] = xpath.xpath('.//div[@class="unitPrice"]//span//text()').get()
    cnt+=1
    yield(item)
    pass

-->

请我喝杯咖啡吧~

支付宝
微信