Forums

how to use scrapy crawl data?

i already crawl some data from a page with scrapy, but another data in the other page , and the two page is a same topic ,how to do it ?

conrad!your QQ was not in my qq list?

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request 
from hbzc.items import HbzcItem


class HbzcsSpider(scrapy.Spider):
    name = "hbzcs"
    city = '?citycode=130000000-130700000-130728000&cityname=河北省张家口市怀安县'
    allowed_domains = ["www.ccgp-hebei.gov.cn"]
    start_urls = (
        'http://www.ccgp-hebei.gov.cn/zfcg/web/getPreWinAnncList_1.html'+city,#预中标公告列表

    )

    def parse(self, response):
        sel = response.xpath
        htm = '.html'
        prehtm = 'http://www.ccgp-hebei.gov.cn/zfcg/preBidingAnncDetail_'        
        prehtmlist =[prehtm+h+htm for h in [t[1] for t in [i.split("'") for i in sel('//*[@id="moreprewinannctable"]/tr/@onclick').extract()]]]#预中标公告详细
        for preurl in prehtmlist:
            yield Request(preurl,callback=self.parse_item)

    def parse_item0(self,response):
        sel = response.xpath
        htm = '.html'
        dinghtm = 'http://www.ccgp-hebei.gov.cn/zfcg/1/bidingAnncDetail_'
        dinghtmlist = [dinghtm+h+htm for h in [t[1] for t in [i.split("'") for i in sel('//*[@id="moredingannctable"]/tr/@onclick').extract()]]]#招标公告详细
        for dingurl in dinghtmlist:
            item = response.meta['item']
            #print dingurl
            yield Request(url=dingurl,callback=self.parse_item1,meta={'item':item})

    def parse_item(self,response):
        sel = response.xpath
        item = HbzcItem()
        item['fs'] = sel('/html/body/table/tr/td/table/tr[4]/td/table/tr[1]/td/table/tr[1]/td[4]/text()').extract()
        item['dl'] = sel('/html/body/table/tr/td/table/tr[4]/td/table/tr[1]/td/table/tr[3]/td[6]/text()').extract()
        item['zb'] = sel('/html/body/table/tr/td/table/tr[4]/td/table/tr[7]/td/span/span[1]/text()').extract()
        item['mc'] = sel('/html/body/table/tr/td/table/tr[4]/td/table/tr[1]/td/table/tr[2]/td[2]/text()').extract()        
        return Request('http://www.ccgp-hebei.gov.cn/zfcg/web/getBidingList_1.html?citycode=130000000-130700000-130728000',callback=self.parse_item0,meta={'item':item})

    def parse_item1(self,response): 
        sel = response.xpath       
        item = response.meta['item']
        item['ys'] = sel('/html/body/table/tr/td/table/tr[4]/td/table/tr[7]/td/span/span[8]/text()').extract()
        item['kb'] = sel('//*[@id="bidopentime2"]/text()').extract()
        print item
        return item

i guess this block has something wrong,but i cant debug it .

return Request('http://www.ccgp-hebei.gov.cn/zfcg/web/getBidingList_1.html?citycode=130000000-130700000-130728000',callback=self.parse_item0,meta={'item':item})

Hi there, free users are restricted to a whitelist of sites, so you'd have to upgrade to a paying account for that script to work...

There's more info on the whitelist here. We sometimes do add sites to the whitelist, if they have an official public API...