yummy-tealY
Apify & Crawlee3y ago
2 replies
yummy-teal

The scrapy crawler gets a different amount of data every time, okay?

There are 20 pages of data on the website, 14 on each page, 280 in total. I have tried several times to get inconsistent data each time, and there is no error in the log.
To get the multi-page data and the detail page data, the code is as follows
class GzDfjrjdSpider(scrapy.Spider):
name = 'gz_dfjrjd'
allowed_domains = ['jrjgj.gz.gov.cn']
start_urls = ['http://jrjgj.gz.gov.cn/tzgg/index.html']
base_url = 'http://jrjgj.gz.gov.cn/tzgg/index'
page = 1

def parse(self, response):
li_list = response.xpath('//div[@id="main"]/ul/li')
for li in li_list:
self.ww = self.ww + 1
url = li.xpath('./a/@href').extract_first()
name = li.xpath('./a/@title').extract_first()
time_str = li.xpath('./span/text()').extract_first()
date = datetime.datetime.strptime(time_str, '%Y-%m-%d')
yield scrapy.Request(url=url, callback=self.parse_second,
meta={'title': name, 'time': date, 'url': url, 'download_timeout': 30})
self.page = self.page + 1
self.url = self.baseurl + '' + str(self.page) + '.html'
yield scrapy.Request(url=self.url, callback=self.parse, meta={'download_timeout': 20})

def parse_second(self, response):
content = response.xpath('//div[@class="info_cont"]').getall()
policy_file_id = uuid.uuid1()
create_time = datetime.datetime.now()
policy = GzStBureauItem(policy_file_id=policy_file_id, title=response.meta['title'], goverment='gz_ed',
area='gzp', date=response.meta['time'], content="".join(content),
url=response.meta['url'], create_time=create_time)
yield policy


-----------------------------------------------------------------------
Was this page helpful?