How can I wait with processing further logic untill all request from batch are proceeded

Hi

I have this code:

  async processBatch(batch){
// requests: {
//     url: string;
//     userData: CrawlerUserData;
// }[]
    const requests = this.generateRequests(batch)
    await this.crawler.addRequests(requests)

    return this.processResults(requests)
  }
...
  async processResults(requests){
    ...
    for (const request of requests) {
      const userData = request.userData as CrawlerUserData
      if (userData.error) {
        this.statistics.incrementErrors()
        continue
      }

      if (userData.results) {
        ...
        await this.saveResults(userData)
      }
    }

    return batchResults
  }

  async processBatch(batch){
// requests: {
//     url: string;
//     userData: CrawlerUserData;
// }[]
    const requests = this.generateRequests(batch)
    await this.crawler.addRequests(requests)

    return this.processResults(requests)
  }
...
  async processResults(requests){
    ...
    for (const request of requests) {
      const userData = request.userData as CrawlerUserData
      if (userData.error) {
        this.statistics.incrementErrors()
        continue
      }

      if (userData.results) {
        ...
        await this.saveResults(userData)
      }
    }

    return batchResults
  }

and this is my route handler:

import { createPlaywrightRouter } from 'crawlee'

export const router = createPlaywrightRouter()

router.addDefaultHandler(async ({ page, request, log }) => {
  const userData = request.userData as CrawlerUserData
  try {
    await page.waitForLoadState('networkidle', { timeout: 5000 })

    const analyzer = new AlertsProximityAnalyzer(userData, callbackCheckingIfDataExist)

    await analyzer.analyze(page) // executing callback

    userData.results = analyzer.results
    // Do I need to save the results here?
  } catch (error) {
    ...
  } finally {
    // Instead of closing the page, reset it for the next use
    await page.evaluate(() => window.stop())
    await page.setContent('<html></html>')
  }
})

import { createPlaywrightRouter } from 'crawlee'

export const router = createPlaywrightRouter()

router.addDefaultHandler(async ({ page, request, log }) => {
  const userData = request.userData as CrawlerUserData
  try {
    await page.waitForLoadState('networkidle', { timeout: 5000 })

    const analyzer = new AlertsProximityAnalyzer(userData, callbackCheckingIfDataExist)

    await analyzer.analyze(page) // executing callback

    userData.results = analyzer.results
    // Do I need to save the results here?
  } catch (error) {
    ...
  } finally {
    // Instead of closing the page, reset it for the next use
    await page.evaluate(() => window.stop())
    await page.setContent('<html></html>')
  }
})

The problem is the crawling process executes once the whole code in

processBatch

processBatch

is done, eg. all batches are added to requestQueue and

processResults

processResults

is executed ( which do not have any data since there is not yet created

userData.results

userData.results

so what I want to know it I need to move my logic to saving results to DB to route handler or is there some way to stop executing this function and start executing route handler and then move back to executing processResults

In response I will paste pseudo algorithm what I expect

How can I wait with processing further logic untill all request from batch are proceeded

Hi

I have this code:

  async processBatch(batch){
// requests: {
//     url: string;
//     userData: CrawlerUserData;
// }[]
    const requests = this.generateRequests(batch)
    await this.crawler.addRequests(requests)

    return this.processResults(requests)
  }
...
  async processResults(requests){
    ...
    for (const request of requests) {
      const userData = request.userData as CrawlerUserData
      if (userData.error) {
        this.statistics.incrementErrors()
        continue
      }

      if (userData.results) {
        ...
        await this.saveResults(userData)
      }
    }

    return batchResults
  }

  async processBatch(batch){
// requests: {
//     url: string;
//     userData: CrawlerUserData;
// }[]
    const requests = this.generateRequests(batch)
    await this.crawler.addRequests(requests)

    return this.processResults(requests)
  }
...
  async processResults(requests){
    ...
    for (const request of requests) {
      const userData = request.userData as CrawlerUserData
      if (userData.error) {
        this.statistics.incrementErrors()
        continue
      }

      if (userData.results) {
        ...
        await this.saveResults(userData)
      }
    }

    return batchResults
  }

and this is my route handler:

import { createPlaywrightRouter } from 'crawlee'

export const router = createPlaywrightRouter()

router.addDefaultHandler(async ({ page, request, log }) => {
  const userData = request.userData as CrawlerUserData
  try {
    await page.waitForLoadState('networkidle', { timeout: 5000 })

    const analyzer = new AlertsProximityAnalyzer(userData, callbackCheckingIfDataExist)

    await analyzer.analyze(page) // executing callback

    userData.results = analyzer.results
    // Do I need to save the results here?
  } catch (error) {
    ...
  } finally {
    // Instead of closing the page, reset it for the next use
    await page.evaluate(() => window.stop())
    await page.setContent('<html></html>')
  }
})

import { createPlaywrightRouter } from 'crawlee'

export const router = createPlaywrightRouter()

router.addDefaultHandler(async ({ page, request, log }) => {
  const userData = request.userData as CrawlerUserData
  try {
    await page.waitForLoadState('networkidle', { timeout: 5000 })

    const analyzer = new AlertsProximityAnalyzer(userData, callbackCheckingIfDataExist)

    await analyzer.analyze(page) // executing callback

    userData.results = analyzer.results
    // Do I need to save the results here?
  } catch (error) {
    ...
  } finally {
    // Instead of closing the page, reset it for the next use
    await page.evaluate(() => window.stop())
    await page.setContent('<html></html>')
  }
})

The problem is the crawling process executes once the whole code in

processBatch

processBatch

is done, eg. all batches are added to requestQueue and

processResults

processResults

is executed ( which do not have any data since there is not yet created

userData.results

userData.results

How can I wait with processing further logic untill all request from batch are proceeded

How can I wait with processing further logic untill all request from batch are proceeded

Similar Threads

Similar Threads

Similar Threads