wise-white•2y ago

Async link parsing for faster results

I have previously used enqueueLinks with a selector to add more URLs to the queue, with good results - it's nearly instant on pages with around 100 links. I now need to modify the unique ID so I'm looping through the results on the page - even though the links are already loaded, this is very slow. Is there a way to do this faster, while still getting the additional attributes?

const processResults = async (locator: Locator) => {
  const queue: {
      [key: string]: {
          name: string | null
          address: string | null
      }
  } = {}
  for await (const result of await locator.all()) {
      try {
          const resultLinkLocator = result.locator(`a[aria-label]`)
          const addressShortLocator = result.locator(
              `span[aria-hidden]:has-text("·") + span:not([role="img"])`
          )
          const name = await resultLinkLocator.getAttribute(
              "aria-label",
              {
                  timeout: 5_000,
              }
          )
          log.info(`Result name: ${name}`)
          const address = await addressShortLocator.textContent({
              timeout: 5_000,
          })
          const url = await resultLinkLocator.getAttribute("href", {
              timeout: 5_000,
          })

          if (!url) {
              log.info(`No url found for result ${name}`)
              continue
          }
          queue[url] = {
              name,
              address,
          }
      } catch (e: any) {
          log.info(`Error queueing result. Error: ${e}`)
      }
  }
  return queue
}
const urls = Object.keys(linkQueue)
await enqueueLinks({
    label: "PLACE_DETAIL",
    urls: urls,
    transformRequestFunction: (request) => {
        request.uniqueKey = `${linkQueue[request.url].name}|${
            linkQueue[request.url].address ?? location
        }`
        return request
    },
    strategy: "same-domain",
    userData,
})

const processResults = async (locator: Locator) => {
  const queue: {
      [key: string]: {
          name: string | null
          address: string | null
      }
  } = {}
  for await (const result of await locator.all()) {
      try {
          const resultLinkLocator = result.locator(`a[aria-label]`)
          const addressShortLocator = result.locator(
              `span[aria-hidden]:has-text("·") + span:not([role="img"])`
          )
          const name = await resultLinkLocator.getAttribute(
              "aria-label",
              {
                  timeout: 5_000,
              }
          )
          log.info(`Result name: ${name}`)
          const address = await addressShortLocator.textContent({
              timeout: 5_000,
          })
          const url = await resultLinkLocator.getAttribute("href", {
              timeout: 5_000,
          })

          if (!url) {
              log.info(`No url found for result ${name}`)
              continue
          }
          queue[url] = {
              name,
              address,
          }
      } catch (e: any) {
          log.info(`Error queueing result. Error: ${e}`)
      }
  }
  return queue
}
const urls = Object.keys(linkQueue)
await enqueueLinks({
    label: "PLACE_DETAIL",
    urls: urls,
    transformRequestFunction: (request) => {
        request.uniqueKey = `${linkQueue[request.url].name}|${
            linkQueue[request.url].address ?? location
        }`
        return request
    },
    strategy: "same-domain",
    userData,
})

1 Reply

Lukas Krivka•2y ago

What exactly is slow, do you have timers at specific points? 100 links is nothing. Perf problems only happen with crazy code usually

Async link parsing for faster results

Did you find this page helpful?