Managing duplicate queries using RequestQueue but it seems off.

Description

It appears that my custom RequestQueue isn't working as expected. Very few jobs are being processed, even though my RequestQueue list has many more job IDs.

import { RequestQueue } from "crawlee";

let jobQueue: RequestQueue;
async function initializeJobQueue() {
  if (!jobQueue) {
    jobQueue = await RequestQueue.open("job-deduplication-queue");
  }
}

async function fetchJobPages(page: Page, jobIds: string[], origin: string) {
  await initializeJobQueue();

  const filteredJobIds = [];
  if (saveOnlyUniqueItems) {
    for (const jobId of jobIds) {
      const jobUrl = `${origin}/viewjob?jk=${jobId}`;
      const request = await jobQueue.addRequest({ url: jobUrl });
      if (!request.wasAlreadyPresent) filteredJobIds.push(jobId);
    }
  } else {
    filteredJobIds.push(...jobIds);
  }

  myLog(
    `Filtered ${jobIds.length - filteredJobIds.length} duplicates, ` +
    `processing ${filteredJobIds.length} unique jobs.`
  );

  // fetchJobWithRetry and batching logic follows...
}


Am i using the request correctly, I am not using the default one from the crawler because my scrapping logic does not allow it.
Was this page helpful?