// CURRENT EXPENSIVE APPROACH:
// Enqueues a whole new request, costing 1 Write + 1 Handled per item
/*
await crawler.addRequests([{
url: detailUrl,
label: 'EXTRA_DATA',
userData: { ... }
}]);
*/
// PROPOSED COST-SAVING APPROACH:
// Fetches inline to avoid touching the Request Queue DB
router.addHandler('ITEM_PAGE', async ({ request, session, proxyInfo, sendRequest }) => {
// 1. Scrape the main item data
const mainData = extractMainData();
const detailUrl = extractDetailUrl();
// 2. Fetch extra details inline without queueing
if (detailUrl) {
// My concern: Does this perfectly mimic the parent request?
const { body } = await sendRequest({
url: detailUrl,
session: session, // Passing current session
proxyUrl: proxyInfo?.url, // Passing current proxy IP
});
const extraData = extractExtraData(body);
}
await pushData(mainData);
});
// CURRENT EXPENSIVE APPROACH:
// Enqueues a whole new request, costing 1 Write + 1 Handled per item
/*
await crawler.addRequests([{
url: detailUrl,
label: 'EXTRA_DATA',
userData: { ... }
}]);
*/
// PROPOSED COST-SAVING APPROACH:
// Fetches inline to avoid touching the Request Queue DB
router.addHandler('ITEM_PAGE', async ({ request, session, proxyInfo, sendRequest }) => {
// 1. Scrape the main item data
const mainData = extractMainData();
const detailUrl = extractDetailUrl();
// 2. Fetch extra details inline without queueing
if (detailUrl) {
// My concern: Does this perfectly mimic the parent request?
const { body } = await sendRequest({
url: detailUrl,
session: session, // Passing current session
proxyUrl: proxyInfo?.url, // Passing current proxy IP
});
const extraData = extractExtraData(body);
}
await pushData(mainData);
});