How do you like the idea of doing this?
import { PuppeteerCrawler, ProxyConfiguration, Dataset } from 'crawlee';
import * as cheerio from 'cheerio';
const crawler = new PuppeteerCrawler({
async requestHandler({ request, sendRequest, parseWithCheerio }) {
if (request.skipNavigation) {
const { statusCode, body } = await sendRequest();
if (statusCode === 200) {
const $ = cheerio.load(body);
const title = $('h1').text();
Dataset.pushData({ title, url: request.url });
} else {
// Maybe there is a keepDuplicateUrls option π€
await crawler.addRequests([{ url: request.url, useExtendedUniqueKey: true }]);
}
} else {
const $ = await parseWithCheerio();
const title = $('h1').text();
Dataset.pushData({ title, url: request.url });
}
}
});
await crawler.run([{ url: 'https://nowsecure.nl', skipNavigation: true }]);