foreign-sapphire•3y ago
Reset Crawler
Is possible when I get multiple 403 to reset completely the crawler ?
I try in failedRequestHandler and errorHandler but with no success :/
Thanks by advance 🙂
2 Replies
continuing-cyan•3y ago
import { CheerioCrawler, RequestQueue } from 'crawlee';
import type { RequestOptions } from 'crawlee';
const startRequests: RequestOptions[] = [
{
url: 'https://google.com',
},
{
url: 'https://crawlee.dev',
},
{
url: 'https://yahoo.com',
},
{
url: 'https://apify.com',
label: 'apify',
},
];
const crawler = new CheerioCrawler({
requestHandler: ({ log, request }) => {
// if it's going to apify.com, throw an error
if (request.label === 'apify') throw new Error('403');
// otherwise, just log something
log.info(`${request.url} handled`);
},
errorHandler: async ({ crawler, log }, error) => {
// store the number of 403 errors hit in the crawler's state
const state = await crawler.useState<{ count403: number }>({ count403: 0 });
// if the error was a 403 error, increment the error count
if (error.message.includes('403')) state.count403++;
// if the total number of 403s hit is 2 (or greater),
if (state.count403 >= 2) {
log.warning('Restarting crawler.');
// reset the count
state.count403 = 0;
// stop the crawler
await crawler.teardown();
// drop the existing request queue so requests can be handled again
await crawler.requestQueue?.drop();
// create a new request queue with a random name
crawler.requestQueue = await RequestQueue.open(Math.round(Math.random() * 100).toString());
// re-run the crawler
await crawler.run(startRequests);
}
},
});
await crawler.run(startRequests);
import { CheerioCrawler, RequestQueue } from 'crawlee';
import type { RequestOptions } from 'crawlee';
const startRequests: RequestOptions[] = [
{
url: 'https://google.com',
},
{
url: 'https://crawlee.dev',
},
{
url: 'https://yahoo.com',
},
{
url: 'https://apify.com',
label: 'apify',
},
];
const crawler = new CheerioCrawler({
requestHandler: ({ log, request }) => {
// if it's going to apify.com, throw an error
if (request.label === 'apify') throw new Error('403');
// otherwise, just log something
log.info(`${request.url} handled`);
},
errorHandler: async ({ crawler, log }, error) => {
// store the number of 403 errors hit in the crawler's state
const state = await crawler.useState<{ count403: number }>({ count403: 0 });
// if the error was a 403 error, increment the error count
if (error.message.includes('403')) state.count403++;
// if the total number of 403s hit is 2 (or greater),
if (state.count403 >= 2) {
log.warning('Restarting crawler.');
// reset the count
state.count403 = 0;
// stop the crawler
await crawler.teardown();
// drop the existing request queue so requests can be handled again
await crawler.requestQueue?.drop();
// create a new request queue with a random name
crawler.requestQueue = await RequestQueue.open(Math.round(Math.random() * 100).toString());
// re-run the crawler
await crawler.run(startRequests);
}
},
});
await crawler.run(startRequests);
foreign-sapphireOP•3y ago
oh thank you so much !