Crawl sitemap
Hi guys i am trying to extract links from sitemap it works fine when i call for first time but if i call it again with same sitemap url it not extract links from sitemap again what is the problem ? this is my code
async crawlSitemap({
url,
containerId,
channelId,
}: {
url: string;
containerId: string;
channelId: string;
}): Promise<Result<ICrawlerInfo>> {
const infos: ICrawlerExtraxtedInfo[] = [];
const crawler = new PlaywrightCrawler({
async requestHandler({ request, page, log }) {
const title = await page.title();
log.info(`Title of ${request.loadedUrl} is '${title}'`);
infos.push({ url: request.loadedUrl, title: title });
},
// TODO: make this configurable
maxRequestsPerCrawl: 50,
});
const result = await this.crawlerInfoService.createCrawlerInfo({
containerId,
channelId,
url,
});
const { urls } = await Sitemap.load(url);
await crawler.addRequests(urls);
crawler.run().then(() => {
this.logger.log(`Crawler finished ${url} with ${infos.length} links`);
try {
result.extractedInfo = infos;
this.crawlerInfoService.updateCrawlerInfo(result.id, {
status: CrawlerInfoStatus.COMPLETED,
extractedInfo: infos,
});
} catch (error) {
this.logger.error(`${error.message} ${error.stack}`);
this.crawlerInfoService.updateCrawlerInfo(result.id, {
status: CrawlerInfoStatus.FAILED,
});
}
});
return Ok(result);
}
async crawlSitemap({
url,
containerId,
channelId,
}: {
url: string;
containerId: string;
channelId: string;
}): Promise<Result<ICrawlerInfo>> {
const infos: ICrawlerExtraxtedInfo[] = [];
const crawler = new PlaywrightCrawler({
async requestHandler({ request, page, log }) {
const title = await page.title();
log.info(`Title of ${request.loadedUrl} is '${title}'`);
infos.push({ url: request.loadedUrl, title: title });
},
// TODO: make this configurable
maxRequestsPerCrawl: 50,
});
const result = await this.crawlerInfoService.createCrawlerInfo({
containerId,
channelId,
url,
});
const { urls } = await Sitemap.load(url);
await crawler.addRequests(urls);
crawler.run().then(() => {
this.logger.log(`Crawler finished ${url} with ${infos.length} links`);
try {
result.extractedInfo = infos;
this.crawlerInfoService.updateCrawlerInfo(result.id, {
status: CrawlerInfoStatus.COMPLETED,
extractedInfo: infos,
});
} catch (error) {
this.logger.error(`${error.message} ${error.stack}`);
this.crawlerInfoService.updateCrawlerInfo(result.id, {
status: CrawlerInfoStatus.FAILED,
});
}
});
return Ok(result);
}
Solution:Jump to solution
nvm i add
and pass it to PlaywrightCrawler fix...
const config = new Configuration({ persistStorage: false });
const config = new Configuration({ persistStorage: false });
1 Reply
Solution
nvm i add
and pass it to PlaywrightCrawler fix
const config = new Configuration({ persistStorage: false });
const config = new Configuration({ persistStorage: false });