2 replies

Crawl sitemap

Hi guys i am trying to extract links from sitemap it works fine when i call for first time but if i call it again with same sitemap url it not extract links from sitemap again what is the problem ? this is my code

 async crawlSitemap({
    url,
    containerId,
    channelId,
  }: {
    url: string;
    containerId: string;
    channelId: string;
  }): Promise<Result<ICrawlerInfo>> {
    const infos: ICrawlerExtraxtedInfo[] = [];
    const crawler = new PlaywrightCrawler({
      async requestHandler({ request, page, log }) {
        const title = await page.title();
        log.info(`Title of ${request.loadedUrl} is '${title}'`);

        infos.push({ url: request.loadedUrl, title: title });
      },
      // TODO: make this configurable
      maxRequestsPerCrawl: 50,
    });

    const result = await this.crawlerInfoService.createCrawlerInfo({
      containerId,
      channelId,
      url,
    });
    const { urls } = await Sitemap.load(url);
    await crawler.addRequests(urls);

    crawler.run().then(() => {
      this.logger.log(`Crawler finished ${url} with ${infos.length} links`);
      try {
        result.extractedInfo = infos;
        this.crawlerInfoService.updateCrawlerInfo(result.id, {
          status: CrawlerInfoStatus.COMPLETED,
          extractedInfo: infos,
        });
      } catch (error) {
        this.logger.error(`${error.message} ${error.stack}`);
        this.crawlerInfoService.updateCrawlerInfo(result.id, {
          status: CrawlerInfoStatus.FAILED,
        });
      }
    });
    return Ok(result);
  }

 async crawlSitemap({
    url,
    containerId,
    channelId,
  }: {
    url: string;
    containerId: string;
    channelId: string;
  }): Promise<Result<ICrawlerInfo>> {
    const infos: ICrawlerExtraxtedInfo[] = [];
    const crawler = new PlaywrightCrawler({
      async requestHandler({ request, page, log }) {
        const title = await page.title();
        log.info(`Title of ${request.loadedUrl} is '${title}'`);

        infos.push({ url: request.loadedUrl, title: title });
      },
      // TODO: make this configurable
      maxRequestsPerCrawl: 50,
    });

    const result = await this.crawlerInfoService.createCrawlerInfo({
      containerId,
      channelId,
      url,
    });
    const { urls } = await Sitemap.load(url);
    await crawler.addRequests(urls);

    crawler.run().then(() => {
      this.logger.log(`Crawler finished ${url} with ${infos.length} links`);
      try {
        result.extractedInfo = infos;
        this.crawlerInfoService.updateCrawlerInfo(result.id, {
          status: CrawlerInfoStatus.COMPLETED,
          extractedInfo: infos,
        });
      } catch (error) {
        this.logger.error(`${error.message} ${error.stack}`);
        this.crawlerInfoService.updateCrawlerInfo(result.id, {
          status: CrawlerInfoStatus.FAILED,
        });
      }
    });
    return Ok(result);
  }

Solution

nvm i add

const config = new Configuration({ persistStorage: false });

const config = new Configuration({ persistStorage: false });

and pass it to PlaywrightCrawler fix

Jump to solution

Crawl sitemap

Similar Threads