parsed_url = urlparse(context.request.url) path_name = parsed_url.path results = _get_regex_matches(path_name) if not results: context.log.info( f'No match found for URL: {context.request.url} in path: ' f'{path_name}' ) # TODO: CANCEL REQUEST
await request_list.mark_request_as_handled(request)
but I don't think I have any access to a request_list or something simular in the PlaywrightPreNavCrawlingContext
import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'The title of {context.request.url} ...') await context.enqueue_links(selector='a[href*="changelog"], a[href*="quick-start"]') await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main())
enqueue_links
import asyncio from yarl import URL from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'The title of {context.request.url} ...') next_requests = [] for link in context.parsed_content.select('a'): link = link.get('href') if 'changelog' in link or 'quick-start' in link: url = URL(context.request.url).join(URL(link)) next_requests.append(str(url)) await context.add_requests(next_requests) await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main())
import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'The title of {context.request.url} ...') await context.enqueue_links(selector='a[href*="changelog"], a[href*="quick-start"]') await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main())
enqueue_links
import asyncio from yarl import URL from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext async def main() -> None: crawler = BeautifulSoupCrawler() @crawler.router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f'The title of {context.request.url} ...') next_requests = [] for link in context.parsed_content.select('a'): link = link.get('href') if 'changelog' in link or 'quick-start' in link: url = URL(context.request.url).join(URL(link)) next_requests.append(str(url)) await context.add_requests(next_requests) await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main())
route
so you don't have to make a real request.import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext async def main() -> None: crawler = PlaywrightCrawler(max_requests_per_crawl=50) @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') page_content = await context.page.content() if '||skip||' in page_content: context.log.info(f'Skip {context.request.url} ...') return await context.enqueue_links() @crawler.pre_navigation_hook async def navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None: if context.request.url == 'https://crawlee.dev/': return if 'changelog' not in context.request.url and 'quick-start' not in context.request.url: await context.page.route( context.request.url, lambda route, _: route.fulfill( status=200, body=b'||skip||', ), ) await crawler.run(['https://crawlee.dev/']) if __name__ == '__main__': asyncio.run(main())