Routers not working as expected
Hello everyone
First of all, thanks for this project — it looks really good and promising!
I'm considering using Crawlee as an alternative to Scrapy.
I'm trying to use a router to run different processes based on the URL.
But the request is never captured by the handler .
I’d appreciate any insights — am I missing something here?
Here’s my crawl.py:
and here my routes:
First of all, thanks for this project — it looks really good and promising!
I'm considering using Crawlee as an alternative to Scrapy.
I'm trying to use a router to run different processes based on the URL.
But the request is never captured by the handler .
I’d appreciate any insights — am I missing something here?
Here’s my crawl.py:
import asyncio
from crawlee.crawlers import AdaptivePlaywrightCrawler
from crawlee import service_locator
from routes import router
async def main() -> None:
configuration = service_locator.get_configuration()
configuration.persist_storage = False
configuration.write_metadata = False
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
request_handler=router,
max_requests_per_crawl=5,
)
await crawler.run(['https://investor.agenusbio.com/news/default.aspx'])
if __name__ == '__main__':
asyncio.run(main())import asyncio
from crawlee.crawlers import AdaptivePlaywrightCrawler
from crawlee import service_locator
from routes import router
async def main() -> None:
configuration = service_locator.get_configuration()
configuration.persist_storage = False
configuration.write_metadata = False
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
request_handler=router,
max_requests_per_crawl=5,
)
await crawler.run(['https://investor.agenusbio.com/news/default.aspx'])
if __name__ == '__main__':
asyncio.run(main())and here my routes:
from __future__ import annotations
from crawlee.crawlers import AdaptivePlaywrightCrawlingContext
from crawlee.router import Router
from crawlee import RequestOptions, RequestTransformAction
router = Router[AdaptivePlaywrightCrawlingContext]()
def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
url = request_options.get('url', '')
if url.endswith('.pdf'):
print(f"Request options: {request_options} before")
request_options['label'] = 'pdf_handler'
print(f"Request options: {request_options} after")
return request_options
return request_options
@router.default_handler
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
await context.enqueue_links(
transform_request_function=transform_request,
)
@router.handler(label='pdf_handler')
async def pdf_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
context.log.info('Processing PDF: %s', context.request.url)from __future__ import annotations
from crawlee.crawlers import AdaptivePlaywrightCrawlingContext
from crawlee.router import Router
from crawlee import RequestOptions, RequestTransformAction
router = Router[AdaptivePlaywrightCrawlingContext]()
def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
url = request_options.get('url', '')
if url.endswith('.pdf'):
print(f"Request options: {request_options} before")
request_options['label'] = 'pdf_handler'
print(f"Request options: {request_options} after")
return request_options
return request_options
@router.default_handler
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
await context.enqueue_links(
transform_request_function=transform_request,
)
@router.handler(label='pdf_handler')
async def pdf_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
context.log.info('Processing PDF: %s', context.request.url)