Routers not working as expected

Hello everyone

First of all, thanks for this project — it looks really good and promising!

I'm considering using Crawlee as an alternative to Scrapy.

I'm trying to use a router to run different processes based on the URL.

But the request is never captured by the handler .

I’d appreciate any insights — am I missing something here?

Here’s my crawl.py:

import asyncio
from crawlee.crawlers import AdaptivePlaywrightCrawler
from crawlee import service_locator
from routes import router      

async def main() -> None:
    configuration = service_locator.get_configuration()
    configuration.persist_storage = False
    configuration.write_metadata = False

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        request_handler=router,
        max_requests_per_crawl=5,
    )

    await crawler.run(['https://investor.agenusbio.com/news/default.aspx'])

if __name__ == '__main__':
    asyncio.run(main())

import asyncio
from crawlee.crawlers import AdaptivePlaywrightCrawler
from crawlee import service_locator
from routes import router      

async def main() -> None:
    configuration = service_locator.get_configuration()
    configuration.persist_storage = False
    configuration.write_metadata = False

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        request_handler=router,
        max_requests_per_crawl=5,
    )

    await crawler.run(['https://investor.agenusbio.com/news/default.aspx'])

if __name__ == '__main__':
    asyncio.run(main())

and here my routes:

from __future__ import annotations

from crawlee.crawlers import AdaptivePlaywrightCrawlingContext
from crawlee.router import Router
from crawlee import RequestOptions, RequestTransformAction

router = Router[AdaptivePlaywrightCrawlingContext]()

def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
    url = request_options.get('url', '')

    if url.endswith('.pdf'):
        print(f"Request options: {request_options} before")
        request_options['label'] = 'pdf_handler'
        print(f"Request options: {request_options} after")
        return request_options

    return request_options

@router.default_handler
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
    await context.enqueue_links(
        transform_request_function=transform_request,
    )

@router.handler(label='pdf_handler')
async def pdf_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
    context.log.info('Processing PDF: %s', context.request.url)

from __future__ import annotations

from crawlee.crawlers import AdaptivePlaywrightCrawlingContext
from crawlee.router import Router
from crawlee import RequestOptions, RequestTransformAction

router = Router[AdaptivePlaywrightCrawlingContext]()

def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
    url = request_options.get('url', '')

    if url.endswith('.pdf'):
        print(f"Request options: {request_options} before")
        request_options['label'] = 'pdf_handler'
        print(f"Request options: {request_options} after")
        return request_options

    return request_options

@router.default_handler
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
    await context.enqueue_links(
        transform_request_function=transform_request,
    )

@router.handler(label='pdf_handler')
async def pdf_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
    context.log.info('Processing PDF: %s', context.request.url)

Apify & Crawlee•11mo ago•

4 replies

clean-aquamarine

Routers not working as expected

import asyncio
from crawlee.crawlers import AdaptivePlaywrightCrawler
from crawlee import service_locator
from routes import router      

async def main() -> None:
    configuration = service_locator.get_configuration()
    configuration.persist_storage = False
    configuration.write_metadata = False

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        request_handler=router,
        max_requests_per_crawl=5,
    )

    await crawler.run(['https://investor.agenusbio.com/news/default.aspx'])

if __name__ == '__main__':
    asyncio.run(main())

import asyncio
from crawlee.crawlers import AdaptivePlaywrightCrawler
from crawlee import service_locator
from routes import router      

async def main() -> None:
    configuration = service_locator.get_configuration()
    configuration.persist_storage = False
    configuration.write_metadata = False

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        request_handler=router,
        max_requests_per_crawl=5,
    )

    await crawler.run(['https://investor.agenusbio.com/news/default.aspx'])

if __name__ == '__main__':
    asyncio.run(main())

and here my routes:

from __future__ import annotations

from crawlee.crawlers import AdaptivePlaywrightCrawlingContext
from crawlee.router import Router
from crawlee import RequestOptions, RequestTransformAction

router = Router[AdaptivePlaywrightCrawlingContext]()

def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
    url = request_options.get('url', '')

    if url.endswith('.pdf'):
        print(f"Request options: {request_options} before")
        request_options['label'] = 'pdf_handler'
        print(f"Request options: {request_options} after")
        return request_options

    return request_options

@router.default_handler
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
    await context.enqueue_links(
        transform_request_function=transform_request,
    )

@router.handler(label='pdf_handler')
async def pdf_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
    context.log.info('Processing PDF: %s', context.request.url)

from __future__ import annotations

from crawlee.crawlers import AdaptivePlaywrightCrawlingContext
from crawlee.router import Router
from crawlee import RequestOptions, RequestTransformAction

router = Router[AdaptivePlaywrightCrawlingContext]()

def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
    url = request_options.get('url', '')

    if url.endswith('.pdf'):
        print(f"Request options: {request_options} before")
        request_options['label'] = 'pdf_handler'
        print(f"Request options: {request_options} after")
        return request_options

    return request_options

@router.default_handler
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
    await context.enqueue_links(
        transform_request_function=transform_request,
    )

@router.handler(label='pdf_handler')
async def pdf_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
    context.log.info('Processing PDF: %s', context.request.url)

Routers not working as expected

Similar Threads

Routers not working as expected

Similar Threads

Similar Threads

Similar Threads