molecular-blue•11mo ago

How to store data in the same dict from different URL?

I have a list of results where I enqueue the link for each item. For each item, I need to crawl internal pages (tabs) and extract the data in tables and add the data to the same dict. I can extract the data from all the pages with router and enqueue_links but I am not able to gather all data in the same dict for each item. What is the best way to do it?

5 Replies

Hall•11mo ago

View post on community site

This post has been pushed to the community knowledgebase. Any replies in this thread will be synced to the community site.

Apify Community

molecular-blueOP•11mo ago

@router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'default_handler is processing {context.request.url}') context.page.wait_for_selector('li.resultado-busqueda > a') await context.enqueue_links( selector='li.resultado-busqueda > a', label='ITEM', ) next_button = await context.page.query_selector('.paginar2 > ul > li > a') if next_button: await context.enqueue_links( selector='.paginar2 > ul > li > a', label='RESULTS', ) @router.handler('ITEM') async def item_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'item_handler is processing {context.request.url}') await context.page.wait_for_selector('#tabs > ul > li > a') await context.enqueue_links( selector='#tabs > ul > li > a', label='TAB', ) @router.handler('TAB') async def tab_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'tab_handler is processing {context.request.url}')
tables = await context.page.query_selector_all('table') data = {} for table in tables: rows = await table.query_selector_all('tr') for row in rows: key = await row.query_selector('th') value = await row.query_selector('td') if key and value: key_text = await key.text_content() value_text = await value.text_content() data[key_text.strip()] = value_text.strip() context.log.info(data) await context.push_data(data)

Mantisus•11mo ago

Hi @Kike , perhaps this will be useful for you. https://discord.com/channels/801163717915574323/1285637563675050005/1285659407149174805

molecular-blueOP•11mo ago

Hi @Mantisus, thank you for your response. I checked the thread but I'm still trying to guess how to gather all the data from the same item to the same dataset. This is my try: @router.handler('ITEM') async def item_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'item_handler is processing {context.request.url}') # Buscar todos los enlaces en navlist await context.page.wait_for_selector('#tabs > ul > li > a') label = await context.page.locator('#idBloqueDatos1 > table > tbody > tr > td').text_content()
tabs = await context.page.query_selector_all('#tabs > ul > li > a') for tab in tabs: url = await tab.get_attribute('href') tab_name = await tab.text_content() await context.add_requests([ Request.from_url( url=url, user_data={'label': label, 'tab_name': tab_name}), ])

Mantisus•11mo ago

Yeah, I can see that won't work for you. I would use some sort of external storage. Like a global dictionary or some kind of class storage. Where your results would be aggregated. Here's a simple code sample that would implement this

storage = {}

@router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
    context.log.info(f'default_handler is processing {context.request.url}')

    url = "https://httpbin.org/get?a=item"

    storage[url] = {}

    await context.add_requests([
        Request.from_url(
            url=url,
            label='ITEM',
            user_data={"item_url":url}),
        ])


@router.handler('ITEM')
async def item_handler(context: HttpCrawlingContext) -> None:
    context.log.info(f'item_handler is processing {context.request.url}')
    url = context.request.user_data["item_url"]
    tabs = [f"https://httpbin.org/get?tab{i}={i}" for i in range(11)]
    storage[url]["all_tabs"] = len(tabs)
    storage[url]["processed_tabs"] = 0
    requests = [Request.from_url(
            url=tab,
            label='TAB',
            user_data={"item_url":url}) for tab in tabs]
    await context.add_requests(requests)

@router.handler('TAB')
async def tab_handler(context: HttpCrawlingContext) -> None:
    context.log.info(f'tab_handler is processing {context.request.url}')

    url = context.request.user_data["item_url"]

    data = json.loads(context.http_response.read())

    for key, value in data["args"].items():
        storage[url].update({
            key: value
        })

    storage[url]["processed_tabs"] += 1
    if storage[url]["processed_tabs"] == storage[url]["all_tabs"]:
        del storage[url]["processed_tabs"]
        del storage[url]["all_tabs"]
        await context.push_data(storage[url])
        del storage[url]

storage = {}

@router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
    context.log.info(f'default_handler is processing {context.request.url}')

    url = "https://httpbin.org/get?a=item"

    storage[url] = {}

    await context.add_requests([
        Request.from_url(
            url=url,
            label='ITEM',
            user_data={"item_url":url}),
        ])


@router.handler('ITEM')
async def item_handler(context: HttpCrawlingContext) -> None:
    context.log.info(f'item_handler is processing {context.request.url}')
    url = context.request.user_data["item_url"]
    tabs = [f"https://httpbin.org/get?tab{i}={i}" for i in range(11)]
    storage[url]["all_tabs"] = len(tabs)
    storage[url]["processed_tabs"] = 0
    requests = [Request.from_url(
            url=tab,
            label='TAB',
            user_data={"item_url":url}) for tab in tabs]
    await context.add_requests(requests)

@router.handler('TAB')
async def tab_handler(context: HttpCrawlingContext) -> None:
    context.log.info(f'tab_handler is processing {context.request.url}')

    url = context.request.user_data["item_url"]

    data = json.loads(context.http_response.read())

    for key, value in data["args"].items():
        storage[url].update({
            key: value
        })

    storage[url]["processed_tabs"] += 1
    if storage[url]["processed_tabs"] == storage[url]["all_tabs"]:
        del storage[url]["processed_tabs"]
        del storage[url]["all_tabs"]
        await context.push_data(storage[url])
        del storage[url]

How to store data in the same dict from different URL?

Did you find this page helpful?