molecular-blue•11mo ago
How to store data in the same dict from different URL?
I have a list of results where I enqueue the link for each item. For each item, I need to crawl internal pages (tabs) and extract the data in tables and add the data to the same dict. I can extract the data from all the pages with router and enqueue_links but I am not able to gather all data in the same dict for each item. What is the best way to do it?
5 Replies
View post on community site
This post has been pushed to the community knowledgebase. Any replies in this thread will be synced to the community site.
Apify Community
molecular-blueOP•11mo ago
@router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'default_handler is processing {context.request.url}')
context.page.wait_for_selector('li.resultado-busqueda > a')
await context.enqueue_links(
selector='li.resultado-busqueda > a',
label='ITEM',
)
next_button = await context.page.query_selector('.paginar2 > ul > li > a')
if next_button:
await context.enqueue_links(
selector='.paginar2 > ul > li > a',
label='RESULTS',
)
@router.handler('ITEM')
async def item_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'item_handler is processing {context.request.url}')
await context.page.wait_for_selector('#tabs > ul > li > a')
await context.enqueue_links(
selector='#tabs > ul > li > a',
label='TAB',
)
@router.handler('TAB')
async def tab_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'tab_handler is processing {context.request.url}')
tables = await context.page.query_selector_all('table') data = {} for table in tables: rows = await table.query_selector_all('tr') for row in rows: key = await row.query_selector('th') value = await row.query_selector('td') if key and value: key_text = await key.text_content() value_text = await value.text_content() data[key_text.strip()] = value_text.strip() context.log.info(data) await context.push_data(data)
tables = await context.page.query_selector_all('table') data = {} for table in tables: rows = await table.query_selector_all('tr') for row in rows: key = await row.query_selector('th') value = await row.query_selector('td') if key and value: key_text = await key.text_content() value_text = await value.text_content() data[key_text.strip()] = value_text.strip() context.log.info(data) await context.push_data(data)
Hi @Kike , perhaps this will be useful for you.
https://discord.com/channels/801163717915574323/1285637563675050005/1285659407149174805
molecular-blueOP•11mo ago
Hi @Mantisus, thank you for your response. I checked the thread but I'm still trying to guess how to gather all the data from the same item to the same dataset. This is my try:
@router.handler('ITEM')
async def item_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'item_handler is processing {context.request.url}')
# Buscar todos los enlaces en navlist
await context.page.wait_for_selector('#tabs > ul > li > a')
label = await context.page.locator('#idBloqueDatos1 > table > tbody > tr > td').text_content()
tabs = await context.page.query_selector_all('#tabs > ul > li > a') for tab in tabs: url = await tab.get_attribute('href') tab_name = await tab.text_content() await context.add_requests([ Request.from_url( url=url, user_data={'label': label, 'tab_name': tab_name}), ])
tabs = await context.page.query_selector_all('#tabs > ul > li > a') for tab in tabs: url = await tab.get_attribute('href') tab_name = await tab.text_content() await context.add_requests([ Request.from_url( url=url, user_data={'label': label, 'tab_name': tab_name}), ])
Yeah, I can see that won't work for you.
I would use some sort of external storage. Like a global dictionary or some kind of class storage. Where your results would be aggregated.
Here's a simple code sample that would implement this