molecular-blue
molecular-blue11mo ago

How to store data in the same dict from different URL?

I have a list of results where I enqueue the link for each item. For each item, I need to crawl internal pages (tabs) and extract the data in tables and add the data to the same dict. I can extract the data from all the pages with router and enqueue_links but I am not able to gather all data in the same dict for each item. What is the best way to do it?
5 Replies
Hall
Hall11mo ago
View post on community site
This post has been pushed to the community knowledgebase. Any replies in this thread will be synced to the community site.
Apify Community
molecular-blue
molecular-blueOP11mo ago
@router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'default_handler is processing {context.request.url}') context.page.wait_for_selector('li.resultado-busqueda > a') await context.enqueue_links( selector='li.resultado-busqueda > a', label='ITEM', ) next_button = await context.page.query_selector('.paginar2 > ul > li > a') if next_button: await context.enqueue_links( selector='.paginar2 > ul > li > a', label='RESULTS', ) @router.handler('ITEM') async def item_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'item_handler is processing {context.request.url}') await context.page.wait_for_selector('#tabs > ul > li > a') await context.enqueue_links( selector='#tabs > ul > li > a', label='TAB', ) @router.handler('TAB') async def tab_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'tab_handler is processing {context.request.url}')
tables = await context.page.query_selector_all('table') data = {} for table in tables: rows = await table.query_selector_all('tr') for row in rows: key = await row.query_selector('th') value = await row.query_selector('td') if key and value: key_text = await key.text_content() value_text = await value.text_content() data[key_text.strip()] = value_text.strip() context.log.info(data) await context.push_data(data)
molecular-blue
molecular-blueOP11mo ago
Hi @Mantisus, thank you for your response. I checked the thread but I'm still trying to guess how to gather all the data from the same item to the same dataset. This is my try: @router.handler('ITEM') async def item_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'item_handler is processing {context.request.url}') # Buscar todos los enlaces en navlist await context.page.wait_for_selector('#tabs > ul > li > a') label = await context.page.locator('#idBloqueDatos1 > table > tbody > tr > td').text_content()
tabs = await context.page.query_selector_all('#tabs > ul > li > a') for tab in tabs: url = await tab.get_attribute('href') tab_name = await tab.text_content() await context.add_requests([ Request.from_url( url=url, user_data={'label': label, 'tab_name': tab_name}), ])
Mantisus
Mantisus11mo ago
Yeah, I can see that won't work for you. I would use some sort of external storage. Like a global dictionary or some kind of class storage. Where your results would be aggregated. Here's a simple code sample that would implement this
storage = {}

@router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'default_handler is processing {context.request.url}')

url = "https://httpbin.org/get?a=item"

storage[url] = {}

await context.add_requests([
Request.from_url(
url=url,
label='ITEM',
user_data={"item_url":url}),
])


@router.handler('ITEM')
async def item_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'item_handler is processing {context.request.url}')
url = context.request.user_data["item_url"]
tabs = [f"https://httpbin.org/get?tab{i}={i}" for i in range(11)]
storage[url]["all_tabs"] = len(tabs)
storage[url]["processed_tabs"] = 0
requests = [Request.from_url(
url=tab,
label='TAB',
user_data={"item_url":url}) for tab in tabs]
await context.add_requests(requests)

@router.handler('TAB')
async def tab_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'tab_handler is processing {context.request.url}')

url = context.request.user_data["item_url"]

data = json.loads(context.http_response.read())

for key, value in data["args"].items():
storage[url].update({
key: value
})

storage[url]["processed_tabs"] += 1
if storage[url]["processed_tabs"] == storage[url]["all_tabs"]:
del storage[url]["processed_tabs"]
del storage[url]["all_tabs"]
await context.push_data(storage[url])
del storage[url]
storage = {}

@router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'default_handler is processing {context.request.url}')

url = "https://httpbin.org/get?a=item"

storage[url] = {}

await context.add_requests([
Request.from_url(
url=url,
label='ITEM',
user_data={"item_url":url}),
])


@router.handler('ITEM')
async def item_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'item_handler is processing {context.request.url}')
url = context.request.user_data["item_url"]
tabs = [f"https://httpbin.org/get?tab{i}={i}" for i in range(11)]
storage[url]["all_tabs"] = len(tabs)
storage[url]["processed_tabs"] = 0
requests = [Request.from_url(
url=tab,
label='TAB',
user_data={"item_url":url}) for tab in tabs]
await context.add_requests(requests)

@router.handler('TAB')
async def tab_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'tab_handler is processing {context.request.url}')

url = context.request.user_data["item_url"]

data = json.loads(context.http_response.read())

for key, value in data["args"].items():
storage[url].update({
key: value
})

storage[url]["processed_tabs"] += 1
if storage[url]["processed_tabs"] == storage[url]["all_tabs"]:
del storage[url]["processed_tabs"]
del storage[url]["all_tabs"]
await context.push_data(storage[url])
del storage[url]

Did you find this page helpful?