Drogbata
Drogbata2w ago

set user-agents for BeautifulSoupCrawler

Is it possible to set user-agents for bs4 crawling when there is no need to use browser crawler?? and what about for specific sessions ?
3 Replies
Mantisus
Mantisus2w ago
Hey, @Drogbata Here are a few ways you can do this. I will give both general examples and examples using sessions. Client level
import asyncio
from json import loads

from crawlee.crawlers import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
from crawlee.http_clients import HttpxHttpClient


async def main() -> None:

crawler = BeautifulSoupCrawler(
http_client=HttpxHttpClient(header_generator=None, headers={'User-Agent': 'MyCrawler/1.0'}),
)

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

data = loads(await context.http_response.read())

if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')

await crawler.run(['http://httpbin.org/get'])
import asyncio
from json import loads

from crawlee.crawlers import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
from crawlee.http_clients import HttpxHttpClient


async def main() -> None:

crawler = BeautifulSoupCrawler(
http_client=HttpxHttpClient(header_generator=None, headers={'User-Agent': 'MyCrawler/1.0'}),
)

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

data = loads(await context.http_response.read())

if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')

await crawler.run(['http://httpbin.org/get'])
Request level
import asyncio
from datetime import timedelta
from json import loads

from crawlee import Request
from crawlee.crawlers import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)


async def main() -> None:
crawler = BeautifulSoupCrawler(
max_request_retries=1,
request_handler_timeout=timedelta(seconds=30),
max_requests_per_crawl=10,
)

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

data = loads(await context.http_response.read())

if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')

await crawler.run([Request.from_url('http://httpbin.org/get', headers={'User-Agent': 'MyCrawler/1.0'})])
import asyncio
from datetime import timedelta
from json import loads

from crawlee import Request
from crawlee.crawlers import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)


async def main() -> None:
crawler = BeautifulSoupCrawler(
max_request_retries=1,
request_handler_timeout=timedelta(seconds=30),
max_requests_per_crawl=10,
)

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

data = loads(await context.http_response.read())

if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')

await crawler.run([Request.from_url('http://httpbin.org/get', headers={'User-Agent': 'MyCrawler/1.0'})])
In pre_navigation_hook for request with bind session
import asyncio
from json import loads

from crawlee import Request, HttpHeaders
from crawlee.crawlers import (
BasicCrawlingContext,
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
from crawlee.sessions import SessionPool


async def main() -> None:
async with SessionPool() as session_pool:

session = await session_pool.get_session()

crawler = BeautifulSoupCrawler(
max_request_retries=1,
session_pool=session_pool,
)

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

data = loads(await context.http_response.read())

if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')

@crawler.pre_navigation_hook
async def some_hook(context: BasicCrawlingContext) -> None:
if context.session and context.session.id:
context.request.headers = context.request.headers | HttpHeaders({'User-Agent': 'MyCrawler/1.0'})

await crawler.run([Request.from_url('http://httpbin.org/get', session_id=session.id)])
import asyncio
from json import loads

from crawlee import Request, HttpHeaders
from crawlee.crawlers import (
BasicCrawlingContext,
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
from crawlee.sessions import SessionPool


async def main() -> None:
async with SessionPool() as session_pool:

session = await session_pool.get_session()

crawler = BeautifulSoupCrawler(
max_request_retries=1,
session_pool=session_pool,
)

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

data = loads(await context.http_response.read())

if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')

@crawler.pre_navigation_hook
async def some_hook(context: BasicCrawlingContext) -> None:
if context.session and context.session.id:
context.request.headers = context.request.headers | HttpHeaders({'User-Agent': 'MyCrawler/1.0'})

await crawler.run([Request.from_url('http://httpbin.org/get', session_id=session.id)])
With pre_navigation_hook and user_data in session (In this case, I am using a session-based request binding simply to make the example work).
import asyncio
from json import loads

from crawlee import Request, HttpHeaders
from crawlee.sessions import SessionPool
from crawlee.crawlers import (
BasicCrawlingContext,
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)


async def main() -> None:
async with SessionPool() as session_pool:

session = await session_pool.get_session()
session.user_data['headers'] = HttpHeaders({'User-Agent': 'MyCrawler/1.0'})
crawler = BeautifulSoupCrawler(
session_pool=session_pool,
)

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

data = loads(await context.http_response.read())

if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')

@crawler.pre_navigation_hook
async def some_hook(context: BasicCrawlingContext) -> None:
if context.session and context.session.user_data:
context.request.headers = context.request.headers | context.session.user_data['headers']

# Run the crawler with the initial list of URLs.
await crawler.run([Request.from_url('http://httpbin.org/get', session_id=session.id)])
import asyncio
from json import loads

from crawlee import Request, HttpHeaders
from crawlee.sessions import SessionPool
from crawlee.crawlers import (
BasicCrawlingContext,
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)


async def main() -> None:
async with SessionPool() as session_pool:

session = await session_pool.get_session()
session.user_data['headers'] = HttpHeaders({'User-Agent': 'MyCrawler/1.0'})
crawler = BeautifulSoupCrawler(
session_pool=session_pool,
)

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

data = loads(await context.http_response.read())

if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')

@crawler.pre_navigation_hook
async def some_hook(context: BasicCrawlingContext) -> None:
if context.session and context.session.user_data:
context.request.headers = context.request.headers | context.session.user_data['headers']

# Run the crawler with the initial list of URLs.
await crawler.run([Request.from_url('http://httpbin.org/get', session_id=session.id)])
Exp
Exp2w ago
import requests, bs4 s = requests.Session() s.headers.update({"User-Agent":"Mozilla/5.0"}) r = s.get("https://example.com") soup = bs4.BeautifulSoup(r.text,"html.parser") print(soup.title.text) Hello, refer this code
Drogbata
DrogbataOP2w ago
i think trick with prenavigation hook is what i was looking for. Thank you very much, will try

Did you find this page helpful?