set user-agents for BeautifulSoupCrawler
Is it possible to set user-agents for bs4 crawling when there is no need to use browser crawler?? and what about for specific sessions ?
3 Replies
Hey, @Drogbata
Here are a few ways you can do this. I will give both general examples and examples using sessions.
Client level
Request level
In pre_navigation_hook for request with bind session
With pre_navigation_hook and user_data in session (In this case, I am using a session-based request binding simply to make the example work).
import asyncio
from json import loads
from crawlee.crawlers import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
from crawlee.http_clients import HttpxHttpClient
async def main() -> None:
crawler = BeautifulSoupCrawler(
http_client=HttpxHttpClient(header_generator=None, headers={'User-Agent': 'MyCrawler/1.0'}),
)
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
data = loads(await context.http_response.read())
if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')
await crawler.run(['http://httpbin.org/get'])
import asyncio
from json import loads
from crawlee.crawlers import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
from crawlee.http_clients import HttpxHttpClient
async def main() -> None:
crawler = BeautifulSoupCrawler(
http_client=HttpxHttpClient(header_generator=None, headers={'User-Agent': 'MyCrawler/1.0'}),
)
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
data = loads(await context.http_response.read())
if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')
await crawler.run(['http://httpbin.org/get'])
import asyncio
from datetime import timedelta
from json import loads
from crawlee import Request
from crawlee.crawlers import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
async def main() -> None:
crawler = BeautifulSoupCrawler(
max_request_retries=1,
request_handler_timeout=timedelta(seconds=30),
max_requests_per_crawl=10,
)
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
data = loads(await context.http_response.read())
if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')
await crawler.run([Request.from_url('http://httpbin.org/get', headers={'User-Agent': 'MyCrawler/1.0'})])
import asyncio
from datetime import timedelta
from json import loads
from crawlee import Request
from crawlee.crawlers import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
async def main() -> None:
crawler = BeautifulSoupCrawler(
max_request_retries=1,
request_handler_timeout=timedelta(seconds=30),
max_requests_per_crawl=10,
)
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
data = loads(await context.http_response.read())
if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')
await crawler.run([Request.from_url('http://httpbin.org/get', headers={'User-Agent': 'MyCrawler/1.0'})])
import asyncio
from json import loads
from crawlee import Request, HttpHeaders
from crawlee.crawlers import (
BasicCrawlingContext,
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
from crawlee.sessions import SessionPool
async def main() -> None:
async with SessionPool() as session_pool:
session = await session_pool.get_session()
crawler = BeautifulSoupCrawler(
max_request_retries=1,
session_pool=session_pool,
)
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
data = loads(await context.http_response.read())
if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')
@crawler.pre_navigation_hook
async def some_hook(context: BasicCrawlingContext) -> None:
if context.session and context.session.id:
context.request.headers = context.request.headers | HttpHeaders({'User-Agent': 'MyCrawler/1.0'})
await crawler.run([Request.from_url('http://httpbin.org/get', session_id=session.id)])
import asyncio
from json import loads
from crawlee import Request, HttpHeaders
from crawlee.crawlers import (
BasicCrawlingContext,
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
from crawlee.sessions import SessionPool
async def main() -> None:
async with SessionPool() as session_pool:
session = await session_pool.get_session()
crawler = BeautifulSoupCrawler(
max_request_retries=1,
session_pool=session_pool,
)
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
data = loads(await context.http_response.read())
if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')
@crawler.pre_navigation_hook
async def some_hook(context: BasicCrawlingContext) -> None:
if context.session and context.session.id:
context.request.headers = context.request.headers | HttpHeaders({'User-Agent': 'MyCrawler/1.0'})
await crawler.run([Request.from_url('http://httpbin.org/get', session_id=session.id)])
import asyncio
from json import loads
from crawlee import Request, HttpHeaders
from crawlee.sessions import SessionPool
from crawlee.crawlers import (
BasicCrawlingContext,
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
async def main() -> None:
async with SessionPool() as session_pool:
session = await session_pool.get_session()
session.user_data['headers'] = HttpHeaders({'User-Agent': 'MyCrawler/1.0'})
crawler = BeautifulSoupCrawler(
session_pool=session_pool,
)
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
data = loads(await context.http_response.read())
if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')
@crawler.pre_navigation_hook
async def some_hook(context: BasicCrawlingContext) -> None:
if context.session and context.session.user_data:
context.request.headers = context.request.headers | context.session.user_data['headers']
# Run the crawler with the initial list of URLs.
await crawler.run([Request.from_url('http://httpbin.org/get', session_id=session.id)])
import asyncio
from json import loads
from crawlee import Request, HttpHeaders
from crawlee.sessions import SessionPool
from crawlee.crawlers import (
BasicCrawlingContext,
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
async def main() -> None:
async with SessionPool() as session_pool:
session = await session_pool.get_session()
session.user_data['headers'] = HttpHeaders({'User-Agent': 'MyCrawler/1.0'})
crawler = BeautifulSoupCrawler(
session_pool=session_pool,
)
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
data = loads(await context.http_response.read())
if data['headers']['User-Agent'] == 'MyCrawler/1.0':
context.log.info('User-Agent is valid.')
else:
context.log.warning('User-Agent is invalid.')
@crawler.pre_navigation_hook
async def some_hook(context: BasicCrawlingContext) -> None:
if context.session and context.session.user_data:
context.request.headers = context.request.headers | context.session.user_data['headers']
# Run the crawler with the initial list of URLs.
await crawler.run([Request.from_url('http://httpbin.org/get', session_id=session.id)])
import requests, bs4
s = requests.Session()
s.headers.update({"User-Agent":"Mozilla/5.0"})
r = s.get("https://example.com")
soup = bs4.BeautifulSoup(r.text,"html.parser")
print(soup.title.text)
Hello, refer this code
i think trick with prenavigation hook is what i was looking for. Thank you very much, will try