[nuxt] [request error] [unhandled] [500] Cannot find module '/app/server/node_modules/puppeteer/lib/cjs/puppeteer/puppeteer.js'
Browser logs: Chromium sandboxing failed! ================================ To avoid the sandboxing issue, do either of the following: - (preferred): Configure your environment to support sandboxing - (alternative): Launch Chromium without sandbox using 'chromiumSandbox: false' option ================================
crawlee[playwright]
to 0.5.2
[90m[crawlee.events._event_manager][0m [34mDEBUG[0m LocalEventManager.on.listener_wrapper(): Awaiting listener task... [90m[crawlee.events._event_manager][0m [34mDEBUG[0m LocalEventManager.on.listener_wrapper(): Awaiting listener task... '[90m[crawlee._autoscaling.autoscaled_pool][0m [34mDEBUG[0m Not scheduling new tasks - system is overloaded '[90m[crawlee.storages._request_queue][0m [34mDEBUG[0m There are still ids in the queue head that are pending processing [90m({"queue_head_ids_pending": 1})[0m [90m[crawlee._utils.system][0m [34mDEBUG[0m Calling get_memory_info()... '[90m[crawlee._autoscaling.autoscaled_pool][0m [34mDEBUG[0m Not scheduling new tasks - system is overloaded '[90m[crawlee.storages._request_queue][0m [34mDEBUG[0m There are still ids in the queue head that are pending processing [90m({"queue_head_ids_pending": 1})[0m '[90m[crawlee._autoscaling.autoscaled_pool][0m [34mDEBUG[0m Not scheduling new tasks - system is overloaded '[90m[crawlee.storages._request_queue][0m [34mDEBUG[0m There are still ids in the queue head that are pending processing [90m({"queue_head_ids_pending": 1})[0m [90m[crawlee._utils.system][0m [34mDEBUG[0m Calling get_cpu_info()... '[90m[crawlee._autoscaling.autoscaled_pool][0m [34mDEBUG[0m Not scheduling new tasks - system is overloaded '[90m[crawlee.storages._request_queue][0m [34mDEBUG[0m There are still ids in the queue head that are pending processing [90m({"queue_head_ids_pending": 1})[0m '[90m[crawlee._autoscaling.autoscaled_pool][0m [34mDEBUG[0m Not scheduling new tasks - system is overloaded
router.addDefaultHandler(async ({ request, enqueueLinks, parseWithCheerio, querySelector, log, page }) => {
await enqueueLinks({
strategy: 'same-domain',
globs: globs,
transformRequestFunction: (request) => {
return request;
},
});
});
parsed_url = urlparse(context.request.url) path_name = parsed_url.path results = _get_regex_matches(path_name) if not results: context.log.info( f'No match found for URL: {context.request.url} in path: ' f'{path_name}' ) # TODO: CANCEL REQUEST
await request_list.mark_request_as_handled(request)
but I don't think I have any access to a request_list or something simular in the PlaywrightPreNavCrawlingContext
WARN
, which spawns another playwright instance. finished condition
is met.[crawlee.storages._request_queue] WARN The request queue seems to be stuck for 300.0s, resetting internal state. ({"queue_head_ids_pending": 0, "in_progress": ["tEyKIytjmqjtRvA"]})
router = Router[BeautifulSoupCrawlingContext]()
const crawler = new AdaptivePlaywrightCrawler({
renderingTypeDetectionRatio: 0.1,
maxRequestsPerCrawl: 50,
async requestHandler({ request, enqueueLinks, parseWithCheerio, querySelector, log, urls }) {
console.log(request.url, request.uniqueKey);
await enqueueLinks();
}
});
crawler.run(['https://crawlee.dev']);
const launchPlaywright = async () => { const browser = await playwright["chromium"].launch({ headless: true, args: ["--disable-blink-features=AutomationControlled"], }); const context = await browser.newContext({ viewport: { width: 1280, height: 720 }, userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", geolocation: { longitude: 7.8421, latitude: 47.9978 }, permissions: ["geolocation"], locale: "en-US", storageState: "playwright/auth/user.json", }); return await context.newPage(); };