apify push
[1], this uploads my project to the Apify cloud and builds an actor from it. gitignore
file?page.setViewportSize()
(https://playwright.dev/docs/api/class-page#page-set-viewport-size)const crawler = new PlaywrightCrawler({
// Stop crawling after 5 pages
maxRequestsPerCrawl: 5,
// https://crawlee.dev/api/playwright-crawler/interface/PlaywrightLaunchContext
launchContext: {
// https://crawlee.dev/api/playwright-crawler/interface/PlaywrightLaunchContext#launchOptions
launchOptions: {
stealth: true,
headless: false,
},
?????
}, .....
enqueueLinks(...)
2022-10-11 08:52:58.931 WARN PlaywrightCrawler: Reclaiming failed request back to the list or queue. page.goto: net::ERR_HTTP_RESPONSE_CODE_FAILURE at https://httpbin.org/status/404
=========================== logs ===========================
navigating to "https://httpbin.org/status/404", waiting until "load"
============================================================ {"id":"sOcDKee4CooEnLF","url":"https://httpbin.org/status/404","retryCount":1}
2022-10-11 08:53:03.429 ERROR PlaywrightCrawler: Request failed and reached maximum retries. page.goto: net::ERR_HTTP_RESPONSE_CODE_FAILURE at https://httpbin.org/status/404
=========================== logs ===========================
navigating to "https://httpbin.org/status/404", waiting until "load"
============================================================
at gotoExtended (c:\Users\HERNOUX-06523\Desktop\Dev\NodeJS\test-crawlee\node_modules\@crawlee\playwright\internals\utils\playwright-utils.js:149:17)
at PlaywrightCrawler._navigationHandler (c:\Users\HERNOUX-06523\Desktop\Dev\NodeJS\test-crawlee\node_modules\@crawlee\playwright\internals\playwright-crawler.js:105:52)
at PlaywrightCrawler._handleNavigation (c:\Users\HERNOUX-06523\Desktop\Dev\NodeJS\test-crawlee\node_modules\@crawlee\browser\internals\browser-crawler.js:268:51)
at async PlaywrightCrawler._runRequestHandler (c:\Users\HERNOUX-06523\Desktop\Dev\NodeJS\test-crawlee\node_modules\@crawlee\browser\internals\browser-crawler.js:215:17)
at async PlaywrightCrawler._runRequestHandler (c:\Users\HERNOUX-06523\Desktop\Dev\NodeJS\test-crawlee\node_modules\@crawlee\playwright\internals\playwright-crawler.js:102:9)
....
browserContext.setDefaultTimeout(timeout)
browserContext.setDefaultNavigationTimeout(timeout)
navigationTimeoutSecs
? (https://crawlee.dev/api/playwright-crawler/interface/PlaywrightCrawlerOptions#navigationTimeoutSecs)skipTime
optionconst Apify = require('apify'); const { utils: { log } } = Apify; log.setOptions({ logger: new log.LoggerText({ skipTime: false }), });
Uncaught TypeError TypeError: Cannot read properties of undefined (reading 'log')
import Apify from 'apify' const { utils: { log } } = Apify; log.setOptions({ logger: new log.LoggerText({ skipTime: false }), }); // https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler const crawler = new PlaywrightCrawler( launchContext: { launchOptions: { headless: true, stealth: true, viewport: { width:600, height:300 } }, }, async requestHandler({ request, page, enqueueLinks, log }) { const title = await page.title(); log.info(`Titre: ${title} Url: ${request.loadedUrl}`); }
preNavigationHooks
array in CheerioCrawlerOptions [1]const crawlerOptions = { ... preNavigationHooks: [], }); const jsFunction = "async ({ page, request }) => { log.info(`preNavigationHook ${request.url}`); }"; crawlerOptions.preNavigationHooks.push( ??? WHAT ???) const myCrawler = new CheerioCrawler(crawlerOptions);
crawlerOptions.preNavigationHooks.push(jsFunction);
, when I run crawler, I got error:WARN CheerioCrawler: Reclaiming failed request back to the list or queue. TypeError: hook is not a function
at CheerioCrawler._executeHooks (D:\Developpement\NodeJS\Nowis_Scraper\node_modules@crawlee\basic\internals\basic-crawler.js:834:23)
at CheerioCrawler._handleNavigation (D:\Developpement\NodeJS\Nowis_Scraper\node_modules@crawlee\http\internals\http-crawler.js:326:20)
at CheerioCrawler._runRequestHandler (D:\Developpement\NodeJS\Nowis_Scraper\node_modules@crawlee\http\internals\http-crawler.js:286:24)
await crawler.run(['https://crawlee.dev'], { userData: { depth: 0 } });
userData
to exist, got [object Object]
in object options
userData
in option?