const crawler = new PlaywrightCrawler({ ... browserPoolOptions: { useFingerprints: true, fingerprintOptions: { fingerprintGeneratorOptions: { browsers: ['firefox'], operatingSystems: ['linux'], }, }, }, launchContext: { launcher: firefox }, });
import { firefox, webkit } from 'playwright'; import { PlaywrightCrawler, Dataset, ProxyConfiguration, Request, log, sleep } from 'crawlee'; import { launchPlaywright, playwrightUtils } from 'crawlee'; import * as crypt from 'crypto'; const crawler = new PlaywrightCrawler({ autoscaledPoolOptions: { minConcurrency: 2, maxConcurrency: 4, loggingIntervalSecs: null, }, maxRequestRetries: 0, navigationTimeoutSecs: 130, requestHandlerTimeoutSecs: 110, useSessionPool: false, persistCookiesPerSession: false, headless: true, browserPoolOptions: { useFingerprints: true, operationTimeoutSecs: 40, fingerprintOptions: { fingerprintGeneratorOptions: { browsers: ['firefox'], operatingSystems: ['linux'], }, }, }, launchContext: { useIncognitoPages: true, launcher: firefox }, async requestHandler( {request, response, page, enqueueLinks, log, proxyInfo} ) { const uniqueKey = crypt.randomBytes(16).toString("hex"); let url = new URL(request.url); let host = url.host; let scrFile = `${host}-${uniqueKey}.png`; log.info(`GET ${request.url} Wait1 ...`); await sleep(40*1000); log.info(`GET ${request.url} Wait2, Pressing Enter ...`); await page.keyboard.press('Enter'); await sleep(40*1000); log.info(`GET ${request.url} Writing into ${scrFile} ...`); await page.screenshot( {path:scrFile, fullPage:true} ); log.info(`GET ${request.url} DONE`); }, }); await crawler.run([ "https://infosimples.github.io/detect-headless/", "https://intoli.com/blog/not-possible-to-block-chrome-headless/chrome-headless-test.html", "https://webscraping.pro/wp-content/uploads/2021/02/testresult2.html" ]);
useSessionPool: true,
chromium
instead of firefox
as launcher, There is no "Plugins length" error.fingerprint-injector
& Playwright [1],crawler = new PlaywrightCrawler({ ... });
plugins
in the preNavigationHooks
- not sure this is the optimal solution...const pluginContent = ` Object.defineProperty(navigator, 'plugins', { get: () => { const PDFPlugin = Object.create(Plugin.prototype, { description: { value: 'Portable Document Format', enumerable: false }, filename: { value: 'internal-pdf-viewer', enumerable: false }, name: { value: 'PDF Plugin', enumerable: false }, }); return Object.create(PluginArray.prototype, { length: { value: 1 }, 0: { value: PDFPlugin }, }); }, }); Object.defineProperty(navigator, 'mimeTypes', { get: () => { const PDFMimeTypeTxt = Object.create(MimeType.prototype, { type: { value: 'text/pdf', enumerable: false }, suffixes: { value: 'pdf', enumerable: false }, description: { value: 'Portable Document Format', enumerable: false }, enabledPlugin: { value: 'PDF Plugin', enumerable: false }, }); return Object.create(MimeTypeArray.prototype, { length: { value: 1 }, 0: { value: PDFMimeTypeTxt }, }); }, }); `
mimeTypes: text/pdf, pdf, Portable Document Format
preNavigationHooks
[1]. Is in BrowserCrawlerOptions
, so can be used with puppeter.preLaunchHooks
and in some cases - in prePageCreateHooks
preNavigationHooks: [ async ({ page, request }) => { log.info(`preNavigationHook: GET=${request.url} START`); const preloadFile = fs.readFileSync('./preload.js', 'utf8'); await page.evaluateOnNewDocument(preloadFile); log.info(`preNavigationHook: GET=${request.url} END`); } ],
Object.defineProperty(navigator, 'plugins', { get: () => { const PDFPlugin = Object.create(Plugin.prototype, { description: { value: 'Portable Document Format', enumerable: false }, filename: { value: 'internal-pdf-viewer', enumerable: false }, name: { value: 'PDF Plugin', enumerable: false }, }); return Object.create(PluginArray.prototype, { length: { value: 1 }, 0: { value: PDFPlugin }, }); }, }); Object.defineProperty(navigator, 'mimeTypes', { get: () => { const PDFMimeTypeTxt = Object.create(MimeType.prototype, { type: { value: 'text/pdf', enumerable: false }, suffixes: { value: 'pdf', enumerable: false }, description: { value: 'Portable Document Format', enumerable: false }, enabledPlugin: { value: 'PDF Plugin', enumerable: false }, }); return Object.create(MimeTypeArray.prototype, { length: { value: 1 }, 0: { value: PDFMimeTypeTxt }, }); }, });
const webGLContent = ` const getParameter = WebGLRenderingContext.getParameter; WebGLRenderingContext.prototype.getParameter = function(parameter) { // UNMASKED_VENDOR_WEBGL if (parameter === 37445) { return 'Intel Open Source Technology Center'; } // UNMASKED_RENDERER_WEBGL if (parameter === 37446) { return 'Mesa DRI Intel(R) Ivybridge Mobile '; } return getParameter(parameter); }; ` ...... await page.addInitScript({ content: webGLContent }); ......
Retina/HiDPI Hairline Feature
.const webGLContent = ...Excellent! what we really need is a list of 100-200 such strings and a piece of JS code randomly returning a "webGL string"... (in other words - this functionality should be in the next version of Crawlee)