import { Actor } from 'apify';
// For more information, see https://crawlee.dev
import { PlaywrightCrawler, RequestOptions, useState } from 'crawlee';
// this is ESM project, and as such, it requires you to specify extensions in your relative imports
// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions
// note that we need to use `.js` even when inside TS files
import { router } from './routes.js';
interface ProxyConfig {
useApifyProxy: boolean;
proxyUrls?: string[]
}
export interface SocialsConfig {
facebook: boolean;
instagram: boolean;
tiktok: boolean;
pinterest: boolean;
}
interface Input {
startUrls: string[];
maxRequestsPerCrawl: number;
navigationTimeoutSecs: number;
proxyConfig: ProxyConfig;
socialsConfig: SocialsConfig;
}
// Initialize the Apify SDK
await Actor.init();
// Structure of input is defined in input_schema.json
const {
startUrls = [
'https://enzosbbq.com/'
],
proxyConfig = {
useApifyProxy: true,
groups: [
"RESIDENTIAL"
]
},
socialsConfig = {
facebook: true,
instagram: true,
tiktok: true,
pinterest: true
},
maxRequestsPerCrawl = 100,
navigationTimeoutSecs = 15,
} = await Actor.getInput<Input>() ?? {} as Input;
// TODO should this be input?
const maxRequestRetries: number = 2;
const proxyConfiguration = await Actor.createProxyConfiguration(
proxyConfig
);
const DEFAULT_STATE: { playwrightDetails: any; listingDone: boolean } = {
playwrightDetails: [],
listingDone: false,
};
const state = await useState(undefined, DEFAULT_STATE);
const crawler = new PlaywrightCrawler({
proxyConfiguration,
maxRequestsPerCrawl,
navigationTimeoutSecs,
requestHandler: router,
maxRequestRetries: maxRequestRetries
});
console.log("Initiating Crawler...")
await crawler.run(startUrls);
// Exit successfully
await Actor.exit();