There was an uncaught exception during the run of the Actor and it was not handled.
const crawler = new PuppeteerCrawler({ launchContext: { launchOptions: { headless: true, args: [ "--no-sandbox", // Mitigates the "sandboxed" process issue in Docker containers, "--ignore-certificate-errors", "--disable-dev-shm-usage", "--disable-infobars", "--disable-extensions", "--disable-setuid-sandbox", "--ignore-certificate-errors", "--disable-gpu", // Mitigates the "crashing GPU process" issue in Docker containers ], }, }, maxRequestRetries: 1, navigationTimeoutSecs: 60, autoscaledPoolOptions: { minConcurrency: 30 }, maxSessionRotations: 5, preNavigationHooks: [ async ({ blockRequests }, goToOptions) => { if (goToOptions) goToOptions.waitUntil = "domcontentloaded"; // Set waitUntil here await blockRequests({ urlPatterns: [ ... ], }); }, ], proxyConfiguration, requestHandler: router, }); await crawler.run(startUrls); await Actor.exit();
stringList
doesn't work on web console.{ "title" : "Test", "type" : "object", "schemaVersion" : 1, "properties" : { "search.location": {"title": "Locations #1", "type": "array", "description":"", "editor":"stringList", "prefill": ["Bandung"]}, ### <-- Problem "search_location": {"title": "Locations #2", "type": "array", "description":"", "editor":"stringList", "prefill": ["Bandung"]} } }
+Add
button. When edited using Bulk
button, the resulting Json
is weird. It automatically become Object Structure which is nice effect. not sure if this really a Bug, or new features ?import { crawler } from './main.js' // Import the exported crawler from main file import express from "express"; const app = express(); app.use(express.json()); const BASE_URL = "https....."; app.post("/scrape", async (req, res) => { if (!req.body || !req.body.usernames) { return res.status(400).json({ error: "Invalid input" }); } const { usernames } = req.body; const urls = usernames.map(username => `${BASE_URL}${username}`); try { await crawler.run(urls); const dataset = await crawler.getData(); return res.status(200).json({ data: dataset }); } catch (error) { console.error("Scraping error:", error); return res.status(500).json({ error: "Scraping failed" }); } }); const PORT = parseInt(process.env.PORT) || 3000; app.listen(PORT, () => console.log(`Server running on port ${PORT}`));
const proxies = [...] //my proxy list const proxyConfiguration = new ProxyConfiguration({ proxyUrls: proxies, }); export const crawler = new CheerioCrawler({ proxyConfiguration, requestHandler: async ({ request, json, proxyInfo }) => { log.info(JSON.stringify(proxyInfo, null, 2)) /// Scraping logic await Dataset.pushData({ // pushing data }); }, new Configuration({ persistStorage: false, }));
crawl4ai
but switched since crawlee
seems much better at anti-blocking.2025-03-13T11:58:38.513Z [Crawler] [INFO ℹ️] Finished! Total 0 requests: 0 succeeded, 0 failed. {"terminal":true} 2025-03-13T11:58:38.513Z [Crawler] [ERROR ❌] BrowserLaunchError: Failed to launch browser. Please check the following: - Check whether the provided executable path "/Users/dp420/.cache/camoufox/Camoufox.app/Contents/MacOS/camoufox" is correct. - Try installing the required dependencies by running `npx playwright install --with-deps` (https://playwright.dev/docs/browsers).
{
"actor-start": {
"eventTitle": "Price for Actor start",
"eventDescription": "Flat fee for starting an Actor run.",
"eventPriceUsd": 0.1
},
"task-completed": {
"eventTitle": "Price for completing the task",
"eventDescription": "Flat fee for completing the task.",
"eventPriceUsd": 0.4
}
}
async def main():
"""Runs the AI Travel Planner workflow."""
async with Actor:
await Actor.charge('actor-start')
actor_input = await Actor.get_input() or {}
Actor.log.info(f"Received input: {actor_input}")
travel_query = TravelState(**actor_input)
# Execute workflow
final_state = travel_workflow.invoke(travel_query)
Actor.log.info(f"Workflow completed. Final state: {final_state}")
await Actor.charge('task-completed')
# Save the final report
await save_report(final_state)
Python -m src
python3.10 -m src
python -m src
, it's using the 3.13 version which is my default version and throwing error so for the project i have used python3.10 crawler.run(["https://website.com/1234"]);
works locally while in the apify cloud it breaks with the following error: Reclaiming failed request back to the list or queue. TypeError: Invalid URL
2025-03-09T00:13:41.538Z ACTOR: Pulling Docker image of build 20IgkKFk3QAzeFbk9 from repository. 2025-03-09T00:13:42.170Z ACTOR: Creating Docker container. 2025-03-09T00:13:42.237Z ACTOR: Starting Docker container. 2025-03-09T00:13:44.148Z Downloading model definition files... 2025-03-09T00:13:44.419Z Error downloading fingerprint-network.zip: [Errno 13] Permission denied: '/usr/local/lib/python3.13/site-packages/browserforge/fingerprints/data/fingerprint-network.zip' 2025-03-09T00:13:44.430Z Downloading model definition files... 2025-03-09T00:13:44.452Z Error downloading input-network.zip: [Errno 13] Permission denied: '/usr/local/lib/python3.13/site-packages/browserforge/headers/data/input-network.zip' ... 2025-03-09T00:13:44.580Z File "/usr/local/lib/python3.13/site-packages/browserforge/bayesian_network.py", line 288, in extract_json 2025-03-09T00:13:44.582Z with zipfile.ZipFile(path, 'r') as zf: 2025-03-09T00:13:44.583Z ~~~~~~~~~~~~~~~^^^^^^^^^^^ 2025-03-09T00:13:44.586Z File "/usr/local/lib/python3.13/zipfile/__init__.py", line 1367, in __init__ 2025-03-09T00:13:44.588Z self.fp = io.open(file, filemode) 2025-03-09T00:13:44.590Z ~~~~~~~^^^^^^^^^^^^^^^^ 2025-03-09T00:13:44.592Z FileNotFoundError: [Errno 2] No such file or directory: '/usr/local/lib/python3.13/site-packages/browserforge/headers/data/input-network.zip'
2025-03-07T21:22:12.478Z ACTOR: Pulling Docker image of build aJ5w2MnrBdaZRxGeA from repository. 2025-03-07T21:22:13.611Z ACTOR: Creating Docker container. 2025-03-07T21:22:13.835Z ACTOR: Starting Docker container. 2025-03-07T21:22:14.208Z Starting X virtual framebuffer using: Xvfb :99 -ac -screen 0 1920x1080x24+32 -nolisten tcp 2025-03-07T21:22:14.210Z Executing main command 2025-03-07T21:22:15.368Z INFO System info {"apifyVersion":"3.3.2","apifyClientVersion":"2.12.0","crawleeVersion":"3.13.0","osType":"Linux","nodeVersion":"v20.18.3"} 2025-03-07T21:22:15.498Z INFO Starting the crawl process {"startUrls":[{"url":"https://salesblaster.ai"}],"maxRequestsPerCrawl":100,"datasetName":"default"} 2025-03-07T21:22:15.905Z ERROR Error running scraper: {"error":"Request options are not valid, provide either a URL or an object with 'url' property (but without 'id' property), or an object with 'requestsFromUrl' property. Input: {\n url: { url: 'https://salesblaster.ai' },\n userData: {\n datasetName: 'default',\n initialUrl: { url: 'https://salesblaster.ai' }\n }\n}"}