const startUrls = ['https://weaviate.io/developers/weaviate'];
const storageDir = path.join(__dirname, '../storage/datasets/default');
const crawler = new PlaywrightCrawler({
requestHandler: router,
});
await crawler.run(startUrls);
router.addDefaultHandler(async ({ enqueueLinks, log }) => {
log.info(`enqueueing new URLs`);
await enqueueLinks({
label: "devDocs", // has to match first arg of addHandler()
});
});
router.addHandler("devDocs", async ({ request, page, log }) => {
const title = await page.title();
const url = request.loadedUrl;
log.info(`${title}`, { url: url });
await Dataset.pushData(await scrapePage(page));
});
const startUrls = ['https://weaviate.io/developers/weaviate'];
const storageDir = path.join(__dirname, '../storage/datasets/default');
const crawler = new PlaywrightCrawler({
requestHandler: router,
});
await crawler.run(startUrls);
router.addDefaultHandler(async ({ enqueueLinks, log }) => {
log.info(`enqueueing new URLs`);
await enqueueLinks({
label: "devDocs", // has to match first arg of addHandler()
});
});
router.addHandler("devDocs", async ({ request, page, log }) => {
const title = await page.title();
const url = request.loadedUrl;
log.info(`${title}`, { url: url });
await Dataset.pushData(await scrapePage(page));
});