exotic-emerald•2y ago

Not Outputting a File??

Hey there, running this is an Appwrite function and I would like to not output to a file. I’m getting an error, unable to write file.stream.output or something because file is null, the crawler finishes, but either way I want to upload the JSON to my storage and not have it as a file, but all config options point to output file

6 Replies

exotic-emeraldOP•2y ago

@Helper just curious if there’s an easy way to do this (I assume I’m supposed to ping this role?)

Oleg V.•2y ago

please provide some reproduction + error stack trace

exotic-emeraldOP•2y ago

Here's the log output An error occurred during the crawling process: TypeError: file.stream.pipe is not a function

export default async ({ req, res, log, error }: any) => {
  try {
    const client = new Client()
      .setEndpoint('https://cloud.appwrite.io/v1')
      .setProject(process.env.APPWRITE_FUNCTION_PROJECT_ID!)
      .setKey(process.env.APPWRITE_API_KEY!);
    const storage = new Storage(client);
    log('Starting to install puppeteer (execSync)');
    execSync('apk add /usr/local/server/src/function/*.apk');
    log('Starting to crawl URL');
    let data: CrawlerReqOptions;
    if (typeof req.body === 'string') {
      data = crawlerReqOptions.parse(JSON.parse(req.body));
    } else {
      data = crawlerReqOptions.parse(req.body);
    }
    const {
      name,
      baseUrl,
      maxLinks,
      globMatch,
      regexMatch,
      stayOnDomain,
      guildId,
    } = data;
    const router = configureRouter({
      stayOnDomain: stayOnDomain || true,
      globs: globMatch ? [globMatch] : undefined,
      regexps: regexMatch ? [new RegExp(regexMatch)] : undefined,
    });
    const crawler = new PuppeteerCrawler({
      requestHandler: router,
      maxRequestsPerCrawl: maxLinks,
      launchContext: {
        launchOptions: {
          executablePath: '/usr/bin/chromium-browser',
          args: [
            '--no-sandbox',
            '--disable-setuid-sandbox',
            '--disable-gpu',
            '--headless',
            '--disable-dev-shm-usage',
          ],
        },
      },
    });

    const isUrlASitemap = /sitemap.*\.xml$/.test(baseUrl);
    let results;
    if (isUrlASitemap) {
      const listOfUrls = await downloadListOfUrls({ url: baseUrl });
      await crawler.addRequests(listOfUrls);
      crawler.config.createMemoryStorage({});
      results = await crawler.run();
    } else {
      results = await crawler.run([baseUrl]);
    }
    const dataSet = await crawler.getData();
    log(`Crawling finished, ${dataSet.count} pages crawled`);
    const file = new Blob([JSON.stringify(dataSet.items)], {
      type: 'application/json',
    });
    const found_buckets = await storage.listBuckets([
      Query.equal('$id', guildId),
    ]);
    if (found_buckets.total != 0) {
      const found_bucket = await storage.getBucket(guildId);
      const fileUploaded = await storage.createFile(
        found_bucket.$id,
        ID.unique(),
        file
      );
      return res.send(fileUploaded.$id);
    } else {
      const bucket_name = `Guild_${guildId}`;
      const bucket = await storage.createBucket(
        guildId,
        bucket_name,
        [Permission.read(Role.any())],
        true,
        true,
        30000000, // 30MB
        undefined,
        'gzip',
        true,
        true
      );
      const fileUploaded = await storage.createFile(
        bucket.$id,
        ID.unique(),
        file
      );
      return res.send(fileUploaded.$id);
    }
  } catch (err) {
    error(`An error occurred during the crawling process: ${err}`);
    return res.send({
      error: 'An error occurred during the crawling process.',
    });
  }
};

export default async ({ req, res, log, error }: any) => {
  try {
    const client = new Client()
      .setEndpoint('https://cloud.appwrite.io/v1')
      .setProject(process.env.APPWRITE_FUNCTION_PROJECT_ID!)
      .setKey(process.env.APPWRITE_API_KEY!);
    const storage = new Storage(client);
    log('Starting to install puppeteer (execSync)');
    execSync('apk add /usr/local/server/src/function/*.apk');
    log('Starting to crawl URL');
    let data: CrawlerReqOptions;
    if (typeof req.body === 'string') {
      data = crawlerReqOptions.parse(JSON.parse(req.body));
    } else {
      data = crawlerReqOptions.parse(req.body);
    }
    const {
      name,
      baseUrl,
      maxLinks,
      globMatch,
      regexMatch,
      stayOnDomain,
      guildId,
    } = data;
    const router = configureRouter({
      stayOnDomain: stayOnDomain || true,
      globs: globMatch ? [globMatch] : undefined,
      regexps: regexMatch ? [new RegExp(regexMatch)] : undefined,
    });
    const crawler = new PuppeteerCrawler({
      requestHandler: router,
      maxRequestsPerCrawl: maxLinks,
      launchContext: {
        launchOptions: {
          executablePath: '/usr/bin/chromium-browser',
          args: [
            '--no-sandbox',
            '--disable-setuid-sandbox',
            '--disable-gpu',
            '--headless',
            '--disable-dev-shm-usage',
          ],
        },
      },
    });

    const isUrlASitemap = /sitemap.*\.xml$/.test(baseUrl);
    let results;
    if (isUrlASitemap) {
      const listOfUrls = await downloadListOfUrls({ url: baseUrl });
      await crawler.addRequests(listOfUrls);
      crawler.config.createMemoryStorage({});
      results = await crawler.run();
    } else {
      results = await crawler.run([baseUrl]);
    }
    const dataSet = await crawler.getData();
    log(`Crawling finished, ${dataSet.count} pages crawled`);
    const file = new Blob([JSON.stringify(dataSet.items)], {
      type: 'application/json',
    });
    const found_buckets = await storage.listBuckets([
      Query.equal('$id', guildId),
    ]);
    if (found_buckets.total != 0) {
      const found_bucket = await storage.getBucket(guildId);
      const fileUploaded = await storage.createFile(
        found_bucket.$id,
        ID.unique(),
        file
      );
      return res.send(fileUploaded.$id);
    } else {
      const bucket_name = `Guild_${guildId}`;
      const bucket = await storage.createBucket(
        guildId,
        bucket_name,
        [Permission.read(Role.any())],
        true,
        true,
        30000000, // 30MB
        undefined,
        'gzip',
        true,
        true
      );
      const fileUploaded = await storage.createFile(
        bucket.$id,
        ID.unique(),
        file
      );
      return res.send(fileUploaded.$id);
    }
  } catch (err) {
    error(`An error occurred during the crawling process: ${err}`);
    return res.send({
      error: 'An error occurred during the crawling process.',
    });
  }
};

message.txt

exotic-emeraldOP•2y ago

my configureRouter

export function configureRouter(options: {
  stayOnDomain: boolean;
  globs?: string[];
  regexps?: RegExpInput[];
  detailHandler?: (context: PuppeteerCrawlingContext) => Promise<void>;
  selector?: string;
}) {
  const router = createPuppeteerRouter();

  // Use provided globs or default value
  const globs = options.globs || undefined;
  const regexps = options.regexps || undefined;
  const selector = options.selector || 'body'; // Default to 'body' if no selector is provided

  router.addDefaultHandler(async ({ enqueueLinks, log, page }) => {
    log.info(`Enqueueing new URLs`);
    await enqueueLinks({
      globs: globs,
      regexps: regexps,
      label: 'detail',
    });

    // If a selector is provided, use it to extract content from the page
    if (selector) {
      if (selector.startsWith('/')) {
        // If the selector is an XPath, wait for the XPath
        await waitForXPath(page, selector);
      }
      const content = await getPageHtml(page, selector);
      log.info(`Extracted content using selector: ${selector}`, { content });
    }
  });

  // Use provided detailHandler or default implementation
  const detailHandler =
    options.detailHandler ||
    (async ({ request, page, log, pushData }) => {
      const title = await page.title();
      log.info(`${title}`, { url: request.loadedUrl });

      // Extract content using the provided selector or XPath
      const content = await getPageHtml(page, selector);
      await pushData({
        url: request.loadedUrl,
        title,
        content, // Include the extracted content in the pushed data
      });
    });

  router.addHandler('detail', detailHandler);

  return router;
}

export function configureRouter(options: {
  stayOnDomain: boolean;
  globs?: string[];
  regexps?: RegExpInput[];
  detailHandler?: (context: PuppeteerCrawlingContext) => Promise<void>;
  selector?: string;
}) {
  const router = createPuppeteerRouter();

  // Use provided globs or default value
  const globs = options.globs || undefined;
  const regexps = options.regexps || undefined;
  const selector = options.selector || 'body'; // Default to 'body' if no selector is provided

  router.addDefaultHandler(async ({ enqueueLinks, log, page }) => {
    log.info(`Enqueueing new URLs`);
    await enqueueLinks({
      globs: globs,
      regexps: regexps,
      label: 'detail',
    });

    // If a selector is provided, use it to extract content from the page
    if (selector) {
      if (selector.startsWith('/')) {
        // If the selector is an XPath, wait for the XPath
        await waitForXPath(page, selector);
      }
      const content = await getPageHtml(page, selector);
      log.info(`Extracted content using selector: ${selector}`, { content });
    }
  });

  // Use provided detailHandler or default implementation
  const detailHandler =
    options.detailHandler ||
    (async ({ request, page, log, pushData }) => {
      const title = await page.title();
      log.info(`${title}`, { url: request.loadedUrl });

      // Extract content using the provided selector or XPath
      const content = await getPageHtml(page, selector);
      await pushData({
        url: request.loadedUrl,
        title,
        content, // Include the extracted content in the pushed data
      });
    });

  router.addHandler('detail', detailHandler);

  return router;
}

@Oleg V. the thing to know though is Appwrite Functions run in a docker environment so the error is more than likely from something related to that but it wouldn't be an error if I could just not output it to a file

MEE6•2y ago

@ZachHandley just advanced to level 1! Thanks for your contributions! 🎉

exotic-emeraldOP•2y ago

@Helper I guess I'll ping again or just make a GitHub issue?

Not Outputting a File??

Did you find this page helpful?