exotic-emerald
exotic-emerald2y ago

Not Outputting a File??

Hey there, running this is an Appwrite function and I would like to not output to a file. I’m getting an error, unable to write file.stream.output or something because file is null, the crawler finishes, but either way I want to upload the JSON to my storage and not have it as a file, but all config options point to output file
6 Replies
exotic-emerald
exotic-emeraldOP2y ago
@Helper just curious if there’s an easy way to do this (I assume I’m supposed to ping this role?)
Oleg V.
Oleg V.2y ago
please provide some reproduction + error stack trace
exotic-emerald
exotic-emeraldOP2y ago
Here's the log output An error occurred during the crawling process: TypeError: file.stream.pipe is not a function
export default async ({ req, res, log, error }: any) => {
try {
const client = new Client()
.setEndpoint('https://cloud.appwrite.io/v1')
.setProject(process.env.APPWRITE_FUNCTION_PROJECT_ID!)
.setKey(process.env.APPWRITE_API_KEY!);
const storage = new Storage(client);
log('Starting to install puppeteer (execSync)');
execSync('apk add /usr/local/server/src/function/*.apk');
log('Starting to crawl URL');
let data: CrawlerReqOptions;
if (typeof req.body === 'string') {
data = crawlerReqOptions.parse(JSON.parse(req.body));
} else {
data = crawlerReqOptions.parse(req.body);
}
const {
name,
baseUrl,
maxLinks,
globMatch,
regexMatch,
stayOnDomain,
guildId,
} = data;
const router = configureRouter({
stayOnDomain: stayOnDomain || true,
globs: globMatch ? [globMatch] : undefined,
regexps: regexMatch ? [new RegExp(regexMatch)] : undefined,
});
const crawler = new PuppeteerCrawler({
requestHandler: router,
maxRequestsPerCrawl: maxLinks,
launchContext: {
launchOptions: {
executablePath: '/usr/bin/chromium-browser',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-gpu',
'--headless',
'--disable-dev-shm-usage',
],
},
},
});

const isUrlASitemap = /sitemap.*\.xml$/.test(baseUrl);
let results;
if (isUrlASitemap) {
const listOfUrls = await downloadListOfUrls({ url: baseUrl });
await crawler.addRequests(listOfUrls);
crawler.config.createMemoryStorage({});
results = await crawler.run();
} else {
results = await crawler.run([baseUrl]);
}
const dataSet = await crawler.getData();
log(`Crawling finished, ${dataSet.count} pages crawled`);
const file = new Blob([JSON.stringify(dataSet.items)], {
type: 'application/json',
});
const found_buckets = await storage.listBuckets([
Query.equal('$id', guildId),
]);
if (found_buckets.total != 0) {
const found_bucket = await storage.getBucket(guildId);
const fileUploaded = await storage.createFile(
found_bucket.$id,
ID.unique(),
file
);
return res.send(fileUploaded.$id);
} else {
const bucket_name = `Guild_${guildId}`;
const bucket = await storage.createBucket(
guildId,
bucket_name,
[Permission.read(Role.any())],
true,
true,
30000000, // 30MB
undefined,
'gzip',
true,
true
);
const fileUploaded = await storage.createFile(
bucket.$id,
ID.unique(),
file
);
return res.send(fileUploaded.$id);
}
} catch (err) {
error(`An error occurred during the crawling process: ${err}`);
return res.send({
error: 'An error occurred during the crawling process.',
});
}
};
export default async ({ req, res, log, error }: any) => {
try {
const client = new Client()
.setEndpoint('https://cloud.appwrite.io/v1')
.setProject(process.env.APPWRITE_FUNCTION_PROJECT_ID!)
.setKey(process.env.APPWRITE_API_KEY!);
const storage = new Storage(client);
log('Starting to install puppeteer (execSync)');
execSync('apk add /usr/local/server/src/function/*.apk');
log('Starting to crawl URL');
let data: CrawlerReqOptions;
if (typeof req.body === 'string') {
data = crawlerReqOptions.parse(JSON.parse(req.body));
} else {
data = crawlerReqOptions.parse(req.body);
}
const {
name,
baseUrl,
maxLinks,
globMatch,
regexMatch,
stayOnDomain,
guildId,
} = data;
const router = configureRouter({
stayOnDomain: stayOnDomain || true,
globs: globMatch ? [globMatch] : undefined,
regexps: regexMatch ? [new RegExp(regexMatch)] : undefined,
});
const crawler = new PuppeteerCrawler({
requestHandler: router,
maxRequestsPerCrawl: maxLinks,
launchContext: {
launchOptions: {
executablePath: '/usr/bin/chromium-browser',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-gpu',
'--headless',
'--disable-dev-shm-usage',
],
},
},
});

const isUrlASitemap = /sitemap.*\.xml$/.test(baseUrl);
let results;
if (isUrlASitemap) {
const listOfUrls = await downloadListOfUrls({ url: baseUrl });
await crawler.addRequests(listOfUrls);
crawler.config.createMemoryStorage({});
results = await crawler.run();
} else {
results = await crawler.run([baseUrl]);
}
const dataSet = await crawler.getData();
log(`Crawling finished, ${dataSet.count} pages crawled`);
const file = new Blob([JSON.stringify(dataSet.items)], {
type: 'application/json',
});
const found_buckets = await storage.listBuckets([
Query.equal('$id', guildId),
]);
if (found_buckets.total != 0) {
const found_bucket = await storage.getBucket(guildId);
const fileUploaded = await storage.createFile(
found_bucket.$id,
ID.unique(),
file
);
return res.send(fileUploaded.$id);
} else {
const bucket_name = `Guild_${guildId}`;
const bucket = await storage.createBucket(
guildId,
bucket_name,
[Permission.read(Role.any())],
true,
true,
30000000, // 30MB
undefined,
'gzip',
true,
true
);
const fileUploaded = await storage.createFile(
bucket.$id,
ID.unique(),
file
);
return res.send(fileUploaded.$id);
}
} catch (err) {
error(`An error occurred during the crawling process: ${err}`);
return res.send({
error: 'An error occurred during the crawling process.',
});
}
};
exotic-emerald
exotic-emeraldOP2y ago
my configureRouter
export function configureRouter(options: {
stayOnDomain: boolean;
globs?: string[];
regexps?: RegExpInput[];
detailHandler?: (context: PuppeteerCrawlingContext) => Promise<void>;
selector?: string;
}) {
const router = createPuppeteerRouter();

// Use provided globs or default value
const globs = options.globs || undefined;
const regexps = options.regexps || undefined;
const selector = options.selector || 'body'; // Default to 'body' if no selector is provided

router.addDefaultHandler(async ({ enqueueLinks, log, page }) => {
log.info(`Enqueueing new URLs`);
await enqueueLinks({
globs: globs,
regexps: regexps,
label: 'detail',
});

// If a selector is provided, use it to extract content from the page
if (selector) {
if (selector.startsWith('/')) {
// If the selector is an XPath, wait for the XPath
await waitForXPath(page, selector);
}
const content = await getPageHtml(page, selector);
log.info(`Extracted content using selector: ${selector}`, { content });
}
});

// Use provided detailHandler or default implementation
const detailHandler =
options.detailHandler ||
(async ({ request, page, log, pushData }) => {
const title = await page.title();
log.info(`${title}`, { url: request.loadedUrl });

// Extract content using the provided selector or XPath
const content = await getPageHtml(page, selector);
await pushData({
url: request.loadedUrl,
title,
content, // Include the extracted content in the pushed data
});
});

router.addHandler('detail', detailHandler);

return router;
}
export function configureRouter(options: {
stayOnDomain: boolean;
globs?: string[];
regexps?: RegExpInput[];
detailHandler?: (context: PuppeteerCrawlingContext) => Promise<void>;
selector?: string;
}) {
const router = createPuppeteerRouter();

// Use provided globs or default value
const globs = options.globs || undefined;
const regexps = options.regexps || undefined;
const selector = options.selector || 'body'; // Default to 'body' if no selector is provided

router.addDefaultHandler(async ({ enqueueLinks, log, page }) => {
log.info(`Enqueueing new URLs`);
await enqueueLinks({
globs: globs,
regexps: regexps,
label: 'detail',
});

// If a selector is provided, use it to extract content from the page
if (selector) {
if (selector.startsWith('/')) {
// If the selector is an XPath, wait for the XPath
await waitForXPath(page, selector);
}
const content = await getPageHtml(page, selector);
log.info(`Extracted content using selector: ${selector}`, { content });
}
});

// Use provided detailHandler or default implementation
const detailHandler =
options.detailHandler ||
(async ({ request, page, log, pushData }) => {
const title = await page.title();
log.info(`${title}`, { url: request.loadedUrl });

// Extract content using the provided selector or XPath
const content = await getPageHtml(page, selector);
await pushData({
url: request.loadedUrl,
title,
content, // Include the extracted content in the pushed data
});
});

router.addHandler('detail', detailHandler);

return router;
}
@Oleg V. the thing to know though is Appwrite Functions run in a docker environment so the error is more than likely from something related to that but it wouldn't be an error if I could just not output it to a file
MEE6
MEE62y ago
@ZachHandley just advanced to level 1! Thanks for your contributions! 🎉
exotic-emerald
exotic-emeraldOP2y ago
@Helper I guess I'll ping again or just make a GitHub issue?

Did you find this page helpful?