Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions Dockerfile.backend
Original file line number Diff line number Diff line change
@@ -1,26 +1,28 @@
FROM --platform=$BUILDPLATFORM node:20-slim
FROM node:20-slim

# Set working directory
WORKDIR /app

COPY .sequelizerc .sequelizerc
COPY .env .env

# Install node dependencies
COPY package*.json ./
COPY src ./src
COPY public ./public
COPY public ./public
COPY server ./server
COPY tsconfig.json ./
COPY server/tsconfig.json ./server/
# COPY server/start.sh ./
# COPY server/start.sh ./

# Install dependencies
RUN npm install --legacy-peer-deps

# Build TypeScript server
RUN npm run build:server

# Expose backend port
EXPOSE ${BACKEND_PORT:-8080}

# Run migrations & start backend using start script
# Run migrations & start backend using plain node
CMD ["npm", "run", "server"]
# CMD ["sh", "-c", "npm run migrate && npm run server"]
2 changes: 1 addition & 1 deletion Dockerfile.frontend
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM --platform=$BUILDPLATFORM node:18-alpine AS builder
FROM node:18-alpine AS builder

WORKDIR /app

Expand Down
7 changes: 3 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "maxun",
"version": "0.0.28",
"version": "0.0.29",
"author": "Maxun",
"license": "AGPL-3.0-or-later",
"dependencies": {
Expand Down Expand Up @@ -83,12 +83,11 @@
"winston": "^3.5.1"
},
"scripts": {
"start": "concurrently -k \"npm run server\" \"npm run client\"",
"server": "cross-env NODE_OPTIONS='--max-old-space-size=8000' nodemon server/src/server.ts",
"start": "npm run build:server && concurrently -k \"npm run server\" \"npm run client\"",
"server": "cross-env NODE_OPTIONS='--max-old-space-size=512' node server/dist/server/src/server.js",
"client": "vite",
"build": "vite build",
"build:server": "tsc -p server/tsconfig.json",
"start:server": "cross-env NODE_OPTIONS='--max-old-space-size=8000' server/dist/server/src/server.js",
"preview": "vite preview",
"lint": "./node_modules/.bin/eslint .",
"migrate": "sequelize-cli db:migrate",
Expand Down
4 changes: 2 additions & 2 deletions server/docker-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ wait_for_postgres() {
wait_for_postgres

# Run the application with migrations before startup
NODE_OPTIONS="--max-old-space-size=4096" node -e "require('./server/src/db/migrate')().then(() => { console.log('Migration process completed.'); })"
NODE_OPTIONS="--max-old-space-size=4096" node -e "require('./server/dist/server/src/db/migrate')().then(() => { console.log('Migration process completed.'); })"

# Run the server normally
# Run the server normally
exec "$@"
49 changes: 46 additions & 3 deletions server/src/api/record.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import { WorkflowFile } from "maxun-core";
import { addGoogleSheetUpdateTask, googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
import { sendWebhook } from "../routes/webhook";
import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape';
import { convertPageToHTML, convertPageToMarkdown, convertPageToScreenshot } from '../markdownify/scrape';

const router = Router();

Expand Down Expand Up @@ -689,7 +689,9 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[

// Override if API request defines formats
if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f));
formats = requestedFormats.filter((f): f is 'markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage' =>
['markdown', 'html', 'screenshot-visible', 'screenshot-fullpage'].includes(f)
);
}

await run.update({
Expand All @@ -707,6 +709,7 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
let markdown = '';
let html = '';
const serializableOutput: any = {};
const binaryOutput: any = {};

const SCRAPE_TIMEOUT = 120000;

Expand All @@ -728,14 +731,52 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
serializableOutput.html = [{ content: html }];
}

if (formats.includes("screenshot-visible")) {
const screenshotPromise = convertPageToScreenshot(url, currentPage, false);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
});
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);

if (!binaryOutput['screenshot-visible']) {
binaryOutput['screenshot-visible'] = {
data: screenshotBuffer.toString('base64'),
mimeType: 'image/png'
};
}
}

if (formats.includes("screenshot-fullpage")) {
const screenshotPromise = convertPageToScreenshot(url, currentPage, true);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
});
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);

if (!binaryOutput['screenshot-fullpage']) {
binaryOutput['screenshot-fullpage'] = {
data: screenshotBuffer.toString('base64'),
mimeType: 'image/png'
};
}
}

await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ')} conversion completed successfully`,
serializableOutput,
binaryOutput: {},
binaryOutput,
});

// Upload binary output (screenshots) to MinIO if present
let uploadedBinaryOutput: Record<string, string> = {};
if (Object.keys(binaryOutput).length > 0) {
const binaryOutputService = new BinaryOutputService('maxun-run-screenshots');
uploadedBinaryOutput = await binaryOutputService.uploadAndStoreBinaryOutput(run, binaryOutput);
await run.update({ binaryOutput: uploadedBinaryOutput });
}

logger.log('info', `Markdown robot execution completed for API run ${id}`);

// Push success socket event
Expand Down Expand Up @@ -775,6 +816,8 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[

if (formats.includes('markdown')) webhookPayload.markdown = markdown;
if (formats.includes('html')) webhookPayload.html = html;
if (uploadedBinaryOutput['screenshot-visible']) webhookPayload.screenshot_visible = uploadedBinaryOutput['screenshot-visible'];
if (uploadedBinaryOutput['screenshot-fullpage']) webhookPayload.screenshot_fullpage = uploadedBinaryOutput['screenshot-fullpage'];

try {
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
Expand Down
25 changes: 25 additions & 0 deletions server/src/markdownify/scrape.ts
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,28 @@ export async function convertPageToHTML(url: string, page: Page): Promise<string
throw error;
}
}

/**
* Takes a screenshot of the page
* @param url - The URL to screenshot
* @param page - Existing Playwright page instance to use
* @param fullPage - Whether to capture the full scrollable page (true) or just visible viewport (false)
*/
export async function convertPageToScreenshot(url: string, page: Page, fullPage: boolean = false): Promise<Buffer> {
try {
const screenshotType = fullPage ? 'full page' : 'visible viewport';
logger.log('info', `[Scrape] Taking ${screenshotType} screenshot of ${url}`);

await gotoWithFallback(page, url);

const screenshot = await page.screenshot({
type: 'png',
fullPage
});

return screenshot;
} catch (error: any) {
logger.error(`[Scrape] Error during screenshot: ${error.message}`);
throw error;
}
}
2 changes: 1 addition & 1 deletion server/src/models/Robot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ interface RobotMeta {
params: any[];
type?: 'extract' | 'scrape';
url?: string;
formats?: ('markdown' | 'html')[];
formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[];
}

interface RobotWorkflow {
Expand Down
4 changes: 2 additions & 2 deletions server/src/models/Run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ interface RunAttributes {
runByUserId?: string;
runByScheduleId?: string;
runByAPI?: boolean;
serializableOutput: Record<string, any[]>;
serializableOutput: Record<string, any>;
binaryOutput: Record<string, string>;
retryCount?: number;
}
Expand All @@ -45,7 +45,7 @@ class Run extends Model<RunAttributes, RunCreationAttributes> implements RunAttr
public runByUserId!: string;
public runByScheduleId!: string;
public runByAPI!: boolean;
public serializableOutput!: Record<string, any[]>;
public serializableOutput!: Record<string, any>;
public binaryOutput!: Record<string, any>;
public retryCount!: number;
}
Expand Down
46 changes: 43 additions & 3 deletions server/src/pgboss-worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } fr
import { io as serverIo } from "./server";
import { sendWebhook } from './routes/webhook';
import { BinaryOutputService } from './storage/mino';
import { convertPageToMarkdown, convertPageToHTML } from './markdownify/scrape';
import { convertPageToMarkdown, convertPageToHTML, convertPageToScreenshot } from './markdownify/scrape';

if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) {
throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.');
Expand Down Expand Up @@ -244,6 +244,7 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
let markdown = '';
let html = '';
const serializableOutput: any = {};
const binaryOutput: any = {};

const SCRAPE_TIMEOUT = 120000;

Expand All @@ -265,15 +266,52 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
serializableOutput.html = [{ content: html }];
}

if (formats.includes("screenshot-visible")) {
const screenshotPromise = convertPageToScreenshot(url, currentPage, false);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
});
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);

if (!binaryOutput['screenshot-visible']) {
binaryOutput['screenshot-visible'] = {
data: screenshotBuffer.toString('base64'),
mimeType: 'image/png'
};
}
}

if (formats.includes("screenshot-fullpage")) {
const screenshotPromise = convertPageToScreenshot(url, currentPage, true);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`Screenshot conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
});
const screenshotBuffer = await Promise.race([screenshotPromise, timeoutPromise]);

if (!binaryOutput['screenshot-fullpage']) {
binaryOutput['screenshot-fullpage'] = {
data: screenshotBuffer.toString('base64'),
mimeType: 'image/png'
};
}
}

// Success update
await run.update({
status: 'success',
finishedAt: new Date().toLocaleString(),
log: `${formats.join(', ').toUpperCase()} conversion completed successfully`,
serializableOutput,
binaryOutput: {},
binaryOutput,
});

let uploadedBinaryOutput: Record<string, string> = {};
if (Object.keys(binaryOutput).length > 0) {
const binaryOutputService = new BinaryOutputService('maxun-run-screenshots');
uploadedBinaryOutput = await binaryOutputService.uploadAndStoreBinaryOutput(run, binaryOutput);
await run.update({ binaryOutput: uploadedBinaryOutput });
}

logger.log('info', `Markdown robot execution completed for run ${data.runId}`);

// Notify sockets
Expand Down Expand Up @@ -304,6 +342,8 @@ async function processRunExecution(job: Job<ExecuteRunData>) {

if (formats.includes('markdown')) webhookPayload.markdown = markdown;
if (formats.includes('html')) webhookPayload.html = html;
if (uploadedBinaryOutput['screenshot-visible']) webhookPayload.screenshot_visible = uploadedBinaryOutput['screenshot-visible'];
if (uploadedBinaryOutput['screenshot-fullpage']) webhookPayload.screenshot_fullpage = uploadedBinaryOutput['screenshot-fullpage'];

await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`);
Expand Down Expand Up @@ -427,7 +467,7 @@ async function processRunExecution(job: Job<ExecuteRunData>) {

logger.log('info', `Workflow execution completed for run ${data.runId}`);

const binaryOutputService = new BinaryOutputService('maxuncloud-run-screenshots');
const binaryOutputService = new BinaryOutputService('maxun-run-screenshots');
const uploadedBinaryOutput = await binaryOutputService.uploadAndStoreBinaryOutput(
run,
interpretationInfo.binaryOutput
Expand Down
9 changes: 5 additions & 4 deletions server/src/routes/auth.ts
Original file line number Diff line number Diff line change
Expand Up @@ -210,12 +210,13 @@ router.get(
requireSignIn,
async (req: AuthenticatedRequest, res) => {
try {
const { id } = req.params;
if (!id) {
return res.status(400).json({ message: "User ID is required" });
if (!req.user || !req.user.id) {
return res.status(401).json({ message: "Unauthorized" });
}

const user = await User.findByPk(id, {
const userId = req.user.id;

const user = await User.findByPk(userId, {
attributes: { exclude: ["password"] },
});

Expand Down
2 changes: 1 addition & 1 deletion server/src/routes/storage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,7 @@ router.post('/recordings/scrape', requireSignIn, async (req: AuthenticatedReques
}

// Validate format
const validFormats = ['markdown', 'html'];
const validFormats = ['markdown', 'html', 'screenshot-visible', 'screenshot-fullpage'];

if (!Array.isArray(formats) || formats.length === 0) {
return res.status(400).json({ error: 'At least one output format must be selected.' });
Expand Down
26 changes: 19 additions & 7 deletions server/src/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -280,27 +280,39 @@ if (require.main === module) {
const run = await Run.findOne({ where: { browserId, status: 'running' } });
if (run) {
const limitedData = {
scrapeSchemaOutput: browser.interpreter.serializableDataByType?.scrapeSchema
? { "schema-tabular": browser.interpreter.serializableDataByType.scrapeSchema }
: {},
scrapeSchemaOutput: browser.interpreter.serializableDataByType?.scrapeSchema || {},
scrapeListOutput: browser.interpreter.serializableDataByType?.scrapeList || {},
binaryOutput: browser.interpreter.binaryData || []
};

const binaryOutputRecord = limitedData.binaryOutput.reduce((acc: Record<string, any>, item: any, index: number) => {
acc[`item-${index}`] = item;
const key = item.name || `Screenshot ${index + 1}`;
acc[key] = { data: item.data, mimeType: item.mimeType };
return acc;
}, {});

let uploadedBinaryOutput = {};
if (Object.keys(binaryOutputRecord).length > 0) {
try {
const { BinaryOutputService } = require('./storage/mino');
const binaryOutputService = new BinaryOutputService('maxun-run-screenshots');
uploadedBinaryOutput = await binaryOutputService.uploadAndStoreBinaryOutput(run, binaryOutputRecord);
logger.log('info', `Successfully uploaded ${Object.keys(uploadedBinaryOutput).length} screenshots to MinIO for interrupted run`);
} catch (minioError: any) {
logger.log('error', `Failed to upload binary data to MinIO during shutdown: ${minioError.message}`);
uploadedBinaryOutput = binaryOutputRecord;
}
}

await run.update({
status: 'failed',
finishedAt: new Date().toLocaleString(),
log: 'Process interrupted during execution - partial data preserved',
serializableOutput: {
scrapeSchema: Object.values(limitedData.scrapeSchemaOutput),
scrapeList: Object.values(limitedData.scrapeListOutput),
scrapeSchema: limitedData.scrapeSchemaOutput,
scrapeList: limitedData.scrapeListOutput,
},
binaryOutput: binaryOutputRecord
binaryOutput: uploadedBinaryOutput
});
}
}
Expand Down
Loading