diff --git a/package.json b/package.json index 5eae375b5..a3ca8d826 100644 --- a/package.json +++ b/package.json @@ -4,6 +4,7 @@ "author": "Maxun", "license": "AGPL-3.0-or-later", "dependencies": { + "@anthropic-ai/sdk": "^0.71.2", "@cliqz/adblocker-playwright": "^1.30.0", "@emotion/react": "^11.9.0", "@emotion/styled": "^11.8.1", diff --git a/server/src/api/sdk.ts b/server/src/api/sdk.ts new file mode 100644 index 000000000..1a049d580 --- /dev/null +++ b/server/src/api/sdk.ts @@ -0,0 +1,717 @@ +/** + * SDK API Routes + * Separate API endpoints specifically for Maxun SDKs + * All routes require API key authentication + */ + +import { Router, Request, Response } from 'express'; +import { requireAPIKey } from "../middlewares/api"; +import Robot from "../models/Robot"; +import Run from "../models/Run"; +import { v4 as uuid } from 'uuid'; +import { WorkflowFile } from "maxun-core"; +import logger from "../logger"; +import { capture } from "../utils/analytics"; +import { handleRunRecording } from "./record"; +import { WorkflowEnricher } from "../sdk/workflowEnricher"; +import { cancelScheduledWorkflow, scheduleWorkflow } from '../storage/schedule'; +import { computeNextRun } from "../utils/schedule"; +import moment from 'moment-timezone'; + +const router = Router(); + +interface AuthenticatedRequest extends Request { + user?: any; +} + +/** + * Create a new robot programmatically + * POST /api/sdk/robots + */ +router.post("/sdk/robots", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => { + try { + const user = req.user; + const workflowFile: WorkflowFile = req.body; + + if (!workflowFile.meta || !workflowFile.workflow) { + return res.status(400).json({ + error: "Invalid workflow structure. Expected { meta, workflow }" + }); + } + + if (!workflowFile.meta.name) { + return res.status(400).json({ + error: "Robot name is required in meta.name" + }); + } + + const type = (workflowFile.meta as any).type || 'extract'; + + let enrichedWorkflow: any[] = []; + let extractedUrl: string | undefined; + + if (type === 'scrape') { + enrichedWorkflow = []; + extractedUrl = (workflowFile.meta as any).url; + + if (!extractedUrl) { + return res.status(400).json({ + error: "URL is required for scrape robots" + }); + } + } else { + const enrichResult = await WorkflowEnricher.enrichWorkflow(workflowFile.workflow, user.id); + + if (!enrichResult.success) { + logger.error("[SDK] Error in Selector Validation:\n" + JSON.stringify(enrichResult.errors, null, 2)) + + return res.status(400).json({ + error: "Workflow validation failed", + details: enrichResult.errors + }); + } + + enrichedWorkflow = enrichResult.workflow!; + extractedUrl = enrichResult.url; + } + + const robotId = uuid(); + const metaId = uuid(); + + const robotMeta: any = { + name: workflowFile.meta.name, + id: metaId, + createdAt: new Date().toISOString(), + updatedAt: new Date().toISOString(), + pairs: enrichedWorkflow.length, + params: [], + type, + url: extractedUrl, + formats: (workflowFile.meta as any).formats || [], + }; + + const robot = await Robot.create({ + id: robotId, + userId: user.id, + recording_meta: robotMeta, + recording: { + workflow: enrichedWorkflow + } + }); + + capture("maxun-oss-robot-created", { + robot_meta: robot.recording_meta, + recording: robot.recording, + }); + + return res.status(201).json({ + data: robot, + message: "Robot created successfully" + }); + + } catch (error: any) { + logger.error("[SDK] Error creating robot:", error); + return res.status(500).json({ + error: "Failed to create robot", + message: error.message + }); + } +}); + +/** + * List all robots for the authenticated user + * GET /api/sdk/robots + */ +router.get("/sdk/robots", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => { + try { + const robots = await Robot.findAll(); + + return res.status(200).json({ + data: robots + }); + } catch (error: any) { + logger.error("[SDK] Error listing robots:", error); + return res.status(500).json({ + error: "Failed to list robots", + message: error.message + }); + } +}); + +/** + * Get a specific robot by ID + * GET /api/sdk/robots/:id + */ +router.get("/sdk/robots/:id", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => { + try { + const robotId = req.params.id; + + const robot = await Robot.findOne({ + where: { + 'recording_meta.id': robotId + } + }); + + if (!robot) { + return res.status(404).json({ + error: "Robot not found" + }); + } + + return res.status(200).json({ + data: robot + }); + } catch (error: any) { + logger.error("[SDK] Error getting robot:", error); + return res.status(500).json({ + error: "Failed to get robot", + message: error.message + }); + } +}); + +/** + * Update a robot + * PUT /api/sdk/robots/:id + */ +router.put("/sdk/robots/:id", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => { + try { + const robotId = req.params.id; + const updates = req.body; + + const robot = await Robot.findOne({ + where: { + 'recording_meta.id': robotId + } + }); + + if (!robot) { + return res.status(404).json({ + error: "Robot not found" + }); + } + + const updateData: any = {}; + + if (updates.workflow) { + updateData.recording = { + workflow: updates.workflow + }; + } + + if (updates.meta) { + updateData.recording_meta = { + ...robot.recording_meta, + ...updates.meta, + updatedAt: new Date().toISOString() + }; + } + + if (updates.google_sheet_email !== undefined) { + updateData.google_sheet_email = updates.google_sheet_email; + } + if (updates.google_sheet_name !== undefined) { + updateData.google_sheet_name = updates.google_sheet_name; + } + if (updates.airtable_base_id !== undefined) { + updateData.airtable_base_id = updates.airtable_base_id; + } + if (updates.airtable_table_name !== undefined) { + updateData.airtable_table_name = updates.airtable_table_name; + } + + if (updates.schedule !== undefined) { + if (updates.schedule === null) { + try { + await cancelScheduledWorkflow(robotId); + } catch (cancelError) { + logger.warn(`[SDK] Failed to cancel existing schedule for robot ${robotId}: ${cancelError}`); + } + updateData.schedule = null; + } else { + const { + runEvery, + runEveryUnit, + timezone, + startFrom = 'SUNDAY', + dayOfMonth = 1, + atTimeStart = '00:00', + atTimeEnd = '23:59' + } = updates.schedule; + + if (!runEvery || !runEveryUnit || !timezone) { + return res.status(400).json({ + error: "Missing required schedule parameters: runEvery, runEveryUnit, timezone" + }); + } + + if (!moment.tz.zone(timezone)) { + return res.status(400).json({ + error: "Invalid timezone" + }); + } + + const [startHours, startMinutes] = atTimeStart.split(':').map(Number); + const [endHours, endMinutes] = atTimeEnd.split(':').map(Number); + + if (isNaN(startHours) || isNaN(startMinutes) || isNaN(endHours) || isNaN(endMinutes) || + startHours < 0 || startHours > 23 || startMinutes < 0 || startMinutes > 59 || + endHours < 0 || endHours > 23 || endMinutes < 0 || endMinutes > 59) { + return res.status(400).json({ error: 'Invalid time format. Expected HH:MM (e.g., 09:30)' }); + } + + const days = ['SUNDAY', 'MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY']; + if (!days.includes(startFrom)) { + return res.status(400).json({ error: 'Invalid startFrom day. Must be one of: SUNDAY, MONDAY, TUESDAY, WEDNESDAY, THURSDAY, FRIDAY, SATURDAY' }); + } + + let cronExpression; + const dayIndex = days.indexOf(startFrom); + + switch (runEveryUnit) { + case 'MINUTES': + cronExpression = `*/${runEvery} * * * *`; + break; + case 'HOURS': + cronExpression = `${startMinutes} */${runEvery} * * *`; + break; + case 'DAYS': + cronExpression = `${startMinutes} ${startHours} */${runEvery} * *`; + break; + case 'WEEKS': + cronExpression = `${startMinutes} ${startHours} * * ${dayIndex}`; + break; + case 'MONTHS': + cronExpression = `${startMinutes} ${startHours} ${dayOfMonth} */${runEvery} *`; + if (startFrom !== 'SUNDAY') { + cronExpression += ` ${dayIndex}`; + } + break; + default: + return res.status(400).json({ + error: "Invalid runEveryUnit. Must be one of: MINUTES, HOURS, DAYS, WEEKS, MONTHS" + }); + } + + try { + await cancelScheduledWorkflow(robotId); + } catch (cancelError) { + logger.warn(`[SDK] Failed to cancel existing schedule for robot ${robotId}: ${cancelError}`); + } + + try { + await scheduleWorkflow(robotId, req.user.id, cronExpression, timezone); + } catch (scheduleError: any) { + logger.error(`[SDK] Failed to schedule workflow for robot ${robotId}: ${scheduleError.message}`); + return res.status(500).json({ + error: "Failed to schedule workflow", + message: scheduleError.message + }); + } + + const nextRunAt = computeNextRun(cronExpression, timezone); + + updateData.schedule = { + runEvery, + runEveryUnit, + timezone, + startFrom, + dayOfMonth, + atTimeStart, + atTimeEnd, + cronExpression, + lastRunAt: undefined, + nextRunAt: nextRunAt || undefined, + }; + + logger.info(`[SDK] Scheduled robot ${robotId} with cron: ${cronExpression} in timezone: ${timezone}`); + } + } + + if (updates.webhooks !== undefined) { + updateData.webhooks = updates.webhooks; + } + + if (updates.proxy_url !== undefined) { + updateData.proxy_url = updates.proxy_url; + } + if (updates.proxy_username !== undefined) { + updateData.proxy_username = updates.proxy_username; + } + if (updates.proxy_password !== undefined) { + updateData.proxy_password = updates.proxy_password; + } + + await robot.update(updateData); + + logger.info(`[SDK] Robot updated: ${robotId}`); + + return res.status(200).json({ + data: robot, + message: "Robot updated successfully" + }); + } catch (error: any) { + logger.error("[SDK] Error updating robot:", error); + return res.status(500).json({ + error: "Failed to update robot", + message: error.message + }); + } +}); + +/** + * Delete a robot + * DELETE /api/sdk/robots/:id + */ +router.delete("/sdk/robots/:id", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => { + try { + const robotId = req.params.id; + + const robot = await Robot.findOne({ + where: { + 'recording_meta.id': robotId + } + }); + + if (!robot) { + return res.status(404).json({ + error: "Robot not found" + }); + } + + await Run.destroy({ + where: { + robotMetaId: robot.recording_meta.id + } + }); + + await robot.destroy(); + + logger.info(`[SDK] Robot deleted: ${robotId}`); + + capture( + 'maxun-oss-robot-deleted', + { + robotId: robotId, + user_id: req.user?.id, + deleted_at: new Date().toISOString(), + } + ) + + return res.status(200).json({ + message: "Robot deleted successfully" + }); + } catch (error: any) { + logger.error("[SDK] Error deleting robot:", error); + return res.status(500).json({ + error: "Failed to delete robot", + message: error.message + }); + } +}); + +/** + * Execute a robot + * POST /api/sdk/robots/:id/execute + */ +router.post("/sdk/robots/:id/execute", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => { + try { + const user = req.user; + const robotId = req.params.id; + + logger.info(`[SDK] Starting execution for robot ${robotId}`); + + const runId = await handleRunRecording(robotId, user.id.toString()); + if (!runId) { + throw new Error('Failed to start robot execution'); + } + + const run = await waitForRunCompletion(runId, user.id.toString()); + + let listData: any[] = []; + if (run.serializableOutput?.scrapeList) { + const scrapeList: any = run.serializableOutput.scrapeList; + + if (scrapeList.scrapeList && Array.isArray(scrapeList.scrapeList)) { + listData = scrapeList.scrapeList; + } + else if (Array.isArray(scrapeList)) { + listData = scrapeList; + } + else if (typeof scrapeList === 'object') { + const listValues = Object.values(scrapeList); + if (listValues.length > 0 && Array.isArray(listValues[0])) { + listData = listValues[0] as any[]; + } + } + } + + return res.status(200).json({ + data: { + runId: run.runId, + status: run.status, + data: { + textData: run.serializableOutput?.scrapeSchema || {}, + listData: listData + }, + screenshots: Object.values(run.binaryOutput || {}) + } + }); + } catch (error: any) { + logger.error("[SDK] Error executing robot:", error); + return res.status(500).json({ + error: "Failed to execute robot", + message: error.message + }); + } +}); + +/** + * Wait for run completion + */ +async function waitForRunCompletion(runId: string, interval: number = 2000) { + const MAX_WAIT_TIME = 180 * 60 * 1000; + const startTime = Date.now(); + + while (true) { + if (Date.now() - startTime > MAX_WAIT_TIME) { + throw new Error('Run completion timeout after 3 hours'); + } + + const run = await Run.findOne({ where: { runId } }); + if (!run) throw new Error('Run not found'); + + if (run.status === 'success') { + return run.toJSON(); + } else if (run.status === 'failed') { + throw new Error('Run failed'); + } else if (run.status === 'aborted') { + throw new Error('Run was aborted'); + } + + await new Promise(resolve => setTimeout(resolve, interval)); + } +} + +/** + * Get all runs for a robot + * GET /api/sdk/robots/:id/runs + */ +router.get("/sdk/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => { + try { + const robotId = req.params.id; + + const robot = await Robot.findOne({ + where: { + 'recording_meta.id': robotId + } + }); + + if (!robot) { + return res.status(404).json({ + error: "Robot not found" + }); + } + + const runs = await Run.findAll({ + where: { + robotMetaId: robot.recording_meta.id + }, + order: [['startedAt', 'DESC']] + }); + + return res.status(200).json({ + data: runs + }); + } catch (error: any) { + logger.error("[SDK] Error getting runs:", error); + return res.status(500).json({ + error: "Failed to get runs", + message: error.message + }); + } +}); + +/** + * Get a specific run + * GET /api/sdk/robots/:id/runs/:runId + */ +router.get("/sdk/robots/:id/runs/:runId", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => { + try { + const robotId = req.params.id; + const runId = req.params.runId; + + const robot = await Robot.findOne({ + where: { + 'recording_meta.id': robotId + } + }); + + if (!robot) { + return res.status(404).json({ + error: "Robot not found" + }); + } + + const run = await Run.findOne({ + where: { + runId: runId, + robotMetaId: robot.recording_meta.id + } + }); + + if (!run) { + return res.status(404).json({ + error: "Run not found" + }); + } + + return res.status(200).json({ + data: run + }); + } catch (error: any) { + logger.error("[SDK] Error getting run:", error); + return res.status(500).json({ + error: "Failed to get run", + message: error.message + }); + } +}); + +/** + * Abort a running execution + * POST /api/sdk/robots/:id/runs/:runId/abort + */ +router.post("/sdk/robots/:id/runs/:runId/abort", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => { + try { + const robotId = req.params.id; + const runId = req.params.runId; + + const robot = await Robot.findOne({ + where: { + 'recording_meta.id': robotId + } + }); + + if (!robot) { + return res.status(404).json({ + error: "Robot not found" + }); + } + + const run = await Run.findOne({ + where: { + runId: runId, + robotMetaId: robot.recording_meta.id + } + }); + + if (!run) { + return res.status(404).json({ + error: "Run not found" + }); + } + + if (run.status !== 'running' && run.status !== 'queued') { + return res.status(400).json({ + error: "Run is not in a state that can be aborted", + currentStatus: run.status + }); + } + + await run.update({ status: 'aborted' }); + + logger.info(`[SDK] Run ${runId} marked for abortion`); + + return res.status(200).json({ + message: "Run abortion initiated", + data: run + }); + } catch (error: any) { + logger.error("[SDK] Error aborting run:", error); + return res.status(500).json({ + error: "Failed to abort run", + message: error.message + }); + } +}); + +/** + * LLM-based extraction - generate workflow from natural language prompt + * POST /api/sdk/extract/llm + */ +router.post("/sdk/extract/llm", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => { + try { + const user = req.user + const { url, prompt, llmProvider, llmModel, llmApiKey, llmBaseUrl, robotName } = req.body; + + if (!url || !prompt) { + return res.status(400).json({ + error: "URL and prompt are required" + }); + } + + const workflowResult = await WorkflowEnricher.generateWorkflowFromPrompt(url, prompt, user.id, { + provider: llmProvider, + model: llmModel, + apiKey: llmApiKey, + baseUrl: llmBaseUrl + }); + + if (!workflowResult.success || !workflowResult.workflow) { + return res.status(400).json({ + error: "Failed to generate workflow from prompt", + details: workflowResult.errors + }); + } + + const robotId = uuid(); + const metaId = uuid(); + + const robotMeta: any = { + name: robotName || `LLM Extract: ${prompt.substring(0, 50)}`, + id: metaId, + createdAt: new Date().toISOString(), + updatedAt: new Date().toISOString(), + pairs: workflowResult.workflow.length, + params: [], + type: 'extract', + url: workflowResult.url, + }; + + const robot = await Robot.create({ + id: robotId, + userId: user.id, + recording_meta: robotMeta, + recording: { + workflow: workflowResult.workflow + }, + }); + + logger.info(`[SDK] Persistent robot created: ${metaId} for LLM extraction`); + + capture("maxun-oss-robot-created", { + robot_meta: robot.recording_meta, + recording: robot.recording, + }); + + return res.status(200).json({ + success: true, + data: { + robotId: metaId, + name: robotMeta.name, + description: prompt, + url: workflowResult.url, + workflow: workflowResult.workflow + } + }); + } catch (error: any) { + logger.error("[SDK] Error in LLM extraction:", error); + return res.status(500).json({ + error: "Failed to perform LLM extraction", + message: error.message + }); + } +}); + +export default router; diff --git a/server/src/browser-management/controller.ts b/server/src/browser-management/controller.ts index 4058fa56a..a0fbf5a2a 100644 --- a/server/src/browser-management/controller.ts +++ b/server/src/browser-management/controller.ts @@ -4,7 +4,7 @@ */ import { Socket } from "socket.io"; import { v4 as uuid } from "uuid"; - +import { Page } from "playwright-core"; import { createSocketConnection, createSocketConnectionForRun } from "../socket-connection/connection"; import { io, browserPool } from "../server"; import { RemoteBrowser } from "./classes/RemoteBrowser"; @@ -434,3 +434,63 @@ const initializeBrowserAsync = async (id: string, userId: string) => { throw error; } }; + +/** + * Creates a RemoteBrowser instance specifically for SDK validation + * Uses dummy socket and returns browser ID and Page for validation tasks + * @param userId User ID for browser ownership + * @returns Promise with browser ID and Page instance + * @category BrowserManagement-Controller + */ +export const createRemoteBrowserForValidation = async ( + userId: string +): Promise<{ browserId: string; page: Page }> => { + const id = uuid(); + + logger.log('info', `Creating validation browser ${id} for user ${userId}`); + + try { + const dummySocket = { + emit: (event: string, data?: any) => { + logger.log('debug', `Browser ${id} emitted ${event}`); + }, + on: () => {}, + off: () => {}, + id: `validation-${id}`, + } as any; + + const browserSession = new RemoteBrowser(dummySocket, userId, id); + + const VALIDATION_INIT_TIMEOUT = 45000; + const initPromise = browserSession.initialize(userId); + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => reject(new Error('Validation browser initialization timeout')), VALIDATION_INIT_TIMEOUT); + }); + + await Promise.race([initPromise, timeoutPromise]); + + const added = browserPool.addRemoteBrowser(id, browserSession, userId, true, 'run'); + if (!added) { + await browserSession.switchOff(); + throw new Error('Failed to add validation browser to pool'); + } + + const page = browserSession.getCurrentPage(); + if (!page) { + await destroyRemoteBrowser(id, userId); + throw new Error('Failed to get page from validation browser'); + } + + logger.log('info', `Browser ${id} initialized successfully`); + + return { browserId: id, page }; + } catch (error: any) { + logger.log('error', `Failed to create validation browser ${id}: ${error.message}`); + try { + await destroyRemoteBrowser(id, userId); + } catch (cleanupError) { + logger.log('warn', `Failed to cleanup browser ${id}: ${cleanupError}`); + } + throw error; + } +}; \ No newline at end of file diff --git a/server/src/routes/storage.ts b/server/src/routes/storage.ts index 3941b01ff..7d50b780d 100644 --- a/server/src/routes/storage.ts +++ b/server/src/routes/storage.ts @@ -894,7 +894,7 @@ router.put('/schedule/:id/', requireSignIn, async (req: AuthenticatedRequest, re logger.log('warn', `Failed to cancel existing schedule for robot ${id}: ${cancelError}`); } - const jobId = await scheduleWorkflow(id, req.user.id, cronExpression, timezone); + await scheduleWorkflow(id, req.user.id, cronExpression, timezone); const nextRunAt = computeNextRun(cronExpression, timezone); diff --git a/server/src/schedule-worker.ts b/server/src/schedule-worker.ts index 63c7bad46..75c578c29 100644 --- a/server/src/schedule-worker.ts +++ b/server/src/schedule-worker.ts @@ -13,7 +13,7 @@ if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || const pgBossConnectionString = `postgresql://${process.env.DB_USER}:${encodeURIComponent(process.env.DB_PASSWORD)}@${process.env.DB_HOST}:${process.env.DB_PORT}/${process.env.DB_NAME}`; -const pgBoss = new PgBoss({ +export const pgBoss = new PgBoss({ connectionString: pgBossConnectionString, max: 3, expireInHours: 23, @@ -87,8 +87,9 @@ async function registerScheduledWorkflowWorker() { /** * Register a worker for a specific queue + * Exported to allow dynamic registration when new schedules are created */ -async function registerWorkerForQueue(queueName: string) { +export async function registerWorkerForQueue(queueName: string) { try { if (registeredQueues.has(queueName)) { return; diff --git a/server/src/sdk/browserSide/pageAnalyzer.js b/server/src/sdk/browserSide/pageAnalyzer.js new file mode 100644 index 000000000..7215ab79d --- /dev/null +++ b/server/src/sdk/browserSide/pageAnalyzer.js @@ -0,0 +1,2649 @@ +/** + * Page Analyzer for pagination auto-detection, selector generation and grouping + */ + +(function () { + 'use strict'; + + /** + * Helper function to evaluate both CSS and XPath selectors + * Returns array of matching elements + */ + function evaluateSelector(selector, doc) { + try { + const isXPath = selector.startsWith('//') || selector.startsWith('(//'); + + if (isXPath) { + const result = doc.evaluate( + selector, + doc, + null, + XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, + null + ); + + const elements = []; + for (let i = 0; i < result.snapshotLength; i++) { + const node = result.snapshotItem(i); + if (node && node.nodeType === Node.ELEMENT_NODE) { + elements.push(node); + } + } + return elements; + } else { + return Array.from(doc.querySelectorAll(selector)); + } + } catch (err) { + return []; + } + } + + /** + * Convert CSS selector to XPath + */ + function cssToXPath(cssSelector) { + if (cssSelector.startsWith('//') || cssSelector.startsWith('/')) { + return cssSelector; + } + + try { + let xpath = ''; + + const parts = cssSelector.split(/\s+(?![^[]*])/); + + for (let i = 0; i < parts.length; i++) { + const part = parts[i].trim(); + if (!part) continue; + if (part === '>') continue; + + const xpathPart = convertCssPart(part); + if (i === 0) { + xpath = '//' + xpathPart; + } else if (parts[i - 1] === '>') { + xpath += '/' + xpathPart; + } else { + xpath += '//' + xpathPart; + } + } + + return xpath || `//*`; + } catch (error) { + return `//*`; + } + } + + /** + * Convert a single CSS selector part to XPath + */ + function convertCssPart(cssPart) { + const tagMatch = cssPart.match(/^([a-zA-Z][\w-]*|\*)/); + const tag = tagMatch ? tagMatch[1] : '*'; + + const predicates = []; + + const idMatch = cssPart.match(/#([\w-]+)/); + if (idMatch) { + predicates.push(`@id='${idMatch[1]}'`); + } + + const classMatches = cssPart.match(/\.((?:\\.|[^.#[\s])+)/g); + if (classMatches) { + classMatches.forEach(cls => { + let className = cls.substring(1).replace(/\\/g, ''); + predicates.push(`contains(@class, '${className}')`); + }); + } + + const attrMatches = cssPart.match(/\[([^\]]+)\]/g); + if (attrMatches) { + attrMatches.forEach(attr => { + const content = attr.slice(1, -1); + const eqMatch = content.match(/([^=]+)="([^"]+)"/); + if (eqMatch) { + predicates.push(`@${eqMatch[1]}='${eqMatch[2]}'`); + } else { + predicates.push(`@${content}`); + } + }); + } + + if (predicates.length > 0) { + return `${tag}[${predicates.join(' and ')}]`; + } + return tag; + } + + /** + * Main entry point for SDK - auto-converts CSS to XPath + */ + window.autoDetectListFields = function (selector) { + try { + let xpathSelector = cssToXPath(selector); + + const testElements = evaluateXPath(xpathSelector, document); + + if (testElements.length === 0) { + console.error('No elements matched the XPath selector!'); + return { + fields: {}, + listSelector: xpathSelector, + listFallbackSelector: null, + error: 'Selector did not match any elements on the page' + }; + } + + if (testElements.length > 0 && !xpathSelector.includes('count(*)')) { + const childCounts = testElements.slice(0, 5).map(el => el.children.length); + const uniqueCounts = [...new Set(childCounts)]; + + if (uniqueCounts.length > 1 && childCounts.filter(c => c === 1).length > childCounts.length / 2) { + if (xpathSelector.includes('[') && xpathSelector.endsWith(']')) { + xpathSelector = xpathSelector.slice(0, -1) + ' and count(*)=1]'; + } else if (xpathSelector.includes('[')) { + xpathSelector = xpathSelector.replace(/\]$/, ' and count(*)=1]'); + } else { + const lastSlash = xpathSelector.lastIndexOf('/'); + if (lastSlash !== -1) { + const beforeTag = xpathSelector.substring(0, lastSlash + 1); + const tag = xpathSelector.substring(lastSlash + 1); + xpathSelector = beforeTag + tag + '[count(*)=1]'; + } else { + xpathSelector = xpathSelector + '[count(*)=1]'; + } + } + } + } + + const fields = window.getChildSelectors(xpathSelector); + + return { + fields: fields, + listSelector: xpathSelector, + listFallbackSelector: null, + error: Object.keys(fields).length === 0 ? 'No valid fields could be auto-detected from the list items' : null + }; + } catch (error) { + console.error('Exception:', error); + return { + fields: {}, + error: error.message || 'Failed to auto-detect fields' + }; + } + }; + + const pathCache = new WeakMap(); + const descendantsCache = new WeakMap(); + const meaningfulCache = new WeakMap(); + const classCache = new Map(); + + /** + * Main entry point - returns detected fields for a list selector + */ + window.getChildSelectors = function (parentSelector) { + try { + const parentElements = evaluateXPath(parentSelector, document); + + if (parentElements.length === 0) { + console.error('No parent elements found!'); + return {}; + } + + const maxItems = 10; + const limitedParents = parentElements.slice(0, Math.min(maxItems, parentElements.length)); + + const allChildSelectors = []; + + for (let i = 0; i < limitedParents.length; i++) { + const parent = limitedParents[i]; + const otherListElements = limitedParents.filter((_, index) => index !== i); + + const selectors = generateOptimizedChildXPaths( + parent, + parentSelector, + otherListElements + ); + + allChildSelectors.push(...selectors); + } + + const childSelectors = Array.from(new Set(allChildSelectors)).sort() + + const fields = createFieldsFromSelectors( + childSelectors, + limitedParents, + parentSelector + ); + + return fields; + } catch (error) { + console.error('Exception:', error); + return {}; + } + }; + + /** + * Generate optimized XPath selectors for all meaningful children + */ + function generateOptimizedChildXPaths(parentElement, listSelector, otherListElements) { + const selectors = []; + const processedElements = new Set(); + + const allDescendants = getAllDescendantsIncludingShadow(parentElement); + + const batchSize = 25; + for (let i = 0; i < allDescendants.length; i += batchSize) { + const batch = allDescendants.slice(i, i + batchSize); + + for (const descendant of batch) { + if (processedElements.has(descendant)) continue; + processedElements.add(descendant); + + const xpath = buildOptimizedAbsoluteXPath( + descendant, + listSelector, + parentElement, + otherListElements + ); + + if (xpath.primary) { + selectors.push({ + primary: xpath.primary, + fallback: xpath.fallback, + element: descendant + }); + } + + if (selectors.length >= 250) { + break; + } + } + + if (selectors.length >= 250) { + break; + } + } + + return selectors; + } + + /** + * Get all meaningful descendants including shadow DOM + */ + function getAllDescendantsIncludingShadow(parentElement) { + if (descendantsCache.has(parentElement)) { + return descendantsCache.get(parentElement); + } + + const meaningfulDescendants = []; + const queue = [parentElement]; + const visited = new Set(); + visited.add(parentElement); + + const MAX_MEANINGFUL_ELEMENTS = 300; + const MAX_NODES_TO_CHECK = 1200; + const MAX_DEPTH = 20; + let nodesChecked = 0; + + const depths = [0]; + let queueIndex = 0; + + while (queueIndex < queue.length) { + const element = queue[queueIndex]; + const currentDepth = depths[queueIndex]; + queueIndex++; + nodesChecked++; + + if ( + nodesChecked > MAX_NODES_TO_CHECK || + meaningfulDescendants.length >= MAX_MEANINGFUL_ELEMENTS || + currentDepth > MAX_DEPTH + ) { + break; + } + + if (element !== parentElement && isMeaningfulElement(element)) { + meaningfulDescendants.push(element); + } + + if (currentDepth >= MAX_DEPTH) { + continue; + } + + // Process light DOM children + const children = element.children; + const childLimit = Math.min(children.length, 30); + for (let i = 0; i < childLimit; i++) { + const child = children[i]; + if (!visited.has(child)) { + visited.add(child); + queue.push(child); + depths.push(currentDepth + 1); + } + } + + // Process shadow DOM + if (element.shadowRoot && currentDepth < MAX_DEPTH - 1) { + const shadowChildren = element.shadowRoot.children; + const shadowLimit = Math.min(shadowChildren.length, 20); + for (let i = 0; i < shadowLimit; i++) { + const child = shadowChildren[i]; + if (!visited.has(child)) { + visited.add(child); + queue.push(child); + depths.push(currentDepth + 1); + } + } + } + } + + descendantsCache.set(parentElement, meaningfulDescendants); + return meaningfulDescendants; + } + + /** + * Check if element has meaningful content for extraction + */ + function isMeaningfulElement(element) { + if (meaningfulCache.has(element)) { + return meaningfulCache.get(element); + } + + const tagName = element.tagName.toLowerCase(); + + if (tagName === 'img' && element.hasAttribute('src')) { + meaningfulCache.set(element, true); + return true; + } + + if (tagName === 'a' && element.hasAttribute('href')) { + meaningfulCache.set(element, true); + return true; + } + + const text = (element.textContent || '').trim(); + const hasVisibleText = text.length > 0; + + if (hasVisibleText || element.querySelector('svg')) { + meaningfulCache.set(element, true); + return true; + } + + if (element.children.length > 0) { + meaningfulCache.set(element, false); + return false; + } + + meaningfulCache.set(element, false); + return false; + } + + /** + * Build optimized absolute XPath + */ + function buildOptimizedAbsoluteXPath(targetElement, listSelector, listElement, otherListElements) { + try { + let primary = null; + const pathFromList = getOptimizedStructuralPath( + targetElement, + listElement, + otherListElements + ); + + if (pathFromList) { + primary = listSelector + pathFromList; + } + + const fallback = generateMandatoryChildFallbackXPath(targetElement, listElement); + + return { primary, fallback }; + } catch (error) { + const fallback = generateMandatoryChildFallbackXPath(targetElement, listElement); + return { primary: null, fallback }; + } + } + + /** + * Get optimized structural path from element to root + */ + function getOptimizedStructuralPath(targetElement, rootElement, otherListElements) { + if (pathCache.has(targetElement)) { + return pathCache.get(targetElement); + } + + if (!elementContains(rootElement, targetElement) || targetElement === rootElement) { + return null; + } + + const pathParts = []; + let current = targetElement; + let pathDepth = 0; + const MAX_PATH_DEPTH = 20; + + while (current && current !== rootElement && pathDepth < MAX_PATH_DEPTH) { + const classes = getCommonClassesAcrossLists(current, otherListElements); + const hasConflictingElement = classes.length > 0 && rootElement + ? queryElementsInScope(rootElement, current.tagName.toLowerCase()) + .filter(el => el !== current) + .some(el => classes.every(cls => + normalizeClasses(el.classList).split(' ').includes(cls) + )) + : false; + + const pathPart = generateOptimizedStructuralStep( + current, + rootElement, + hasConflictingElement, + otherListElements + ); + + if (pathPart) { + pathParts.unshift(pathPart); + } + + current = current.parentElement || + ((current.getRootNode()).host); + + pathDepth++; + } + + if (current !== rootElement) { + pathCache.set(targetElement, null); + return null; + } + + const result = pathParts.length > 0 ? '/' + pathParts.join('/') : null; + pathCache.set(targetElement, result); + + return result; + } + + /** + * Generate optimized structural step for XPath + */ + function generateOptimizedStructuralStep(element, rootElement, addPositionToAll, otherListElements) { + const tagName = element.tagName.toLowerCase(); + const parent = element.parentElement || + ((element.getRootNode()).host); + + if (!parent) { + return tagName; + } + + const classes = getCommonClassesAcrossLists(element, otherListElements); + if (classes.length > 0 && !addPositionToAll) { + const classSelector = classes + .map(cls => `contains(@class, '${cls}')`) + .join(' and '); + + const hasConflictingElement = rootElement + ? queryElementsInScope(rootElement, element.tagName.toLowerCase()) + .filter(el => el !== element) + .some(el => classes.every(cls => + normalizeClasses(el.classList).split(' ').includes(cls) + )) + : false; + + if (!hasConflictingElement) { + return `${tagName}[${classSelector}]`; + } else { + const position = getSiblingPosition(element, parent); + return `${tagName}[${classSelector}][${position}]`; + } + } + + if (!addPositionToAll) { + const meaningfulAttrs = ['role', 'type']; + for (const attrName of meaningfulAttrs) { + if (element.hasAttribute(attrName)) { + const value = element.getAttribute(attrName).replace(/'/g, "\\'"); + const isCommon = isAttributeCommonAcrossLists( + element, + attrName, + value, + otherListElements + ); + if (isCommon) { + return `${tagName}[@${attrName}='${value}']`; + } + } + } + } + + const position = getSiblingPosition(element, parent); + + if (addPositionToAll || classes.length === 0) { + return `${tagName}[${position}]`; + } + + return tagName; + } + + /** + * Get common classes across list items + */ + function getCommonClassesAcrossLists(targetElement, otherListElements) { + if (otherListElements.length === 0) { + return normalizeClasses(targetElement.classList).split(' ').filter(Boolean); + } + + const targetClasses = normalizeClasses(targetElement.classList).split(' ').filter(Boolean); + + if (targetClasses.length === 0) { + return []; + } + + const cacheKey = `${targetElement.tagName}_${targetClasses.join(',')}_${otherListElements.length}`; + + if (classCache.has(cacheKey)) { + return classCache.get(cacheKey); + } + + const targetClassSet = new Set(targetClasses); + const similarElements = []; + + const maxElementsToCheck = 100; + let checkedElements = 0; + + for (const listEl of otherListElements) { + if (checkedElements >= maxElementsToCheck) break; + + const descendants = getAllDescendantsIncludingShadow(listEl); + for (const child of descendants) { + if (checkedElements >= maxElementsToCheck) break; + if (child.tagName === targetElement.tagName) { + similarElements.push(child); + checkedElements++; + } + } + } + + if (similarElements.length === 0) { + classCache.set(cacheKey, targetClasses); + return targetClasses; + } + + // Fast exact match check + const exactMatches = similarElements.filter(el => { + const elClasses = normalizeClasses(el.classList).split(' ').filter(Boolean); + if (elClasses.length !== targetClasses.length) return false; + return elClasses.every(cls => targetClassSet.has(cls)); + }); + + if (exactMatches.length > 0) { + classCache.set(cacheKey, targetClasses); + return targetClasses; + } + + // Find common classes + const commonClasses = []; + + for (const targetClass of targetClasses) { + const existsInAllOtherLists = otherListElements.every(listEl => { + const elementsInThisList = getAllDescendantsIncludingShadow(listEl).filter(child => + child.tagName === targetElement.tagName + ); + + return elementsInThisList.some(el => + normalizeClasses(el.classList).split(' ').includes(targetClass) + ); + }); + + if (existsInAllOtherLists) { + commonClasses.push(targetClass); + } + } + + classCache.set(cacheKey, commonClasses); + return commonClasses; + } + + /** + * Normalize class names by removing dynamic parts + */ + function normalizeClasses(classList) { + return Array.from(classList) + .filter(cls => { + return ( + !cls.match(/\d{3,}|uuid|hash|id-|_\d+$/i) && + !cls.startsWith('_ngcontent-') && + !cls.startsWith('_nghost-') && + !cls.match(/^ng-tns-c\d+-\d+$/) + ); + }) + .sort() + .join(' '); + } + + /** + * Check if attribute is common across lists + */ + function isAttributeCommonAcrossLists(targetElement, attrName, attrValue, otherListElements) { + if (otherListElements.length === 0) { + return true; + } + + const targetPath = getElementPath(targetElement); + + for (const otherListElement of otherListElements) { + const correspondingElement = findCorrespondingElement(otherListElement, targetPath); + if (correspondingElement) { + const otherValue = correspondingElement.getAttribute(attrName); + if (otherValue !== attrValue) { + return false; + } + } + } + + return true; + } + + /** + * Get element path as indices + */ + function getElementPath(element) { + const path = []; + let current = element; + + while (current && current.parentElement) { + const siblings = Array.from(current.parentElement.children); + path.unshift(siblings.indexOf(current)); + current = current.parentElement; + } + + return path; + } + + /** + * Find corresponding element in another list + */ + function findCorrespondingElement(rootElement, path) { + let current = rootElement; + + for (const index of path) { + const children = Array.from(current.children); + if (index >= children.length) { + return null; + } + current = children[index]; + } + + return current; + } + + /** + * Get sibling position + */ + function getSiblingPosition(element, parent) { + const siblings = Array.from(parent.children || []).filter( + child => child.tagName === element.tagName + ); + return siblings.indexOf(element) + 1; + } + + /** + * Query elements in scope (handles shadow DOM) + */ + function queryElementsInScope(rootElement, tagName) { + if (rootElement.shadowRoot || isInShadowDOM(rootElement)) { + return deepQuerySelectorAll(rootElement, tagName); + } else { + return Array.from(rootElement.querySelectorAll(tagName)); + } + } + + /** + * Check if element is in shadow DOM + */ + function isInShadowDOM(element) { + return element.getRootNode() instanceof ShadowRoot; + } + + /** + * Deep query selector for shadow DOM + */ + function deepQuerySelectorAll(root, selector) { + const elements = []; + + function process(node) { + if (node instanceof Element && node.matches(selector)) { + elements.push(node); + } + + for (const child of node.children) { + process(child); + } + + if (node instanceof HTMLElement && node.shadowRoot) { + process(node.shadowRoot); + } + } + + process(root); + return elements; + } + + /** + * Check if container contains element (works with shadow DOM) + */ + function elementContains(container, element) { + if (container.contains(element)) { + return true; + } + + let current = element; + while (current) { + if (current === container) { + return true; + } + + current = current.parentElement || + ((current.getRootNode()).host); + } + + return false; + } + + /** + * Generate fallback XPath using data-mx-id + */ + function generateMandatoryChildFallbackXPath(childElement, parentElement) { + try { + const parentMxId = parentElement.getAttribute('data-mx-id'); + const childMxId = childElement.getAttribute('data-mx-id'); + + if (!parentMxId) { + return null; + } + + const parentTagName = parentElement.tagName.toLowerCase(); + const childTagName = childElement.tagName.toLowerCase(); + + if (childMxId) { + return `//${parentTagName}[@data-mx-id='${parentMxId}']//${childTagName}[@data-mx-id='${childMxId}']`; + } else { + const pathElements = getMandatoryFallbackPath(childElement, parentElement); + if (pathElements.length > 0) { + const parentPath = `//${parentTagName}[@data-mx-id='${parentMxId}']`; + const childPath = pathElements.join('/'); + return `${parentPath}/${childPath}`; + } + } + + return null; + } catch (error) { + return null; + } + } + + /** + * Build mandatory fallback path using data-mx-id + */ + function getMandatoryFallbackPath(targetElement, rootElement) { + const pathParts = []; + let current = targetElement; + + while (current && current !== rootElement && current.parentElement) { + const mxId = current.getAttribute('data-mx-id'); + const tagName = current.tagName.toLowerCase(); + + if (mxId) { + pathParts.unshift(`${tagName}[@data-mx-id='${mxId}']`); + } else { + const position = Array.from(current.parentElement.children) + .filter(child => child.tagName === current.tagName) + .indexOf(current) + 1; + pathParts.unshift(`${tagName}[${position}]`); + } + + current = current.parentElement; + } + + return pathParts; + } + + /** + * Evaluate XPath and return elements + */ + function evaluateXPath(xpath, contextNode) { + try { + const doc = contextNode instanceof ShadowRoot + ? contextNode.host.ownerDocument + : contextNode; + + const result = doc.evaluate( + xpath, + contextNode, + null, + XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, + null + ); + + const elements = []; + for (let i = 0; i < result.snapshotLength; i++) { + const node = result.snapshotItem(i); + if (node && node.nodeType === Node.ELEMENT_NODE) { + elements.push(node); + } + } + + return elements; + } catch (error) { + return []; + } + } + + /** + * Create fields from selectors by evaluating them and extracting data + */ + function createFieldsFromSelectors(selectorObjects, listElements, parentSelector) { + const candidates = []; + + for (const selectorObj of selectorObjects) { + try { + const elements = evaluateXPath(selectorObj.primary, document); + + if (elements.length === 0) continue; + + const element = elements[0]; + + const tagName = element.tagName.toLowerCase(); + if (tagName === 'a') { + const href = element.getAttribute('href'); + const text = (element.textContent || '').trim(); + + if (text) { + const textField = createFieldData(element, selectorObj.primary, 'innerText'); + if (textField && textField.data) { + candidates.push({ + field: textField, + element: element, + position: getElementPosition(element) + }); + } + } + + if (href && href !== '#' && !href.startsWith('javascript:')) { + const hrefField = createFieldData(element, selectorObj.primary, 'href'); + if (hrefField && hrefField.data) { + candidates.push({ + field: hrefField, + element: element, + position: getElementPosition(element) + }); + } + } + } else { + const field = createFieldData(element, selectorObj.primary); + + if (field && field.data) { + candidates.push({ + field: field, + element: element, + position: getElementPosition(element) + }); + } + } + } catch (error) { + } + } + + const filtered = removeParentChildDuplicates(candidates); + + filtered.sort((a, b) => { + if (Math.abs(a.position.y - b.position.y) > 5) { + return a.position.y - b.position.y; + } + return a.position.x - b.position.x; + }); + + return removeDuplicateContentAndFormat(filtered); + } + + /** + * Create field data from element + */ + function createFieldData(element, selector, forceAttribute) { + const tagName = element.tagName.toLowerCase(); + let data = ''; + let attribute = forceAttribute || 'innerText'; + + if (forceAttribute) { + if (forceAttribute === 'href') { + data = element.getAttribute('href') || ''; + } else if (forceAttribute === 'innerText') { + data = (element.textContent || '').trim(); + } + } else if (tagName === 'img') { + data = element.getAttribute('src') || ''; + attribute = 'src'; + } else if (tagName === 'a') { + const href = element.getAttribute('href') || ''; + const text = (element.textContent || '').trim(); + if (href && href !== '#' && !href.startsWith('javascript:')) { + data = href; + attribute = 'href'; + } else if (text) { + data = text; + attribute = 'innerText'; + } + } else { + data = (element.textContent || '').trim(); + attribute = 'innerText'; + } + + if (!data) { + return null; + } + + const isShadow = element.getRootNode() instanceof ShadowRoot; + + return { + data: data, + selectorObj: { + selector: selector, + attribute: attribute, + tag: tagName.toUpperCase(), + isShadow: isShadow + } + }; + } + + /** + * Get element position + */ + function getElementPosition(element) { + const rect = element.getBoundingClientRect(); + return { + x: rect.left, + y: rect.top + }; + } + + /** + * Remove parent-child duplicates + */ + function removeParentChildDuplicates(candidates) { + const filtered = []; + + for (const candidate of candidates) { + let shouldInclude = true; + const tagName = candidate.element.tagName.toLowerCase(); + + for (const existing of filtered) { + if (candidate.element.contains(existing.element)) { + shouldInclude = false; + break; + } else if (existing.element.contains(candidate.element)) { + const existingIndex = filtered.indexOf(existing); + filtered.splice(existingIndex, 1); + break; + } + } + + if (tagName === 'a' || tagName === 'img') { + shouldInclude = true; + } + + if (shouldInclude) { + filtered.push(candidate); + } + } + + return filtered; + } + + /** + * Remove duplicate content and format for workflow + */ + function removeDuplicateContentAndFormat(candidates) { + const finalFields = {}; + const seenContent = new Set(); + const seenSelectors = new Set(); + let labelCounter = 1; + + for (const candidate of candidates) { + const content = candidate.field.data.trim().toLowerCase(); + const selectorKey = `${candidate.field.selectorObj.selector}::${candidate.field.selectorObj.attribute}`; + + if (!seenContent.has(content) && !seenSelectors.has(selectorKey)) { + seenContent.add(content); + seenSelectors.add(selectorKey); + const fieldName = `Label ${labelCounter}`; + + finalFields[fieldName] = { + selector: candidate.field.selectorObj.selector, + attribute: candidate.field.selectorObj.attribute, + tag: candidate.field.selectorObj.tag, + isShadow: candidate.field.selectorObj.isShadow + }; + + labelCounter++; + } + } + + return finalFields; + } + + /** + * Auto-detect pagination type and selector + * Returns: { type: string, selector: string | null } + * Types: 'scrollDown', 'scrollUp', 'clickNext', 'clickLoadMore', '' + */ + window.autoDetectPagination = function (listSelector, options) { + try { + + const listElements = evaluateSelector(listSelector, document); + + if (listElements.length === 0) { + return { type: '', selector: null, confidence: 'low', debug: 'No list elements found' }; + } + + const listContainer = listElements[0]; + + const nextButtonPatterns = [ + /next/i, + /\bnext\s+page\b/i, + /page\s+suivante/i, + /siguiente/i, + /weiter/i, + />>|›|→|»|⟩/, + /\bforward\b/i, + /\bnewer\b/i, + /\bolder\b/i + ]; + + const loadMorePatterns = [ + /load\s+more/i, + /show\s+more/i, + /view\s+more/i, + /see\s+more/i, + /more\s+results/i, + /plus\s+de\s+résultats/i, + /más\s+resultados/i, + /weitere\s+ergebnisse/i + ]; + + const prevButtonPatterns = [ + /prev/i, + /previous/i, + /<<|‹|←|«/, + /\bback\b/i + ]; + + /** + * Check if element text matches any pattern + */ + function matchesAnyPattern(text, patterns) { + return patterns.some(pattern => pattern.test(text)); + } + + /** + * Get all clickable elements (buttons, links, etc.) + */ + function getClickableElements() { + const clickables = []; + const selectors = ['button', 'a', '[role="button"]', '[onclick]', '.btn', '.button']; + + for (const selector of selectors) { + const elements = document.querySelectorAll(selector); + clickables.push(...Array.from(elements)); + } + + return [...new Set(clickables)]; + } + + /** + * Check if element is visible + */ + function isVisible(element) { + const style = window.getComputedStyle(element); + return style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0' && + element.offsetWidth > 0 && + element.offsetHeight > 0; + } + + /** + * Comprehensive selector generator based on @medv/finder algorithm + * Generates multiple selector types and chains them for reliability + */ + function generatePaginationSelector(element) { + try { + element.scrollIntoView({ behavior: 'instant', block: 'center', inline: 'center' }); + } catch (e) { + } + + const rect = element.getBoundingClientRect(); + const coordinates = { + x: rect.left + rect.width / 2, + y: rect.top + rect.height / 2 + }; + + const result = getSelectors(document, coordinates); + + const selectorChain = []; + + if (result.primary) { + if (result.primary.id) selectorChain.push(result.primary.id); + if (result.primary.testIdSelector) selectorChain.push(result.primary.testIdSelector); + if (result.primary.relSelector) selectorChain.push(result.primary.relSelector); + if (result.primary.accessibilitySelector) selectorChain.push(result.primary.accessibilitySelector); + if (result.primary.hrefSelector) selectorChain.push(result.primary.hrefSelector); + if (result.primary.formSelector) selectorChain.push(result.primary.formSelector); + if (result.primary.attrSelector) selectorChain.push(result.primary.attrSelector); + if (result.primary.generalSelector) selectorChain.push(result.primary.generalSelector); + } + + return selectorChain.length > 0 ? selectorChain.join(',') : element.tagName.toLowerCase(); + } + + /** + * Comprehensive selector generator (based on @medv/finder) + * Supports shadow DOM, iframes, and multiple selector strategies + */ + function getSelectors(iframeDoc, coordinates) { + try { + // ===== FINDER ALGORITHM ===== + // Based on @medv/finder by Anton Medvedev + // https://github.com/antonmedv/finder/blob/master/finder.ts + + const Limit = { + All: 0, + Two: 1, + One: 2 + }; + + let config; + let rootDocument; + + function finder(input, options) { + if (input.nodeType !== Node.ELEMENT_NODE) { + throw new Error("Can't generate CSS selector for non-element node type."); + } + + if ('html' === input.tagName.toLowerCase()) { + return 'html'; + } + + const defaults = { + root: iframeDoc.body, + idName: function (name) { return true; }, + className: function (name) { return true; }, + tagName: function (name) { return true; }, + attr: function (name, value) { return false; }, + seedMinLength: 1, + optimizedMinLength: 2, + threshold: 900, + maxNumberOfTries: 9000 + }; + + config = Object.assign({}, defaults, options || {}); + rootDocument = findRootDocument(config.root, defaults); + + let path = bottomUpSearch(input, Limit.All, function () { + return bottomUpSearch(input, Limit.Two, function () { + return bottomUpSearch(input, Limit.One); + }); + }); + + if (path) { + const optimized = sort(optimize(path, input)); + if (optimized.length > 0) { + path = optimized[0]; + } + return selector(path); + } else { + throw new Error('Selector was not found.'); + } + } + + function findRootDocument(rootNode, defaults) { + if (rootNode.nodeType === Node.DOCUMENT_NODE) { + return rootNode; + } + if (rootNode === defaults.root) { + return rootNode.ownerDocument; + } + return rootNode; + } + + function bottomUpSearch(input, limit, fallback) { + let path = null; + let stack = []; + let current = input; + let i = 0; + + while (current && current !== config.root.parentElement) { + let level = maybe(id(current)) || + maybe.apply(null, attr(current)) || + maybe.apply(null, classNames(current)) || + maybe(tagName(current)) || + [any()]; + + const nth = index(current); + + if (limit === Limit.All) { + if (nth) { + level = level.concat( + level.filter(dispensableNth).map(function (node) { + return nthChild(node, nth); + }) + ); + } + } else if (limit === Limit.Two) { + level = level.slice(0, 1); + if (nth) { + level = level.concat( + level.filter(dispensableNth).map(function (node) { + return nthChild(node, nth); + }) + ); + } + } else if (limit === Limit.One) { + const node = level[0]; + level = level.slice(0, 1); + if (nth && dispensableNth(node)) { + level = [nthChild(node, nth)]; + } + } + + for (let j = 0; j < level.length; j++) { + level[j].level = i; + } + + stack.push(level); + + if (stack.length >= config.seedMinLength) { + path = findUniquePath(stack, fallback); + if (path) { + break; + } + } + + current = current.parentElement; + i++; + } + + if (!path) { + path = findUniquePath(stack, fallback); + } + + return path; + } + + function findUniquePath(stack, fallback) { + const paths = sort(combinations(stack)); + + if (paths.length > config.threshold) { + return fallback ? fallback() : null; + } + + for (let i = 0; i < paths.length; i++) { + if (unique(paths[i])) { + return paths[i]; + } + } + + return null; + } + + function selector(path) { + let node = path[0]; + let query = node.name; + for (let i = 1; i < path.length; i++) { + const level = path[i].level || 0; + + if (node.level === level - 1) { + query = path[i].name + ' > ' + query; + } else { + query = path[i].name + ' ' + query; + } + + node = path[i]; + } + return query; + } + + function penalty(path) { + return path.map(function (node) { return node.penalty; }) + .reduce(function (acc, i) { return acc + i; }, 0); + } + + function unique(path) { + const elements = rootDocument.querySelectorAll(selector(path)); + switch (elements.length) { + case 0: + throw new Error("Can't select any node with this selector: " + selector(path)); + case 1: + return true; + default: + return false; + } + } + + function id(input) { + const elementId = input.getAttribute('id'); + if (elementId && config.idName(elementId)) { + return { + name: '#' + cssesc(elementId, { isIdentifier: true }), + penalty: 0 + }; + } + return null; + } + + function attr(input) { + const attrs = Array.from(input.attributes).filter(function (attr) { + return config.attr(attr.name, attr.value) && attr.name !== 'data-mx-id'; + }); + + return attrs.map(function (attr) { + let attrValue = attr.value; + + if (attr.name === 'href' && attr.value.includes('://')) { + try { + const url = new URL(attr.value); + const siteOrigin = url.protocol + '//' + url.host; + attrValue = attr.value.replace(siteOrigin, ''); + } catch (e) { + // Keep original if URL parsing fails + } + } + + return { + name: '[' + cssesc(attr.name, { isIdentifier: true }) + '="' + cssesc(attrValue) + '"]', + penalty: 0.5 + }; + }); + } + + function classNames(input) { + const names = Array.from(input.classList).filter(config.className); + + return names.map(function (name) { + return { + name: '.' + cssesc(name, { isIdentifier: true }), + penalty: 1 + }; + }); + } + + function tagName(input) { + const name = input.tagName.toLowerCase(); + if (config.tagName(name)) { + return { + name: name, + penalty: 2 + }; + } + return null; + } + + function any() { + return { + name: '*', + penalty: 3 + }; + } + + function index(input) { + const parent = input.parentNode; + if (!parent) { + return null; + } + + let child = parent.firstChild; + if (!child) { + return null; + } + + let i = 0; + while (child) { + if (child.nodeType === Node.ELEMENT_NODE) { + i++; + } + + if (child === input) { + break; + } + + child = child.nextSibling; + } + + return i; + } + + function nthChild(node, i) { + return { + name: node.name + ':nth-child(' + i + ')', + penalty: node.penalty + 1 + }; + } + + function dispensableNth(node) { + return node.name !== 'html' && !node.name.startsWith('#'); + } + + function maybe() { + const args = Array.prototype.slice.call(arguments); + const list = args.filter(notEmpty); + if (list.length > 0) { + return list; + } + return null; + } + + function notEmpty(value) { + return value !== null && value !== undefined; + } + + function combinations(stack, path) { + path = path || []; + const results = []; + + function* generate(s, p) { + if (s.length > 0) { + for (let i = 0; i < s[0].length; i++) { + yield* generate(s.slice(1), p.concat(s[0][i])); + } + } else { + yield p; + } + } + + const gen = generate(stack, path); + let next = gen.next(); + while (!next.done) { + results.push(next.value); + next = gen.next(); + } + return results; + } + + function sort(paths) { + return Array.from(paths).sort(function (a, b) { + return penalty(a) - penalty(b); + }); + } + + function* optimize(path, input, scope) { + scope = scope || { + counter: 0, + visited: new Map() + }; + + if (path.length > 2 && path.length > config.optimizedMinLength) { + for (let i = 1; i < path.length - 1; i++) { + if (scope.counter > config.maxNumberOfTries) { + return; + } + scope.counter += 1; + const newPath = path.slice(); + newPath.splice(i, 1); + const newPathKey = selector(newPath); + if (scope.visited.has(newPathKey)) { + continue; + } + try { + if (unique(newPath) && same(newPath, input)) { + yield newPath; + scope.visited.set(newPathKey, true); + yield* optimize(newPath, input, scope); + } + } catch (e) { + continue; + } + } + } + } + + function same(path, input) { + return rootDocument.querySelector(selector(path)) === input; + } + + // ===== CSSESC UTILITY ===== + const regexAnySingleEscape = /[ -,\.\/:-@\[-\^`\{-~]/; + const regexSingleEscape = /[ -,\.\/:-@\[\]\^`\{-~]/; + const regexExcessiveSpaces = /(^|\\+)?(\\[A-F0-9]{1,6})\x20(?![a-fA-F0-9\x20])/g; + + const defaultCssEscOptions = { + escapeEverything: false, + isIdentifier: false, + quotes: 'single', + wrap: false + }; + + function cssesc(string, opt) { + const options = Object.assign({}, defaultCssEscOptions, opt || {}); + if (options.quotes != 'single' && options.quotes != 'double') { + options.quotes = 'single'; + } + const quote = options.quotes == 'double' ? '"' : "'"; + const isIdentifier = options.isIdentifier; + + const firstChar = string.charAt(0); + let output = ''; + let counter = 0; + const length = string.length; + + while (counter < length) { + const character = string.charAt(counter++); + let codePoint = character.charCodeAt(0); + let value = undefined; + + if (codePoint < 0x20 || codePoint > 0x7e) { + if (codePoint >= 0xd800 && codePoint <= 0xdbff && counter < length) { + const extra = string.charCodeAt(counter++); + if ((extra & 0xfc00) == 0xdc00) { + codePoint = ((codePoint & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000; + } else { + counter--; + } + } + value = '\\' + codePoint.toString(16).toUpperCase() + ' '; + } else { + if (options.escapeEverything) { + if (regexAnySingleEscape.test(character)) { + value = '\\' + character; + } else { + value = '\\' + codePoint.toString(16).toUpperCase() + ' '; + } + } else if (/[\t\n\f\r\x0B]/.test(character)) { + value = '\\' + codePoint.toString(16).toUpperCase() + ' '; + } else if ( + character == '\\' || + (!isIdentifier && ((character == '"' && quote == character) || (character == "'" && quote == character))) || + (isIdentifier && regexSingleEscape.test(character)) + ) { + value = '\\' + character; + } else { + value = character; + } + } + output += value; + } + + if (isIdentifier) { + if (/^-[-\d]/.test(output)) { + output = '\\-' + output.slice(1); + } else if (/\d/.test(firstChar)) { + output = '\\3' + firstChar + ' ' + output.slice(1); + } + } + + output = output.replace(regexExcessiveSpaces, function ($0, $1, $2) { + if ($1 && $1.length % 2) { + return $0; + } + return ($1 || '') + $2; + }); + + if (!isIdentifier && options.wrap) { + return quote + output + quote; + } + return output; + } + + // ===== ELEMENT DETECTION ===== + function getDeepestElementFromPoint(x, y) { + let elements = iframeDoc.elementsFromPoint(x, y); + if (!elements || elements.length === 0) return null; + + // Check for dialog elements first + const dialogElement = elements.find(function (el) { + return el.getAttribute('role') === 'dialog'; + }); + + if (dialogElement) { + const dialogElements = elements.filter(function (el) { + return el === dialogElement || dialogElement.contains(el); + }); + + const findDeepestInDialog = function (elems) { + if (!elems.length) return null; + if (elems.length === 1) return elems[0]; + + let deepestElement = elems[0]; + let maxDepth = 0; + + for (let i = 0; i < elems.length; i++) { + let depth = 0; + let current = elems[i]; + + while (current && current.parentElement && current !== dialogElement.parentElement) { + depth++; + current = current.parentElement; + } + + if (depth > maxDepth) { + maxDepth = depth; + deepestElement = elems[i]; + } + } + + return deepestElement; + }; + + return findDeepestInDialog(dialogElements); + } + + // Standard deepest element detection + const findDeepestElement = function (elems) { + if (!elems.length) return null; + if (elems.length === 1) return elems[0]; + + // Check for positioned overlays + for (let i = 0; i < Math.min(3, elems.length); i++) { + const element = elems[i]; + const style = window.getComputedStyle(element); + const zIndex = parseInt(style.zIndex) || 0; + + if ((style.position === 'fixed' || style.position === 'absolute') && zIndex > 50) { + return element; + } + + if (element.tagName === 'SVG' && i < 2) { + return element; + } + } + + // Depth-based fallback + let deepestElement = elems[0]; + let maxDepth = 0; + + for (let i = 0; i < elems.length; i++) { + let depth = 0; + let current = elems[i]; + + while (current) { + depth++; + if (current.parentElement) { + current = current.parentElement; + } else { + break; + } + } + + if (depth > maxDepth) { + maxDepth = depth; + deepestElement = elems[i]; + } + } + + return deepestElement; + }; + + let deepestElement = findDeepestElement(elements); + if (!deepestElement) return null; + + // Handle shadow DOM + const traverseShadowDOM = function (element) { + let current = element; + let shadowRoot = current.shadowRoot; + let deepest = current; + let depth = 0; + const MAX_SHADOW_DEPTH = 4; + + while (shadowRoot && depth < MAX_SHADOW_DEPTH) { + const shadowElement = shadowRoot.elementFromPoint(x, y); + if (!shadowElement || shadowElement === current) break; + + deepest = shadowElement; + current = shadowElement; + shadowRoot = current.shadowRoot; + depth++; + } + + return deepest; + }; + + deepestElement = traverseShadowDOM(deepestElement); + return deepestElement; + } + + // ===== SELECTOR GENERATION ===== + function genAttributeSet(element, attributes) { + return new Set( + attributes.filter(function (attr) { + const attrValue = element.getAttribute(attr); + return attrValue != null && attrValue.length > 0; + }) + ); + } + + function isAttributesDefined(element, attributes) { + return genAttributeSet(element, attributes).size > 0; + } + + function genValidAttributeFilter(element, attributes) { + const attrSet = genAttributeSet(element, attributes); + return function (name) { return attrSet.has(name); }; + } + + function genSelectorForAttributes(element, attributes) { + let selector = null; + try { + if (attributes.includes('rel') && element.hasAttribute('rel')) { + const relValue = element.getAttribute('rel'); + return '[rel="' + relValue + '"]'; + } + + selector = isAttributesDefined(element, attributes) + ? finder(element, { + idName: function () { return false; }, + attr: genValidAttributeFilter(element, attributes) + }) + : null; + } catch (e) { } + + return selector; + } + + function isCharacterNumber(char) { + return char && char.length === 1 && /[0-9]/.test(char); + } + + function generateMandatoryCSSFallback(element) { + const mxId = Math.floor(Math.random() * 10000).toString(); + element.setAttribute('data-mx-id', mxId); + return element.tagName.toLowerCase() + '[data-mx-id="' + mxId + '"]'; + } + + function genSelectors(element) { + if (element == null) { + return null; + } + + const href = element.getAttribute('href'); + + let generalSelector = null; + try { + generalSelector = finder(element); + } catch (e) { } + + let attrSelector = null; + try { + attrSelector = finder(element, { + attr: function () { return true; } + }); + } catch (e) { } + + const relSelector = genSelectorForAttributes(element, ['rel']); + const hrefSelector = genSelectorForAttributes(element, ['href']); + const formSelector = genSelectorForAttributes(element, ['name', 'placeholder', 'for']); + const accessibilitySelector = genSelectorForAttributes(element, ['aria-label', 'alt', 'title']); + const testIdSelector = genSelectorForAttributes(element, [ + 'data-testid', 'data-test-id', 'data-testing', + 'data-test', 'data-qa', 'data-cy' + ]); + + let idSelector = null; + try { + const elementId = element.getAttribute('id'); + idSelector = isAttributesDefined(element, ['id']) && !isCharacterNumber(elementId ? elementId[0] : '') + ? finder(element, { + attr: function (name) { return name === 'id'; } + }) + : null; + } catch (e) { } + + return { + id: idSelector, + generalSelector: generalSelector, + attrSelector: attrSelector, + testIdSelector: testIdSelector, + text: element.innerText, + href: href || undefined, + hrefSelector: hrefSelector, + accessibilitySelector: accessibilitySelector, + formSelector: formSelector, + relSelector: relSelector, + iframeSelector: null, + shadowSelector: null + }; + } + + // Main execution + const hoveredElement = getDeepestElementFromPoint(coordinates.x, coordinates.y); + + if (hoveredElement != null) { + const parentElement = hoveredElement.parentElement; + const element = (parentElement && parentElement.tagName === 'A') ? parentElement : hoveredElement; + + const generatedSelectors = genSelectors(element); + + return { + primary: generatedSelectors + }; + } + } catch (e) { + } + + return { primary: null }; + } + + + /** + * Check if element is near the list container + */ + function isNearList(element) { + try { + const listRect = listContainer.getBoundingClientRect(); + const elementRect = element.getBoundingClientRect(); + + if (elementRect.top >= listRect.bottom && elementRect.top <= listRect.bottom + 500) { + return true; + } + + if (elementRect.bottom <= listRect.top && elementRect.bottom >= listRect.top - 500) { + return true; + } + + const verticalOverlap = !(elementRect.bottom < listRect.top || elementRect.top > listRect.bottom); + if (verticalOverlap) { + const horizontalDistance = Math.min( + Math.abs(elementRect.left - listRect.right), + Math.abs(elementRect.right - listRect.left) + ); + if (horizontalDistance < 200) { + return true; + } + } + + return false; + } catch (error) { + return false; + } + } + + const clickableElements = getClickableElements(); + + let nextButton = null; + let nextButtonScore = 0; + const nextButtonCandidates = []; + + for (const element of clickableElements) { + if (!isVisible(element)) continue; + + const text = (element.textContent || '').trim(); + const ariaLabel = element.getAttribute('aria-label') || ''; + const title = element.getAttribute('title') || ''; + const combinedText = `${text} ${ariaLabel} ${title}`; + + let score = 0; + const reasons = []; + + if (matchesAnyPattern(combinedText, nextButtonPatterns)) { + score += 10; + reasons.push('text match (+10)'); + } + + if (isNearList(element)) { + score += 5; + reasons.push('near list (+5)'); + } + + if (element.tagName === 'BUTTON') { + score += 2; + reasons.push('button tag (+2)'); + } + + const className = element.className || ''; + if (/pagination|next|forward/i.test(className)) { + score += 3; + reasons.push('pagination class (+3)'); + } + + if (score > 0) { + nextButtonCandidates.push({ + element: element, + score: score, + text: text.substring(0, 50), + ariaLabel: ariaLabel, + tag: element.tagName, + className: className, + reasons: reasons + }); + } + + if (score > nextButtonScore) { + nextButtonScore = score; + nextButton = element; + } + } + + let loadMoreButton = null; + let loadMoreScore = 0; + + for (const element of clickableElements) { + if (!isVisible(element)) continue; + + const text = (element.textContent || '').trim(); + const ariaLabel = element.getAttribute('aria-label') || ''; + const title = element.getAttribute('title') || ''; + const combinedText = `${text} ${ariaLabel} ${title}`; + + let score = 0; + + if (matchesAnyPattern(combinedText, loadMorePatterns)) { + score += 10; + } + + if (isNearList(element)) { + score += 5; + } + + if (element.tagName === 'BUTTON') { + score += 2; + } + + if (score > loadMoreScore) { + loadMoreScore = score; + loadMoreButton = element; + } + } + + let prevButton = null; + let prevButtonScore = 0; + + for (const element of clickableElements) { + if (!isVisible(element)) continue; + + const text = (element.textContent || '').trim(); + const ariaLabel = element.getAttribute('aria-label') || ''; + const title = element.getAttribute('title') || ''; + const combinedText = `${text} ${ariaLabel} ${title}`; + + let score = 0; + + if (matchesAnyPattern(combinedText, prevButtonPatterns)) { + score += 10; + } + + if (isNearList(element)) { + score += 5; + } + + if (score > prevButtonScore) { + prevButtonScore = score; + prevButton = element; + } + } + + function detectInfiniteScrollScore() { + try { + const debugInfo = { + indicators: [], + score: 0, + threshold: 5 + }; + + const initialItemCount = listElements.length; + const initialHeight = document.documentElement.scrollHeight; + const viewportHeight = window.innerHeight; + const currentScrollY = window.scrollY; + + if (initialHeight <= viewportHeight) { + return 0; + } + + const loadingIndicators = [ + '[class*="loading"]', + '[class*="spinner"]', + '[class*="skeleton"]', + '[aria-busy="true"]', + '[data-loading="true"]', + '.loader', + '.load-more-spinner', + '[class*="load"]', + '[id*="loading"]', + '[id*="spinner"]' + ]; + + for (const selector of loadingIndicators) { + if (document.querySelector(selector)) { + debugInfo.score += 3; + debugInfo.indicators.push(`Loading indicator: ${selector} (+3)`); + break; + } + } + + const sentinelPatterns = [ + '[class*="sentinel"]', + '[class*="trigger"]', + '[data-infinite]', + '[data-scroll-trigger]', + '#infinite-scroll-trigger', + '[class*="infinite"]', + '[id*="infinite"]' + ]; + + for (const selector of sentinelPatterns) { + if (document.querySelector(selector)) { + debugInfo.score += 4; + debugInfo.indicators.push(`Sentinel element: ${selector} (+4)`); + break; + } + } + + const scrollToTopPatterns = [ + '[class*="scroll"][class*="top"]', + '[aria-label*="scroll to top"]', + '[title*="back to top"]', + '.back-to-top', + '#back-to-top', + '[class*="scrolltop"]', + '[class*="backtotop"]', + 'button[class*="top"]', + 'a[href="#top"]', + 'a[href="#"]' + ]; + + for (const selector of scrollToTopPatterns) { + const element = document.querySelector(selector); + if (element && isVisible(element)) { + debugInfo.score += 2; + debugInfo.indicators.push('Scroll-to-top button (+2)'); + break; + } + } + + if (initialHeight > viewportHeight * 3) { + debugInfo.score += 3; + debugInfo.indicators.push(`Very tall page (${(initialHeight / viewportHeight).toFixed(1)}x viewport) (+3)`); + } else if (initialHeight > viewportHeight * 2) { + debugInfo.score += 2; + debugInfo.indicators.push(`Tall page (${(initialHeight / viewportHeight).toFixed(1)}x viewport) (+2)`); + } + + if (initialItemCount >= 20) { + debugInfo.score += 2; + debugInfo.indicators.push(`Many list items (${initialItemCount}) (+2)`); + } else if (initialItemCount >= 10) { + debugInfo.score += 1; + debugInfo.indicators.push(`Good number of list items (${initialItemCount}) (+1)`); + } + + const infiniteScrollLibraries = [ + '.infinite-scroll', + '[data-infinite-scroll]', + '[data-flickity]', + '[data-slick]', + '.masonry', + '[data-masonry]', + '[class*="infinite-scroll"]', + '[class*="lazy-load"]', + '[data-lazy]' + ]; + + for (const selector of infiniteScrollLibraries) { + if (document.querySelector(selector)) { + debugInfo.score += 4; + debugInfo.indicators.push(`Infinite scroll library: ${selector} (+4)`); + break; + } + } + + const lastListItem = listElements[listElements.length - 1]; + if (lastListItem) { + const lastItemRect = lastListItem.getBoundingClientRect(); + const lastItemY = lastItemRect.bottom + currentScrollY; + const viewportBottom = currentScrollY + viewportHeight; + + if (lastItemY > viewportBottom + viewportHeight) { + debugInfo.score += 3; + debugInfo.indicators.push('List extends far below viewport (+3)'); + } else if (lastItemY > viewportBottom) { + debugInfo.score += 2; + debugInfo.indicators.push('List extends below viewport (+2)'); + } + } + + const hiddenLoadMore = document.querySelectorAll('[class*="load"], [class*="more"]'); + for (let i = 0; i < hiddenLoadMore.length; i++) { + const el = hiddenLoadMore[i]; + const style = window.getComputedStyle(el); + if (style.opacity === '0' || style.visibility === 'hidden') { + debugInfo.score += 2; + debugInfo.indicators.push('Hidden load trigger element (+2)'); + break; + } + } + + const paginationControls = document.querySelectorAll('[class*="pagination"], [class*="pager"]'); + if (paginationControls.length === 0) { + debugInfo.score += 1; + debugInfo.indicators.push('No pagination controls found (+1)'); + } + + return debugInfo.score; + } catch (error) { + return 0; + } + } + + const infiniteScrollScore = (options && options.disableScrollDetection) + ? 0 + : detectInfiniteScrollScore(); + const hasStrongInfiniteScrollSignals = infiniteScrollScore >= 8; + const hasMediumInfiniteScrollSignals = infiniteScrollScore >= 5 && infiniteScrollScore < 8; + + if (loadMoreButton && loadMoreScore >= 15) { + const selector = generatePaginationSelector(loadMoreButton); + return { + type: 'clickLoadMore', + selector: selector, + confidence: 'high' + }; + } + + if (nextButton && nextButtonScore >= 15) { + const selector = generatePaginationSelector(nextButton); + return { + type: 'clickNext', + selector: selector, + confidence: 'high' + }; + } + + if (hasStrongInfiniteScrollSignals) { + const confidence = infiniteScrollScore >= 12 ? 'high' : infiniteScrollScore >= 10 ? 'medium' : 'low'; + return { + type: 'scrollDown', + selector: null, + confidence: confidence + }; + } + + if (loadMoreButton && loadMoreScore >= 10) { + const selector = generatePaginationSelector(loadMoreButton); + const confidence = 'medium'; + return { + type: 'clickLoadMore', + selector: selector, + confidence: confidence + }; + } + + if (nextButton && nextButtonScore >= 10) { + const selector = generatePaginationSelector(nextButton); + const confidence = 'medium'; + return { + type: 'clickNext', + selector: selector, + confidence: confidence + }; + } + + if (hasMediumInfiniteScrollSignals) { + const confidence = infiniteScrollScore >= 7 ? 'medium' : 'low'; + return { + type: 'scrollDown', + selector: null, + confidence: confidence + }; + } + + if (loadMoreButton && loadMoreScore >= 8) { + const selector = generatePaginationSelector(loadMoreButton); + const confidence = loadMoreScore >= 10 ? 'medium' : 'low'; + return { + type: 'clickLoadMore', + selector: selector, + confidence: confidence + }; + } + + if (nextButton && nextButtonScore >= 8) { + const selector = generatePaginationSelector(nextButton); + const confidence = nextButtonScore >= 10 ? 'medium' : 'low'; + return { + type: 'clickNext', + selector: selector, + confidence: confidence + }; + } + + if (prevButton && prevButtonScore >= 8) { + const confidence = prevButtonScore >= 15 ? 'high' : prevButtonScore >= 10 ? 'medium' : 'low'; + return { + type: 'scrollUp', + selector: null, + confidence: confidence + }; + } + + return { + type: '', + selector: null, + confidence: 'low', + debug: { + clickableElementsCount: clickableElements.length, + nextCandidatesCount: nextButtonCandidates.length, + topNextCandidates: nextButtonCandidates.slice(0, 3).map(c => ({ + score: c.score, + text: c.text, + tag: c.tag, + reasons: c.reasons + })), + finalScores: { + loadMore: loadMoreScore, + next: nextButtonScore, + prev: prevButtonScore, + infiniteScroll: infiniteScrollScore + } + } + }; + + } catch (error) { + return { + type: '', + selector: null, + confidence: 'low', + error: error.message, + debug: 'Exception thrown: ' + error.message + }; + } + }; + + /** + * Analyze element groups on the page + * Returns grouped elements with their structural fingerprints + */ + window.analyzeElementGroups = function() { + try { + const normalizeClasses = (classList) => { + return Array.from(classList) + .filter((cls) => { + return ( + !cls.match(/\d{3,}|uuid|hash|id-|_\d+$/i) && + !cls.startsWith('_ngcontent-') && + !cls.startsWith('_nghost-') && + !cls.match(/^ng-tns-c\d+-\d+$/) + ); + }) + .sort() + .join(' '); + }; + + const getStructuralFingerprint = (element) => { + if (element.nodeType !== Node.ELEMENT_NODE) return null; + + const tagName = element.tagName.toLowerCase(); + const isCustomElement = tagName.includes('-'); + + const standardExcludeSelectors = ['script', 'style', 'meta', 'link', 'title', 'head']; + if (!isCustomElement && standardExcludeSelectors.includes(tagName)) { + return null; + } + + const children = Array.from(element.children); + let childrenStructureString; + + if (tagName === 'table') { + const thead = element.querySelector('thead'); + const representativeRow = thead ? thead.querySelector('tr') : element.querySelector('tr'); + + if (representativeRow) { + const structure = Array.from(representativeRow.children).map(child => ({ + tag: child.tagName.toLowerCase(), + classes: normalizeClasses(child.classList), + })); + childrenStructureString = JSON.stringify(structure); + } else { + childrenStructureString = JSON.stringify([]); + } + } else if (tagName === 'tr') { + const structure = children.map((child) => ({ + tag: child.tagName.toLowerCase(), + classes: normalizeClasses(child.classList), + })); + childrenStructureString = JSON.stringify(structure); + } else { + const structure = children.map((child) => ({ + tag: child.tagName.toLowerCase(), + classes: normalizeClasses(child.classList), + hasText: (child.textContent ?? '').trim().length > 0, + })); + childrenStructureString = JSON.stringify(structure); + } + + const normalizedClasses = normalizeClasses(element.classList); + + const relevantAttributes = Array.from(element.attributes) + .filter((attr) => { + if (isCustomElement) { + return !['id', 'style', 'data-reactid', 'data-react-checksum'].includes(attr.name.toLowerCase()); + } else { + return ( + !['id', 'style', 'data-reactid', 'data-react-checksum'].includes(attr.name.toLowerCase()) && + (!attr.name.startsWith('data-') || attr.name === 'data-type' || attr.name === 'data-role') + ); + } + }) + .map((attr) => `${attr.name}=${attr.value}`) + .sort(); + + let depth = 0; + let parent = element.parentElement; + while (parent && depth < 20) { + depth++; + parent = parent.parentElement; + } + + const textContent = (element.textContent ?? '').trim(); + const textCharacteristics = { + hasText: textContent.length > 0, + textLength: Math.floor(textContent.length / 20) * 20, + hasLinks: element.querySelectorAll('a').length, + hasImages: element.querySelectorAll('img').length, + hasButtons: element.querySelectorAll('button, input[type="button"], input[type="submit"]').length, + }; + + const signature = `${tagName}::${normalizedClasses}::${children.length}::${childrenStructureString}::${relevantAttributes.join('|')}`; + + return { + tagName, + normalizedClasses, + childrenCount: children.length, + childrenStructure: childrenStructureString, + attributes: relevantAttributes.join('|'), + depth, + textCharacteristics, + signature, + }; + }; + + const calculateSimilarity = (fp1, fp2) => { + if (!fp1 || !fp2) return 0; + + let score = 0; + let maxScore = 0; + + maxScore += 10; + if (fp1.tagName === fp2.tagName) score += 10; + else return 0; + + maxScore += 8; + if (fp1.normalizedClasses === fp2.normalizedClasses) score += 8; + else if (fp1.normalizedClasses && fp2.normalizedClasses) { + const classes1 = fp1.normalizedClasses.split(' ').filter((c) => c); + const classes2 = fp2.normalizedClasses.split(' ').filter((c) => c); + const commonClasses = classes1.filter((c) => classes2.includes(c)); + if (classes1.length > 0 && classes2.length > 0) { + score += (commonClasses.length / Math.max(classes1.length, classes2.length)) * 8; + } + } + + maxScore += 8; + if (fp1.childrenStructure === fp2.childrenStructure) score += 8; + else if (fp1.childrenCount === fp2.childrenCount) score += 4; + + maxScore += 5; + if (fp1.attributes === fp2.attributes) score += 5; + else if (fp1.attributes && fp2.attributes) { + const attrs1 = fp1.attributes.split('|').filter((a) => a); + const attrs2 = fp2.attributes.split('|').filter((a) => a); + const commonAttrs = attrs1.filter((a) => attrs2.includes(a)); + if (attrs1.length > 0 && attrs2.length > 0) { + score += (commonAttrs.length / Math.max(attrs1.length, attrs2.length)) * 5; + } + } + + maxScore += 2; + if (Math.abs(fp1.depth - fp2.depth) <= 1) score += 2; + else if (Math.abs(fp1.depth - fp2.depth) <= 2) score += 1; + + maxScore += 3; + const tc1 = fp1.textCharacteristics; + const tc2 = fp2.textCharacteristics; + if (tc1.hasText === tc2.hasText) score += 1; + if (Math.abs(tc1.textLength - tc2.textLength) <= 40) score += 1; + if (tc1.hasLinks === tc2.hasLinks && tc1.hasImages === tc2.hasImages) score += 1; + + return maxScore > 0 ? score / maxScore : 0; + }; + + const hasAnyMeaningfulChildren = (element) => { + const meaningfulChildren = []; + + const traverse = (el, depth) => { + if (depth === undefined) depth = 0; + if (depth > 5) return; + + Array.from(el.children).forEach(function(child) { + const tagName = child.tagName.toLowerCase(); + + if (tagName === 'img' && child.hasAttribute('src')) { + meaningfulChildren.push(child); + return; + } + + if (tagName === 'a' && child.hasAttribute('href')) { + meaningfulChildren.push(child); + return; + } + + const text = (child.textContent || '').trim(); + const hasVisibleText = text.length > 0; + + if (hasVisibleText || child.querySelector('svg')) { + meaningfulChildren.push(child); + return; + } + + if (child.children.length > 0) { + traverse(child, depth + 1); + } + }); + + if (el.shadowRoot) { + Array.from(el.shadowRoot.children).forEach(function(shadowChild) { + const tagName = shadowChild.tagName.toLowerCase(); + + if (tagName === 'img' && shadowChild.hasAttribute('src')) { + meaningfulChildren.push(shadowChild); + return; + } + + if (tagName === 'a' && shadowChild.hasAttribute('href')) { + meaningfulChildren.push(shadowChild); + return; + } + + const text = (shadowChild.textContent || '').trim(); + const hasVisibleText = text.length > 0; + + if (hasVisibleText || shadowChild.querySelector('svg')) { + meaningfulChildren.push(shadowChild); + return; + } + + if (shadowChild.children.length > 0) { + traverse(shadowChild, depth + 1); + } + }); + } + }; + + traverse(element); + return meaningfulChildren.length > 0; + }; + + const getAllVisibleElements = () => { + const allElements = []; + const visited = new Set(); + + const traverseContainer = (container) => { + try { + const elements = Array.from(container.querySelectorAll('*')).filter((el) => { + const rect = el.getBoundingClientRect(); + return rect.width > 0 && rect.height > 0; + }); + + elements.forEach((element) => { + if (!visited.has(element)) { + visited.add(element); + allElements.push(element); + + if (element.shadowRoot) { + traverseContainer(element.shadowRoot); + } + } + }); + } catch (error) { + console.warn('Error traversing container:', error); + } + }; + + traverseContainer(document); + return allElements; + }; + + const allElements = getAllVisibleElements(); + const processedInTables = new Set(); + const elementGroups = new Map(); + const groupedElements = new Set(); + + // Group table rows + const tables = allElements.filter(el => el.tagName === 'TABLE'); + tables.forEach(table => { + const rows = Array.from(table.querySelectorAll('tbody > tr')).filter(row => { + const parent = row.parentElement; + if (!parent || !table.contains(parent)) return false; + + const rect = row.getBoundingClientRect(); + return rect.width > 0 && rect.height > 0; + }); + + if (rows.length >= 2) { + const representativeFingerprint = getStructuralFingerprint(rows[0]); + if (!representativeFingerprint) return; + + const group = { + elements: rows, + fingerprint: representativeFingerprint, + representative: rows[0], + }; + + rows.forEach(row => { + elementGroups.set(row, group); + groupedElements.add(row); + processedInTables.add(row); + }); + } + }); + + // Group other elements + const remainingElements = allElements.filter(el => !processedInTables.has(el)); + const elementFingerprints = new Map(); + remainingElements.forEach((element) => { + const fingerprint = getStructuralFingerprint(element); + if (fingerprint) { + elementFingerprints.set(element, fingerprint); + } + }); + + const processedElements = new Set(); + const similarityThreshold = 0.7; + const minGroupSize = 2; + const maxParentLevels = 5; + + elementFingerprints.forEach((fingerprint, element) => { + if (processedElements.has(element)) return; + + const currentGroup = [element]; + processedElements.add(element); + + elementFingerprints.forEach((otherFingerprint, otherElement) => { + if (processedElements.has(otherElement)) return; + + const similarity = calculateSimilarity(fingerprint, otherFingerprint); + if (similarity >= similarityThreshold) { + currentGroup.push(otherElement); + processedElements.add(otherElement); + } + }); + + if (currentGroup.length >= minGroupSize && hasAnyMeaningfulChildren(element)) { + let grouped = false; + + for (let level = 1; level <= maxParentLevels && !grouped; level++) { + let ancestor = currentGroup[0]; + for (let i = 0; i < level && ancestor; i++) { + ancestor = ancestor.parentElement; + } + + if (!ancestor) break; + + const allShareAncestor = currentGroup.every(el => { + let elAncestor = el; + for (let i = 0; i < level && elAncestor; i++) { + elAncestor = elAncestor.parentElement; + } + return elAncestor === ancestor; + }); + + if (allShareAncestor) { + const group = { + elements: currentGroup, + fingerprint, + representative: element, + }; + currentGroup.forEach((el) => { + elementGroups.set(el, group); + groupedElements.add(el); + }); + grouped = true; + } + } + + if (!grouped) { + currentGroup.forEach((el, idx) => { + if (idx > 0) processedElements.delete(el); + }); + } + } + }); + + // Convert to serializable format with XPath + const uniqueGroups = new Map(); + elementGroups.forEach((group) => { + const signature = group.fingerprint.signature; + if (!uniqueGroups.has(signature)) { + const tagName = group.fingerprint.tagName; + const classes = group.fingerprint.normalizedClasses.split(' ').filter(Boolean); + + let xpath = `//${tagName}`; + if (classes.length > 0) { + const classConditions = classes.map(cls => `contains(@class, '${cls}')`).join(' and '); + xpath += `[${classConditions}]`; + } + + // Get sample innerText from first 3 elements + const sampleTexts = group.elements.slice(0, 3).map((el) => { + return (el.textContent || '').trim().substring(0, 200); + }); + + // Get sample HTML structure + const sampleHTML = group.representative.outerHTML.substring(0, 500); + + uniqueGroups.set(signature, { + fingerprint: group.fingerprint, + count: group.elements.length, + xpath: xpath, + sampleTexts: sampleTexts, + sampleHTML: sampleHTML, + }); + } + }); + + return Array.from(uniqueGroups.values()); + } catch (error) { + console.error('[analyzeElementGroups] Error:', error); + return []; + } + }; + +})(); diff --git a/server/src/sdk/selectorValidator.ts b/server/src/sdk/selectorValidator.ts new file mode 100644 index 000000000..de0e8d93b --- /dev/null +++ b/server/src/sdk/selectorValidator.ts @@ -0,0 +1,585 @@ +/** + * Selector Validator + * Validates and enriches selectors with metadata using Playwright page instance + */ + +import { Page } from 'playwright-core'; +import logger from '../logger'; + +interface SelectorInput { + selector: string; + attribute?: string; +} + +interface EnrichedSelector { + tag: string; + isShadow: boolean; + selector: string; + attribute: string; +} + +interface ValidationResult { + valid: boolean; + enriched?: EnrichedSelector; + error?: string; +} + +export class SelectorValidator { + private page: Page | null = null; + + /** + * Initialize with an existing Page instance and navigate to URL + * @param page Page instance from RemoteBrowser + * @param url URL to navigate to + */ + async initialize(page: Page, url: string): Promise { + this.page = page; + try { + await page.goto(url, { + waitUntil: "networkidle", + timeout: 100000, + }); + } catch (err) { + await page.goto(url, { + waitUntil: "domcontentloaded", + timeout: 100000, + }); + } + logger.info(`Navigated to ${url} using RemoteBrowser page`); + } + + /** + * Validate and enrich a single selector + */ + async validateSelector(input: SelectorInput): Promise { + if (!this.page) { + return { valid: false, error: 'Browser not initialized' }; + } + + const { selector, attribute = 'innerText' } = input; + + try { + const isXPath = selector.startsWith('//') || selector.startsWith('(//'); + + let element; + if (isXPath) { + element = await this.page.locator(`xpath=${selector}`).first(); + } else { + element = await this.page.locator(selector).first(); + } + + const count = await element.count(); + if (count === 0) { + return { + valid: false, + error: `Selector "${selector}" did not match any elements` + }; + } + + const tagName = await element.evaluate((el) => el.tagName); + + const isShadow = await element.evaluate((el) => { + let parent = el.parentNode; + while (parent) { + if (parent instanceof ShadowRoot) { + return true; + } + parent = parent.parentNode; + } + return false; + }); + + return { + valid: true, + enriched: { + tag: tagName, + isShadow, + selector, + attribute + } + }; + } catch (error: any) { + logger.error(`Error validating selector "${selector}":`, error.message); + return { + valid: false, + error: `Invalid selector: ${error.message}` + }; + } + } + + /** + * Validate and enrich multiple selectors + */ + async validateSchemaFields( + fields: Record + ): Promise<{ valid: boolean; enriched?: Record; errors?: string[] }> { + const enriched: Record = {}; + const errors: string[] = []; + + for (const [fieldName, fieldInput] of Object.entries(fields)) { + const input: SelectorInput = typeof fieldInput === 'string' + ? { selector: fieldInput } + : fieldInput; + + const result = await this.validateSelector(input); + + if (result.valid && result.enriched) { + enriched[fieldName] = result.enriched; + } else { + errors.push(`Field "${fieldName}": ${result.error}`); + } + } + + if (errors.length > 0) { + return { valid: false, errors }; + } + + return { valid: true, enriched }; + } + + /** + * Validate list selector and fields + */ + async validateListFields(config: { + itemSelector: string; + fields: Record; + }): Promise<{ + valid: boolean; + enriched?: { + listSelector: string; + listTag: string; + fields: Record; + }; + errors?: string[] + }> { + const errors: string[] = []; + + const listResult = await this.validateSelector({ + selector: config.itemSelector, + attribute: 'innerText' + }); + + if (!listResult.valid || !listResult.enriched) { + errors.push(`List selector: ${listResult.error}`); + return { valid: false, errors }; + } + + const fieldsResult = await this.validateSchemaFields(config.fields); + + if (!fieldsResult.valid) { + errors.push(...(fieldsResult.errors || [])); + return { valid: false, errors }; + } + + return { + valid: true, + enriched: { + listSelector: config.itemSelector, + listTag: listResult.enriched.tag, + fields: fieldsResult.enriched! + } + }; + } + + /** + * Detect input type for a given selector + */ + async detectInputType(selector: string): Promise { + if (!this.page) { + throw new Error('Browser not initialized'); + } + + try { + const isXPath = selector.startsWith('//') || selector.startsWith('(//'); + + let element; + if (isXPath) { + element = await this.page.locator(`xpath=${selector}`).first(); + } else { + element = await this.page.locator(selector).first(); + } + + const count = await element.count(); + if (count === 0) { + throw new Error(`Selector "${selector}" did not match any elements`); + } + + const inputType = await element.evaluate((el) => { + if (el instanceof HTMLInputElement) { + return el.type || 'text'; + } + if (el instanceof HTMLTextAreaElement) { + return 'textarea'; + } + if (el instanceof HTMLSelectElement) { + return 'select'; + } + return 'text'; + }); + + return inputType; + } catch (error: any) { + throw new Error(`Failed to detect input type: ${error.message}`); + } + } + + /** + * Auto-detect fields from list selector + */ + async autoDetectListFields(listSelector: string): Promise<{ + success: boolean; + fields?: Record; + listSelector?: string; + error?: string; + }> { + if (!this.page) { + return { success: false, error: 'Browser not initialized' }; + } + + try { + const fs = require('fs'); + const path = require('path'); + const scriptPath = path.join(__dirname, 'browserSide/pageAnalyzer.js'); + const scriptContent = fs.readFileSync(scriptPath, 'utf8'); + + await this.page.evaluate((script) => { + eval(script); + }, scriptContent); + + const result = await this.page.evaluate((selector) => { + const win = window as any; + if (typeof win.autoDetectListFields === 'function') { + return win.autoDetectListFields(selector); + } else { + return { + fields: {}, + error: 'Auto-detection function not loaded' + }; + } + }, listSelector); + + // Log debug information + if (result.debug) { + logger.info(`Debug info: ${JSON.stringify(result.debug)}`); + } + + if (result.error || !result.fields || Object.keys(result.fields).length === 0) { + return { + success: false, + error: result.error || 'No fields detected from list selector' + }; + } + + const convertedListSelector = result.listSelector || listSelector; + + logger.info(`Auto-detected ${Object.keys(result.fields).length} fields from list`); + + return { + success: true, + fields: result.fields, + listSelector: convertedListSelector, + }; + } catch (error: any) { + logger.error('Field auto-detection error:', error); + return { + success: false, + error: `Field auto-detection failed: ${error.message}` + }; + } + } + + /** + * Auto-detect pagination type and selector from list selector + */ + async autoDetectPagination(listSelector: string): Promise<{ + success: boolean; + type?: string; + selector?: string | null; + error?: string; + }> { + if (!this.page) { + return { success: false, error: 'Browser not initialized' }; + } + + try { + const fs = require('fs'); + const path = require('path'); + const scriptPath = path.join(__dirname, 'browserSide/pageAnalyzer.js'); + const scriptContent = fs.readFileSync(scriptPath, 'utf8'); + + await this.page.evaluate((script) => { + eval(script); + }, scriptContent); + + const buttonResult = await this.page.evaluate((selector) => { + const win = window as any; + + if (typeof win.autoDetectPagination === 'function') { + const result = win.autoDetectPagination(selector); + return result; + } else { + console.error('autoDetectPagination function not found!'); + return { + type: '', + selector: null, + error: 'Pagination auto-detection function not loaded' + }; + } + }, listSelector); + + if (buttonResult.debug) { + logger.info(`Pagination debug info: ${JSON.stringify(buttonResult.debug)}`); + } + + if (buttonResult.error) { + logger.error(`Button detection error: ${buttonResult.error}`); + return { + success: false, + error: buttonResult.error + }; + } + + if (buttonResult.type && buttonResult.type !== '') { + if (buttonResult.type === 'clickLoadMore' && buttonResult.selector) { + logger.info('Testing Load More button by clicking...'); + const loadMoreVerified = await this.testLoadMoreButton(buttonResult.selector, listSelector); + + if (!loadMoreVerified) { + logger.warn('Load More button did not load content, falling back to scroll detection'); + const scrollTestResult = await this.testInfiniteScrollByScrolling(listSelector); + + if (scrollTestResult.detected) { + return { + success: true, + type: 'scrollDown', + selector: null + }; + } + } else { + logger.info(`Verified Load More button works`); + return { + success: true, + type: buttonResult.type, + selector: buttonResult.selector + }; + } + } else { + logger.info(`Detected pagination type: ${buttonResult.type}${buttonResult.selector ? ` with selector: ${buttonResult.selector}` : ''}`); + return { + success: true, + type: buttonResult.type, + selector: buttonResult.selector + }; + } + } + + return { + success: true, + type: '', + selector: null + }; + + } catch (error: any) { + logger.error('Pagination auto-detection error:', error); + return { + success: false, + error: `Pagination auto-detection failed: ${error.message}` + }; + } + } + + /** + * Test Load More button by clicking it and checking if content loads + */ + private async testLoadMoreButton(buttonSelector: string, listSelector: string): Promise { + if (!this.page) { + return false; + } + + try { + const initialState = await this.page.evaluate((selector) => { + function evaluateSelector(sel: string, doc: Document) { + const isXPath = sel.startsWith('//') || sel.startsWith('(//'); + if (isXPath) { + const result = doc.evaluate(sel, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); + const elements = []; + for (let i = 0; i < result.snapshotLength; i++) { + elements.push(result.snapshotItem(i)); + } + return elements; + } else { + return Array.from(doc.querySelectorAll(sel)); + } + } + + const listElements = evaluateSelector(selector, document); + return { + itemCount: listElements.length, + scrollHeight: document.documentElement.scrollHeight + }; + }, listSelector); + + try { + const selectors = buttonSelector.split(',').map(s => s.trim()); + let clicked = false; + + for (const sel of selectors) { + try { + await this.page.click(sel, { timeout: 1000 }); + clicked = true; + break; + } catch (e) { + continue; + } + } + + if (!clicked) { + return false; + } + + await this.page.waitForTimeout(2000); + + } catch (clickError: any) { + logger.warn(`Failed to click button: ${clickError.message}`); + return false; + } + + const afterClickState = await this.page.evaluate((selector) => { + function evaluateSelector(sel: string, doc: Document) { + const isXPath = sel.startsWith('//') || sel.startsWith('(//'); + if (isXPath) { + const result = doc.evaluate(sel, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); + const elements = []; + for (let i = 0; i < result.snapshotLength; i++) { + elements.push(result.snapshotItem(i)); + } + return elements; + } else { + return Array.from(doc.querySelectorAll(sel)); + } + } + + const listElements = evaluateSelector(selector, document); + return { + itemCount: listElements.length, + scrollHeight: document.documentElement.scrollHeight + }; + }, listSelector); + + logger.info(`After click: ${afterClickState.itemCount} items, scrollHeight: ${afterClickState.scrollHeight}`); + + const itemsAdded = afterClickState.itemCount > initialState.itemCount; + const heightIncreased = afterClickState.scrollHeight > initialState.scrollHeight + 100; + + if (itemsAdded || heightIncreased) { + const details = `Items: ${initialState.itemCount} → ${afterClickState.itemCount}, Height: ${initialState.scrollHeight} → ${afterClickState.scrollHeight}`; + logger.info(`Content loaded after click: ${details}`); + return true; + } + + logger.info('No content change detected after clicking'); + return false; + + } catch (error: any) { + logger.error('Error during Load More test:', error.message); + return false; + } + } + + /** + * Test for infinite scroll by actually scrolling and checking if content loads + */ + private async testInfiniteScrollByScrolling(listSelector: string): Promise<{ + detected: boolean; + details?: string; + }> { + if (!this.page) { + return { detected: false }; + } + + try { + const initialState = await this.page.evaluate((selector) => { + function evaluateSelector(sel: string, doc: Document) { + const isXPath = sel.startsWith('//') || sel.startsWith('(//'); + if (isXPath) { + const result = doc.evaluate(sel, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); + const elements = []; + for (let i = 0; i < result.snapshotLength; i++) { + elements.push(result.snapshotItem(i)); + } + return elements; + } else { + return Array.from(doc.querySelectorAll(sel)); + } + } + + const listElements = evaluateSelector(selector, document); + return { + itemCount: listElements.length, + scrollHeight: document.documentElement.scrollHeight, + scrollY: window.scrollY + }; + }, listSelector); + + logger.info(`Initial state: ${initialState.itemCount} items, scrollHeight: ${initialState.scrollHeight}`); + + await this.page.evaluate(() => { + window.scrollTo(0, document.documentElement.scrollHeight); + }); + + await this.page.waitForTimeout(2000); + + const afterScrollState = await this.page.evaluate((selector) => { + function evaluateSelector(sel: string, doc: Document) { + const isXPath = sel.startsWith('//') || sel.startsWith('(//'); + if (isXPath) { + const result = doc.evaluate(sel, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); + const elements = []; + for (let i = 0; i < result.snapshotLength; i++) { + elements.push(result.snapshotItem(i)); + } + return elements; + } else { + return Array.from(doc.querySelectorAll(sel)); + } + } + + const listElements = evaluateSelector(selector, document); + return { + itemCount: listElements.length, + scrollHeight: document.documentElement.scrollHeight, + scrollY: window.scrollY + }; + }, listSelector); + + await this.page.evaluate((originalY) => { + window.scrollTo(0, originalY); + }, initialState.scrollY); + + + const itemsAdded = afterScrollState.itemCount > initialState.itemCount; + const heightIncreased = afterScrollState.scrollHeight > initialState.scrollHeight + 100; + + if (itemsAdded || heightIncreased) { + const details = `Items: ${initialState.itemCount} → ${afterScrollState.itemCount}, Height: ${initialState.scrollHeight} → ${afterScrollState.scrollHeight}`; + logger.info(`Content changed: ${details}`); + return { detected: true, details }; + } + + logger.info('No content change detected'); + return { detected: false }; + + } catch (error: any) { + logger.error('Error during scroll test:', error.message); + return { detected: false }; + } + } + + /** + * Clear page reference + */ + async close(): Promise { + this.page = null; + logger.info('Page reference cleared'); + } +} diff --git a/server/src/sdk/workflowEnricher.ts b/server/src/sdk/workflowEnricher.ts new file mode 100644 index 000000000..f8ae49202 --- /dev/null +++ b/server/src/sdk/workflowEnricher.ts @@ -0,0 +1,1361 @@ +/** + * Workflow Enricher + * Converts simplified SDK workflow to full format with validation + */ + +import { SelectorValidator } from './selectorValidator'; +import { createRemoteBrowserForValidation, destroyRemoteBrowser } from '../browser-management/controller'; +import logger from '../logger'; +import { v4 as uuid } from 'uuid'; +import { encrypt } from '../utils/auth'; +import Anthropic from '@anthropic-ai/sdk'; + +interface SimplifiedAction { + action: string | typeof Symbol.asyncDispose; + args?: any[]; + name?: string; + actionId?: string; +} + +type RegexableString = string | { $regex: string }; + +interface SimplifiedWorkflowPair { + where: { + url?: RegexableString; + [key: string]: any; + }; + what: SimplifiedAction[]; +} + +export class WorkflowEnricher { + /** + * Enrich a simplified workflow with full metadata + */ + static async enrichWorkflow( + simplifiedWorkflow: SimplifiedWorkflowPair[], + userId: string + ): Promise<{ success: boolean; workflow?: any[]; errors?: string[]; url?: string }> { + const errors: string[] = []; + const enrichedWorkflow: any[] = []; + + if (simplifiedWorkflow.length === 0) { + return { success: false, errors: ['Workflow is empty'] }; + } + + let url: string | undefined; + for (const step of simplifiedWorkflow) { + const rawUrl = step.where.url; + if (rawUrl && rawUrl !== 'about:blank') { + url = typeof rawUrl === 'string' ? rawUrl : rawUrl.$regex; + break; + } + } + + if (!url) { + return { success: false, errors: ['No valid URL found in workflow'] }; + } + + let browserId: string | null = null; + const validator = new SelectorValidator(); + + try { + logger.info('Creating RemoteBrowser for validation'); + const { browserId: id, page } = await createRemoteBrowserForValidation(userId); + browserId = id; + + await validator.initialize(page, url); + + for (const step of simplifiedWorkflow) { + const enrichedStep: any = { + where: { ...step.where }, + what: [] + }; + + const selectors: string[] = []; + + for (const action of step.what) { + if (typeof action.action !== 'string') { + continue; + } + + if (action.action === 'type') { + if (!action.args || action.args.length < 2) { + errors.push('type action missing selector or value'); + continue; + } + + const selector = action.args[0]; + const value = action.args[1]; + const providedInputType = action.args[2]; + + selectors.push(selector); + + const encryptedValue = encrypt(value); + + if (!providedInputType) { + try { + const inputType = await validator.detectInputType(selector); + enrichedStep.what.push({ + ...action, + args: [selector, encryptedValue, inputType] + }); + } catch (error: any) { + errors.push(`type action: ${error.message}`); + continue; + } + } else { + enrichedStep.what.push({ + ...action, + args: [selector, encryptedValue, providedInputType] + }); + } + + enrichedStep.what.push({ + action: 'waitForLoadState', + args: ['networkidle'] + }); + + continue; + } + + if (action.action !== 'scrapeSchema' && action.action !== 'scrapeList') { + enrichedStep.what.push(action); + continue; + } + + if (action.action === 'scrapeSchema') { + if (!action.args || !action.args[0]) { + errors.push('scrapeSchema action missing fields argument'); + continue; + } + const fields = action.args[0]; + const result = await validator.validateSchemaFields(fields); + + if (!result.valid) { + errors.push(...(result.errors || [])); + continue; + } + + const enrichedFields: Record = {}; + for (const [fieldName, enrichedData] of Object.entries(result.enriched!)) { + enrichedFields[fieldName] = { + tag: enrichedData.tag, + isShadow: enrichedData.isShadow, + selector: enrichedData.selector, + attribute: enrichedData.attribute + }; + + selectors.push(enrichedData.selector); + } + + const enrichedAction: any = { + action: 'scrapeSchema', + actionId: `text-${uuid()}`, + args: [enrichedFields] + }; + if (action.name) { + enrichedAction.name = action.name; + } + enrichedStep.what.push(enrichedAction); + + enrichedStep.what.push({ + action: 'waitForLoadState', + args: ['networkidle'] + }); + + } else if (action.action === 'scrapeList') { + if (!action.args || !action.args[0]) { + errors.push('scrapeList action missing config argument'); + continue; + } + const config = action.args[0]; + + let enrichedFields: Record = {}; + let listSelector: string; + + try { + const autoDetectResult = await validator.autoDetectListFields(config.itemSelector); + + if (!autoDetectResult.success || !autoDetectResult.fields || Object.keys(autoDetectResult.fields).length === 0) { + errors.push(autoDetectResult.error || 'Failed to auto-detect fields from list selector'); + continue; + } + + enrichedFields = autoDetectResult.fields; + listSelector = autoDetectResult.listSelector!; + } catch (error: any) { + errors.push(`Field auto-detection failed: ${error.message}`); + continue; + } + + let paginationType = 'none'; + let paginationSelector = ''; + + if (config.pagination && config.pagination.type) { + paginationType = config.pagination.type; + paginationSelector = config.pagination.selector || ''; + } else { + try { + const paginationResult = await validator.autoDetectPagination(config.itemSelector); + + if (paginationResult.success && paginationResult.type) { + paginationType = paginationResult.type; + paginationSelector = paginationResult.selector || ''; + } + } catch (error: any) { + logger.warn('Pagination auto-detection failed, using default (none):', error.message); + } + } + + const enrichedListAction: any = { + action: 'scrapeList', + actionId: `list-${uuid()}`, + args: [{ + fields: enrichedFields, + listSelector: listSelector, + pagination: { + type: paginationType, + selector: paginationSelector + }, + limit: config.maxItems || 100 + }] + }; + if (action.name) { + enrichedListAction.name = action.name; + } + enrichedStep.what.push(enrichedListAction); + + enrichedStep.what.push({ + action: 'waitForLoadState', + args: ['networkidle'] + }); + } + } + + if (selectors.length > 0) { + enrichedStep.where.selectors = selectors; + } + + enrichedWorkflow.push(enrichedStep); + } + + await validator.close(); + + if (browserId) { + await destroyRemoteBrowser(browserId, userId); + logger.info('RemoteBrowser cleaned up successfully'); + } + + if (errors.length > 0) { + return { success: false, errors }; + } + + return { success: true, workflow: enrichedWorkflow, url }; + + } catch (error: any) { + await validator.close(); + + if (browserId) { + try { + await destroyRemoteBrowser(browserId, userId); + logger.info('RemoteBrowser cleaned up after error'); + } catch (cleanupError) { + logger.warn('Failed to cleanup RemoteBrowser:', cleanupError); + } + } + + logger.error('Error enriching workflow:', error); + return { success: false, errors: [error.message] }; + } + } + + + /** + * Generate workflow from natural language prompt using LLM with vision + */ + static async generateWorkflowFromPrompt( + url: string, + prompt: string, + userId: string, + llmConfig?: { + provider?: 'anthropic' | 'openai' | 'ollama'; + model?: string; + apiKey?: string; + baseUrl?: string; + }, + ): Promise<{ success: boolean; workflow?: any[]; url?: string; errors?: string[] }> { + let browserId: string | null = null; + const validator = new SelectorValidator(); + + try { + logger.info(`Generating workflow from prompt for URL: ${url}`); + logger.info(`Prompt: ${prompt}`); + + logger.info('Creating RemoteBrowser for LLM workflow generation'); + const { browserId: id, page } = await createRemoteBrowserForValidation(userId); + browserId = id; + + await validator.initialize(page as any, url); + + const validatorPage = (validator as any).page; + // Use JPEG with quality 85 for faster processing and smaller file size + // Vision models handle this compression well while maintaining accuracy + const screenshotBuffer = await page.screenshot({ + fullPage: true, + type: 'jpeg', + quality: 85 + }); + const screenshotBase64 = screenshotBuffer.toString('base64'); + + const elementGroups = await this.analyzePageGroups(validator); + logger.info(`Found ${elementGroups.length} element groups`); + + const pageHTML = await validatorPage.content(); + + const llmDecision = await this.getLLMDecisionWithVision( + prompt, + screenshotBase64, + elementGroups, + pageHTML, + llmConfig + ); + logger.info(`LLM decided action type: ${llmDecision.actionType}`); + + const workflow = await this.buildWorkflowFromLLMDecision(llmDecision, url, validator, prompt, llmConfig); + + await validator.close(); + + if (browserId) { + await destroyRemoteBrowser(browserId, userId); + logger.info('RemoteBrowser cleaned up after LLM workflow generation'); + } + + return { success: true, workflow, url }; + } catch (error: any) { + await validator.close(); + + if (browserId) { + try { + await destroyRemoteBrowser(browserId, userId); + logger.info('RemoteBrowser cleaned up after LLM generation error'); + } catch (cleanupError) { + logger.warn('Failed to cleanup RemoteBrowser:', cleanupError); + } + } + + logger.error('Error generating workflow from prompt:', error); + return { success: false, errors: [error.message] }; + } + } + + /** + * Analyze page groups using browser-side script + */ + private static async analyzePageGroups(validator: SelectorValidator): Promise { + try { + const page = (validator as any).page; + const fs = require('fs'); + const path = require('path'); + const scriptPath = path.join(__dirname, 'browserSide/pageAnalyzer.js'); + const scriptContent = fs.readFileSync(scriptPath, 'utf8'); + + await page.evaluate((script: string) => { + eval(script); + }, scriptContent); + + const groups = await page.evaluate(() => { + const win = window as any; + if (typeof win.analyzeElementGroups === 'function') { + return win.analyzeElementGroups(); + } + return []; + }); + + return groups; + } catch (error: any) { + logger.error('Error analyzing page groups:', error); + return []; + } + } + + /** + * Use LLM (with or without vision) to decide action and select best element/group + */ + private static async getLLMDecisionWithVision( + prompt: string, + screenshotBase64: string, + elementGroups: any[], + pageHTML: string, + llmConfig?: { + provider?: 'anthropic' | 'openai' | 'ollama'; + model?: string; + apiKey?: string; + baseUrl?: string; + } + ): Promise { + try { + const provider = llmConfig?.provider || 'ollama'; + const axios = require('axios'); + + const groupsDescription = elementGroups.map((group, index) => { + const sampleText = group.sampleTexts.slice(0, 2).filter((t: string) => t && t.trim().length > 0).join(' | '); + const hasContent = sampleText.length > 0; + const contentPreview = hasContent ? sampleText : '(no text content - likely images/icons)'; + + return `Group ${index}: +- Tag: ${group.fingerprint.tagName} +- Count: ${group.count} similar elements +- Has text content: ${hasContent ? 'YES' : 'NO'} +- Sample content: ${contentPreview.substring(0, 300)}`; + }).join('\n\n'); + + const systemPrompt = `You are a request classifier for list extraction. Your job is to: +1. Identify that the user wants to extract a list of items +2. Select the BEST element group that matches what they want +3. Extract any numeric limit from their request + +CRITICAL GROUP SELECTION RULES: +- Match the sample content to what the user is asking for - this is the PRIMARY criterion +- Groups with text content are often easier to match, but image galleries, icon grids, or data-attribute based groups can also be correct +- Analyze the keywords in the user's request and find the group whose sample content or structure best matches +- Consider the context: product sites may have image grids, job sites have text listings, etc. +- The group with the most relevant content should be selected, NOT just the first group or the group with most text + +LIMIT EXTRACTION: +- Look for numbers in the request that indicate quantity (e.g., "50", "25", "100", "first 30", "top 10") +- If no limit specified, use null + +Must return valid JSON: {"actionType": "captureList", "reasoning": "...", "selectedGroupIndex": NUMBER, "limit": NUMBER_OR_NULL}`; + + const userPrompt = `User's request: "${prompt}" + +Available element groups on page: +${groupsDescription} + +TASK: +1. Identify the key terms from the user's request +2. Look through ALL the groups above +3. Find the group whose "Sample content" best matches the key terms from the request +4. Prefer groups with "Has text content: YES" over "NO" +5. Extract any numeric limit from the request if present + +Return JSON: +{ + "actionType": "captureList", + "reasoning": "Brief explanation of why this group was selected", + "selectedGroupIndex": INDEX_NUMBER, + "limit": NUMBER_OR_NULL +} + +Note: selectedGroupIndex must be between 0 and ${elementGroups.length - 1}`; + + + let llmResponse: string; + + if (provider === 'ollama') { + const ollamaBaseUrl = llmConfig?.baseUrl || process.env.OLLAMA_BASE_URL || 'http://localhost:11434'; + const ollamaModel = llmConfig?.model || 'llama3.2-vision'; + + const jsonSchema = { + type: 'object', + required: ['actionType', 'reasoning', 'selectedGroupIndex'], + properties: { + actionType: { + type: 'string', + enum: ['captureList'] + }, + reasoning: { + type: 'string' + }, + selectedGroupIndex: { + type: 'integer' + }, + limit: { + type: ['integer', 'null'] + } + } + }; + + const response = await axios.post(`${ollamaBaseUrl}/api/chat`, { + model: ollamaModel, + messages: [ + { + role: 'system', + content: systemPrompt + }, + { + role: 'user', + content: userPrompt, + images: [screenshotBase64] + } + ], + stream: false, + format: jsonSchema, + options: { + temperature: 0.1 + } + }); + + llmResponse = response.data.message.content; + + } else if (provider === 'anthropic') { + const anthropic = new Anthropic({ + apiKey: llmConfig?.apiKey || process.env.ANTHROPIC_API_KEY + }); + const anthropicModel = llmConfig?.model || 'claude-3-5-sonnet-20241022'; + + const response = await anthropic.messages.create({ + model: anthropicModel, + max_tokens: 1024, + messages: [{ + role: 'user', + content: [ + { + type: 'image', + source: { + type: 'base64', + media_type: 'image/png', + data: screenshotBase64 + } + }, + { + type: 'text', + text: userPrompt + } + ] + }], + system: systemPrompt + }); + + const textContent = response.content.find((c: any) => c.type === 'text'); + llmResponse = textContent?.type === 'text' ? textContent.text : ''; + + } else if (provider === 'openai') { + const openaiBaseUrl = llmConfig?.baseUrl || 'https://api.openai.com/v1'; + const openaiModel = llmConfig?.model || 'gpt-4-vision-preview'; + + const response = await axios.post(`${openaiBaseUrl}/chat/completions`, { + model: openaiModel, + messages: [ + { + role: 'system', + content: systemPrompt + }, + { + role: 'user', + content: [ + { + type: 'text', + text: userPrompt + }, + { + type: 'image_url', + image_url: { + url: `data:image/png;base64,${screenshotBase64}` + } + } + ] + } + ], + max_tokens: 1024, + temperature: 0.1 + }, { + headers: { + 'Authorization': `Bearer ${llmConfig?.apiKey || process.env.OPENAI_API_KEY}`, + 'Content-Type': 'application/json' + } + }); + + llmResponse = response.data.choices[0].message.content; + + } else { + throw new Error(`Unsupported LLM provider: ${provider}`); + } + + logger.info(`LLM Response: ${llmResponse}`); + + let jsonStr = llmResponse.trim(); + + const jsonMatch = jsonStr.match(/```json\s*([\s\S]*?)\s*```/) || jsonStr.match(/```\s*([\s\S]*?)\s*```/); + if (jsonMatch) { + jsonStr = jsonMatch[1].trim(); + } + + const objectMatch = jsonStr.match(/\{[\s\S]*"actionType"[\s\S]*\}/); + if (objectMatch) { + jsonStr = objectMatch[0]; + } + + const decision = JSON.parse(jsonStr); + + if (!decision.actionType || decision.actionType !== 'captureList') { + throw new Error('LLM response must have actionType: "captureList"'); + } + + if (decision.selectedGroupIndex === undefined || decision.selectedGroupIndex < 0 || decision.selectedGroupIndex >= elementGroups.length) { + throw new Error(`Invalid selectedGroupIndex: ${decision.selectedGroupIndex}. Must be between 0 and ${elementGroups.length - 1}`); + } + + const selectedGroup = elementGroups[decision.selectedGroupIndex]; + return { + actionType: 'captureList', + selectedGroup, + itemSelector: selectedGroup.xpath, + reasoning: decision.reasoning, + limit: decision.limit || null + }; + + } catch (error: any) { + logger.error('LLM decision error:', error); + return this.fallbackHeuristicDecision(prompt, elementGroups); + } + } + + /** + * Fallback heuristic decision when LLM fails + */ + private static fallbackHeuristicDecision(prompt: string, elementGroups: any[]): any { + const promptLower = prompt.toLowerCase(); + + if (elementGroups.length === 0) { + throw new Error('No element groups found on page for list extraction'); + } + + const scoredGroups = elementGroups.map((group, index) => { + let score = 0; + for (const sampleText of group.sampleTexts) { + const keywords = promptLower.split(' ').filter((w: string) => w.length > 3); + for (const keyword of keywords) { + if (sampleText.toLowerCase().includes(keyword)) score += 2; + } + } + score += Math.min(group.count / 10, 5); + return { group, score, index }; + }); + + scoredGroups.sort((a, b) => b.score - a.score); + const best = scoredGroups[0]; + + return { + actionType: 'captureList', + selectedGroup: best.group, + itemSelector: best.group.xpath + }; + } + + /** + * Generate semantic field labels using LLM based on content and context + */ + private static async generateFieldLabels( + fields: Record, + fieldSamples: Record, + prompt: string, + url: string, + llmConfig?: { + provider?: 'anthropic' | 'openai' | 'ollama'; + model?: string; + apiKey?: string; + baseUrl?: string; + } + ): Promise> { + try { + const provider = llmConfig?.provider || 'ollama'; + + const BATCH_SIZE = provider === 'ollama' ? 25 : 50; + + const fieldEntries = Object.entries(fieldSamples); + const totalFields = fieldEntries.length; + + logger.info(`Processing ${totalFields} fields in batches of ${BATCH_SIZE} for LLM labeling`); + + const allLabels: Record = {}; + + for (let i = 0; i < fieldEntries.length; i += BATCH_SIZE) { + const batch = fieldEntries.slice(i, i + BATCH_SIZE); + const batchNumber = Math.floor(i / BATCH_SIZE) + 1; + const totalBatches = Math.ceil(fieldEntries.length / BATCH_SIZE); + + logger.info(`Processing batch ${batchNumber}/${totalBatches} (${batch.length} fields)`); + + const batchLabels = await this.generateFieldLabelsBatch( + fields, + Object.fromEntries(batch), + prompt, + url, + llmConfig, + allLabels + ); + + Object.assign(allLabels, batchLabels); + } + + logger.info(`Completed labeling for ${Object.keys(allLabels).length}/${totalFields} fields`); + + return allLabels; + } catch (error: any) { + logger.error(`Error generating field labels with LLM: ${error.message}`); + logger.error(`Using fallback: keeping generic field labels`); + const fallbackLabels: Record = {}; + Object.keys(fields).forEach(label => { + fallbackLabels[label] = label; + }); + return fallbackLabels; + } + } + + private static async generateFieldLabelsBatch( + allFields: Record, + fieldSamplesBatch: Record, + prompt: string, + url: string, + llmConfig?: { + provider?: 'anthropic' | 'openai' | 'ollama'; + model?: string; + apiKey?: string; + baseUrl?: string; + }, + previousLabels?: Record + ): Promise> { + try { + const provider = llmConfig?.provider || 'ollama'; + const axios = require('axios'); + + const fieldDescriptions = Object.entries(fieldSamplesBatch).map(([genericLabel, samples]) => { + const fieldInfo = allFields[genericLabel]; + const tagType = fieldInfo?.tag?.toLowerCase() || 'unknown'; + const attribute = fieldInfo?.attribute || 'innerText'; + + let typeHint = ''; + if (attribute === 'href') typeHint = '(link/URL)'; + else if (attribute === 'src') typeHint = '(image)'; + else if (tagType === 'img') typeHint = '(image)'; + else if (tagType === 'a') typeHint = '(link)'; + + return `${genericLabel}: + Type: ${tagType} ${typeHint} + Attribute: ${attribute} + Sample values: +${samples.slice(0, 3).map((s, i) => ` ${i + 1}. "${s}"`).join('\n')}`; + }).join('\n\n'); + + const hasPreviousLabels = previousLabels && Object.keys(previousLabels).length > 0; + const previousLabelsText = hasPreviousLabels + ? `\n\nPREVIOUSLY ASSIGNED LABELS (from earlier batches):\n${Object.entries(previousLabels!).map(([orig, sem]) => `- "${sem}"`).join('\n')}\n\nIMPORTANT: DO NOT reuse these exact labels. Use them as context to maintain consistent naming patterns and avoid duplicates. Add qualifiers like "Secondary", "Alternative", numbers, or additional context to distinguish similar fields.` + : ''; + + const systemPrompt = `You are a data field labeling assistant. Your job is to generate clear, semantic field names for extracted data based on the user's request and the actual field content. + +RULES FOR FIELD NAMING: +1. Use clear, descriptive names that match the content and context +2. Keep names concise (2-4 words maximum) +3. Use Title Case for field names +4. Match the user's terminology when possible +5. Be specific - include context when needed (e.g., "Product Name", "Job Title", "Article Headline", "Company Name") +6. For images, include "Image" or "Photo" in the name (e.g., "Product Image", "Profile Photo", "Thumbnail") +7. For links/URLs, you can use "URL" or "Link" (e.g., "Details Link", "Company Website") +8. Avoid generic terms like "Text", "Field", "Data" unless absolutely necessary +9. If you can't determine the meaning, use a descriptive observation based on the content type +10. Adapt to the domain: e-commerce (Product, Price), jobs (Title, Company), articles (Headline, Author), etc. +11. CRITICAL: Check previously assigned labels to avoid duplicates and maintain consistent naming patterns${previousLabelsText} + +You must return a JSON object mapping each generic label to its semantic name.`; + + const userPrompt = `URL: ${url} + +User's extraction request: "${prompt}" + +Detected fields with sample data: +${fieldDescriptions} + +TASK: Generate a semantic name for each field that accurately describes what it contains. +Consider: +- What the user is trying to extract (from their request) +- The actual content in the sample values +- The HTML element type and attribute being extracted +- Common naming conventions for this type of data + +Return a JSON object with this exact structure: +{ + "Label 1": "Semantic Field Name 1", + "Label 2": "Semantic Field Name 2", + ... +}`; + + let llmResponse: string; + + if (provider === 'ollama') { + const ollamaBaseUrl = llmConfig?.baseUrl || process.env.OLLAMA_BASE_URL || 'http://localhost:11434'; + const ollamaModel = llmConfig?.model || 'llama3.2-vision'; + + logger.info(`Using Ollama at ${ollamaBaseUrl} with model ${ollamaModel}`); + + const jsonSchema = { + type: 'object', + required: ['fieldLabels'], + properties: { + fieldLabels: { + type: 'object', + description: 'Mapping of generic labels to semantic field names', + patternProperties: { + '^Label \\d+$': { + type: 'string', + description: 'Semantic field name in Title Case' + } + } + } + } + }; + + try { + const response = await axios.post(`${ollamaBaseUrl}/api/chat`, { + model: ollamaModel, + messages: [ + { + role: 'system', + content: systemPrompt + }, + { + role: 'user', + content: userPrompt + } + ], + stream: false, + format: jsonSchema, + options: { + temperature: 0.1, + top_p: 0.9 + } + }); + + llmResponse = response.data.message.content; + } catch (ollamaError: any) { + logger.error(`Ollama request failed: ${ollamaError.message}`); + if (ollamaError.response) { + logger.error(`Ollama response status: ${ollamaError.response.status}`); + logger.error(`Ollama response data: ${JSON.stringify(ollamaError.response.data)}`); + } + throw new Error(`Ollama API error: ${ollamaError.message}. Make sure Ollama is running at ${ollamaBaseUrl}`); + } + + } else if (provider === 'anthropic') { + const anthropic = new Anthropic({ + apiKey: llmConfig?.apiKey || process.env.ANTHROPIC_API_KEY + }); + const anthropicModel = llmConfig?.model || 'claude-3-5-sonnet-20241022'; + + const response = await anthropic.messages.create({ + model: anthropicModel, + max_tokens: 2048, + temperature: 0.1, + messages: [{ + role: 'user', + content: userPrompt + }], + system: systemPrompt + }); + + const textContent = response.content.find((c: any) => c.type === 'text'); + llmResponse = textContent?.type === 'text' ? textContent.text : ''; + + } else if (provider === 'openai') { + const openaiBaseUrl = llmConfig?.baseUrl || 'https://api.openai.com/v1'; + const openaiModel = llmConfig?.model || 'gpt-4o-mini'; + + const response = await axios.post(`${openaiBaseUrl}/chat/completions`, { + model: openaiModel, + messages: [ + { + role: 'system', + content: systemPrompt + }, + { + role: 'user', + content: userPrompt + } + ], + max_tokens: 2048, + temperature: 0.1, + response_format: { type: 'json_object' } + }, { + headers: { + 'Authorization': `Bearer ${llmConfig?.apiKey || process.env.OPENAI_API_KEY}`, + 'Content-Type': 'application/json' + } + }); + + llmResponse = response.data.choices[0].message.content; + + } else { + throw new Error(`Unsupported LLM provider: ${provider}`); + } + + let jsonStr = llmResponse.trim(); + + const jsonMatch = jsonStr.match(/```json\s*([\s\S]*?)\s*```/) || jsonStr.match(/```\s*([\s\S]*?)\s*```/); + if (jsonMatch) { + jsonStr = jsonMatch[1].trim(); + } + + const objectMatch = jsonStr.match(/\{[\s\S]*\}/); + if (objectMatch) { + jsonStr = objectMatch[0]; + } + + const parsedResponse = JSON.parse(jsonStr); + + let labelMapping: Record; + if (parsedResponse.fieldLabels) { + labelMapping = parsedResponse.fieldLabels; + } else { + labelMapping = parsedResponse; + } + + const missingLabels: string[] = []; + Object.keys(fieldSamplesBatch).forEach(genericLabel => { + if (!labelMapping[genericLabel]) { + missingLabels.push(genericLabel); + } + }); + + if (missingLabels.length > 0) { + logger.warn(`LLM did not provide labels for: ${missingLabels.join(', ')}`); + missingLabels.forEach(label => { + labelMapping[label] = label; + }); + } + + return labelMapping; + } catch (error: any) { + logger.error(`Error in batch field labeling: ${error.message}`); + const fallbackLabels: Record = {}; + Object.keys(fieldSamplesBatch).forEach(label => { + fallbackLabels[label] = label; + }); + return fallbackLabels; + } + } + + /** + * Filter fields based on user intent using LLM with confidence scoring + */ + private static async filterFieldsByIntent( + labeledFields: Record, + fieldSamples: Record, + prompt: string, + llmConfig?: { + provider?: 'anthropic' | 'openai' | 'ollama'; + model?: string; + apiKey?: string; + baseUrl?: string; + } + ): Promise<{ + selectedFields: Record; + confidence: number; + reasoning: string; + needsUserConfirmation: boolean; + }> { + try { + const provider = llmConfig?.provider || 'ollama'; + const axios = require('axios'); + + const fieldDescriptions = Object.entries(labeledFields).map(([fieldName, fieldInfo]) => { + const samples = fieldSamples[fieldName] || []; + const sampleText = samples.length > 0 + ? samples.slice(0, 1).map((s, i) => `"${s.substring(0, 100)}"`).join(', ') + : '(no samples)'; + + return `${fieldName}: ${fieldInfo.tag || 'unknown'} - ${sampleText}`; + }).join('\n'); + + const systemPrompt = `You are a field filter assistant. Your job is to analyze the user's extraction request and select ONLY the fields that match their intent. + +CRITICAL RULES: +1. Only include fields explicitly mentioned or clearly implied by the user's request +2. Use semantic matching (e.g., "quotes" matches "Quote Text", "company names" matches "Company Name") +3. If the user specifies a count (e.g., "20 quotes"), note it but return the matching fields +4. Be strict: when in doubt, exclude the field rather than include it +5. Return high confidence (0.9-1.0) only if matches are exact or obvious +6. Return medium confidence (0.6-0.8) if matches are semantic/implied +7. Return low confidence (<0.6) if uncertain + +You must return a JSON object with selectedFields, confidence, and reasoning.`; + + const userPrompt = `User's extraction request: "${prompt}" + +Available labeled fields: +${fieldDescriptions} + +TASK: Determine which fields the user wants to extract based on their request. + +Return a JSON object with this exact structure: +{ + "selectedFields": ["Field Name 1", "Field Name 2"], + "confidence": 0.95, + "reasoning": "Brief explanation of why these fields were selected and confidence level" +} + +Rules: +- selectedFields: Array of field names that match the user's intent +- confidence: Number between 0 and 1 (1.0 = exact match, 0.8+ = semantic match, <0.7 = uncertain) +- reasoning: Explain which keywords from the user's request matched which fields`; + + let llmResponse: string; + + if (provider === 'ollama') { + const ollamaBaseUrl = llmConfig?.baseUrl || process.env.OLLAMA_BASE_URL || 'http://localhost:11434'; + const ollamaModel = llmConfig?.model || 'llama3.2-vision'; + + const jsonSchema = { + type: 'object', + required: ['selectedFields', 'confidence', 'reasoning'], + properties: { + selectedFields: { + type: 'array', + items: { type: 'string' }, + description: 'Array of field names that match user intent' + }, + confidence: { + type: 'number', + minimum: 0, + maximum: 1, + description: 'Confidence score from 0 to 1' + }, + reasoning: { + type: 'string', + description: 'Explanation of field selection and confidence' + } + } + }; + + const response = await axios.post(`${ollamaBaseUrl}/api/chat`, { + model: ollamaModel, + messages: [ + { + role: 'system', + content: systemPrompt + }, + { + role: 'user', + content: userPrompt + } + ], + stream: false, + format: jsonSchema, + options: { + temperature: 0.1, + top_p: 0.9 + } + }); + + llmResponse = response.data.message.content; + + } else if (provider === 'anthropic') { + const anthropic = new Anthropic({ + apiKey: llmConfig?.apiKey || process.env.ANTHROPIC_API_KEY + }); + const anthropicModel = llmConfig?.model || 'claude-3-5-sonnet-20241022'; + + const response = await anthropic.messages.create({ + model: anthropicModel, + max_tokens: 1024, + temperature: 0.1, + messages: [{ + role: 'user', + content: userPrompt + }], + system: systemPrompt + }); + + const textContent = response.content.find((c: any) => c.type === 'text'); + llmResponse = textContent?.type === 'text' ? textContent.text : ''; + + } else if (provider === 'openai') { + const openaiBaseUrl = llmConfig?.baseUrl || 'https://api.openai.com/v1'; + const openaiModel = llmConfig?.model || 'gpt-4o-mini'; + + const response = await axios.post(`${openaiBaseUrl}/chat/completions`, { + model: openaiModel, + messages: [ + { + role: 'system', + content: systemPrompt + }, + { + role: 'user', + content: userPrompt + } + ], + max_tokens: 1024, + temperature: 0.1, + response_format: { type: 'json_object' } + }, { + headers: { + 'Authorization': `Bearer ${llmConfig?.apiKey || process.env.OPENAI_API_KEY}`, + 'Content-Type': 'application/json' + } + }); + + llmResponse = response.data.choices[0].message.content; + + } else { + throw new Error(`Unsupported LLM provider: ${provider}`); + } + + logger.info(`LLM Field Filtering Response: ${llmResponse}`); + + // Parse JSON response + let jsonStr = llmResponse.trim(); + + const jsonMatch = jsonStr.match(/```json\s*([\s\S]*?)\s*```/) || jsonStr.match(/```\s*([\s\S]*?)\s*```/); + if (jsonMatch) { + jsonStr = jsonMatch[1].trim(); + } + + const objectMatch = jsonStr.match(/\{[\s\S]*\}/); + if (objectMatch) { + jsonStr = objectMatch[0]; + } + + const filterResult = JSON.parse(jsonStr); + + if (!Array.isArray(filterResult.selectedFields)) { + throw new Error('Invalid response: selectedFields must be an array'); + } + + if (typeof filterResult.confidence !== 'number' || filterResult.confidence < 0 || filterResult.confidence > 1) { + throw new Error('Invalid response: confidence must be a number between 0 and 1'); + } + + const filteredFields: Record = {}; + for (const fieldName of filterResult.selectedFields) { + if (labeledFields[fieldName]) { + filteredFields[fieldName] = labeledFields[fieldName]; + } else { + logger.warn(`LLM selected field "${fieldName}" but it doesn't exist in labeled fields`); + } + } + + const needsUserConfirmation = filterResult.confidence < 0.8 || Object.keys(filteredFields).length === 0; + + return { + selectedFields: filteredFields, + confidence: filterResult.confidence, + reasoning: filterResult.reasoning || 'No reasoning provided', + needsUserConfirmation + }; + + } catch (error: any) { + logger.error(`Error filtering fields by intent: ${error.message}`); + + return { + selectedFields: labeledFields, + confidence: 0.5, + reasoning: 'Error during filtering, returning all fields as fallback', + needsUserConfirmation: true + }; + } + } + + /** + * Extract sample data from fields for LLM labeling + */ + private static async extractFieldSamples( + fields: Record, + listSelector: string, + validator: SelectorValidator + ): Promise> { + const fieldSamples: Record = {}; + + try { + const page = (validator as any).page; + if (!page) { + throw new Error('Page not available'); + } + + const samples = await page.evaluate((args: { fieldsData: any; listSel: string }) => { + const results: Record = {}; + + function evaluateSelector(selector: string, doc: Document): Element[] { + const isXPath = selector.startsWith('//') || selector.startsWith('(//'); + if (isXPath) { + const result = doc.evaluate(selector, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); + const elements: Element[] = []; + for (let i = 0; i < result.snapshotLength; i++) { + const node = result.snapshotItem(i); + if (node && node.nodeType === Node.ELEMENT_NODE) { + elements.push(node as Element); + } + } + return elements; + } else { + return Array.from(doc.querySelectorAll(selector)); + } + } + + const listItems = evaluateSelector(args.listSel, document).slice(0, 5); + + Object.entries(args.fieldsData).forEach(([fieldLabel, fieldInfo]: [string, any]) => { + const samples: string[] = []; + const selector = fieldInfo.selector; + const attribute = fieldInfo.attribute || 'innerText'; + + listItems.forEach((listItem: Element) => { + try { + const elements = evaluateSelector(selector, document); + + const matchingElement = elements.find((el: Element) => { + return listItem.contains(el); + }); + + if (matchingElement) { + let value = ''; + if (attribute === 'innerText') { + value = (matchingElement.textContent || '').trim(); + } else { + value = matchingElement.getAttribute(attribute) || ''; + } + + if (value && value.length > 0 && !samples.includes(value)) { + samples.push(value.substring(0, 200)); + } + } + } catch (e) { + } + }); + + results[fieldLabel] = samples; + }); + + return results; + }, { fieldsData: fields, listSel: listSelector }); + + return samples; + } catch (error: any) { + logger.error(`Error extracting field samples: ${error.message}`); + logger.error(`Error stack: ${error.stack}`); + Object.keys(fields).forEach(label => { + fieldSamples[label] = []; + }); + return fieldSamples; + } + } + + /** + * Build workflow from LLM decision + */ + private static async buildWorkflowFromLLMDecision( + llmDecision: any, + url: string, + validator: SelectorValidator, + prompt?: string, + llmConfig?: { + provider?: 'anthropic' | 'openai' | 'ollama'; + model?: string; + apiKey?: string; + baseUrl?: string; + } + ): Promise { + const workflow: any[] = []; + + workflow.push({ + where: { url, selectors: [] }, + what: [ + { action: 'goto', args: [url] }, + { action: 'waitForLoadState', args: ['networkidle'] } + ] + }); + + if (llmDecision.actionType === 'captureList') { + const autoDetectResult = await validator.autoDetectListFields(llmDecision.itemSelector); + + if (!autoDetectResult.success || !autoDetectResult.fields || Object.keys(autoDetectResult.fields).length === 0) { + throw new Error('Failed to auto-detect fields from selected group'); + } + + logger.info('Extracting field samples and detecting pagination in parallel...'); + const [fieldSamples, paginationResult] = await Promise.all([ + this.extractFieldSamples( + autoDetectResult.fields, + autoDetectResult.listSelector || '', + validator + ), + validator.autoDetectPagination(llmDecision.itemSelector).catch((error: any) => { + logger.warn('Pagination auto-detection failed:', error.message); + return { success: false, type: 'none', selector: '' }; + }) + ]); + + logger.info('Generating semantic field labels with LLM...'); + const fieldLabels = await this.generateFieldLabels( + autoDetectResult.fields, + fieldSamples, + prompt || 'Extract list data', + url, + llmConfig + ); + + const renamedFields: Record = {}; + Object.entries(autoDetectResult.fields).forEach(([genericLabel, fieldInfo]) => { + const semanticLabel = fieldLabels[genericLabel] || genericLabel; + renamedFields[semanticLabel] = fieldInfo; + }); + + const renamedSamples: Record = {}; + Object.entries(fieldSamples).forEach(([genericLabel, samples]) => { + const semanticLabel = fieldLabels[genericLabel] || genericLabel; + renamedSamples[semanticLabel] = samples; + }); + + const filterResult = await this.filterFieldsByIntent( + renamedFields, + renamedSamples, + prompt || 'Extract list data', + llmConfig + ); + + let finalFields = renamedFields; + if (filterResult.confidence >= 0.8 && Object.keys(filterResult.selectedFields).length > 0) { + finalFields = filterResult.selectedFields; + } else if (filterResult.confidence >= 0.6 && Object.keys(filterResult.selectedFields).length > 0) { + finalFields = filterResult.selectedFields; + } else { + logger.warn(`Low confidence (${filterResult.confidence}) or no fields selected. Using all detected fields as fallback.`); + } + + let paginationType = 'none'; + let paginationSelector = ''; + + if (paginationResult.success && paginationResult.type) { + paginationType = paginationResult.type; + paginationSelector = paginationResult.selector || ''; + } + + const limit = llmDecision.limit || 100; + logger.info(`Using limit: ${limit}`); + + workflow[0].what.push({ + action: 'scrapeList', + actionId: `list-${uuid()}`, + name: 'List 1', + args: [{ + fields: finalFields, + listSelector: autoDetectResult.listSelector, + pagination: { + type: paginationType, + selector: paginationSelector + }, + limit: limit + }] + }); + + workflow[0].what.push({ + action: 'waitForLoadState', + args: ['networkidle'] + }); + } else { + throw new Error(`Unsupported action type: ${llmDecision.actionType}. Only captureList is supported.`); + } + + return workflow; + } +} \ No newline at end of file diff --git a/server/src/storage/schedule.ts b/server/src/storage/schedule.ts index 33ddd7458..ed524d9bb 100644 --- a/server/src/storage/schedule.ts +++ b/server/src/storage/schedule.ts @@ -5,6 +5,7 @@ import { v4 as uuid } from 'uuid'; import logger from '../logger'; import { pgBossClient } from './pgboss'; +import { registerWorkerForQueue } from '../schedule-worker'; /** * Utility function to schedule a cron job using PgBoss @@ -13,7 +14,7 @@ import { pgBossClient } from './pgboss'; * @param cronExpression The cron expression for scheduling * @param timezone The timezone for the cron expression */ -export async function scheduleWorkflow(id: string, userId: string, cronExpression: string, timezone: string): Promise { +export async function scheduleWorkflow(id: string, userId: string, cronExpression: string, timezone: string): Promise { try { const runId = uuid(); @@ -28,6 +29,8 @@ export async function scheduleWorkflow(id: string, userId: string, cronExpressio { tz: timezone } ); + await registerWorkerForQueue(queueName); + logger.log('info', `Scheduled workflow job for robot ${id}`); } catch (error: unknown) { const errorMessage = error instanceof Error ? error.message : String(error);