(null);
@@ -487,7 +489,7 @@ function TasksTreeView({
scrollToNode,
virtualizer,
} = useTree({
- tree: events,
+ tree: showDebug ? events : events.filter((event) => !event.data.isDebug),
selectedId,
// collapsedIds,
onSelectedIdChanged,
@@ -512,6 +514,15 @@ function TasksTreeView({
+ {isAdmin && (
+
setShowDebug(e.valueOf())}
+ />
+ )}
{
);
}
- logger.error("Failed to start a test run", { error: e });
+ logger.error("Failed to start a test run", { error: e instanceof Error ? e.message : e });
return redirectBackWithErrorMessage(
request,
diff --git a/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts b/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts
index 471481a169..3bb2ecf664 100644
--- a/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts
+++ b/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts
@@ -3,6 +3,7 @@ import { z } from "zod";
import { prisma } from "~/db.server";
import { authenticateApiRequestWithPersonalAccessToken } from "~/services/personalAccessToken.server";
import { marqs } from "~/v3/marqs/index.server";
+import { updateEnvConcurrencyLimits } from "~/v3/runQueue.server";
const ParamsSchema = z.object({
environmentId: z.string(),
@@ -60,7 +61,7 @@ export async function action({ request, params }: ActionFunctionArgs) {
},
});
- await marqs?.updateEnvConcurrencyLimits(environment);
+ await updateEnvConcurrencyLimits(environment);
return json({ success: true });
}
diff --git a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.concurrency.ts b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.concurrency.ts
index 51d292eb05..d6491bcc45 100644
--- a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.concurrency.ts
+++ b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.concurrency.ts
@@ -3,6 +3,7 @@ import { z } from "zod";
import { prisma } from "~/db.server";
import { authenticateApiRequestWithPersonalAccessToken } from "~/services/personalAccessToken.server";
import { marqs } from "~/v3/marqs/index.server";
+import { updateEnvConcurrencyLimits } from "~/v3/runQueue.server";
const ParamsSchema = z.object({
organizationId: z.string(),
@@ -97,7 +98,7 @@ export async function action({ request, params }: ActionFunctionArgs) {
},
});
- await marqs?.updateEnvConcurrencyLimits({ ...modifiedEnvironment, organization });
+ await updateEnvConcurrencyLimits({ ...modifiedEnvironment, organization });
}
return json({ success: true });
diff --git a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.environments.staging.ts b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.environments.staging.ts
index c4088257af..8483058f32 100644
--- a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.environments.staging.ts
+++ b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.environments.staging.ts
@@ -4,6 +4,7 @@ import { prisma } from "~/db.server";
import { createEnvironment } from "~/models/organization.server";
import { authenticateApiRequestWithPersonalAccessToken } from "~/services/personalAccessToken.server";
import { marqs } from "~/v3/marqs/index.server";
+import { updateEnvConcurrencyLimits } from "~/v3/runQueue.server";
const ParamsSchema = z.object({
organizationId: z.string(),
@@ -58,10 +59,10 @@ export async function action({ request, params }: ActionFunctionArgs) {
if (!stagingEnvironment) {
const staging = await createEnvironment(organization, project, "STAGING");
- await marqs?.updateEnvConcurrencyLimits({ ...staging, organization, project });
+ await updateEnvConcurrencyLimits({ ...staging, organization, project });
created++;
} else {
- await marqs?.updateEnvConcurrencyLimits({ ...stagingEnvironment, organization, project });
+ await updateEnvConcurrencyLimits({ ...stagingEnvironment, organization, project });
}
}
diff --git a/apps/webapp/app/routes/admin.api.v1.workers.ts b/apps/webapp/app/routes/admin.api.v1.workers.ts
new file mode 100644
index 0000000000..185c9cc4d0
--- /dev/null
+++ b/apps/webapp/app/routes/admin.api.v1.workers.ts
@@ -0,0 +1,65 @@
+import { ActionFunctionArgs, json } from "@remix-run/server-runtime";
+import { z } from "zod";
+import { prisma } from "~/db.server";
+import { authenticateApiRequestWithPersonalAccessToken } from "~/services/personalAccessToken.server";
+import { WorkerGroupService } from "~/v3/services/worker/workerGroupService.server";
+
+const RequestBodySchema = z.object({
+ name: z.string().optional(),
+ description: z.string().optional(),
+ projectId: z.string().optional(),
+ makeDefault: z.boolean().optional(),
+});
+
+export async function action({ request }: ActionFunctionArgs) {
+ // Next authenticate the request
+ const authenticationResult = await authenticateApiRequestWithPersonalAccessToken(request);
+
+ if (!authenticationResult) {
+ return json({ error: "Invalid or Missing API key" }, { status: 401 });
+ }
+
+ const user = await prisma.user.findUnique({
+ where: {
+ id: authenticationResult.userId,
+ },
+ });
+
+ if (!user) {
+ return json({ error: "Invalid or Missing API key" }, { status: 401 });
+ }
+
+ if (!user.admin) {
+ return json({ error: "You must be an admin to perform this action" }, { status: 403 });
+ }
+
+ try {
+ const rawBody = await request.json();
+ const { name, description, projectId, makeDefault } = RequestBodySchema.parse(rawBody ?? {});
+
+ const service = new WorkerGroupService();
+ const { workerGroup, token } = await service.createWorkerGroup({
+ name,
+ description,
+ });
+
+ if (makeDefault && projectId) {
+ await prisma.project.update({
+ where: {
+ id: projectId,
+ },
+ data: {
+ defaultWorkerGroupId: workerGroup.id,
+ engine: "V2",
+ },
+ });
+ }
+
+ return json({
+ token,
+ workerGroup,
+ });
+ } catch (error) {
+ return json({ error: error instanceof Error ? error.message : error }, { status: 400 });
+ }
+}
diff --git a/apps/webapp/app/routes/api.v1.batches.$batchId.ts b/apps/webapp/app/routes/api.v1.batches.$batchId.ts
index 365f9caa22..150978331e 100644
--- a/apps/webapp/app/routes/api.v1.batches.$batchId.ts
+++ b/apps/webapp/app/routes/api.v1.batches.$batchId.ts
@@ -34,6 +34,7 @@ export const loader = createLoaderApiRoute(
createdAt: batch.createdAt,
updatedAt: batch.updatedAt,
runCount: batch.runCount,
+ runs: batch.runIds,
});
}
);
diff --git a/apps/webapp/app/routes/api.v1.deployments.latest.ts b/apps/webapp/app/routes/api.v1.deployments.latest.ts
new file mode 100644
index 0000000000..6f31f58fcc
--- /dev/null
+++ b/apps/webapp/app/routes/api.v1.deployments.latest.ts
@@ -0,0 +1,41 @@
+import { LoaderFunctionArgs, json } from "@remix-run/server-runtime";
+import { WorkerInstanceGroupType } from "@trigger.dev/database";
+import { prisma } from "~/db.server";
+import { authenticateApiRequest } from "~/services/apiAuth.server";
+import { logger } from "~/services/logger.server";
+
+export async function loader({ request }: LoaderFunctionArgs) {
+ // Next authenticate the request
+ const authenticationResult = await authenticateApiRequest(request);
+
+ if (!authenticationResult) {
+ logger.info("Invalid or missing api key", { url: request.url });
+ return json({ error: "Invalid or Missing API key" }, { status: 401 });
+ }
+
+ const authenticatedEnv = authenticationResult.environment;
+
+ const deployment = await prisma.workerDeployment.findFirst({
+ where: {
+ type: WorkerInstanceGroupType.UNMANAGED,
+ environmentId: authenticatedEnv.id,
+ },
+ orderBy: {
+ createdAt: "desc",
+ },
+ });
+
+ if (!deployment) {
+ return json({ error: "Deployment not found" }, { status: 404 });
+ }
+
+ return json({
+ id: deployment.friendlyId,
+ status: deployment.status,
+ contentHash: deployment.contentHash,
+ shortCode: deployment.shortCode,
+ version: deployment.version,
+ imageReference: deployment.imageReference,
+ errorData: deployment.errorData,
+ });
+}
diff --git a/apps/webapp/app/routes/api.v1.deployments.ts b/apps/webapp/app/routes/api.v1.deployments.ts
index 2f4b9bdd54..c3dcfb13d0 100644
--- a/apps/webapp/app/routes/api.v1.deployments.ts
+++ b/apps/webapp/app/routes/api.v1.deployments.ts
@@ -6,6 +6,7 @@ import {
import { env } from "~/env.server";
import { authenticateApiRequest } from "~/services/apiAuth.server";
import { logger } from "~/services/logger.server";
+import { ServiceValidationError } from "~/v3/services/baseService.server";
import { InitializeDeploymentService } from "~/v3/services/initializeDeployment.server";
export async function action({ request, params }: ActionFunctionArgs) {
@@ -33,18 +34,30 @@ export async function action({ request, params }: ActionFunctionArgs) {
const service = new InitializeDeploymentService();
- const { deployment, imageTag } = await service.call(authenticatedEnv, body.data);
-
- const responseBody: InitializeDeploymentResponseBody = {
- id: deployment.friendlyId,
- contentHash: deployment.contentHash,
- shortCode: deployment.shortCode,
- version: deployment.version,
- externalBuildData:
- deployment.externalBuildData as InitializeDeploymentResponseBody["externalBuildData"],
- imageTag,
- registryHost: body.data.registryHost ?? env.DEPLOY_REGISTRY_HOST,
- };
-
- return json(responseBody, { status: 200 });
+ try {
+ const { deployment, imageTag } = await service.call(authenticatedEnv, body.data);
+
+ const responseBody: InitializeDeploymentResponseBody = {
+ id: deployment.friendlyId,
+ contentHash: deployment.contentHash,
+ shortCode: deployment.shortCode,
+ version: deployment.version,
+ externalBuildData:
+ deployment.externalBuildData as InitializeDeploymentResponseBody["externalBuildData"],
+ imageTag,
+ registryHost: body.data.registryHost ?? env.DEPLOY_REGISTRY_HOST,
+ };
+
+ return json(responseBody, { status: 200 });
+ } catch (error) {
+ if (error instanceof ServiceValidationError) {
+ return json({ error: error.message }, { status: 400 });
+ } else if (error instanceof Error) {
+ logger.error("Error initializing deployment", { error: error.message });
+ return json({ error: `Internal server error: ${error.message}` }, { status: 500 });
+ } else {
+ logger.error("Error initializing deployment", { error: String(error) });
+ return json({ error: "Internal server error" }, { status: 500 });
+ }
+ }
}
diff --git a/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts b/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts
index a557243442..cc364f3437 100644
--- a/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts
+++ b/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts
@@ -1,5 +1,9 @@
import { json } from "@remix-run/server-runtime";
-import { generateJWT as internal_generateJWT, TriggerTaskRequestBody } from "@trigger.dev/core/v3";
+import {
+ generateJWT as internal_generateJWT,
+ RunEngineVersionSchema,
+ TriggerTaskRequestBody,
+} from "@trigger.dev/core/v3";
import { TaskRun } from "@trigger.dev/database";
import { z } from "zod";
import { env } from "~/env.server";
@@ -21,6 +25,7 @@ export const HeadersSchema = z.object({
"x-trigger-span-parent-as-link": z.coerce.number().nullish(),
"x-trigger-worker": z.string().nullish(),
"x-trigger-client": z.string().nullish(),
+ "x-trigger-engine-version": RunEngineVersionSchema.nullish(),
traceparent: z.string().optional(),
tracestate: z.string().optional(),
});
@@ -49,6 +54,7 @@ const { action, loader } = createActionApiRoute(
tracestate,
"x-trigger-worker": isFromWorker,
"x-trigger-client": triggerClient,
+ "x-trigger-engine-version": engineVersion,
} = headers;
const service = new TriggerTaskService();
@@ -74,14 +80,20 @@ const { action, loader } = createActionApiRoute(
const idempotencyKeyExpiresAt = resolveIdempotencyKeyTTL(idempotencyKeyTTL);
- const result = await service.call(params.taskId, authentication.environment, body, {
- idempotencyKey: idempotencyKey ?? undefined,
- idempotencyKeyExpiresAt: idempotencyKeyExpiresAt,
- triggerVersion: triggerVersion ?? undefined,
- traceContext,
- spanParentAsLink: spanParentAsLink === 1,
- oneTimeUseToken,
- });
+ const result = await service.call(
+ params.taskId,
+ authentication.environment,
+ body,
+ {
+ idempotencyKey: idempotencyKey ?? undefined,
+ idempotencyKeyExpiresAt: idempotencyKeyExpiresAt,
+ triggerVersion: triggerVersion ?? undefined,
+ traceContext,
+ spanParentAsLink: spanParentAsLink === 1,
+ oneTimeUseToken,
+ },
+ engineVersion ?? undefined
+ );
if (!result) {
return json({ error: "Task not found" }, { status: 404 });
diff --git a/apps/webapp/app/routes/api.v1.tasks.batch.ts b/apps/webapp/app/routes/api.v1.tasks.batch.ts
index 5504427986..d26c55f1cb 100644
--- a/apps/webapp/app/routes/api.v1.tasks.batch.ts
+++ b/apps/webapp/app/routes/api.v1.tasks.batch.ts
@@ -67,6 +67,7 @@ const { action, loader } = createActionApiRoute(
"x-trigger-span-parent-as-link": spanParentAsLink,
"x-trigger-worker": isFromWorker,
"x-trigger-client": triggerClient,
+ "x-trigger-engine-version": engineVersion,
"batch-processing-strategy": batchProcessingStrategy,
traceparent,
tracestate,
diff --git a/apps/webapp/app/routes/api.v1.waitpoints.tokens.$waitpointFriendlyId.complete.ts b/apps/webapp/app/routes/api.v1.waitpoints.tokens.$waitpointFriendlyId.complete.ts
new file mode 100644
index 0000000000..e7109f67b2
--- /dev/null
+++ b/apps/webapp/app/routes/api.v1.waitpoints.tokens.$waitpointFriendlyId.complete.ts
@@ -0,0 +1,68 @@
+import { json } from "@remix-run/server-runtime";
+import {
+ CompleteWaitpointTokenRequestBody,
+ CompleteWaitpointTokenResponseBody,
+ conditionallyExportPacket,
+ stringifyIO,
+} from "@trigger.dev/core/v3";
+import { WaitpointId } from "@trigger.dev/core/v3/apps";
+import { z } from "zod";
+import { $replica } from "~/db.server";
+import { env } from "~/env.server";
+import { logger } from "~/services/logger.server";
+import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+import { engine } from "~/v3/runEngine.server";
+
+const { action } = createActionApiRoute(
+ {
+ params: z.object({
+ waitpointFriendlyId: z.string(),
+ }),
+ body: CompleteWaitpointTokenRequestBody,
+ maxContentLength: env.TASK_PAYLOAD_MAXIMUM_SIZE,
+ method: "POST",
+ },
+ async ({ authentication, body, params }) => {
+ // Resume tokens are actually just waitpoints
+ const waitpointId = WaitpointId.toId(params.waitpointFriendlyId);
+
+ try {
+ //check permissions
+ const waitpoint = await $replica.waitpoint.findFirst({
+ where: {
+ id: waitpointId,
+ environmentId: authentication.environment.id,
+ },
+ });
+
+ if (!waitpoint) {
+ throw json({ error: "Waitpoint not found" }, { status: 404 });
+ }
+
+ const stringifiedData = await stringifyIO(body.data);
+ const finalData = await conditionallyExportPacket(
+ stringifiedData,
+ `${waitpointId}/waitpoint/token`
+ );
+
+ const result = await engine.completeWaitpoint({
+ id: waitpointId,
+ output: finalData.data
+ ? { type: finalData.dataType, value: finalData.data, isError: false }
+ : undefined,
+ });
+
+ return json(
+ {
+ success: true,
+ },
+ { status: 200 }
+ );
+ } catch (error) {
+ logger.error("Failed to complete waitpoint token", { error });
+ throw json({ error: "Failed to complete waitpoint token" }, { status: 500 });
+ }
+ }
+);
+
+export { action };
diff --git a/apps/webapp/app/routes/api.v1.waitpoints.tokens.ts b/apps/webapp/app/routes/api.v1.waitpoints.tokens.ts
new file mode 100644
index 0000000000..7603551fe1
--- /dev/null
+++ b/apps/webapp/app/routes/api.v1.waitpoints.tokens.ts
@@ -0,0 +1,43 @@
+import { json } from "@remix-run/server-runtime";
+import {
+ CreateWaitpointTokenRequestBody,
+ CreateWaitpointTokenResponseBody,
+} from "@trigger.dev/core/v3";
+import { WaitpointId } from "@trigger.dev/core/v3/apps";
+import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+import { parseDelay } from "~/utils/delays";
+import { resolveIdempotencyKeyTTL } from "~/utils/idempotencyKeys.server";
+import { engine } from "~/v3/runEngine.server";
+
+const { action } = createActionApiRoute(
+ {
+ body: CreateWaitpointTokenRequestBody,
+ maxContentLength: 1024 * 10, // 10KB
+ method: "POST",
+ },
+ async ({ authentication, body }) => {
+ const idempotencyKeyExpiresAt = body.idempotencyKeyTTL
+ ? resolveIdempotencyKeyTTL(body.idempotencyKeyTTL)
+ : undefined;
+
+ const timeout = await parseDelay(body.timeout);
+
+ const result = await engine.createManualWaitpoint({
+ environmentId: authentication.environment.id,
+ projectId: authentication.environment.projectId,
+ idempotencyKey: body.idempotencyKey,
+ idempotencyKeyExpiresAt,
+ timeout,
+ });
+
+ return json(
+ {
+ id: WaitpointId.toFriendlyId(result.waitpoint.id),
+ isCached: result.isCached,
+ },
+ { status: 200 }
+ );
+ }
+);
+
+export { action };
diff --git a/apps/webapp/app/routes/api.v1.workers.ts b/apps/webapp/app/routes/api.v1.workers.ts
new file mode 100644
index 0000000000..4008d64f1a
--- /dev/null
+++ b/apps/webapp/app/routes/api.v1.workers.ts
@@ -0,0 +1,73 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import {
+ WorkersCreateRequestBody,
+ WorkersCreateResponseBody,
+ WorkersListResponseBody,
+} from "@trigger.dev/core/v3";
+import {
+ createActionApiRoute,
+ createLoaderApiRoute,
+} from "~/services/routeBuilders/apiBuilder.server";
+import { WorkerGroupService } from "~/v3/services/worker/workerGroupService.server";
+
+export const loader = createLoaderApiRoute(
+ {
+ corsStrategy: "all",
+ findResource: async () => 1, // This is a dummy function, we don't need to find a resource
+ },
+ async ({
+ authentication,
+ }): Promise> => {
+ if (authentication.environment.project.engine !== "V2") {
+ return json({ error: "Not supported for V1 projects" }, { status: 400 });
+ }
+
+ const service = new WorkerGroupService();
+ const workers = await service.listWorkerGroups({
+ projectId: authentication.environment.projectId,
+ });
+
+ return json(
+ workers.map((w) => ({
+ type: w.type,
+ name: w.name,
+ description: w.description,
+ isDefault: w.id === authentication.environment.project.defaultWorkerGroupId,
+ updatedAt: w.updatedAt,
+ }))
+ );
+ }
+);
+
+export const { action } = createActionApiRoute(
+ {
+ corsStrategy: "all",
+ body: WorkersCreateRequestBody,
+ },
+ async ({
+ authentication,
+ body,
+ }): Promise> => {
+ if (authentication.environment.project.engine !== "V2") {
+ return json({ error: "Not supported" }, { status: 400 });
+ }
+
+ const service = new WorkerGroupService();
+ const { workerGroup, token } = await service.createWorkerGroup({
+ projectId: authentication.environment.projectId,
+ organizationId: authentication.environment.organizationId,
+ name: body.name,
+ description: body.description,
+ });
+
+ return json({
+ token: {
+ plaintext: token.plaintext,
+ },
+ workerGroup: {
+ name: workerGroup.name,
+ description: workerGroup.description,
+ },
+ });
+ }
+);
diff --git a/apps/webapp/app/routes/api.v2.batches.$batchId.ts b/apps/webapp/app/routes/api.v2.batches.$batchId.ts
new file mode 100644
index 0000000000..150978331e
--- /dev/null
+++ b/apps/webapp/app/routes/api.v2.batches.$batchId.ts
@@ -0,0 +1,40 @@
+import { json } from "@remix-run/server-runtime";
+import { z } from "zod";
+import { $replica } from "~/db.server";
+import { createLoaderApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+
+const ParamsSchema = z.object({
+ batchId: z.string(),
+});
+
+export const loader = createLoaderApiRoute(
+ {
+ params: ParamsSchema,
+ allowJWT: true,
+ corsStrategy: "all",
+ findResource: (params, auth) => {
+ return $replica.batchTaskRun.findFirst({
+ where: {
+ friendlyId: params.batchId,
+ runtimeEnvironmentId: auth.environment.id,
+ },
+ });
+ },
+ authorization: {
+ action: "read",
+ resource: (batch) => ({ batch: batch.friendlyId }),
+ superScopes: ["read:runs", "read:all", "admin"],
+ },
+ },
+ async ({ resource: batch }) => {
+ return json({
+ id: batch.friendlyId,
+ status: batch.status,
+ idempotencyKey: batch.idempotencyKey ?? undefined,
+ createdAt: batch.createdAt,
+ updatedAt: batch.updatedAt,
+ runCount: batch.runCount,
+ runs: batch.runIds,
+ });
+ }
+);
diff --git a/apps/webapp/app/routes/api.v2.tasks.batch.ts b/apps/webapp/app/routes/api.v2.tasks.batch.ts
new file mode 100644
index 0000000000..e2c13e1aa9
--- /dev/null
+++ b/apps/webapp/app/routes/api.v2.tasks.batch.ts
@@ -0,0 +1,152 @@
+import { json } from "@remix-run/server-runtime";
+import {
+ BatchTriggerTaskV3RequestBody,
+ BatchTriggerTaskV3Response,
+ generateJWT,
+} from "@trigger.dev/core/v3";
+import { env } from "~/env.server";
+import { AuthenticatedEnvironment, getOneTimeUseToken } from "~/services/apiAuth.server";
+import { logger } from "~/services/logger.server";
+import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+import { ServiceValidationError } from "~/v3/services/baseService.server";
+import { BatchProcessingStrategy } from "~/v3/services/batchTriggerV3.server";
+import { BatchTriggerV4Service } from "~/v3/services/batchTriggerV4.server";
+import { OutOfEntitlementError } from "~/v3/services/triggerTask.server";
+import { HeadersSchema } from "./api.v1.tasks.$taskId.trigger";
+
+const { action, loader } = createActionApiRoute(
+ {
+ headers: HeadersSchema.extend({
+ "batch-processing-strategy": BatchProcessingStrategy.nullish(),
+ }),
+ body: BatchTriggerTaskV3RequestBody,
+ allowJWT: true,
+ maxContentLength: env.BATCH_TASK_PAYLOAD_MAXIMUM_SIZE,
+ authorization: {
+ action: "batchTrigger",
+ resource: (_, __, ___, body) => ({
+ tasks: Array.from(new Set(body.items.map((i) => i.task))),
+ }),
+ superScopes: ["write:tasks", "admin"],
+ },
+ corsStrategy: "all",
+ },
+ async ({ body, headers, params, authentication }) => {
+ if (!body.items.length) {
+ return json({ error: "Batch cannot be triggered with no items" }, { status: 400 });
+ }
+
+ // Check the there are fewer than MAX_BATCH_V2_TRIGGER_ITEMS items
+ if (body.items.length > env.MAX_BATCH_V2_TRIGGER_ITEMS) {
+ return json(
+ {
+ error: `Batch size of ${body.items.length} is too large. Maximum allowed batch size is ${env.MAX_BATCH_V2_TRIGGER_ITEMS}.`,
+ },
+ { status: 400 }
+ );
+ }
+
+ const {
+ "trigger-version": triggerVersion,
+ "x-trigger-span-parent-as-link": spanParentAsLink,
+ "x-trigger-worker": isFromWorker,
+ "x-trigger-client": triggerClient,
+ "x-trigger-engine-version": engineVersion,
+ "batch-processing-strategy": batchProcessingStrategy,
+ traceparent,
+ tracestate,
+ } = headers;
+
+ const oneTimeUseToken = await getOneTimeUseToken(authentication);
+
+ logger.debug("Batch trigger request", {
+ triggerVersion,
+ spanParentAsLink,
+ isFromWorker,
+ triggerClient,
+ traceparent,
+ tracestate,
+ batchProcessingStrategy,
+ });
+
+ const traceContext =
+ traceparent && isFromWorker // If the request is from a worker, we should pass the trace context
+ ? { traceparent, tracestate }
+ : undefined;
+
+ const service = new BatchTriggerV4Service(batchProcessingStrategy ?? undefined);
+
+ try {
+ const batch = await service.call(authentication.environment, body, {
+ triggerVersion: triggerVersion ?? undefined,
+ traceContext,
+ spanParentAsLink: spanParentAsLink === 1,
+ oneTimeUseToken,
+ });
+
+ const $responseHeaders = await responseHeaders(
+ batch,
+ authentication.environment,
+ triggerClient
+ );
+
+ return json(batch, { status: 202, headers: $responseHeaders });
+ } catch (error) {
+ logger.error("Batch trigger error", {
+ error: {
+ message: (error as Error).message,
+ stack: (error as Error).stack,
+ },
+ });
+
+ if (error instanceof ServiceValidationError) {
+ return json({ error: error.message }, { status: 422 });
+ } else if (error instanceof OutOfEntitlementError) {
+ return json({ error: error.message }, { status: 422 });
+ } else if (error instanceof Error) {
+ return json(
+ { error: error.message },
+ { status: 500, headers: { "x-should-retry": "false" } }
+ );
+ }
+
+ return json({ error: "Something went wrong" }, { status: 500 });
+ }
+ }
+);
+
+async function responseHeaders(
+ batch: BatchTriggerTaskV3Response,
+ environment: AuthenticatedEnvironment,
+ triggerClient?: string | null
+): Promise> {
+ const claimsHeader = JSON.stringify({
+ sub: environment.id,
+ pub: true,
+ });
+
+ if (triggerClient === "browser") {
+ const claims = {
+ sub: environment.id,
+ pub: true,
+ scopes: [`read:batch:${batch.id}`],
+ };
+
+ const jwt = await generateJWT({
+ secretKey: environment.apiKey,
+ payload: claims,
+ expirationTime: "1h",
+ });
+
+ return {
+ "x-trigger-jwt-claims": claimsHeader,
+ "x-trigger-jwt": jwt,
+ };
+ }
+
+ return {
+ "x-trigger-jwt-claims": claimsHeader,
+ };
+}
+
+export { action, loader };
diff --git a/apps/webapp/app/routes/engine.v1.dev.config.ts b/apps/webapp/app/routes/engine.v1.dev.config.ts
new file mode 100644
index 0000000000..501b6e80de
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.dev.config.ts
@@ -0,0 +1,32 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import { DevConfigResponseBody } from "@trigger.dev/core/v3/schemas";
+import { z } from "zod";
+import { env } from "~/env.server";
+import { logger } from "~/services/logger.server";
+import { createLoaderApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+
+export const loader = createLoaderApiRoute(
+ {
+ findResource: async () => 1,
+ headers: z.object({
+ "x-forwarded-for": z.string().optional(),
+ }),
+ },
+ async ({ authentication }): Promise> => {
+ logger.debug("Get dev settings", { environmentId: authentication.environment.id });
+
+ try {
+ return json({
+ environmentId: authentication.environment.id,
+ dequeueIntervalWithRun: env.DEV_DEQUEUE_INTERVAL_WITH_RUN,
+ dequeueIntervalWithoutRun: env.DEV_DEQUEUE_INTERVAL_WITHOUT_RUN,
+ });
+ } catch (error) {
+ logger.error("Failed to get dev settings", {
+ environmentId: authentication.environment.id,
+ error,
+ });
+ throw error;
+ }
+ }
+);
diff --git a/apps/webapp/app/routes/engine.v1.dev.dequeue.ts b/apps/webapp/app/routes/engine.v1.dev.dequeue.ts
new file mode 100644
index 0000000000..0f10c3dee1
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.dev.dequeue.ts
@@ -0,0 +1,92 @@
+import { json } from "@remix-run/server-runtime";
+import { DequeuedMessage, DevDequeueRequestBody, MachineResources } from "@trigger.dev/core/v3";
+import { BackgroundWorkerId } from "@trigger.dev/core/v3/apps";
+import { env } from "~/env.server";
+import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+import { engine } from "~/v3/runEngine.server";
+
+const { action } = createActionApiRoute(
+ {
+ body: DevDequeueRequestBody,
+ maxContentLength: 1024 * 10, // 10KB
+ method: "POST",
+ },
+ async ({ authentication, body }) => {
+ //we won't return more runs than this in one API call
+ let maxDequeueCount = env.DEV_DEQUEUE_MAX_RUNS_PER_PULL;
+
+ //we can't use more than the max resources
+ const availableResources = body.maxResources ?? {
+ cpu: 8,
+ memory: 16,
+ };
+
+ let dequeuedMessages: DequeuedMessage[] = [];
+
+ //we need to check the current worker, because a run might have been locked to it
+ const workers = body.oldWorkers.concat(body.currentWorker);
+
+ //first we want to clear out old runs
+ for (const worker of workers) {
+ //dequeue
+ const latestResult = await engine.dequeueFromBackgroundWorkerMasterQueue({
+ consumerId: authentication.environment.id,
+ //specific version
+ backgroundWorkerId: BackgroundWorkerId.toId(worker),
+ maxRunCount: maxDequeueCount,
+ maxResources: availableResources,
+ });
+
+ //add runs to the array
+ dequeuedMessages.push(...latestResult);
+
+ //update availableResources
+ const consumedResources = latestResult.reduce(
+ (acc, r) => {
+ return {
+ cpu: acc.cpu + r.run.machine.cpu,
+ memory: acc.memory + r.run.machine.memory,
+ };
+ },
+ { cpu: 0, memory: 0 }
+ );
+ updateAvailableResources(availableResources, consumedResources);
+
+ //update maxDequeueCount
+ maxDequeueCount -= latestResult.length;
+
+ //if we have no resources left, we exit the loop
+ if (!hasAvailableResources(availableResources)) break;
+ //we've already dequeued the max number of runs
+ if (maxDequeueCount <= 0) break;
+ }
+
+ //dequeue from the current version if we still have space
+ if (hasAvailableResources(availableResources) && maxDequeueCount > 0) {
+ const latestResult = await engine.dequeueFromEnvironmentMasterQueue({
+ consumerId: authentication.environment.id,
+ //current dev version (no specific version specified)
+ environmentId: authentication.environment.id,
+ maxRunCount: maxDequeueCount,
+ maxResources: availableResources,
+ });
+ dequeuedMessages.push(...latestResult);
+ }
+
+ return json({ dequeuedMessages }, { status: 200 });
+ }
+);
+
+function updateAvailableResources(
+ availableResources: MachineResources,
+ resources: MachineResources
+) {
+ availableResources.cpu -= resources.cpu;
+ availableResources.memory -= resources.memory;
+}
+
+function hasAvailableResources(availableResources: MachineResources) {
+ return availableResources.cpu > 0 && availableResources.memory > 0;
+}
+
+export { action };
diff --git a/apps/webapp/app/routes/engine.v1.dev.presence.ts b/apps/webapp/app/routes/engine.v1.dev.presence.ts
new file mode 100644
index 0000000000..9e3add8c78
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.dev.presence.ts
@@ -0,0 +1,81 @@
+import { json } from "@remix-run/server-runtime";
+import { Redis } from "ioredis";
+import { env } from "~/env.server";
+import { DevPresenceStream } from "~/presenters/v3/DevPresenceStream.server";
+import { authenticateApiRequestWithFailure } from "~/services/apiAuth.server";
+import { logger } from "~/services/logger.server";
+import { createSSELoader } from "~/utils/sse";
+
+const redis = new Redis({
+ port: env.RUN_ENGINE_DEV_PRESENCE_REDIS_PORT ?? undefined,
+ host: env.RUN_ENGINE_DEV_PRESENCE_REDIS_HOST ?? undefined,
+ username: env.RUN_ENGINE_DEV_PRESENCE_REDIS_USERNAME ?? undefined,
+ password: env.RUN_ENGINE_DEV_PRESENCE_REDIS_PASSWORD ?? undefined,
+ enableAutoPipelining: true,
+ ...(env.RUN_ENGINE_DEV_PRESENCE_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }),
+});
+
+export const loader = createSSELoader({
+ timeout: env.DEV_PRESENCE_TTL_MS,
+ interval: env.DEV_PRESENCE_POLL_INTERVAL_MS,
+ debug: true,
+ handler: async ({ id, controller, debug, request }) => {
+ const authentication = await authenticateApiRequestWithFailure(request);
+
+ if (!authentication.ok) {
+ throw json({ error: "Invalid or Missing API key" }, { status: 401 });
+ }
+
+ const environmentId = authentication.environment.id;
+
+ const presenceKey = DevPresenceStream.getPresenceKey(environmentId);
+ const presenceChannel = DevPresenceStream.getPresenceChannel(environmentId);
+
+ return {
+ beforeStream: async () => {
+ logger.debug("Start dev presence SSE session", {
+ environmentId,
+ presenceKey,
+ presenceChannel,
+ });
+ },
+ initStream: async ({ send }) => {
+ //todo set a string instead, with the expire on the same call
+ //won't need multi
+
+ // Set initial presence with more context
+ await redis.setex(presenceKey, env.DEV_PRESENCE_TTL_MS / 1000, Date.now().toString());
+
+ // Publish presence update
+ await redis.publish(
+ presenceChannel,
+ JSON.stringify({
+ type: "connected",
+ environmentId,
+ timestamp: Date.now(),
+ })
+ );
+
+ send({ event: "start", data: `Started ${id}` });
+ },
+ iterator: async ({ send, date }) => {
+ await redis.setex(presenceKey, env.DEV_PRESENCE_TTL_MS / 1000, date.toISOString());
+
+ send({ event: "time", data: new Date().toISOString() });
+ },
+ cleanup: async () => {
+ await redis.del(presenceKey);
+
+ // Publish disconnect event
+ await redis.publish(
+ presenceChannel,
+ JSON.stringify({
+ type: "disconnected",
+ environmentId,
+ timestamp: Date.now(),
+ })
+ );
+ },
+ };
+ },
+});
diff --git a/apps/webapp/app/routes/engine.v1.dev.runs.$runFriendlyId.logs.debug.ts b/apps/webapp/app/routes/engine.v1.dev.runs.$runFriendlyId.logs.debug.ts
new file mode 100644
index 0000000000..d804f49b79
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.dev.runs.$runFriendlyId.logs.debug.ts
@@ -0,0 +1,74 @@
+import { TypedResponse } from "@remix-run/server-runtime";
+import { assertExhaustive } from "@trigger.dev/core";
+import { RunId } from "@trigger.dev/core/v3/apps";
+import {
+ WorkerApiDebugLogBody,
+ WorkerApiRunAttemptStartResponseBody,
+} from "@trigger.dev/core/v3/workers";
+import { z } from "zod";
+import { prisma } from "~/db.server";
+import { logger } from "~/services/logger.server";
+import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+import { recordRunDebugLog } from "~/v3/eventRepository.server";
+
+const { action } = createActionApiRoute(
+ {
+ params: z.object({
+ runFriendlyId: z.string(),
+ }),
+ body: WorkerApiDebugLogBody,
+ method: "POST",
+ },
+ async ({
+ authentication,
+ body,
+ params,
+ }): Promise> => {
+ const { runFriendlyId } = params;
+
+ try {
+ const run = await prisma.taskRun.findFirst({
+ where: {
+ friendlyId: params.runFriendlyId,
+ runtimeEnvironmentId: authentication.environment.id,
+ },
+ });
+
+ if (!run) {
+ throw new Response("You don't have permissions for this run", { status: 401 });
+ }
+
+ const eventResult = await recordRunDebugLog(
+ RunId.fromFriendlyId(runFriendlyId),
+ body.message,
+ {
+ attributes: {
+ properties: body.properties,
+ },
+ startTime: body.time,
+ }
+ );
+
+ if (eventResult.success) {
+ return new Response(null, { status: 204 });
+ }
+
+ switch (eventResult.code) {
+ case "FAILED_TO_RECORD_EVENT":
+ return new Response(null, { status: 400 }); // send a 400 to prevent retries
+ case "RUN_NOT_FOUND":
+ return new Response(null, { status: 404 });
+ default:
+ return assertExhaustive(eventResult.code);
+ }
+ } catch (error) {
+ logger.error("Failed to record dev log", {
+ environmentId: authentication.environment.id,
+ error,
+ });
+ throw error;
+ }
+ }
+);
+
+export { action };
diff --git a/apps/webapp/app/routes/engine.v1.dev.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.attempts.complete.ts b/apps/webapp/app/routes/engine.v1.dev.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.attempts.complete.ts
new file mode 100644
index 0000000000..13752e36fa
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.dev.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.attempts.complete.ts
@@ -0,0 +1,64 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import { assertExhaustive } from "@trigger.dev/core";
+import { RunId, SnapshotId } from "@trigger.dev/core/v3/apps";
+import {
+ WorkerApiDebugLogBody,
+ WorkerApiRunAttemptCompleteRequestBody,
+ WorkerApiRunAttemptCompleteResponseBody,
+ WorkerApiRunAttemptStartResponseBody,
+ WorkloadHeartbeatResponseBody,
+} from "@trigger.dev/core/v3/workers";
+import { z } from "zod";
+import { prisma } from "~/db.server";
+import { logger } from "~/services/logger.server";
+import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+import { recordRunDebugLog } from "~/v3/eventRepository.server";
+import { engine } from "~/v3/runEngine.server";
+
+const { action } = createActionApiRoute(
+ {
+ params: z.object({
+ runFriendlyId: z.string(),
+ snapshotFriendlyId: z.string(),
+ }),
+ body: WorkerApiRunAttemptCompleteRequestBody,
+ method: "POST",
+ },
+ async ({
+ authentication,
+ body,
+ params,
+ }): Promise> => {
+ const { completion } = body;
+ const { runFriendlyId, snapshotFriendlyId } = params;
+
+ try {
+ const run = await prisma.taskRun.findFirst({
+ where: {
+ friendlyId: params.runFriendlyId,
+ runtimeEnvironmentId: authentication.environment.id,
+ },
+ });
+
+ if (!run) {
+ throw new Response("You don't have permissions for this run", { status: 401 });
+ }
+
+ const completeResult = await engine.completeRunAttempt({
+ runId: RunId.toId(runFriendlyId),
+ snapshotId: SnapshotId.toId(snapshotFriendlyId),
+ completion,
+ });
+
+ return json({ result: completeResult });
+ } catch (error) {
+ logger.error("Failed to complete dev attempt", {
+ environmentId: authentication.environment.id,
+ error,
+ });
+ throw error;
+ }
+ }
+);
+
+export { action };
diff --git a/apps/webapp/app/routes/engine.v1.dev.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.attempts.start.ts b/apps/webapp/app/routes/engine.v1.dev.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.attempts.start.ts
new file mode 100644
index 0000000000..25787eee05
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.dev.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.attempts.start.ts
@@ -0,0 +1,103 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import { MachinePreset } from "@trigger.dev/core/v3";
+import { RunId, SnapshotId } from "@trigger.dev/core/v3/apps";
+import {
+ WorkerApiRunAttemptStartRequestBody,
+ WorkerApiRunAttemptStartResponseBody,
+} from "@trigger.dev/core/v3/workers";
+import { RuntimeEnvironment } from "@trigger.dev/database";
+import { defaultMachine } from "@trigger.dev/platform/v3";
+import { z } from "zod";
+import { prisma } from "~/db.server";
+import { generateJWTTokenForEnvironment } from "~/services/apiAuth.server";
+import { logger } from "~/services/logger.server";
+import {
+ createActionApiRoute,
+ createActionWorkerApiRoute,
+} from "~/services/routeBuilders/apiBuilder.server";
+import { resolveVariablesForEnvironment } from "~/v3/environmentVariables/environmentVariablesRepository.server";
+import { machinePresetFromName } from "~/v3/machinePresets.server";
+import { engine } from "~/v3/runEngine.server";
+
+const { action } = createActionApiRoute(
+ {
+ body: WorkerApiRunAttemptStartRequestBody,
+ params: z.object({
+ runFriendlyId: z.string(),
+ snapshotFriendlyId: z.string(),
+ }),
+ method: "POST",
+ },
+ async ({
+ authentication,
+ body,
+ params,
+ }): Promise> => {
+ const { runFriendlyId, snapshotFriendlyId } = params;
+
+ try {
+ const run = await prisma.taskRun.findFirst({
+ where: {
+ friendlyId: params.runFriendlyId,
+ runtimeEnvironmentId: authentication.environment.id,
+ },
+ });
+
+ if (!run) {
+ throw new Response("You don't have permissions for this run", { status: 401 });
+ }
+
+ const engineResult = await engine.startRunAttempt({
+ runId: RunId.toId(runFriendlyId),
+ snapshotId: SnapshotId.toId(snapshotFriendlyId),
+ });
+
+ const defaultMachinePreset = machinePresetFromName(defaultMachine);
+
+ const envVars = await getEnvVars(
+ authentication.environment,
+ engineResult.run.id,
+ engineResult.execution.machine ?? defaultMachinePreset
+ );
+
+ return json({
+ ...engineResult,
+ envVars,
+ });
+ } catch (error) {
+ logger.error("Failed to record dev log", {
+ environmentId: authentication.environment.id,
+ error,
+ });
+ throw error;
+ }
+ }
+);
+
+async function getEnvVars(
+ environment: RuntimeEnvironment,
+ runId: string,
+ machinePreset: MachinePreset
+): Promise> {
+ const variables = await resolveVariablesForEnvironment(environment);
+
+ const jwt = await generateJWTTokenForEnvironment(environment, {
+ run_id: runId,
+ machine_preset: machinePreset.name,
+ });
+
+ variables.push(
+ ...[
+ { key: "TRIGGER_JWT", value: jwt },
+ { key: "TRIGGER_RUN_ID", value: runId },
+ { key: "TRIGGER_MACHINE_PRESET", value: machinePreset.name },
+ ]
+ );
+
+ return variables.reduce((acc: Record, curr) => {
+ acc[curr.key] = curr.value;
+ return acc;
+ }, {});
+}
+
+export { action };
diff --git a/apps/webapp/app/routes/engine.v1.dev.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.heartbeat.ts b/apps/webapp/app/routes/engine.v1.dev.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.heartbeat.ts
new file mode 100644
index 0000000000..2bf88ff1ac
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.dev.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.heartbeat.ts
@@ -0,0 +1,59 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import { assertExhaustive } from "@trigger.dev/core";
+import { RunId, SnapshotId } from "@trigger.dev/core/v3/apps";
+import {
+ WorkerApiDebugLogBody,
+ WorkerApiRunAttemptStartResponseBody,
+ WorkloadHeartbeatResponseBody,
+} from "@trigger.dev/core/v3/workers";
+import { z } from "zod";
+import { prisma } from "~/db.server";
+import { logger } from "~/services/logger.server";
+import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+import { recordRunDebugLog } from "~/v3/eventRepository.server";
+import { engine } from "~/v3/runEngine.server";
+
+const { action } = createActionApiRoute(
+ {
+ params: z.object({
+ runFriendlyId: z.string(),
+ snapshotFriendlyId: z.string(),
+ }),
+ method: "POST",
+ },
+ async ({
+ authentication,
+ body,
+ params,
+ }): Promise> => {
+ const { runFriendlyId, snapshotFriendlyId } = params;
+
+ try {
+ const run = await prisma.taskRun.findFirst({
+ where: {
+ friendlyId: params.runFriendlyId,
+ runtimeEnvironmentId: authentication.environment.id,
+ },
+ });
+
+ if (!run) {
+ throw new Response("You don't have permissions for this run", { status: 401 });
+ }
+
+ await engine.heartbeatRun({
+ runId: RunId.toId(runFriendlyId),
+ snapshotId: SnapshotId.toId(snapshotFriendlyId),
+ });
+
+ return json({ ok: true });
+ } catch (error) {
+ logger.error("Failed to heartbeat dev run", {
+ environmentId: authentication.environment.id,
+ error,
+ });
+ throw error;
+ }
+ }
+);
+
+export { action };
diff --git a/apps/webapp/app/routes/engine.v1.dev.runs.$runFriendlyId.snapshots.latest.ts b/apps/webapp/app/routes/engine.v1.dev.runs.$runFriendlyId.snapshots.latest.ts
new file mode 100644
index 0000000000..91664888a0
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.dev.runs.$runFriendlyId.snapshots.latest.ts
@@ -0,0 +1,55 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import { RunId } from "@trigger.dev/core/v3/apps";
+import { WorkerApiRunLatestSnapshotResponseBody } from "@trigger.dev/core/v3/workers";
+import { z } from "zod";
+import { prisma } from "~/db.server";
+import { logger } from "~/services/logger.server";
+import { createLoaderApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+import { engine } from "~/v3/runEngine.server";
+
+export const loader = createLoaderApiRoute(
+ {
+ findResource: async () => 1,
+ params: z.object({
+ runFriendlyId: z.string(),
+ }),
+ },
+ async ({
+ authentication,
+ params,
+ }): Promise> => {
+ logger.debug("dev: Get latest snapshot", {
+ environmentId: authentication.environment.id,
+ params,
+ });
+
+ try {
+ const run = await prisma.taskRun.findFirst({
+ where: {
+ friendlyId: params.runFriendlyId,
+ runtimeEnvironmentId: authentication.environment.id,
+ },
+ });
+
+ if (!run) {
+ throw new Response("You don't have permissions for this run", { status: 401 });
+ }
+
+ const executionData = await engine.getRunExecutionData({
+ runId: RunId.toId(params.runFriendlyId),
+ });
+
+ if (!executionData) {
+ throw new Error("Failed to retrieve latest snapshot");
+ }
+
+ return json({ execution: executionData });
+ } catch (error) {
+ logger.error("Failed to get latest snapshot", {
+ environmentId: authentication.environment.id,
+ error,
+ });
+ throw error;
+ }
+ }
+);
diff --git a/apps/webapp/app/routes/engine.v1.runs.$runFriendlyId.wait.duration.ts b/apps/webapp/app/routes/engine.v1.runs.$runFriendlyId.wait.duration.ts
new file mode 100644
index 0000000000..04ef8c5aae
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.runs.$runFriendlyId.wait.duration.ts
@@ -0,0 +1,75 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import { WaitForDurationRequestBody, WaitForDurationResponseBody } from "@trigger.dev/core/v3";
+import { RunId } from "@trigger.dev/core/v3/apps";
+
+import { z } from "zod";
+import { prisma } from "~/db.server";
+import { logger } from "~/services/logger.server";
+import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+import { resolveIdempotencyKeyTTL } from "~/utils/idempotencyKeys.server";
+import { engine } from "~/v3/runEngine.server";
+
+const { action } = createActionApiRoute(
+ {
+ body: WaitForDurationRequestBody,
+ params: z.object({
+ runFriendlyId: z.string(),
+ }),
+ method: "POST",
+ },
+ async ({ authentication, body, params }): Promise> => {
+ const { runFriendlyId } = params;
+ const runId = RunId.toId(runFriendlyId);
+
+ try {
+ const run = await prisma.taskRun.findFirst({
+ where: {
+ id: runId,
+ runtimeEnvironmentId: authentication.environment.id,
+ },
+ });
+
+ if (!run) {
+ throw new Response("You don't have permissions for this run", { status: 401 });
+ }
+
+ const idempotencyKeyExpiresAt = body.idempotencyKeyTTL
+ ? resolveIdempotencyKeyTTL(body.idempotencyKeyTTL)
+ : undefined;
+
+ const { waitpoint } = await engine.createDateTimeWaitpoint({
+ projectId: authentication.environment.project.id,
+ environmentId: authentication.environment.id,
+ completedAfter: body.date,
+ idempotencyKey: body.idempotencyKey,
+ idempotencyKeyExpiresAt: idempotencyKeyExpiresAt,
+ });
+
+ const waitResult = await engine.blockRunWithWaitpoint({
+ runId: run.id,
+ waitpoints: waitpoint.id,
+ environmentId: authentication.environment.id,
+ projectId: authentication.environment.project.id,
+ organizationId: authentication.environment.organization.id,
+ releaseConcurrency: {
+ releaseQueue: true,
+ },
+ });
+
+ return json({
+ waitUntil: body.date,
+ waitpoint: {
+ id: waitpoint.friendlyId,
+ },
+ });
+ } catch (error) {
+ logger.error("Failed to wait for duration dev", {
+ environmentId: authentication.environment.id,
+ error,
+ });
+ throw error;
+ }
+ }
+);
+
+export { action };
diff --git a/apps/webapp/app/routes/engine.v1.runs.$runFriendlyId.waitpoints.tokens.$waitpointFriendlyId.wait.ts b/apps/webapp/app/routes/engine.v1.runs.$runFriendlyId.waitpoints.tokens.$waitpointFriendlyId.wait.ts
new file mode 100644
index 0000000000..e34e25529f
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.runs.$runFriendlyId.waitpoints.tokens.$waitpointFriendlyId.wait.ts
@@ -0,0 +1,58 @@
+import { json } from "@remix-run/server-runtime";
+import { WaitForWaitpointTokenResponseBody } from "@trigger.dev/core/v3";
+import { RunId, WaitpointId } from "@trigger.dev/core/v3/apps";
+import { z } from "zod";
+import { $replica } from "~/db.server";
+import { logger } from "~/services/logger.server";
+import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+import { engine } from "~/v3/runEngine.server";
+
+const { action } = createActionApiRoute(
+ {
+ params: z.object({
+ runFriendlyId: z.string(),
+ waitpointFriendlyId: z.string(),
+ }),
+ maxContentLength: 1024 * 10, // 10KB
+ method: "POST",
+ },
+ async ({ authentication, body, params }) => {
+ // Resume tokens are actually just waitpoints
+ const waitpointId = WaitpointId.toId(params.waitpointFriendlyId);
+ const runId = RunId.toId(params.runFriendlyId);
+
+ try {
+ //check permissions
+ const waitpoint = await $replica.waitpoint.findFirst({
+ where: {
+ id: waitpointId,
+ environmentId: authentication.environment.id,
+ },
+ });
+
+ if (!waitpoint) {
+ throw json({ error: "Waitpoint not found" }, { status: 404 });
+ }
+
+ const result = await engine.blockRunWithWaitpoint({
+ runId,
+ waitpoints: [waitpointId],
+ environmentId: authentication.environment.id,
+ projectId: authentication.environment.project.id,
+ organizationId: authentication.environment.organization.id,
+ });
+
+ return json(
+ {
+ success: true,
+ },
+ { status: 200 }
+ );
+ } catch (error) {
+ logger.error("Failed to wait for waitpoint", { runId, waitpointId, error });
+ throw json({ error: "Failed to wait for waitpoint token" }, { status: 500 });
+ }
+ }
+);
+
+export { action };
diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.connect.ts b/apps/webapp/app/routes/engine.v1.worker-actions.connect.ts
new file mode 100644
index 0000000000..247984d454
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.worker-actions.connect.ts
@@ -0,0 +1,19 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import { WorkerApiConnectRequestBody, WorkerApiConnectResponseBody } from "@trigger.dev/core/v3/workers";
+import { createActionWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+
+export const action = createActionWorkerApiRoute(
+ {
+ body: WorkerApiConnectRequestBody,
+ },
+ async ({ authenticatedWorker, body }): Promise> => {
+ await authenticatedWorker.connect(body.metadata);
+ return json({
+ ok: true,
+ workerGroup: {
+ type: authenticatedWorker.type,
+ name: authenticatedWorker.name,
+ },
+ });
+ }
+);
diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.deployments.$deploymentFriendlyId.dequeue.ts b/apps/webapp/app/routes/engine.v1.worker-actions.deployments.$deploymentFriendlyId.dequeue.ts
new file mode 100644
index 0000000000..fbfa194662
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.worker-actions.deployments.$deploymentFriendlyId.dequeue.ts
@@ -0,0 +1,68 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import { CURRENT_DEPLOYMENT_LABEL } from "@trigger.dev/core/v3/apps";
+import { WorkerApiDequeueResponseBody } from "@trigger.dev/core/v3/workers";
+import { z } from "zod";
+import { $replica, prisma } from "~/db.server";
+import { createLoaderWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+
+export const loader = createLoaderWorkerApiRoute(
+ {
+ params: z.object({
+ deploymentFriendlyId: z.string(),
+ }),
+ searchParams: z.object({
+ maxRunCount: z.coerce.number().optional(),
+ }),
+ },
+ async ({
+ authenticatedWorker,
+ params,
+ searchParams,
+ }): Promise> => {
+ const deployment = await $replica.workerDeployment.findUnique({
+ where: {
+ friendlyId: params.deploymentFriendlyId,
+ },
+ include: {
+ worker: true,
+ },
+ });
+
+ if (!deployment) {
+ throw new Error("Deployment not found");
+ }
+
+ if (!deployment.worker) {
+ throw new Error("Worker not found");
+ }
+
+ const dequeuedMessages = (await isCurrentDeployment(deployment.id, deployment.environmentId))
+ ? await authenticatedWorker.dequeueFromEnvironment(
+ deployment.worker.id,
+ deployment.environmentId
+ )
+ : await authenticatedWorker.dequeueFromVersion(
+ deployment.worker.id,
+ searchParams.maxRunCount
+ );
+
+ return json(dequeuedMessages);
+ }
+);
+
+async function isCurrentDeployment(deploymentId: string, environmentId: string): Promise {
+ const promotion = await prisma.workerDeploymentPromotion.findUnique({
+ where: {
+ environmentId_label: {
+ environmentId,
+ label: CURRENT_DEPLOYMENT_LABEL,
+ },
+ },
+ });
+
+ if (!promotion) {
+ return false;
+ }
+
+ return promotion.deploymentId === deploymentId;
+}
diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.dequeue.ts b/apps/webapp/app/routes/engine.v1.worker-actions.dequeue.ts
new file mode 100644
index 0000000000..4dd0798ad3
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.worker-actions.dequeue.ts
@@ -0,0 +1,16 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import { WorkerApiDequeueRequestBody, WorkerApiDequeueResponseBody } from "@trigger.dev/core/v3/workers";
+import { createActionWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+
+export const action = createActionWorkerApiRoute(
+ {
+ body: WorkerApiDequeueRequestBody,
+ },
+ async ({ authenticatedWorker, body }): Promise> => {
+ return json(
+ await authenticatedWorker.dequeue({
+ maxResources: body.maxResources,
+ })
+ );
+ }
+);
diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.heartbeat.ts b/apps/webapp/app/routes/engine.v1.worker-actions.heartbeat.ts
new file mode 100644
index 0000000000..111cab6011
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.worker-actions.heartbeat.ts
@@ -0,0 +1,13 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import { WorkerApiHeartbeatResponseBody, WorkerApiHeartbeatRequestBody } from "@trigger.dev/core/v3/workers";
+import { createActionWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+
+export const action = createActionWorkerApiRoute(
+ {
+ body: WorkerApiHeartbeatRequestBody,
+ },
+ async ({ authenticatedWorker }): Promise> => {
+ await authenticatedWorker.heartbeatWorkerInstance();
+ return json({ ok: true });
+ }
+);
diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.logs.debug.ts b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.logs.debug.ts
new file mode 100644
index 0000000000..2ec26906b7
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.logs.debug.ts
@@ -0,0 +1,38 @@
+import { assertExhaustive } from "@trigger.dev/core";
+import { RunId } from "@trigger.dev/core/v3/apps";
+import { WorkerApiDebugLogBody } from "@trigger.dev/core/v3/runEngineWorker";
+import { z } from "zod";
+import { createActionWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+import { recordRunDebugLog } from "~/v3/eventRepository.server";
+
+export const action = createActionWorkerApiRoute(
+ {
+ params: z.object({
+ runFriendlyId: z.string(),
+ }),
+ body: WorkerApiDebugLogBody,
+ },
+ async ({ body, params }): Promise => {
+ const { runFriendlyId } = params;
+
+ const eventResult = await recordRunDebugLog(RunId.fromFriendlyId(runFriendlyId), body.message, {
+ attributes: {
+ properties: body.properties,
+ },
+ startTime: body.time,
+ });
+
+ if (eventResult.success) {
+ return new Response(null, { status: 204 });
+ }
+
+ switch (eventResult.code) {
+ case "FAILED_TO_RECORD_EVENT":
+ return new Response(null, { status: 400 }); // send a 400 to prevent retries
+ case "RUN_NOT_FOUND":
+ return new Response(null, { status: 404 });
+ default:
+ return assertExhaustive(eventResult.code);
+ }
+ }
+);
diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.attempts.complete.ts b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.attempts.complete.ts
new file mode 100644
index 0000000000..81f53280e1
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.attempts.complete.ts
@@ -0,0 +1,33 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import {
+ WorkerApiRunAttemptCompleteRequestBody,
+ WorkerApiRunAttemptCompleteResponseBody,
+} from "@trigger.dev/core/v3/workers";
+import { z } from "zod";
+import { createActionWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+
+export const action = createActionWorkerApiRoute(
+ {
+ body: WorkerApiRunAttemptCompleteRequestBody,
+ params: z.object({
+ runFriendlyId: z.string(),
+ snapshotFriendlyId: z.string(),
+ }),
+ },
+ async ({
+ authenticatedWorker,
+ body,
+ params,
+ }): Promise> => {
+ const { completion } = body;
+ const { runFriendlyId, snapshotFriendlyId } = params;
+
+ const completeResult = await authenticatedWorker.completeRunAttempt({
+ runFriendlyId,
+ snapshotFriendlyId,
+ completion,
+ });
+
+ return json({ result: completeResult });
+ }
+);
diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.attempts.start.ts b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.attempts.start.ts
new file mode 100644
index 0000000000..f7a8e874ec
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.attempts.start.ts
@@ -0,0 +1,32 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import {
+ WorkerApiRunAttemptStartRequestBody,
+ WorkerApiRunAttemptStartResponseBody,
+} from "@trigger.dev/core/v3/workers";
+import { z } from "zod";
+import { createActionWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+
+export const action = createActionWorkerApiRoute(
+ {
+ body: WorkerApiRunAttemptStartRequestBody,
+ params: z.object({
+ runFriendlyId: z.string(),
+ snapshotFriendlyId: z.string(),
+ }),
+ },
+ async ({
+ authenticatedWorker,
+ body,
+ params,
+ }): Promise> => {
+ const { runFriendlyId, snapshotFriendlyId } = params;
+
+ const runExecutionData = await authenticatedWorker.startRunAttempt({
+ runFriendlyId,
+ snapshotFriendlyId,
+ isWarmStart: body.isWarmStart,
+ });
+
+ return json(runExecutionData);
+ }
+);
diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.continue.ts b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.continue.ts
new file mode 100644
index 0000000000..b35f26a10e
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.continue.ts
@@ -0,0 +1,34 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import { WorkerApiContinueRunExecutionRequestBody } from "@trigger.dev/core/v3/workers";
+import { z } from "zod";
+import { logger } from "~/services/logger.server";
+import { createLoaderWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+
+export const loader = createLoaderWorkerApiRoute(
+ {
+ params: z.object({
+ runFriendlyId: z.string(),
+ snapshotFriendlyId: z.string(),
+ }),
+ },
+ async ({
+ authenticatedWorker,
+ params,
+ }): Promise> => {
+ const { runFriendlyId, snapshotFriendlyId } = params;
+
+ logger.debug("Continuing run execution", { runFriendlyId, snapshotFriendlyId });
+
+ try {
+ const continuationResult = await authenticatedWorker.continueRunExecution({
+ runFriendlyId,
+ snapshotFriendlyId,
+ });
+
+ return json(continuationResult);
+ } catch (error) {
+ logger.error("Failed to suspend run", { runFriendlyId, snapshotFriendlyId, error });
+ throw error;
+ }
+ }
+);
diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.heartbeat.ts b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.heartbeat.ts
new file mode 100644
index 0000000000..0942bcde8b
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.heartbeat.ts
@@ -0,0 +1,26 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import { WorkloadHeartbeatResponseBody } from "@trigger.dev/core/v3/workers";
+import { z } from "zod";
+import { createActionWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+
+export const action = createActionWorkerApiRoute(
+ {
+ params: z.object({
+ runFriendlyId: z.string(),
+ snapshotFriendlyId: z.string(),
+ }),
+ },
+ async ({
+ authenticatedWorker,
+ params,
+ }): Promise> => {
+ const { runFriendlyId, snapshotFriendlyId } = params;
+
+ await authenticatedWorker.heartbeatRun({
+ runFriendlyId,
+ snapshotFriendlyId,
+ });
+
+ return json({ ok: true });
+ }
+);
diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.restore.ts b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.restore.ts
new file mode 100644
index 0000000000..8f892288e6
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.restore.ts
@@ -0,0 +1,51 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import {
+ WorkerApiSuspendRunRequestBody,
+ WorkerApiSuspendRunResponseBody,
+} from "@trigger.dev/core/v3/workers";
+import { z } from "zod";
+import { logger } from "~/services/logger.server";
+import { createActionWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+
+export const action = createActionWorkerApiRoute(
+ {
+ params: z.object({
+ runFriendlyId: z.string(),
+ snapshotFriendlyId: z.string(),
+ }),
+ body: WorkerApiSuspendRunRequestBody,
+ },
+ async ({
+ authenticatedWorker,
+ params,
+ body,
+ }): Promise> => {
+ const { runFriendlyId, snapshotFriendlyId } = params;
+
+ logger.debug("Restoring run", { runFriendlyId, snapshotFriendlyId, body });
+
+ if (!body.success) {
+ // TODO: we could create a debug span here
+ logger.error("Failed to restore run", {
+ runFriendlyId,
+ snapshotFriendlyId,
+ error: body.error,
+ });
+
+ return json({ ok: true });
+ }
+
+ try {
+ await authenticatedWorker.createCheckpoint({
+ runFriendlyId,
+ snapshotFriendlyId,
+ checkpoint: body.checkpoint,
+ });
+
+ return json({ ok: true });
+ } catch (error) {
+ logger.error("Failed to restore run", { runFriendlyId, snapshotFriendlyId, error });
+ throw error;
+ }
+ }
+);
diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.suspend.ts b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.suspend.ts
new file mode 100644
index 0000000000..323c98405f
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.$snapshotFriendlyId.suspend.ts
@@ -0,0 +1,51 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import {
+ WorkerApiSuspendRunRequestBody,
+ WorkerApiSuspendRunResponseBody,
+} from "@trigger.dev/core/v3/workers";
+import { z } from "zod";
+import { logger } from "~/services/logger.server";
+import { createActionWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+
+export const action = createActionWorkerApiRoute(
+ {
+ params: z.object({
+ runFriendlyId: z.string(),
+ snapshotFriendlyId: z.string(),
+ }),
+ body: WorkerApiSuspendRunRequestBody,
+ },
+ async ({
+ authenticatedWorker,
+ params,
+ body,
+ }): Promise> => {
+ const { runFriendlyId, snapshotFriendlyId } = params;
+
+ logger.debug("Suspending run", { runFriendlyId, snapshotFriendlyId, body });
+
+ if (!body.success) {
+ // TODO: we could create a debug span here
+ logger.error("Failed to suspend run", {
+ runFriendlyId,
+ snapshotFriendlyId,
+ error: body.error,
+ });
+
+ return json({ ok: true });
+ }
+
+ try {
+ await authenticatedWorker.createCheckpoint({
+ runFriendlyId,
+ snapshotFriendlyId,
+ checkpoint: body.checkpoint,
+ });
+
+ return json({ ok: true });
+ } catch (error) {
+ logger.error("Failed to suspend run", { runFriendlyId, snapshotFriendlyId, error });
+ throw error;
+ }
+ }
+);
diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.latest.ts b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.latest.ts
new file mode 100644
index 0000000000..d480acf01e
--- /dev/null
+++ b/apps/webapp/app/routes/engine.v1.worker-actions.runs.$runFriendlyId.snapshots.latest.ts
@@ -0,0 +1,28 @@
+import { json, TypedResponse } from "@remix-run/server-runtime";
+import { WorkerApiRunLatestSnapshotResponseBody } from "@trigger.dev/core/v3/workers";
+import { z } from "zod";
+import { createLoaderWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+
+export const loader = createLoaderWorkerApiRoute(
+ {
+ params: z.object({
+ runFriendlyId: z.string(),
+ }),
+ },
+ async ({
+ authenticatedWorker,
+ params,
+ }): Promise> => {
+ const { runFriendlyId } = params;
+
+ const executionData = await authenticatedWorker.getLatestSnapshot({
+ runFriendlyId,
+ });
+
+ if (!executionData) {
+ throw new Error("Failed to retrieve latest snapshot");
+ }
+
+ return json({ execution: executionData });
+ }
+);
diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.waitpoints.$waitpointFriendlyId.complete/route.tsx b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.waitpoints.$waitpointFriendlyId.complete/route.tsx
new file mode 100644
index 0000000000..56ebef0726
--- /dev/null
+++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.waitpoints.$waitpointFriendlyId.complete/route.tsx
@@ -0,0 +1,410 @@
+import { env } from "~/env.server";
+import { parse } from "@conform-to/zod";
+import { Form, useLocation, useNavigation, useSubmit } from "@remix-run/react";
+import { ActionFunctionArgs, json } from "@remix-run/server-runtime";
+import {
+ conditionallyExportPacket,
+ IOPacket,
+ stringifyIO,
+ timeoutError,
+} from "@trigger.dev/core/v3";
+import { WaitpointId } from "@trigger.dev/core/v3/apps";
+import { Waitpoint } from "@trigger.dev/database";
+import { useCallback, useRef } from "react";
+import { z } from "zod";
+import { AnimatedHourglassIcon } from "~/assets/icons/AnimatedHourglassIcon";
+import { JSONEditor } from "~/components/code/JSONEditor";
+import { Button } from "~/components/primitives/Buttons";
+import { DateTime } from "~/components/primitives/DateTime";
+import { Paragraph } from "~/components/primitives/Paragraph";
+import { InfoIconTooltip } from "~/components/primitives/Tooltip";
+import { LiveCountdown } from "~/components/runs/v3/LiveTimer";
+import { $replica } from "~/db.server";
+import { useOrganization } from "~/hooks/useOrganizations";
+import { useProject } from "~/hooks/useProject";
+import { redirectWithErrorMessage, redirectWithSuccessMessage } from "~/models/message.server";
+import { logger } from "~/services/logger.server";
+import { requireUserId } from "~/services/session.server";
+import { ProjectParamSchema, v3RunsPath } from "~/utils/pathBuilder";
+import { engine } from "~/v3/runEngine.server";
+
+const CompleteWaitpointFormData = z.discriminatedUnion("type", [
+ z.object({
+ type: z.literal("MANUAL"),
+ payload: z.string().optional(),
+ isTimeout: z.string().optional(),
+ successRedirect: z.string(),
+ failureRedirect: z.string(),
+ }),
+ z.object({
+ type: z.literal("DATETIME"),
+ successRedirect: z.string(),
+ failureRedirect: z.string(),
+ }),
+]);
+
+const Params = ProjectParamSchema.extend({
+ waitpointFriendlyId: z.string(),
+});
+
+export const action = async ({ request, params }: ActionFunctionArgs) => {
+ const userId = await requireUserId(request);
+ const { organizationSlug, projectParam, waitpointFriendlyId } = Params.parse(params);
+
+ const formData = await request.formData();
+ const submission = parse(formData, { schema: CompleteWaitpointFormData });
+
+ if (!submission.value) {
+ return json(submission);
+ }
+
+ try {
+ //first check that the user has access to the project
+ const project = await $replica.project.findUnique({
+ where: {
+ slug: projectParam,
+ organization: {
+ members: {
+ some: {
+ userId,
+ },
+ },
+ },
+ },
+ select: { id: true },
+ });
+
+ if (!project) {
+ throw new Error("Project not found");
+ }
+
+ const waitpointId = WaitpointId.toId(waitpointFriendlyId);
+
+ const waitpoint = await $replica.waitpoint.findFirst({
+ select: {
+ projectId: true,
+ },
+ where: {
+ id: waitpointId,
+ },
+ });
+
+ if (waitpoint?.projectId !== project.id) {
+ return redirectWithErrorMessage(
+ submission.value.failureRedirect,
+ request,
+ "No waitpoint found"
+ );
+ }
+
+ switch (submission.value.type) {
+ case "DATETIME": {
+ const result = await engine.completeWaitpoint({
+ id: waitpointId,
+ });
+
+ return redirectWithSuccessMessage(
+ submission.value.successRedirect,
+ request,
+ "Waitpoint skipped"
+ );
+ }
+ case "MANUAL": {
+ if (submission.value.isTimeout) {
+ try {
+ const result = await engine.completeWaitpoint({
+ id: waitpointId,
+ output: {
+ type: "application/json",
+ value: JSON.stringify(timeoutError(new Date())),
+ isError: true,
+ },
+ });
+
+ return redirectWithSuccessMessage(
+ submission.value.successRedirect,
+ request,
+ "Waitpoint timed out"
+ );
+ } catch (e) {
+ return redirectWithErrorMessage(
+ submission.value.failureRedirect,
+ request,
+ "Invalid payload, must be valid JSON"
+ );
+ }
+ }
+
+ try {
+ if (
+ submission.value.payload &&
+ submission.value.payload.length > env.TASK_PAYLOAD_MAXIMUM_SIZE
+ ) {
+ return redirectWithErrorMessage(
+ submission.value.failureRedirect,
+ request,
+ "Payload is too large"
+ );
+ }
+
+ const data = submission.value.payload ? JSON.parse(submission.value.payload) : {};
+ const stringifiedData = await stringifyIO(data);
+ const finalData = await conditionallyExportPacket(
+ stringifiedData,
+ `${waitpointId}/waitpoint/token`
+ );
+
+ const result = await engine.completeWaitpoint({
+ id: waitpointId,
+ output: finalData.data
+ ? { type: finalData.dataType, value: finalData.data, isError: false }
+ : undefined,
+ });
+
+ return redirectWithSuccessMessage(
+ submission.value.successRedirect,
+ request,
+ "Waitpoint completed"
+ );
+ } catch (e) {
+ return redirectWithErrorMessage(
+ submission.value.failureRedirect,
+ request,
+ "Invalid payload, must be valid JSON"
+ );
+ }
+ }
+ }
+ } catch (error: any) {
+ logger.error("Failed to complete waitpoint", error);
+
+ const errorMessage = `Something went wrong. Please try again.`;
+ return redirectWithErrorMessage(
+ v3RunsPath({ slug: organizationSlug }, { slug: projectParam }),
+ request,
+ errorMessage
+ );
+ }
+};
+
+type FormWaitpoint = Pick;
+
+export function CompleteWaitpointForm({ waitpoint }: { waitpoint: FormWaitpoint }) {
+ const navigation = useNavigation();
+ const submit = useSubmit();
+ const isLoading = navigation.state !== "idle";
+ const organization = useOrganization();
+ const project = useProject();
+
+ return (
+
+ {waitpoint.type === "DATETIME" ? (
+ waitpoint.completedAfter ? (
+
+ ) : (
+ <>Waitpoint doesn't have a complete date>
+ )
+ ) : (
+
+ )}
+
+ );
+}
+
+function CompleteDateTimeWaitpointForm({
+ waitpoint,
+}: {
+ waitpoint: { friendlyId: string; completedAfter: Date };
+}) {
+ const location = useLocation();
+ const navigation = useNavigation();
+ const isLoading = navigation.state !== "idle";
+ const organization = useOrganization();
+ const project = useProject();
+
+ const timeToComplete = waitpoint.completedAfter.getTime() - Date.now();
+ if (timeToComplete < 0) {
+ return (
+
+ );
+ }
+
+ return (
+
+ );
+}
+
+function CompleteManualWaitpointForm({ waitpoint }: { waitpoint: { friendlyId: string } }) {
+ const location = useLocation();
+ const navigation = useNavigation();
+ const submit = useSubmit();
+ const isLoading = navigation.state !== "idle";
+ const organization = useOrganization();
+ const project = useProject();
+ const currentJson = useRef("{\n\n}");
+ const formAction = `/resources/orgs/${organization.slug}/projects/${project.slug}/waitpoints/${waitpoint.friendlyId}/complete`;
+
+ const submitForm = useCallback(
+ (e: React.FormEvent) => {
+ const formData = new FormData(e.currentTarget);
+ const data: Record = {
+ type: formData.get("type") as string,
+ failureRedirect: formData.get("failureRedirect") as string,
+ successRedirect: formData.get("successRedirect") as string,
+ };
+
+ data.payload = currentJson.current;
+
+ submit(data, {
+ action: formAction,
+ method: "post",
+ });
+ e.preventDefault();
+ },
+ [currentJson]
+ );
+
+ return (
+ <>
+
+ >
+ );
+}
+
+export function ForceTimeout({ waitpoint }: { waitpoint: { friendlyId: string } }) {
+ const location = useLocation();
+ const navigation = useNavigation();
+ const isLoading = navigation.state !== "idle";
+ const organization = useOrganization();
+ const project = useProject();
+ const formAction = `/resources/orgs/${organization.slug}/projects/${project.slug}/waitpoints/${waitpoint.friendlyId}/complete`;
+
+ return (
+
+ );
+}
diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.v3.$projectParam.runs.$runParam.spans.$spanParam/route.tsx b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.v3.$projectParam.runs.$runParam.spans.$spanParam/route.tsx
index 04afd2c37c..805cf89b96 100644
--- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.v3.$projectParam.runs.$runParam.spans.$spanParam/route.tsx
+++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.v3.$projectParam.runs.$runParam.spans.$spanParam/route.tsx
@@ -59,6 +59,7 @@ import { requireUserId } from "~/services/session.server";
import { cn } from "~/utils/cn";
import { formatCurrencyAccurate } from "~/utils/numberFormatter";
import {
+ docsPath,
v3BatchPath,
v3RunDownloadLogsPath,
v3RunPath,
@@ -67,6 +68,11 @@ import {
v3SchedulePath,
v3SpanParamsSchema,
} from "~/utils/pathBuilder";
+import { SpanLink } from "~/v3/eventRepository.server";
+import {
+ CompleteWaitpointForm,
+ ForceTimeout,
+} from "../resources.orgs.$organizationSlug.projects.$projectParam.waitpoints.$waitpointFriendlyId.complete/route";
export const loader = async ({ request, params }: LoaderFunctionArgs) => {
const userId = await requireUserId(request);
@@ -170,7 +176,6 @@ function SpanBody({
runParam?: string;
closePanel?: () => void;
}) {
- const isAdmin = useHasAdminAccess();
const organization = useOrganization();
const project = useProject();
const { value, replace } = useSearchParams();
@@ -190,7 +195,7 @@ function SpanBody({
className="h-4 min-h-4 w-4 min-w-4"
/>
-
+
{runParam && closePanel && (
@@ -226,179 +231,67 @@ function SpanBody({
-
-
- {tab === "detail" ? (
-
-
-
- Status
-
-
-
-
-
- Task
-
-
- {span.taskSlug}
-
- }
- content={`Filter runs by ${span.taskSlug}`}
- />
-
-
- {span.idempotencyKey && (
-
- Idempotency key
- {span.idempotencyKey}
-
- )}
-
- Version
-
- {span.workerVersion ? (
- span.workerVersion
- ) : (
-
- Never started
-
-
- )}
-
-
-
-
- ) : (
-
- {span.level === "TRACE" ? (
- <>
-
-
-
-
+ {tab === "detail" ? (
+
+
+
+ Status
+
+
- >
- ) : (
-
-
}
- variant="dot-solid"
+
+
+
+ Task
+
+
+ {span.taskSlug}
+
+ }
+ content={`Filter runs by ${span.taskSlug}`}
/>
-
- )}
-
+
+
+ {span.idempotencyKey && (
- Message
- {span.message}
+ Idempotency key
+ {span.idempotencyKey}
- {span.triggeredRuns.length > 0 && (
-
-
-
Triggered runs
-
-
-
- Run #
- Task
- Version
- Created at
-
-
-
- {span.triggeredRuns.map((run) => {
- const path = v3RunSpanPath(
- organization,
- project,
- { friendlyId: run.friendlyId },
- { spanId: run.spanId }
- );
- return (
-
-
- {run.number}
-
-
- {run.taskIdentifier}
-
-
- {run.lockedToVersion?.version ?? "–"}
-
-
-
-
-
- );
- })}
-
-
-
-
- )}
-
- {span.events.length > 0 &&
}
- {span.properties !== undefined && (
-
)}
-
- )}
-
+
+ Version
+
+ {span.workerVersion ? (
+ span.workerVersion
+ ) : (
+
+ Never started
+
+
+ )}
+
+
+
+
+ ) : (
+
+ )}
);
@@ -417,6 +310,7 @@ function RunBody({
}) {
const organization = useOrganization();
const project = useProject();
+ const isAdmin = useHasAdminAccess();
const { value, replace } = useSearchParams();
const tab = value("tab");
@@ -427,12 +321,15 @@ function RunBody({
- {run.taskIdentifier}
+
+ {run.taskIdentifier}
+ {run.isCached ? " (cached)" : null}
+
{runParam && closePanel && (
@@ -602,6 +499,22 @@ function RunBody({
)}
+
+ Idempotency
+
+ {run.idempotencyKey ? run.idempotencyKey : "–"}
+ {run.idempotencyKey && (
+
+ Expires:{" "}
+ {run.idempotencyKeyExpiresAt ? (
+
+ ) : (
+ "–"
+ )}
+
+ )}
+
+
Version
@@ -634,6 +547,22 @@ function RunBody({
)}
+
+ Engine version
+ {run.engine}
+
+ {isAdmin && (
+ <>
+
+ Primary master queue
+ {run.masterQueue}
+
+
+ Secondary master queue
+ {run.secondaryMasterQueue}
+
+ >
+ )}
Test run
@@ -771,12 +700,13 @@ function RunBody({
+ {run.error && }
+
{run.payload !== undefined && (
)}
- {run.error !== undefined ? (
-
- ) : run.output !== undefined ? (
+
+ {run.error === undefined && run.output !== undefined ? (
) : null}
@@ -787,12 +717,17 @@ function RunBody({
{run.friendlyId !== runParam && (
- Focus on run
+ {run.isCached ? "Jump to original run" : "Focus on run"}
)}
@@ -925,3 +860,216 @@ function PacketDisplay({
}
}
}
+
+function SpanEntity({ span }: { span: Span }) {
+ const isAdmin = useHasAdminAccess();
+
+ const organization = useOrganization();
+ const project = useProject();
+
+ if (!span.entity) {
+ //normal span
+ return (
+
+ {span.level === "TRACE" ? (
+ <>
+
+
+
+
+ >
+ ) : (
+
+ }
+ variant="dot-solid"
+ />
+
+ )}
+
+
+ Message
+ {span.message}
+
+ {span.triggeredRuns.length > 0 && (
+
+
+
Triggered runs
+
+
+
+ Run #
+ Task
+ Version
+ Created at
+
+
+
+ {span.triggeredRuns.map((run) => {
+ const path = v3RunSpanPath(
+ organization,
+ project,
+ { friendlyId: run.friendlyId },
+ { spanId: run.spanId }
+ );
+ return (
+
+
+ {run.number}
+
+
+ {run.taskIdentifier}
+
+
+ {run.lockedToVersion?.version ?? "–"}
+
+
+
+
+
+ );
+ })}
+
+
+
+
+ )}
+
+ {span.events.length > 0 &&
}
+ {span.properties !== undefined ? (
+
+ ) : null}
+
+ );
+ }
+
+ switch (span.entity.type) {
+ case "waitpoint": {
+ return (
+
+
+
+
Waitpoint
+
+ A waitpoint pauses your code from continuing until the conditions are met.{" "}
+ View docs.
+
+
+
+
+ Status
+
+
+
+
+
+ ID
+
+ {span.entity.object.friendlyId}
+
+
+
+ Idempotency key
+
+
+
+ {span.entity.object.userProvidedIdempotencyKey
+ ? span.entity.object.idempotencyKey
+ : "–"}
+
+
+ {span.entity.object.idempotencyKeyExpiresAt ? (
+ <>
+ TTL:
+ >
+ ) : null}
+
+
+
+
+ {span.entity.object.type === "MANUAL" && (
+ <>
+
+ Timeout at
+
+
+ {span.entity.object.completedAfter ? (
+
+ ) : (
+ "–"
+ )}
+ {span.entity.object.status === "PENDING" && (
+
+ )}
+
+
+
+ >
+ )}
+ {span.entity.object.status === "PENDING" ? null : span.entity.object.isTimeout ? (
+ <>>
+ ) : span.entity.object.output ? (
+
+ ) : span.entity.object.completedAfter ? (
+
+ Completed at
+
+
+
+
+ ) : (
+ "Completed with no output"
+ )}
+
+
+ {span.entity.object.status === "PENDING" && (
+
+
+
+ )}
+
+ );
+ }
+ default: {
+ return No span for {span.entity.type};
+ }
+ }
+}
diff --git a/apps/webapp/app/services/engineRateLimit.server.ts b/apps/webapp/app/services/engineRateLimit.server.ts
new file mode 100644
index 0000000000..f34ed0ef44
--- /dev/null
+++ b/apps/webapp/app/services/engineRateLimit.server.ts
@@ -0,0 +1,34 @@
+import { env } from "~/env.server";
+import { authenticateAuthorizationHeader } from "./apiAuth.server";
+import { authorizationRateLimitMiddleware } from "./authorizationRateLimitMiddleware.server";
+import { Duration } from "./rateLimiter.server";
+
+export const engineRateLimiter = authorizationRateLimitMiddleware({
+ redis: {
+ port: env.RATE_LIMIT_REDIS_PORT,
+ host: env.RATE_LIMIT_REDIS_HOST,
+ username: env.RATE_LIMIT_REDIS_USERNAME,
+ password: env.RATE_LIMIT_REDIS_PASSWORD,
+ tlsDisabled: env.RATE_LIMIT_REDIS_TLS_DISABLED === "true",
+ clusterMode: env.RATE_LIMIT_REDIS_CLUSTER_MODE_ENABLED === "1",
+ },
+ keyPrefix: "engine",
+ defaultLimiter: {
+ type: "tokenBucket",
+ refillRate: env.RUN_ENGINE_RATE_LIMIT_REFILL_RATE,
+ interval: env.RUN_ENGINE_RATE_LIMIT_REFILL_INTERVAL as Duration,
+ maxTokens: env.RUN_ENGINE_RATE_LIMIT_MAX,
+ },
+ limiterCache: {
+ fresh: 60_000 * 10, // Data is fresh for 10 minutes
+ stale: 60_000 * 20, // Date is stale after 20 minutes
+ },
+ pathMatchers: [/^\/engine/],
+ // Allow /api/v1/tasks/:id/callback/:secret
+ pathWhiteList: [],
+ log: {
+ rejections: env.RUN_ENGINE_RATE_LIMIT_REJECTION_LOGS_ENABLED === "1",
+ requests: env.RUN_ENGINE_RATE_LIMIT_REQUEST_LOGS_ENABLED === "1",
+ limiter: env.RUN_ENGINE_RATE_LIMIT_LIMITER_LOGS_ENABLED === "1",
+ },
+});
diff --git a/apps/webapp/app/services/personalAccessToken.server.ts b/apps/webapp/app/services/personalAccessToken.server.ts
index c349b08ef6..f48582ec9b 100644
--- a/apps/webapp/app/services/personalAccessToken.server.ts
+++ b/apps/webapp/app/services/personalAccessToken.server.ts
@@ -129,6 +129,7 @@ export async function authenticatePersonalAccessToken(
token: string
): Promise {
if (!token.startsWith(tokenPrefix)) {
+ logger.warn(`PAT doesn't start with ${tokenPrefix}`);
return;
}
diff --git a/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts b/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts
index e505994034..d28945218f 100644
--- a/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts
+++ b/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts
@@ -17,11 +17,17 @@ import {
PersonalAccessTokenAuthenticationResult,
} from "../personalAccessToken.server";
import { safeJsonParse } from "~/utils/json";
+import {
+ AuthenticatedWorkerInstance,
+ WorkerGroupTokenService,
+} from "~/v3/services/worker/workerGroupTokenService.server";
+
+type AnyZodSchema = z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion;
type ApiKeyRouteBuilderOptions<
- TParamsSchema extends z.AnyZodObject | undefined = undefined,
- TSearchParamsSchema extends z.AnyZodObject | undefined = undefined,
- THeadersSchema extends z.AnyZodObject | undefined = undefined,
+ TParamsSchema extends AnyZodSchema | undefined = undefined,
+ TSearchParamsSchema extends AnyZodSchema | undefined = undefined,
+ THeadersSchema extends AnyZodSchema | undefined = undefined,
TResource = never
> = {
params?: TParamsSchema;
@@ -30,7 +36,9 @@ type ApiKeyRouteBuilderOptions<
allowJWT?: boolean;
corsStrategy?: "all" | "none";
findResource: (
- params: TParamsSchema extends z.AnyZodObject ? z.infer : undefined,
+ params: TParamsSchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined,
authentication: ApiAuthenticationResultSuccess
) => Promise;
shouldRetryNotFound?: boolean;
@@ -38,36 +46,48 @@ type ApiKeyRouteBuilderOptions<
action: AuthorizationAction;
resource: (
resource: NonNullable,
- params: TParamsSchema extends z.AnyZodObject ? z.infer : undefined,
- searchParams: TSearchParamsSchema extends z.AnyZodObject
+ params: TParamsSchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined,
+ searchParams: TSearchParamsSchema extends
+ | z.ZodFirstPartySchemaTypes
+ | z.ZodDiscriminatedUnion
? z.infer
: undefined,
- headers: THeadersSchema extends z.AnyZodObject ? z.infer : undefined
+ headers: THeadersSchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined
) => AuthorizationResources;
superScopes?: string[];
};
};
type ApiKeyHandlerFunction<
- TParamsSchema extends z.AnyZodObject | undefined,
- TSearchParamsSchema extends z.AnyZodObject | undefined,
- THeadersSchema extends z.AnyZodObject | undefined = undefined,
+ TParamsSchema extends AnyZodSchema | undefined,
+ TSearchParamsSchema extends AnyZodSchema | undefined,
+ THeadersSchema extends AnyZodSchema | undefined = undefined,
TResource = never
> = (args: {
- params: TParamsSchema extends z.AnyZodObject ? z.infer : undefined;
- searchParams: TSearchParamsSchema extends z.AnyZodObject
+ params: TParamsSchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined;
+ searchParams: TSearchParamsSchema extends
+ | z.ZodFirstPartySchemaTypes
+ | z.ZodDiscriminatedUnion
? z.infer
: undefined;
- headers: THeadersSchema extends z.AnyZodObject ? z.infer : undefined;
+ headers: THeadersSchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined;
authentication: ApiAuthenticationResultSuccess;
request: Request;
resource: NonNullable;
}) => Promise;
export function createLoaderApiRoute<
- TParamsSchema extends z.AnyZodObject | undefined = undefined,
- TSearchParamsSchema extends z.AnyZodObject | undefined = undefined,
- THeadersSchema extends z.AnyZodObject | undefined = undefined,
+ TParamsSchema extends AnyZodSchema | undefined = undefined,
+ TSearchParamsSchema extends AnyZodSchema | undefined = undefined,
+ THeadersSchema extends AnyZodSchema | undefined = undefined,
TResource = never
>(
options: ApiKeyRouteBuilderOptions,
@@ -226,6 +246,19 @@ export function createLoaderApiRoute<
if (error instanceof Response) {
return await wrapResponse(request, error, corsStrategy !== "none");
}
+
+ logger.error("Error in loader", {
+ error:
+ error instanceof Error
+ ? {
+ name: error.name,
+ message: error.message,
+ stack: error.stack,
+ }
+ : String(error),
+ url: request.url,
+ });
+
return await wrapResponse(
request,
json({ error: "Internal Server Error" }, { status: 500 }),
@@ -241,9 +274,9 @@ export function createLoaderApiRoute<
}
type PATRouteBuilderOptions<
- TParamsSchema extends z.AnyZodObject | undefined = undefined,
- TSearchParamsSchema extends z.AnyZodObject | undefined = undefined,
- THeadersSchema extends z.AnyZodObject | undefined = undefined
+ TParamsSchema extends AnyZodSchema | undefined = undefined,
+ TSearchParamsSchema extends AnyZodSchema | undefined = undefined,
+ THeadersSchema extends AnyZodSchema | undefined = undefined
> = {
params?: TParamsSchema;
searchParams?: TSearchParamsSchema;
@@ -252,23 +285,29 @@ type PATRouteBuilderOptions<
};
type PATHandlerFunction<
- TParamsSchema extends z.AnyZodObject | undefined,
- TSearchParamsSchema extends z.AnyZodObject | undefined,
- THeadersSchema extends z.AnyZodObject | undefined = undefined
+ TParamsSchema extends AnyZodSchema | undefined,
+ TSearchParamsSchema extends AnyZodSchema | undefined,
+ THeadersSchema extends AnyZodSchema | undefined = undefined
> = (args: {
- params: TParamsSchema extends z.AnyZodObject ? z.infer : undefined;
- searchParams: TSearchParamsSchema extends z.AnyZodObject
+ params: TParamsSchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined;
+ searchParams: TSearchParamsSchema extends
+ | z.ZodFirstPartySchemaTypes
+ | z.ZodDiscriminatedUnion
? z.infer
: undefined;
- headers: THeadersSchema extends z.AnyZodObject ? z.infer : undefined;
+ headers: THeadersSchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined;
authentication: PersonalAccessTokenAuthenticationResult;
request: Request;
}) => Promise;
export function createLoaderPATApiRoute<
- TParamsSchema extends z.AnyZodObject | undefined = undefined,
- TSearchParamsSchema extends z.AnyZodObject | undefined = undefined,
- THeadersSchema extends z.AnyZodObject | undefined = undefined
+ TParamsSchema extends AnyZodSchema | undefined = undefined,
+ TSearchParamsSchema extends AnyZodSchema | undefined = undefined,
+ THeadersSchema extends AnyZodSchema | undefined = undefined
>(
options: PATRouteBuilderOptions,
handler: PATHandlerFunction
@@ -374,10 +413,10 @@ export function createLoaderPATApiRoute<
}
type ApiKeyActionRouteBuilderOptions<
- TParamsSchema extends z.AnyZodObject | undefined = undefined,
- TSearchParamsSchema extends z.AnyZodObject | undefined = undefined,
- THeadersSchema extends z.AnyZodObject | undefined = undefined,
- TBodySchema extends z.AnyZodObject | undefined = undefined
+ TParamsSchema extends AnyZodSchema | undefined = undefined,
+ TSearchParamsSchema extends AnyZodSchema | undefined = undefined,
+ THeadersSchema extends AnyZodSchema | undefined = undefined,
+ TBodySchema extends AnyZodSchema | undefined = undefined
> = {
params?: TParamsSchema;
searchParams?: TSearchParamsSchema;
@@ -388,12 +427,20 @@ type ApiKeyActionRouteBuilderOptions<
authorization?: {
action: AuthorizationAction;
resource: (
- params: TParamsSchema extends z.AnyZodObject ? z.infer : undefined,
- searchParams: TSearchParamsSchema extends z.AnyZodObject
+ params: TParamsSchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined,
+ searchParams: TSearchParamsSchema extends
+ | z.ZodFirstPartySchemaTypes
+ | z.ZodDiscriminatedUnion
? z.infer
: undefined,
- headers: THeadersSchema extends z.AnyZodObject ? z.infer : undefined,
- body: TBodySchema extends z.AnyZodObject ? z.infer : undefined
+ headers: THeadersSchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined,
+ body: TBodySchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined
) => AuthorizationResources;
superScopes?: string[];
};
@@ -402,26 +449,34 @@ type ApiKeyActionRouteBuilderOptions<
};
type ApiKeyActionHandlerFunction<
- TParamsSchema extends z.AnyZodObject | undefined,
- TSearchParamsSchema extends z.AnyZodObject | undefined,
- THeadersSchema extends z.AnyZodObject | undefined = undefined,
- TBodySchema extends z.AnyZodObject | undefined = undefined
+ TParamsSchema extends AnyZodSchema | undefined,
+ TSearchParamsSchema extends AnyZodSchema | undefined,
+ THeadersSchema extends AnyZodSchema | undefined = undefined,
+ TBodySchema extends AnyZodSchema | undefined = undefined
> = (args: {
- params: TParamsSchema extends z.AnyZodObject ? z.infer : undefined;
- searchParams: TSearchParamsSchema extends z.AnyZodObject
+ params: TParamsSchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined;
+ searchParams: TSearchParamsSchema extends
+ | z.ZodFirstPartySchemaTypes
+ | z.ZodDiscriminatedUnion
? z.infer
: undefined;
- headers: THeadersSchema extends z.AnyZodObject ? z.infer : undefined;
- body: TBodySchema extends z.AnyZodObject ? z.infer : undefined;
+ headers: THeadersSchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined;
+ body: TBodySchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined;
authentication: ApiAuthenticationResultSuccess;
request: Request;
}) => Promise;
export function createActionApiRoute<
- TParamsSchema extends z.AnyZodObject | undefined = undefined,
- TSearchParamsSchema extends z.AnyZodObject | undefined = undefined,
- THeadersSchema extends z.AnyZodObject | undefined = undefined,
- TBodySchema extends z.AnyZodObject | undefined = undefined
+ TParamsSchema extends AnyZodSchema | undefined = undefined,
+ TSearchParamsSchema extends AnyZodSchema | undefined = undefined,
+ THeadersSchema extends AnyZodSchema | undefined = undefined,
+ TBodySchema extends AnyZodSchema | undefined = undefined
>(
options: ApiKeyActionRouteBuilderOptions<
TParamsSchema,
@@ -667,3 +722,289 @@ async function wrapResponse(
})
: response;
}
+
+type WorkerLoaderRouteBuilderOptions<
+ TParamsSchema extends AnyZodSchema | undefined = undefined,
+ TSearchParamsSchema extends AnyZodSchema | undefined = undefined,
+ THeadersSchema extends AnyZodSchema | undefined = undefined
+> = {
+ params?: TParamsSchema;
+ searchParams?: TSearchParamsSchema;
+ headers?: THeadersSchema;
+};
+
+type WorkerLoaderHandlerFunction<
+ TParamsSchema extends AnyZodSchema | undefined,
+ TSearchParamsSchema extends AnyZodSchema | undefined,
+ THeadersSchema extends AnyZodSchema | undefined = undefined
+> = (args: {
+ params: TParamsSchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined;
+ searchParams: TSearchParamsSchema extends
+ | z.ZodFirstPartySchemaTypes
+ | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined;
+ authenticatedWorker: AuthenticatedWorkerInstance;
+ request: Request;
+ headers: THeadersSchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined;
+}) => Promise;
+
+export function createLoaderWorkerApiRoute<
+ TParamsSchema extends AnyZodSchema | undefined = undefined,
+ TSearchParamsSchema extends AnyZodSchema | undefined = undefined,
+ THeadersSchema extends AnyZodSchema | undefined = undefined
+>(
+ options: WorkerLoaderRouteBuilderOptions,
+ handler: WorkerLoaderHandlerFunction
+) {
+ return async function loader({ request, params }: LoaderFunctionArgs) {
+ const {
+ params: paramsSchema,
+ searchParams: searchParamsSchema,
+ headers: headersSchema,
+ } = options;
+
+ try {
+ const service = new WorkerGroupTokenService();
+ const authenticationResult = await service.authenticate(request);
+
+ if (!authenticationResult) {
+ return json({ error: "Invalid or missing worker token" }, { status: 401 });
+ }
+
+ let parsedParams: any = undefined;
+ if (paramsSchema) {
+ const parsed = paramsSchema.safeParse(params);
+ if (!parsed.success) {
+ return json(
+ { error: "Params Error", details: fromZodError(parsed.error).details },
+ { status: 400 }
+ );
+ }
+ parsedParams = parsed.data;
+ }
+
+ let parsedSearchParams: any = undefined;
+ if (searchParamsSchema) {
+ const searchParams = Object.fromEntries(new URL(request.url).searchParams);
+ const parsed = searchParamsSchema.safeParse(searchParams);
+ if (!parsed.success) {
+ return json(
+ { error: "Query Error", details: fromZodError(parsed.error).details },
+ { status: 400 }
+ );
+ }
+ parsedSearchParams = parsed.data;
+ }
+
+ let parsedHeaders: any = undefined;
+ if (headersSchema) {
+ const rawHeaders = Object.fromEntries(request.headers);
+ const headers = headersSchema.safeParse(rawHeaders);
+ if (!headers.success) {
+ return json(
+ { error: "Headers Error", details: fromZodError(headers.error).details },
+ { status: 400 }
+ );
+ }
+ parsedHeaders = headers.data;
+ }
+
+ const result = await handler({
+ params: parsedParams,
+ searchParams: parsedSearchParams,
+ authenticatedWorker: authenticationResult,
+ request,
+ headers: parsedHeaders,
+ });
+ return result;
+ } catch (error) {
+ console.error("Error in API route:", error);
+ if (error instanceof Response) {
+ return error;
+ }
+
+ logger.error("Error in loader", {
+ error:
+ error instanceof Error
+ ? {
+ name: error.name,
+ message: error.message,
+ stack: error.stack,
+ }
+ : String(error),
+ url: request.url,
+ });
+
+ return json({ error: "Internal Server Error" }, { status: 500 });
+ }
+ };
+}
+
+type WorkerActionRouteBuilderOptions<
+ TParamsSchema extends AnyZodSchema | undefined = undefined,
+ TSearchParamsSchema extends AnyZodSchema | undefined = undefined,
+ THeadersSchema extends AnyZodSchema | undefined = undefined,
+ TBodySchema extends AnyZodSchema | undefined = undefined
+> = {
+ params?: TParamsSchema;
+ searchParams?: TSearchParamsSchema;
+ headers?: THeadersSchema;
+ body?: TBodySchema;
+ method?: "POST" | "PUT" | "DELETE" | "PATCH";
+};
+
+type WorkerActionHandlerFunction<
+ TParamsSchema extends AnyZodSchema | undefined,
+ TSearchParamsSchema extends AnyZodSchema | undefined,
+ THeadersSchema extends AnyZodSchema | undefined = undefined,
+ TBodySchema extends AnyZodSchema | undefined = undefined
+> = (args: {
+ params: TParamsSchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined;
+ searchParams: TSearchParamsSchema extends
+ | z.ZodFirstPartySchemaTypes
+ | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined;
+ authenticatedWorker: AuthenticatedWorkerInstance;
+ request: Request;
+ headers: THeadersSchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined;
+ body: TBodySchema extends z.ZodFirstPartySchemaTypes | z.ZodDiscriminatedUnion
+ ? z.infer
+ : undefined;
+}) => Promise;
+
+export function createActionWorkerApiRoute<
+ TParamsSchema extends AnyZodSchema | undefined = undefined,
+ TSearchParamsSchema extends AnyZodSchema | undefined = undefined,
+ THeadersSchema extends AnyZodSchema | undefined = undefined,
+ TBodySchema extends AnyZodSchema | undefined = undefined
+>(
+ options: WorkerActionRouteBuilderOptions<
+ TParamsSchema,
+ TSearchParamsSchema,
+ THeadersSchema,
+ TBodySchema
+ >,
+ handler: WorkerActionHandlerFunction<
+ TParamsSchema,
+ TSearchParamsSchema,
+ THeadersSchema,
+ TBodySchema
+ >
+) {
+ return async function action({ request, params }: ActionFunctionArgs) {
+ if (options.method) {
+ if (request.method.toUpperCase() !== options.method) {
+ return json(
+ { error: "Method not allowed" },
+ { status: 405, headers: { Allow: options.method } }
+ );
+ }
+ }
+
+ const {
+ params: paramsSchema,
+ searchParams: searchParamsSchema,
+ body: bodySchema,
+ headers: headersSchema,
+ } = options;
+
+ try {
+ const service = new WorkerGroupTokenService();
+ const authenticationResult = await service.authenticate(request);
+
+ if (!authenticationResult) {
+ return json({ error: "Invalid or missing worker token" }, { status: 401 });
+ }
+
+ let parsedParams: any = undefined;
+ if (paramsSchema) {
+ const parsed = paramsSchema.safeParse(params);
+ if (!parsed.success) {
+ return json(
+ { error: "Params Error", details: fromZodError(parsed.error).details },
+ { status: 400 }
+ );
+ }
+ parsedParams = parsed.data;
+ }
+
+ let parsedSearchParams: any = undefined;
+ if (searchParamsSchema) {
+ const searchParams = Object.fromEntries(new URL(request.url).searchParams);
+ const parsed = searchParamsSchema.safeParse(searchParams);
+ if (!parsed.success) {
+ return json(
+ { error: "Query Error", details: fromZodError(parsed.error).details },
+ { status: 400 }
+ );
+ }
+ parsedSearchParams = parsed.data;
+ }
+
+ let parsedHeaders: any = undefined;
+ if (headersSchema) {
+ const rawHeaders = Object.fromEntries(request.headers);
+ const headers = headersSchema.safeParse(rawHeaders);
+ if (!headers.success) {
+ return json(
+ { error: "Headers Error", details: fromZodError(headers.error).details },
+ { status: 400 }
+ );
+ }
+ parsedHeaders = headers.data;
+ }
+
+ let parsedBody: any = undefined;
+ if (bodySchema) {
+ const body = await request.clone().json();
+ const parsed = bodySchema.safeParse(body);
+ if (!parsed.success) {
+ return json(
+ { error: "Body Error", details: fromZodError(parsed.error).details },
+ { status: 400 }
+ );
+ }
+ parsedBody = parsed.data;
+ }
+
+ const result = await handler({
+ params: parsedParams,
+ searchParams: parsedSearchParams,
+ authenticatedWorker: authenticationResult,
+ request,
+ body: parsedBody,
+ headers: parsedHeaders,
+ });
+ return result;
+ } catch (error) {
+ console.error("Error in API route:", error);
+ if (error instanceof Response) {
+ return error;
+ }
+
+ logger.error("Error in action", {
+ error:
+ error instanceof Error
+ ? {
+ name: error.name,
+ message: error.message,
+ stack: error.stack,
+ }
+ : String(error),
+ url: request.url,
+ });
+
+ return json({ error: "Internal Server Error" }, { status: 500 });
+ }
+ };
+}
diff --git a/apps/webapp/app/services/worker.server.ts b/apps/webapp/app/services/worker.server.ts
index 3c76907aa7..7d42dd7453 100644
--- a/apps/webapp/app/services/worker.server.ts
+++ b/apps/webapp/app/services/worker.server.ts
@@ -54,6 +54,10 @@ import {
} from "~/v3/services/cancelDevSessionRuns.server";
import { logger } from "./logger.server";
import { BatchProcessingOptions, BatchTriggerV3Service } from "~/v3/services/batchTriggerV3.server";
+import {
+ BatchProcessingOptions as BatchProcessingOptionsV4,
+ BatchTriggerV4Service,
+} from "~/v3/services/batchTriggerV4.server";
const workerCatalog = {
indexEndpoint: z.object({
@@ -194,6 +198,7 @@ const workerCatalog = {
}),
"v3.cancelDevSessionRuns": CancelDevSessionRunsServiceOptions,
"v3.processBatchTaskRun": BatchProcessingOptions,
+ "v3.processBatchTaskRunV3": BatchProcessingOptionsV4,
};
const executionWorkerCatalog = {
@@ -717,6 +722,15 @@ function getWorkerQueue() {
handler: async (payload, job) => {
const service = new BatchTriggerV3Service(payload.strategy);
+ await service.processBatchTaskRun(payload);
+ },
+ },
+ "v3.processBatchTaskRunV3": {
+ priority: 0,
+ maxAttempts: 5,
+ handler: async (payload, job) => {
+ const service = new BatchTriggerV4Service(payload.strategy);
+
await service.processBatchTaskRun(payload);
},
},
diff --git a/apps/webapp/app/utils/delays.ts b/apps/webapp/app/utils/delays.ts
index 6faa67c677..eaa296e11b 100644
--- a/apps/webapp/app/utils/delays.ts
+++ b/apps/webapp/app/utils/delays.ts
@@ -1,3 +1,5 @@
+import { parseNaturalLanguageDuration } from "@trigger.dev/core/v3/apps";
+
export const calculateDurationInMs = (options: {
seconds?: number;
minutes?: number;
@@ -11,3 +13,30 @@ export const calculateDurationInMs = (options: {
(options?.days ?? 0) * 24 * 60 * 60 * 1000
);
};
+
+export async function parseDelay(value?: string | Date): Promise {
+ if (!value) {
+ return;
+ }
+
+ if (value instanceof Date) {
+ return value;
+ }
+
+ try {
+ const date = new Date(value);
+
+ // Check if the date is valid
+ if (isNaN(date.getTime())) {
+ return parseNaturalLanguageDuration(value);
+ }
+
+ if (date.getTime() <= Date.now()) {
+ return;
+ }
+
+ return date;
+ } catch (error) {
+ return parseNaturalLanguageDuration(value);
+ }
+}
diff --git a/apps/webapp/app/utils/sse.server.ts b/apps/webapp/app/utils/sse.server.ts
index fced1fbaf4..56e7b191af 100644
--- a/apps/webapp/app/utils/sse.server.ts
+++ b/apps/webapp/app/utils/sse.server.ts
@@ -22,8 +22,8 @@ export function sse({ request, pingInterval = 1000, updateInterval = 348, run }:
return new Response("SSE disabled", { status: 200 });
}
- let pinger: NodeJS.Timer | undefined = undefined;
- let updater: NodeJS.Timer | undefined = undefined;
+ let pinger: NodeJS.Timeout | undefined = undefined;
+ let updater: NodeJS.Timeout | undefined = undefined;
let timeout: NodeJS.Timeout | undefined = undefined;
const abort = () => {
diff --git a/apps/webapp/app/utils/sse.ts b/apps/webapp/app/utils/sse.ts
new file mode 100644
index 0000000000..ef6135a866
--- /dev/null
+++ b/apps/webapp/app/utils/sse.ts
@@ -0,0 +1,183 @@
+import { LoaderFunctionArgs } from "@remix-run/node";
+import { eventStream } from "remix-utils/sse/server";
+import { setInterval } from "timers/promises";
+
+type SendFunction = Parameters[1]>[0];
+
+type HandlerParams = {
+ send: SendFunction;
+};
+
+type SSEHandlers = {
+ /** Return false to stop */
+ beforeStream?: () => Promise | boolean | void;
+ /** Return false to stop */
+ initStream?: (params: HandlerParams) => Promise | boolean | void;
+ /** Return false to stop */
+ iterator?: (params: HandlerParams & { date: Date }) => Promise | boolean | void;
+ cleanup?: () => void;
+};
+
+type SSEContext = {
+ id: string;
+ request: Request;
+ controller: AbortController;
+ debug: (message: string) => void;
+};
+
+type SSEOptions = {
+ timeout: number;
+ interval?: number;
+ debug?: boolean;
+ handler: (context: SSEContext) => Promise;
+};
+
+// This is used to track the open connections, for debugging
+const connections: Set = new Set();
+
+export function createSSELoader(options: SSEOptions) {
+ const { timeout, interval = 500, debug = false, handler } = options;
+
+ return async function loader({ request }: LoaderFunctionArgs) {
+ const id = request.headers.get("x-request-id") || Math.random().toString(36).slice(2, 8);
+
+ const internalController = new AbortController();
+ const timeoutSignal = AbortSignal.timeout(timeout);
+
+ const log = (message: string) => {
+ if (debug) console.log(`SSE: [${id}] ${message} (${connections.size} open connections)`);
+ };
+
+ const context: SSEContext = {
+ id,
+ request,
+ controller: internalController,
+ debug: log,
+ };
+
+ const handlers = await handler(context).catch((error) => {
+ if (error instanceof Response) {
+ throw error;
+ }
+
+ throw new Response("Internal Server Error", { status: 500 });
+ });
+
+ const combinedSignal = AbortSignal.any([
+ request.signal,
+ timeoutSignal,
+ internalController.signal,
+ ]);
+
+ log("Start");
+
+ request.signal.addEventListener(
+ "abort",
+ () => {
+ log(`request signal aborted`);
+ internalController.abort("Request aborted");
+ },
+ { once: true, signal: internalController.signal }
+ );
+
+ combinedSignal.addEventListener(
+ "abort",
+ () => {
+ log(`combinedSignal aborted: ${combinedSignal.reason}`);
+ },
+ { once: true, signal: internalController.signal }
+ );
+
+ timeoutSignal.addEventListener(
+ "abort",
+ () => {
+ if (internalController.signal.aborted) return;
+ log(`timeoutSignal aborted: ${timeoutSignal.reason}`);
+ internalController.abort("Timeout");
+ },
+ { once: true, signal: internalController.signal }
+ );
+
+ if (handlers.beforeStream) {
+ const shouldContinue = await handlers.beforeStream();
+ if (shouldContinue === false) {
+ log("beforeStream returned false, so we'll exit before creating the stream");
+ internalController.abort("Init requested stop");
+ return;
+ }
+ }
+
+ return eventStream(combinedSignal, function setup(send) {
+ connections.add(id);
+
+ async function run() {
+ try {
+ log("Initializing");
+ if (handlers.initStream) {
+ const shouldContinue = await handlers.initStream({ send });
+ if (shouldContinue === false) {
+ log("initStream returned false, so we'll stop the stream");
+ internalController.abort("Init requested stop");
+ return;
+ }
+ }
+
+ log("Starting interval");
+ for await (const _ of setInterval(interval, null, {
+ signal: combinedSignal,
+ })) {
+ log("PING");
+
+ const date = new Date();
+
+ if (handlers.iterator) {
+ try {
+ const shouldContinue = await handlers.iterator({ date, send });
+ if (shouldContinue === false) {
+ log("iterator return false, so we'll stop the stream");
+ internalController.abort("Iterator requested stop");
+ break;
+ }
+ } catch (error) {
+ log("iterator threw an error, aborting stream");
+ // Immediately abort to trigger cleanup
+ internalController.abort(error instanceof Error ? error.message : "Iterator error");
+ // No need to re-throw as we're handling it by aborting
+ return; // Exit the run function immediately
+ }
+ }
+ }
+ log("iterator finished all iterations");
+ } catch (error) {
+ if (error instanceof Error) {
+ if (error.name !== "AbortError") {
+ console.error(error);
+ }
+ }
+ } finally {
+ log("iterator finished");
+ }
+ }
+
+ run();
+
+ return () => {
+ connections.delete(id);
+
+ log("Cleanup called");
+ if (handlers.cleanup) {
+ try {
+ handlers.cleanup();
+ } catch (error) {
+ log(
+ `Error in cleanup handler: ${
+ error instanceof Error ? error.message : "Unknown error"
+ }`
+ );
+ console.error("SSE Cleanup Error:", error);
+ }
+ }
+ };
+ });
+ };
+}
diff --git a/apps/webapp/app/utils/string.ts b/apps/webapp/app/utils/string.ts
new file mode 100644
index 0000000000..d2dfdbb1d6
--- /dev/null
+++ b/apps/webapp/app/utils/string.ts
@@ -0,0 +1,3 @@
+export function capitalizeWord(word: string) {
+ return word.charAt(0).toUpperCase() + word.slice(1).toLowerCase();
+}
diff --git a/apps/webapp/app/utils/taskEvent.ts b/apps/webapp/app/utils/taskEvent.ts
index 9e3ebfb90c..7c5be6a34e 100644
--- a/apps/webapp/app/utils/taskEvent.ts
+++ b/apps/webapp/app/utils/taskEvent.ts
@@ -65,7 +65,6 @@ export function prepareTrace(events: TaskEvent[]): TraceSummary | undefined {
id: event.spanId,
parentId: event.parentId ?? undefined,
runId: event.runId,
- idempotencyKey: event.idempotencyKey,
data: {
message: event.message,
style: event.style,
@@ -77,8 +76,9 @@ export function prepareTrace(events: TaskEvent[]): TraceSummary | undefined {
level: event.level,
events: event.events,
environmentType: event.environmentType,
+ isDebug: event.isDebug,
},
- };
+ } satisfies SpanSummary;
spansBySpanId.set(event.spanId, span);
diff --git a/apps/webapp/app/v3/authenticatedSocketConnection.server.ts b/apps/webapp/app/v3/authenticatedSocketConnection.server.ts
index bf8658209c..a6de96b9c9 100644
--- a/apps/webapp/app/v3/authenticatedSocketConnection.server.ts
+++ b/apps/webapp/app/v3/authenticatedSocketConnection.server.ts
@@ -1,4 +1,8 @@
-import { clientWebsocketMessages, serverWebsocketMessages } from "@trigger.dev/core/v3";
+import {
+ clientWebsocketMessages,
+ HeartbeatService,
+ serverWebsocketMessages,
+} from "@trigger.dev/core/v3";
import { ZodMessageHandler, ZodMessageSender } from "@trigger.dev/core/v3/zodMessageHandler";
import { Evt } from "evt";
import { randomUUID } from "node:crypto";
@@ -7,7 +11,6 @@ import { WebSocket } from "ws";
import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
import { logger } from "~/services/logger.server";
import { DevQueueConsumer } from "./marqs/devQueueConsumer.server";
-import { HeartbeatService } from "./services/heartbeatService.server";
export class AuthenticatedSocketConnection {
public id: string;
@@ -86,6 +89,7 @@ export class AuthenticatedSocketConnection {
ws.ping();
},
+ intervalMs: 45_000,
});
this._pingService.start();
diff --git a/apps/webapp/app/v3/engineVersion.server.ts b/apps/webapp/app/v3/engineVersion.server.ts
new file mode 100644
index 0000000000..1b514fc398
--- /dev/null
+++ b/apps/webapp/app/v3/engineVersion.server.ts
@@ -0,0 +1,66 @@
+import { RunEngineVersion, RuntimeEnvironmentType } from "@trigger.dev/database";
+import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
+import {
+ findCurrentWorkerDeploymentWithoutTasks,
+ findCurrentWorkerFromEnvironment,
+} from "./models/workerDeployment.server";
+import { $replica } from "~/db.server";
+
+export async function determineEngineVersion({
+ environment,
+ workerVersion,
+ engineVersion: version,
+}: {
+ environment: AuthenticatedEnvironment;
+ workerVersion?: string;
+ engineVersion?: RunEngineVersion;
+}): Promise {
+ if (version) {
+ return version;
+ }
+
+ // If the project is V1, then none of the background workers are running V2
+ if (environment.project.engine === RunEngineVersion.V1) {
+ return "V1";
+ }
+
+ /**
+ * The project has V2 enabled so it *could* be V2.
+ */
+
+ // A specific worker version is requested
+ if (workerVersion) {
+ const worker = await $replica.backgroundWorker.findUnique({
+ select: {
+ engine: true,
+ },
+ where: {
+ projectId_runtimeEnvironmentId_version: {
+ projectId: environment.projectId,
+ runtimeEnvironmentId: environment.id,
+ version: workerVersion,
+ },
+ },
+ });
+
+ if (!worker) {
+ throw new Error(`Worker not found: environment: ${environment.id} version: ${workerVersion}`);
+ }
+
+ return worker.engine;
+ }
+
+ // Dev: use the latest BackgroundWorker
+ if (environment.type === "DEVELOPMENT") {
+ const backgroundWorker = await findCurrentWorkerFromEnvironment(environment);
+ return backgroundWorker?.engine ?? "V1";
+ }
+
+ // Deployed: use the latest deployed BackgroundWorker
+ const currentDeployment = await findCurrentWorkerDeploymentWithoutTasks(environment.id);
+ if (currentDeployment?.type === "V1") {
+ return "V1";
+ }
+
+ return "V2";
+}
diff --git a/apps/webapp/app/v3/eventRepository.server.ts b/apps/webapp/app/v3/eventRepository.server.ts
index 911e7435d1..391b3be1b9 100644
--- a/apps/webapp/app/v3/eventRepository.server.ts
+++ b/apps/webapp/app/v3/eventRepository.server.ts
@@ -1,4 +1,4 @@
-import { Attributes, Link, trace, TraceFlags, Tracer } from "@opentelemetry/api";
+import { Attributes, AttributeValue, Link, trace, TraceFlags, Tracer } from "@opentelemetry/api";
import { RandomIdGenerator } from "@opentelemetry/sdk-trace-base";
import { SemanticResourceAttributes } from "@opentelemetry/semantic-conventions";
import {
@@ -10,6 +10,7 @@ import {
SpanEvent,
SpanEvents,
SpanMessagingEvent,
+ TaskEventEnvironment,
TaskEventStyle,
TaskRunError,
correctErrorStackTrace,
@@ -26,7 +27,6 @@ import { Gauge } from "prom-client";
import { $replica, PrismaClient, PrismaReplicaClient, prisma } from "~/db.server";
import { env } from "~/env.server";
import { metricsRegister } from "~/metrics.server";
-import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
import { logger } from "~/services/logger.server";
import { singleton } from "~/utils/singleton";
import { DynamicFlushScheduler } from "./dynamicFlushScheduler.server";
@@ -59,6 +59,7 @@ export type TraceAttributes = Partial<
| "attemptId"
| "isError"
| "isCancelled"
+ | "isDebug"
| "runId"
| "runIsTest"
| "output"
@@ -84,7 +85,7 @@ export type TraceEventOptions = {
parentAsLinkType?: "trigger" | "replay";
spanIdSeed?: string;
attributes: TraceAttributes;
- environment: AuthenticatedEnvironment;
+ environment: TaskEventEnvironment;
taskSlug: string;
startTime?: bigint;
endTime?: Date;
@@ -125,6 +126,7 @@ export type QueriedEvent = Prisma.TaskEventGetPayload<{
isError: true;
isPartial: true;
isCancelled: true;
+ isDebug: true;
level: true;
events: true;
environmentType: true;
@@ -169,6 +171,7 @@ export type SpanSummary = {
isError: boolean;
isPartial: boolean;
isCancelled: boolean;
+ isDebug: boolean;
level: NonNullable;
environmentType: CreatableEventEnvironmentType;
};
@@ -183,6 +186,26 @@ export type UpdateEventOptions = {
events?: SpanEvents;
};
+type TaskEventSummary = Pick<
+ TaskEvent,
+ | "id"
+ | "spanId"
+ | "parentId"
+ | "runId"
+ | "idempotencyKey"
+ | "message"
+ | "style"
+ | "startTime"
+ | "duration"
+ | "isError"
+ | "isPartial"
+ | "isCancelled"
+ | "level"
+ | "events"
+ | "environmentType"
+ | "isDebug"
+>;
+
export class EventRepository {
private readonly _flushScheduler: DynamicFlushScheduler;
private _randomIdGenerator = new RandomIdGenerator();
@@ -263,7 +286,7 @@ export class EventRepository {
eventId: event.id,
});
- await this.insert({
+ const completedEvent = {
...omit(event, "id"),
isPartial: false,
isError: options?.attributes.isError ?? false,
@@ -283,7 +306,11 @@ export class EventRepository {
: "application/json",
payload: event.payload as Attributes,
payloadType: event.payloadType,
- });
+ } satisfies CreatableEvent;
+
+ await this.insert(completedEvent);
+
+ return completedEvent;
}
async cancelEvent(event: TaskEventRecord, cancelledAt: Date, reason: string) {
@@ -485,6 +512,7 @@ export class EventRepository {
isError: event.isError,
isPartial: ancestorCancelled ? false : event.isPartial,
isCancelled: event.isCancelled === true ? true : event.isPartial && ancestorCancelled,
+ isDebug: event.isDebug,
startTime: getDateFromNanoseconds(event.startTime),
level: event.level,
events: event.events,
@@ -541,6 +569,7 @@ export class EventRepository {
isError: true,
isPartial: true,
isCancelled: true,
+ isDebug: true,
level: true,
events: true,
environmentType: true,
@@ -633,6 +662,19 @@ export class EventRepository {
spanEvent.environmentType === "DEVELOPMENT"
);
+ const originalRun = rehydrateAttribute(
+ spanEvent.properties,
+ SemanticInternalAttributes.ORIGINAL_RUN_ID
+ );
+
+ const entity = {
+ type: rehydrateAttribute(
+ spanEvent.properties,
+ SemanticInternalAttributes.ENTITY_TYPE
+ ),
+ id: rehydrateAttribute(spanEvent.properties, SemanticInternalAttributes.ENTITY_ID),
+ };
+
return {
...spanEvent,
...span.data,
@@ -642,6 +684,8 @@ export class EventRepository {
events: spanEvents,
show,
links,
+ originalRun,
+ entity,
};
});
}
@@ -788,14 +832,19 @@ export class EventRepository {
});
}
- public async recordEvent(message: string, options: TraceEventOptions) {
+ public async recordEvent(
+ message: string,
+ options: TraceEventOptions & { duration?: number; parentId?: string }
+ ) {
const propagatedContext = extractContextFromCarrier(options.context ?? {});
const startTime = options.startTime ?? getNowInNanoseconds();
- const duration = options.endTime ? calculateDurationFromStart(startTime, options.endTime) : 100;
+ const duration =
+ options.duration ??
+ (options.endTime ? calculateDurationFromStart(startTime, options.endTime) : 100);
const traceId = propagatedContext?.traceparent?.traceId ?? this.generateTraceId();
- const parentId = propagatedContext?.traceparent?.spanId;
+ const parentId = options.parentId ?? propagatedContext?.traceparent?.spanId;
const tracestate = propagatedContext?.tracestate;
const spanId = options.spanIdSeed
? this.#generateDeterministicSpanId(traceId, options.spanIdSeed)
@@ -816,8 +865,10 @@ export class EventRepository {
...options.attributes.metadata,
};
+ const isDebug = options.attributes.isDebug;
+
const style = {
- [SemanticInternalAttributes.STYLE_ICON]: "play",
+ [SemanticInternalAttributes.STYLE_ICON]: isDebug ? "warn" : "play",
};
if (!options.attributes.runId) {
@@ -832,11 +883,12 @@ export class EventRepository {
message: message,
serviceName: "api server",
serviceNamespace: "trigger.dev",
- level: "TRACE",
+ level: isDebug ? "WARN" : "TRACE",
kind: options.kind,
status: "OK",
startTime,
isPartial: false,
+ isDebug,
duration, // convert to nanoseconds
environmentId: options.environment.id,
environmentType: options.environment.type,
@@ -876,7 +928,7 @@ export class EventRepository {
public async traceEvent(
message: string,
- options: TraceEventOptions & { incomplete?: boolean },
+ options: TraceEventOptions & { incomplete?: boolean; isError?: boolean },
callback: (
e: EventBuilder,
traceContext: Record,
@@ -1322,7 +1374,7 @@ function excludePartialEventsWithCorrespondingFullEvent(batch: CreatableEvent[])
);
}
-function extractContextFromCarrier(carrier: Record) {
+export function extractContextFromCarrier(carrier: Record) {
const traceparent = carrier["traceparent"];
const tracestate = carrier["tracestate"];
@@ -1608,3 +1660,139 @@ function rehydrateShow(properties: Prisma.JsonValue): { actions?: boolean } | un
return;
}
+
+function rehydrateAttribute(
+ properties: Prisma.JsonValue,
+ key: string
+): T | undefined {
+ if (properties === null || properties === undefined) {
+ return;
+ }
+
+ if (typeof properties !== "object") {
+ return;
+ }
+
+ if (Array.isArray(properties)) {
+ return;
+ }
+
+ const value = properties[key];
+
+ if (!value) return;
+
+ return value as T;
+}
+
+export async function findRunForEventCreation(runId: string) {
+ return prisma.taskRun.findFirst({
+ where: {
+ id: runId,
+ },
+ select: {
+ friendlyId: true,
+ taskIdentifier: true,
+ traceContext: true,
+ runtimeEnvironment: {
+ select: {
+ id: true,
+ type: true,
+ organizationId: true,
+ projectId: true,
+ project: {
+ select: {
+ externalRef: true,
+ },
+ },
+ },
+ },
+ },
+ });
+}
+
+export async function recordRunEvent(
+ runId: string,
+ message: string,
+ options: Omit & {
+ duration?: number;
+ parentId?: string;
+ startTime?: Date;
+ }
+): Promise<
+ | {
+ success: true;
+ }
+ | {
+ success: false;
+ code: "RUN_NOT_FOUND" | "FAILED_TO_RECORD_EVENT";
+ error?: unknown;
+ }
+> {
+ try {
+ const foundRun = await findRunForEventCreation(runId);
+
+ if (!foundRun) {
+ logger.error("Failed to find run for event creation", { runId });
+ return {
+ success: false,
+ code: "RUN_NOT_FOUND",
+ };
+ }
+
+ const { attributes, startTime, ...optionsRest } = options;
+
+ await eventRepository.recordEvent(message, {
+ environment: foundRun.runtimeEnvironment,
+ taskSlug: foundRun.taskIdentifier,
+ context: foundRun.traceContext as Record,
+ attributes: {
+ runId: foundRun.friendlyId,
+ ...attributes,
+ },
+ startTime: BigInt((startTime?.getTime() ?? Date.now()) * 1_000_000),
+ ...optionsRest,
+ });
+
+ return {
+ success: true,
+ };
+ } catch (error) {
+ logger.error("Failed to record event for run", {
+ error: error instanceof Error ? error.message : error,
+ runId,
+ });
+
+ return {
+ success: false,
+ code: "FAILED_TO_RECORD_EVENT",
+ error,
+ };
+ }
+}
+
+export async function recordRunDebugLog(
+ runId: string,
+ message: string,
+ options: Omit & {
+ duration?: number;
+ parentId?: string;
+ startTime?: Date;
+ }
+): Promise<
+ | {
+ success: true;
+ }
+ | {
+ success: false;
+ code: "RUN_NOT_FOUND" | "FAILED_TO_RECORD_EVENT";
+ error?: unknown;
+ }
+> {
+ return recordRunEvent(runId, message, {
+ ...options,
+ attributes: {
+ ...options?.attributes,
+ isDebug: true,
+ },
+ });
+}
diff --git a/apps/webapp/app/v3/featureFlags.server.ts b/apps/webapp/app/v3/featureFlags.server.ts
new file mode 100644
index 0000000000..1cc57ed48c
--- /dev/null
+++ b/apps/webapp/app/v3/featureFlags.server.ts
@@ -0,0 +1,54 @@
+import { z } from "zod";
+import { prisma, PrismaClientOrTransaction } from "~/db.server";
+
+const FeatureFlagCatalog = {
+ defaultWorkerInstanceGroupId: z.string(),
+};
+
+type FeatureFlagKey = keyof typeof FeatureFlagCatalog;
+
+export type FlagsOptions = {
+ key: FeatureFlagKey;
+};
+
+export function makeFlags(_prisma: PrismaClientOrTransaction = prisma) {
+ return async function flags(
+ opts: FlagsOptions
+ ): Promise | undefined> {
+ const value = await _prisma.featureFlag.findUnique({
+ where: {
+ key: opts.key,
+ },
+ });
+
+ const parsed = FeatureFlagCatalog[opts.key].safeParse(value?.value);
+
+ if (!parsed.success) {
+ return;
+ }
+
+ return parsed.data;
+ };
+}
+
+export function makeSetFlags(_prisma: PrismaClientOrTransaction = prisma) {
+ return async function setFlags(
+ opts: FlagsOptions & { value: z.infer<(typeof FeatureFlagCatalog)[T]> }
+ ): Promise {
+ await _prisma.featureFlag.upsert({
+ where: {
+ key: opts.key,
+ },
+ create: {
+ key: opts.key,
+ value: opts.value,
+ },
+ update: {
+ value: opts.value,
+ },
+ });
+ };
+}
+
+export const flags = makeFlags();
+export const setFlags = makeSetFlags();
diff --git a/apps/webapp/app/v3/handleSocketIo.server.ts b/apps/webapp/app/v3/handleSocketIo.server.ts
index d06e6240fe..3290a67ed8 100644
--- a/apps/webapp/app/v3/handleSocketIo.server.ts
+++ b/apps/webapp/app/v3/handleSocketIo.server.ts
@@ -1,3 +1,5 @@
+import { EventBusEventArgs } from "@internal/run-engine";
+import { createAdapter } from "@socket.io/redis-adapter";
import {
ClientToSharedQueueMessages,
CoordinatorSocketData,
@@ -7,24 +9,32 @@ import {
ProviderToPlatformMessages,
SharedQueueToClientMessages,
} from "@trigger.dev/core/v3";
+import { RunId } from "@trigger.dev/core/v3/apps";
+import type {
+ WorkerClientToServerEvents,
+ WorkerServerToClientEvents,
+} from "@trigger.dev/core/v3/workers";
import { ZodNamespace } from "@trigger.dev/core/v3/zodNamespace";
-import { Server } from "socket.io";
+import { Redis } from "ioredis";
+import { Namespace, Server, Socket } from "socket.io";
import { env } from "~/env.server";
+import { findEnvironmentById } from "~/models/runtimeEnvironment.server";
+import { authenticateApiRequestWithFailure } from "~/services/apiAuth.server";
+import { logger } from "~/services/logger.server";
import { singleton } from "~/utils/singleton";
-import { SharedSocketConnection } from "./sharedSocketConnection";
-import { CreateCheckpointService } from "./services/createCheckpoint.server";
+import { recordRunDebugLog } from "./eventRepository.server";
import { sharedQueueTasks } from "./marqs/sharedQueueConsumer.server";
+import { engine } from "./runEngine.server";
import { CompleteAttemptService } from "./services/completeAttempt.server";
-import { logger } from "~/services/logger.server";
-import { findEnvironmentById } from "~/models/runtimeEnvironment.server";
-import { CreateDeployedBackgroundWorkerService } from "./services/createDeployedBackgroundWorker.server";
-import { ResumeAttemptService } from "./services/resumeAttempt.server";
-import { DeploymentIndexFailed } from "./services/deploymentIndexFailed.server";
-import { Redis } from "ioredis";
-import { createAdapter } from "@socket.io/redis-adapter";
import { CrashTaskRunService } from "./services/crashTaskRun.server";
+import { CreateCheckpointService } from "./services/createCheckpoint.server";
+import { CreateDeployedBackgroundWorkerService } from "./services/createDeployedBackgroundWorker.server";
import { CreateTaskRunAttemptService } from "./services/createTaskRunAttempt.server";
+import { DeploymentIndexFailed } from "./services/deploymentIndexFailed.server";
+import { ResumeAttemptService } from "./services/resumeAttempt.server";
import { UpdateFatalRunErrorService } from "./services/updateFatalRunError.server";
+import { WorkerGroupTokenService } from "./services/worker/workerGroupTokenService.server";
+import { SharedSocketConnection } from "./sharedSocketConnection";
export const socketIo = singleton("socketIo", initalizeIoServer);
@@ -38,12 +48,40 @@ function initalizeIoServer() {
const coordinatorNamespace = createCoordinatorNamespace(io);
const providerNamespace = createProviderNamespace(io);
const sharedQueueConsumerNamespace = createSharedQueueConsumerNamespace(io);
+ const workerNamespace = createWorkerNamespace({
+ io,
+ namespace: "/worker",
+ authenticate: async (request) => {
+ const tokenService = new WorkerGroupTokenService();
+ const authenticatedInstance = await tokenService.authenticate(request);
+ if (!authenticatedInstance) {
+ return false;
+ }
+ return true;
+ },
+ });
+ const devWorkerNamespace = createWorkerNamespace({
+ io,
+ namespace: "/dev-worker",
+ authenticate: async (request) => {
+ const authentication = await authenticateApiRequestWithFailure(request);
+ if (!authentication.ok) {
+ return false;
+ }
+ if (authentication.environment.type !== "DEVELOPMENT") {
+ return false;
+ }
+ return true;
+ },
+ });
return {
io,
coordinatorNamespace,
providerNamespace,
sharedQueueConsumerNamespace,
+ workerNamespace,
+ devWorkerNamespace,
};
}
@@ -405,3 +443,230 @@ function createSharedQueueConsumerNamespace(io: Server) {
return sharedQueue.namespace;
}
+
+function headersFromHandshake(handshake: Socket["handshake"]) {
+ const headers = new Headers();
+
+ for (const [key, value] of Object.entries(handshake.headers)) {
+ if (typeof value !== "string") continue;
+ headers.append(key, value);
+ }
+
+ return headers;
+}
+
+function createWorkerNamespace({
+ io,
+ namespace,
+ authenticate,
+}: {
+ io: Server;
+ namespace: string;
+ authenticate: (request: Request) => Promise;
+}) {
+ const worker: Namespace =
+ io.of(namespace);
+
+ worker.use(async (socket, next) => {
+ try {
+ const headers = headersFromHandshake(socket.handshake);
+
+ logger.debug("Worker authentication", {
+ namespace,
+ socketId: socket.id,
+ headers: Object.fromEntries(headers),
+ });
+
+ const request = new Request("https://example.com", {
+ headers,
+ });
+
+ const success = await authenticate(request);
+
+ if (!success) {
+ throw new Error("unauthorized");
+ }
+
+ next();
+ } catch (error) {
+ logger.error("Worker authentication failed", {
+ namespace,
+ error: error instanceof Error ? error.message : error,
+ });
+
+ socket.disconnect(true);
+ }
+ });
+
+ worker.on("connection", async (socket) => {
+ logger.debug("worker connected", { namespace, socketId: socket.id });
+
+ const rooms = new Set();
+
+ async function onNotification({
+ time,
+ run,
+ snapshot,
+ }: EventBusEventArgs<"workerNotification">[0]) {
+ if (!env.RUN_ENGINE_DEBUG_WORKER_NOTIFICATIONS) {
+ return;
+ }
+
+ logger.debug("[handleSocketIo] Received worker notification", {
+ namespace,
+ time,
+ runId: run.id,
+ snapshot,
+ });
+
+ // Record notification event
+ await recordRunDebugLog(run.id, `run:notify workerNotification event`, {
+ attributes: {
+ properties: {
+ snapshotId: snapshot.id,
+ snapshotStatus: snapshot.executionStatus,
+ rooms: Array.from(rooms),
+ },
+ },
+ startTime: time,
+ });
+ }
+
+ engine.eventBus.on("workerNotification", onNotification);
+
+ const interval = setInterval(() => {
+ logger.debug("Rooms for socket", {
+ namespace,
+ socketId: socket.id,
+ rooms: Array.from(rooms),
+ });
+ }, 5000);
+
+ socket.on("disconnect", (reason, description) => {
+ logger.debug("worker disconnected", {
+ namespace,
+ socketId: socket.id,
+ reason,
+ description,
+ });
+ clearInterval(interval);
+
+ engine.eventBus.off("workerNotification", onNotification);
+ });
+
+ socket.on("disconnecting", (reason, description) => {
+ logger.debug("worker disconnecting", {
+ namespace,
+ socketId: socket.id,
+ reason,
+ description,
+ });
+ clearInterval(interval);
+ });
+
+ socket.on("error", (error) => {
+ logger.error("worker error", {
+ namespace,
+ socketId: socket.id,
+ error: JSON.parse(JSON.stringify(error)),
+ });
+ clearInterval(interval);
+ });
+
+ socket.on("run:subscribe", async ({ version, runFriendlyIds }) => {
+ logger.debug("run:subscribe", { namespace, version, runFriendlyIds });
+
+ const settledResult = await Promise.allSettled(
+ runFriendlyIds.map(async (friendlyId) => {
+ const room = roomFromFriendlyRunId(friendlyId);
+
+ logger.debug("Joining room", { namespace, room });
+
+ socket.join(room);
+ rooms.add(room);
+
+ await recordRunDebugLog(
+ RunId.fromFriendlyId(friendlyId),
+ "run:subscribe received by platform",
+ {
+ attributes: {
+ properties: {
+ friendlyId,
+ runFriendlyIds,
+ room,
+ },
+ },
+ }
+ );
+ })
+ );
+
+ for (const result of settledResult) {
+ if (result.status === "rejected") {
+ logger.error("Error joining room", {
+ namespace,
+ runFriendlyIds,
+ error: result.reason instanceof Error ? result.reason.message : result.reason,
+ });
+ }
+ }
+
+ logger.debug("Rooms for socket after subscribe", {
+ namespace,
+ socketId: socket.id,
+ rooms: Array.from(rooms),
+ });
+ });
+
+ socket.on("run:unsubscribe", async ({ version, runFriendlyIds }) => {
+ logger.debug("run:unsubscribe", { namespace, version, runFriendlyIds });
+
+ const settledResult = await Promise.allSettled(
+ runFriendlyIds.map(async (friendlyId) => {
+ const room = roomFromFriendlyRunId(friendlyId);
+
+ logger.debug("Leaving room", { namespace, room });
+
+ socket.leave(room);
+ rooms.delete(room);
+
+ await recordRunDebugLog(
+ RunId.fromFriendlyId(friendlyId),
+ "run:unsubscribe received by platform",
+ {
+ attributes: {
+ properties: {
+ friendlyId,
+ runFriendlyIds,
+ room,
+ },
+ },
+ }
+ );
+ })
+ );
+
+ for (const result of settledResult) {
+ if (result.status === "rejected") {
+ logger.error("Error leaving room", {
+ namespace,
+ runFriendlyIds,
+ error: result.reason instanceof Error ? result.reason.message : result.reason,
+ });
+ }
+ }
+
+ logger.debug("Rooms for socket after unsubscribe", {
+ namespace,
+ socketId: socket.id,
+ rooms: Array.from(rooms),
+ });
+ });
+ });
+
+ return worker;
+}
+
+export function roomFromFriendlyRunId(id: string) {
+ return `room:${id}`;
+}
diff --git a/apps/webapp/app/v3/machinePresets.server.ts b/apps/webapp/app/v3/machinePresets.server.ts
index 23b8bfa8fa..84aff460d8 100644
--- a/apps/webapp/app/v3/machinePresets.server.ts
+++ b/apps/webapp/app/v3/machinePresets.server.ts
@@ -51,3 +51,15 @@ function derivePresetNameFromValues(cpu: number, memory: number): MachinePresetN
return defaultMachine;
}
+
+export function allMachines(): Record {
+ return Object.fromEntries(
+ Object.entries(machines).map(([name, preset]) => [
+ name,
+ {
+ name: name as MachinePresetName,
+ ...preset,
+ },
+ ])
+ );
+}
diff --git a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts
index cf42669e43..8d0b4db9d5 100644
--- a/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts
+++ b/apps/webapp/app/v3/marqs/devQueueConsumer.server.ts
@@ -19,7 +19,7 @@ import { FailedTaskRunService } from "../failedTaskRun.server";
import { CancelDevSessionRunsService } from "../services/cancelDevSessionRuns.server";
import { CompleteAttemptService } from "../services/completeAttempt.server";
import { attributesFromAuthenticatedEnv, tracer } from "../tracer.server";
-import { getMaxDuration } from "../utils/maxDuration";
+import { getMaxDuration } from "@trigger.dev/core/v3/apps";
import { DevSubscriber, devPubSub } from "./devPubSub.server";
import { findQueueInEnvironment, sanitizeQueueName } from "~/models/taskQueue.server";
import { createRedisClient, RedisClient } from "~/redis.server";
diff --git a/apps/webapp/app/v3/models/workerDeployment.server.ts b/apps/webapp/app/v3/models/workerDeployment.server.ts
index 096bab6372..37d5cae111 100644
--- a/apps/webapp/app/v3/models/workerDeployment.server.ts
+++ b/apps/webapp/app/v3/models/workerDeployment.server.ts
@@ -1,6 +1,9 @@
import type { Prettify } from "@trigger.dev/core";
-import { BackgroundWorker } from "@trigger.dev/database";
-import { CURRENT_DEPLOYMENT_LABEL } from "~/consts";
+import { BackgroundWorker, WorkerDeployment } from "@trigger.dev/database";
+import {
+ CURRENT_DEPLOYMENT_LABEL,
+ CURRENT_UNMANAGED_DEPLOYMENT_LABEL,
+} from "@trigger.dev/core/v3/apps";
import { Prisma, prisma } from "~/db.server";
import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
@@ -34,6 +37,7 @@ type WorkerDeploymentWithWorkerTasks = Prisma.WorkerDeploymentGetPayload<{
sdkVersion: true;
cliVersion: true;
supportsLazyAttempts: true;
+ engine: true;
tasks: {
select: {
id: true;
@@ -53,12 +57,13 @@ type WorkerDeploymentWithWorkerTasks = Prisma.WorkerDeploymentGetPayload<{
}>;
export async function findCurrentWorkerDeployment(
- environmentId: string
+ environmentId: string,
+ label = CURRENT_DEPLOYMENT_LABEL
): Promise {
const promotion = await prisma.workerDeploymentPromotion.findFirst({
where: {
environmentId,
- label: CURRENT_DEPLOYMENT_LABEL,
+ label,
},
select: {
deployment: {
@@ -75,6 +80,7 @@ export async function findCurrentWorkerDeployment(
cliVersion: true,
supportsLazyAttempts: true,
tasks: true,
+ engine: true,
},
},
},
@@ -85,11 +91,37 @@ export async function findCurrentWorkerDeployment(
return promotion?.deployment;
}
+export async function findCurrentWorkerDeploymentWithoutTasks(
+ environmentId: string,
+ label = CURRENT_DEPLOYMENT_LABEL
+): Promise {
+ const promotion = await prisma.workerDeploymentPromotion.findUnique({
+ where: {
+ environmentId_label: {
+ environmentId,
+ label,
+ },
+ },
+ include: {
+ deployment: true,
+ },
+ });
+
+ return promotion?.deployment;
+}
+
+export async function findCurrentUnmanagedWorkerDeployment(
+ environmentId: string
+): Promise {
+ return await findCurrentWorkerDeployment(environmentId, CURRENT_UNMANAGED_DEPLOYMENT_LABEL);
+}
+
export async function findCurrentWorkerFromEnvironment(
- environment: Pick
+ environment: Pick,
+ label = CURRENT_DEPLOYMENT_LABEL
): Promise | null> {
if (environment.type === "DEVELOPMENT") {
const latestDevWorker = await prisma.backgroundWorker.findFirst({
@@ -102,11 +134,24 @@ export async function findCurrentWorkerFromEnvironment(
});
return latestDevWorker;
} else {
- const deployment = await findCurrentWorkerDeployment(environment.id);
+ const deployment = await findCurrentWorkerDeployment(environment.id, label);
return deployment?.worker ?? null;
}
}
+export async function findCurrentUnmanagedWorkerFromEnvironment(
+ environment: Pick
+): Promise | null> {
+ if (environment.type === "DEVELOPMENT") {
+ return null;
+ }
+
+ return await findCurrentWorkerFromEnvironment(environment, CURRENT_UNMANAGED_DEPLOYMENT_LABEL);
+}
+
export async function getWorkerDeploymentFromWorker(
workerId: string
): Promise {
diff --git a/apps/webapp/app/v3/registryProxy.server.ts b/apps/webapp/app/v3/registryProxy.server.ts
index 2740e34c1c..f253c89195 100644
--- a/apps/webapp/app/v3/registryProxy.server.ts
+++ b/apps/webapp/app/v3/registryProxy.server.ts
@@ -13,6 +13,7 @@ import { mkdtemp } from "fs/promises";
import { createReadStream, createWriteStream } from "node:fs";
import { pipeline } from "node:stream/promises";
import { unlinkSync } from "fs";
+import { parseDockerImageReference, rebuildDockerImageReference } from "@trigger.dev/core/v3";
const TokenResponseBody = z.object({
token: z.string(),
@@ -466,70 +467,3 @@ async function streamRequestBodyToTempFile(request: IncomingMessage): Promise 1) {
- parts.digest = atSplit[1];
- imageReference = atSplit[0];
- }
-
- // Splitting by ':' to separate the tag (if exists) and to ensure it's not part of a port
- let colonSplit = imageReference.split(":");
- if (colonSplit.length > 2 || (colonSplit.length === 2 && !colonSplit[1].includes("/"))) {
- // It's a tag if there's no '/' in the second part (after colon), or there are more than 2 parts (implying a port number in registry)
- parts.tag = colonSplit.pop(); // The last part is the tag
- imageReference = colonSplit.join(":"); // Join back in case it was a port number
- }
-
- // Check for registry
- let slashIndex = imageReference.indexOf("/");
- if (slashIndex !== -1) {
- let potentialRegistry = imageReference.substring(0, slashIndex);
- // Validate if the first part is a valid hostname-like string (registry), otherwise treat the entire string as the repo
- if (
- potentialRegistry.includes(".") ||
- potentialRegistry === "localhost" ||
- potentialRegistry.includes(":")
- ) {
- parts.registry = potentialRegistry;
- parts.repo = imageReference.substring(slashIndex + 1);
- } else {
- parts.repo = imageReference; // No valid registry found, treat as repo
- }
- } else {
- parts.repo = imageReference; // Only repo is present
- }
-
- return parts;
-}
-
-function rebuildDockerImageReference(parts: DockerImageParts): string {
- let imageReference = "";
-
- if (parts.registry) {
- imageReference += `${parts.registry}/`;
- }
-
- imageReference += parts.repo; // Repo is now guaranteed to be defined
-
- if (parts.tag) {
- imageReference += `:${parts.tag}`;
- }
-
- if (parts.digest) {
- imageReference += `@${parts.digest}`;
- }
-
- return imageReference;
-}
diff --git a/apps/webapp/app/v3/runEngine.server.ts b/apps/webapp/app/v3/runEngine.server.ts
new file mode 100644
index 0000000000..2176925c53
--- /dev/null
+++ b/apps/webapp/app/v3/runEngine.server.ts
@@ -0,0 +1,68 @@
+import { RunEngine } from "@internal/run-engine";
+import { prisma } from "~/db.server";
+import { env } from "~/env.server";
+import { tracer } from "./tracer.server";
+import { singleton } from "~/utils/singleton";
+import { defaultMachine, machines } from "@trigger.dev/platform/v3";
+import { allMachines } from "./machinePresets.server";
+
+export const engine = singleton("RunEngine", createRunEngine);
+
+export type { RunEngine };
+
+function createRunEngine() {
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ workers: env.RUN_ENGINE_WORKER_COUNT,
+ tasksPerWorker: env.RUN_ENGINE_TASKS_PER_WORKER,
+ pollIntervalMs: env.RUN_ENGINE_WORKER_POLL_INTERVAL,
+ redis: {
+ keyPrefix: "engine:",
+ port: env.RUN_ENGINE_WORKER_REDIS_PORT ?? undefined,
+ host: env.RUN_ENGINE_WORKER_REDIS_HOST ?? undefined,
+ username: env.RUN_ENGINE_WORKER_REDIS_USERNAME ?? undefined,
+ password: env.RUN_ENGINE_WORKER_REDIS_PASSWORD ?? undefined,
+ enableAutoPipelining: true,
+ ...(env.RUN_ENGINE_WORKER_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }),
+ },
+ },
+ machines: {
+ defaultMachine,
+ machines: allMachines(),
+ baseCostInCents: env.CENTS_PER_RUN,
+ },
+ queue: {
+ defaultEnvConcurrency: env.DEFAULT_ENV_EXECUTION_CONCURRENCY_LIMIT,
+ redis: {
+ keyPrefix: "engine:",
+ port: env.RUN_ENGINE_RUN_QUEUE_REDIS_PORT ?? undefined,
+ host: env.RUN_ENGINE_RUN_QUEUE_REDIS_HOST ?? undefined,
+ username: env.RUN_ENGINE_RUN_QUEUE_REDIS_USERNAME ?? undefined,
+ password: env.RUN_ENGINE_RUN_QUEUE_REDIS_PASSWORD ?? undefined,
+ enableAutoPipelining: true,
+ ...(env.RUN_ENGINE_RUN_QUEUE_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }),
+ },
+ },
+ runLock: {
+ redis: {
+ keyPrefix: "engine:",
+ port: env.RUN_ENGINE_RUN_LOCK_REDIS_PORT ?? undefined,
+ host: env.RUN_ENGINE_RUN_LOCK_REDIS_HOST ?? undefined,
+ username: env.RUN_ENGINE_RUN_LOCK_REDIS_USERNAME ?? undefined,
+ password: env.RUN_ENGINE_RUN_LOCK_REDIS_PASSWORD ?? undefined,
+ enableAutoPipelining: true,
+ ...(env.RUN_ENGINE_RUN_LOCK_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }),
+ },
+ },
+ tracer,
+ heartbeatTimeoutsMs: {
+ PENDING_EXECUTING: env.RUN_ENGINE_TIMEOUT_PENDING_EXECUTING,
+ PENDING_CANCEL: env.RUN_ENGINE_TIMEOUT_PENDING_CANCEL,
+ EXECUTING: env.RUN_ENGINE_TIMEOUT_EXECUTING,
+ EXECUTING_WITH_WAITPOINTS: env.RUN_ENGINE_TIMEOUT_EXECUTING_WITH_WAITPOINTS,
+ },
+ });
+
+ return engine;
+}
diff --git a/apps/webapp/app/v3/runEngineHandlers.server.ts b/apps/webapp/app/v3/runEngineHandlers.server.ts
new file mode 100644
index 0000000000..4bd833974f
--- /dev/null
+++ b/apps/webapp/app/v3/runEngineHandlers.server.ts
@@ -0,0 +1,521 @@
+import { $replica, prisma } from "~/db.server";
+import {
+ createExceptionPropertiesFromError,
+ eventRepository,
+ recordRunDebugLog,
+} from "./eventRepository.server";
+import { createJsonErrorObject, sanitizeError } from "@trigger.dev/core/v3";
+import { logger } from "~/services/logger.server";
+import { safeJsonParse } from "~/utils/json";
+import type { Attributes } from "@opentelemetry/api";
+import { reportInvocationUsage } from "~/services/platform.v3.server";
+import { roomFromFriendlyRunId, socketIo } from "./handleSocketIo.server";
+import { engine } from "./runEngine.server";
+import { PerformTaskRunAlertsService } from "./services/alerts/performTaskRunAlerts.server";
+import { RunId } from "@trigger.dev/core/v3/apps";
+import { updateMetadataService } from "~/services/metadata/updateMetadata.server";
+import { findEnvironmentFromRun } from "~/models/runtimeEnvironment.server";
+import { env } from "~/env.server";
+import { getTaskEventStoreTableForRun } from "./taskEventStore.server";
+
+export function registerRunEngineEventBusHandlers() {
+ engine.eventBus.on("runSucceeded", async ({ time, run }) => {
+ try {
+ const completedEvent = await eventRepository.completeEvent(
+ getTaskEventStoreTableForRun(run),
+ run.spanId,
+ run.createdAt,
+ run.completedAt ?? undefined,
+ {
+ endTime: time,
+ attributes: {
+ isError: false,
+ output:
+ run.outputType === "application/store" || run.outputType === "text/plain"
+ ? run.output
+ : run.output
+ ? (safeJsonParse(run.output) as Attributes)
+ : undefined,
+ outputType: run.outputType,
+ },
+ }
+ );
+
+ if (!completedEvent) {
+ logger.error("[runSucceeded] Failed to complete event for unknown reason", {
+ runId: run.id,
+ spanId: run.spanId,
+ });
+ return;
+ }
+ } catch (error) {
+ logger.error("[runSucceeded] Failed to complete event", {
+ error: error instanceof Error ? error.message : error,
+ runId: run.id,
+ spanId: run.spanId,
+ });
+ }
+ });
+
+ // Handle alerts
+ engine.eventBus.on("runFailed", async ({ time, run }) => {
+ try {
+ await PerformTaskRunAlertsService.enqueue(run.id);
+ } catch (error) {
+ logger.error("[runFailed] Failed to enqueue alerts", {
+ error: error instanceof Error ? error.message : error,
+ runId: run.id,
+ spanId: run.spanId,
+ });
+ }
+ });
+
+ // Handle events
+ engine.eventBus.on("runFailed", async ({ time, run }) => {
+ try {
+ const sanitizedError = sanitizeError(run.error);
+ const exception = createExceptionPropertiesFromError(sanitizedError);
+
+ const eventStore = getTaskEventStoreTableForRun(run);
+
+ const completedEvent = await eventRepository.completeEvent(
+ eventStore,
+ run.spanId,
+ run.createdAt,
+ run.completedAt ?? undefined,
+ {
+ endTime: time,
+ attributes: {
+ isError: true,
+ },
+ events: [
+ {
+ name: "exception",
+ time,
+ properties: {
+ exception,
+ },
+ },
+ ],
+ }
+ );
+
+ if (!completedEvent) {
+ logger.error("[runFailed] Failed to complete event for unknown reason", {
+ runId: run.id,
+ spanId: run.spanId,
+ });
+ return;
+ }
+
+ const inProgressEvents = await eventRepository.queryIncompleteEvents(
+ eventStore,
+ {
+ runId: completedEvent?.runId,
+ },
+ run.createdAt,
+ run.completedAt ?? undefined
+ );
+
+ await Promise.all(
+ inProgressEvents.map((event) => {
+ try {
+ const completedEvent = eventRepository.completeEvent(
+ eventStore,
+ run.spanId,
+ run.createdAt,
+ run.completedAt ?? undefined,
+ {
+ endTime: time,
+ attributes: {
+ isError: true,
+ },
+ events: [
+ {
+ name: "exception",
+ time,
+ properties: {
+ exception,
+ },
+ },
+ ],
+ }
+ );
+
+ if (!completedEvent) {
+ logger.error("[runFailed] Failed to complete in-progress event for unknown reason", {
+ runId: run.id,
+ spanId: run.spanId,
+ eventId: event.id,
+ });
+ return;
+ }
+ } catch (error) {
+ logger.error("[runFailed] Failed to complete in-progress event", {
+ error: error instanceof Error ? error.message : error,
+ runId: run.id,
+ spanId: run.spanId,
+ eventId: event.id,
+ });
+ }
+ })
+ );
+ } catch (error) {
+ logger.error("[runFailed] Failed to complete event", {
+ error: error instanceof Error ? error.message : error,
+ runId: run.id,
+ spanId: run.spanId,
+ });
+ }
+ });
+
+ engine.eventBus.on("runAttemptFailed", async ({ time, run }) => {
+ try {
+ const sanitizedError = sanitizeError(run.error);
+ const exception = createExceptionPropertiesFromError(sanitizedError);
+ const eventStore = getTaskEventStoreTableForRun(run);
+
+ const inProgressEvents = await eventRepository.queryIncompleteEvents(
+ eventStore,
+ {
+ runId: RunId.toFriendlyId(run.id),
+ spanId: {
+ not: run.spanId,
+ },
+ },
+ run.createdAt,
+ run.completedAt ?? undefined
+ );
+
+ await Promise.all(
+ inProgressEvents.map((event) => {
+ return eventRepository.crashEvent({
+ event: event,
+ crashedAt: time,
+ exception,
+ });
+ })
+ );
+ } catch (error) {
+ logger.error("[runAttemptFailed] Failed to complete event", {
+ error: error instanceof Error ? error.message : error,
+ runId: run.id,
+ spanId: run.spanId,
+ });
+ }
+ });
+
+ engine.eventBus.on("cachedRunCompleted", async ({ time, span, blockedRunId, hasError }) => {
+ try {
+ const blockedRun = await $replica.taskRun.findFirst({
+ select: {
+ taskEventStore: true,
+ },
+ where: {
+ id: blockedRunId,
+ },
+ });
+
+ if (!blockedRun) {
+ logger.error("[cachedRunCompleted] Blocked run not found", {
+ blockedRunId,
+ });
+ return;
+ }
+
+ const eventStore = getTaskEventStoreTableForRun(blockedRun);
+
+ const completedEvent = await eventRepository.completeEvent(
+ eventStore,
+ span.id,
+ span.createdAt,
+ time,
+ {
+ endTime: time,
+ attributes: {
+ isError: hasError,
+ },
+ }
+ );
+
+ if (!completedEvent) {
+ logger.error("[cachedRunCompleted] Failed to complete event for unknown reason", {
+ span,
+ });
+ return;
+ }
+ } catch (error) {
+ logger.error("[cachedRunCompleted] Failed to complete event for unknown reason", {
+ error: error instanceof Error ? error.message : error,
+ span,
+ });
+ }
+ });
+
+ engine.eventBus.on("runExpired", async ({ time, run }) => {
+ try {
+ const eventStore = getTaskEventStoreTableForRun(run);
+
+ const completedEvent = await eventRepository.completeEvent(
+ eventStore,
+ run.spanId,
+ run.createdAt,
+ run.completedAt ?? undefined,
+ {
+ endTime: time,
+ attributes: {
+ isError: true,
+ },
+ events: [
+ {
+ name: "exception",
+ time,
+ properties: {
+ exception: {
+ message: `Run expired because the TTL (${run.ttl}) was reached`,
+ },
+ },
+ },
+ ],
+ }
+ );
+
+ if (!completedEvent) {
+ logger.error("[runFailed] Failed to complete event for unknown reason", {
+ runId: run.id,
+ spanId: run.spanId,
+ });
+ return;
+ }
+ } catch (error) {
+ logger.error("[runExpired] Failed to complete event", {
+ error: error instanceof Error ? error.message : error,
+ runId: run.id,
+ spanId: run.spanId,
+ });
+ }
+ });
+
+ engine.eventBus.on("runCancelled", async ({ time, run }) => {
+ try {
+ const eventStore = getTaskEventStoreTableForRun(run);
+
+ const inProgressEvents = await eventRepository.queryIncompleteEvents(
+ eventStore,
+ {
+ runId: run.friendlyId,
+ },
+ run.createdAt,
+ run.completedAt ?? undefined
+ );
+
+ await Promise.all(
+ inProgressEvents.map((event) => {
+ const error = createJsonErrorObject(run.error);
+ return eventRepository.cancelEvent(event, time, error.message);
+ })
+ );
+ } catch (error) {
+ logger.error("[runCancelled] Failed to cancel event", {
+ error: error instanceof Error ? error.message : error,
+ runId: run.id,
+ spanId: run.spanId,
+ });
+ }
+ });
+
+ engine.eventBus.on("runRetryScheduled", async ({ time, run, environment, retryAt }) => {
+ try {
+ await eventRepository.recordEvent(`Retry #${run.attemptNumber} delay`, {
+ taskSlug: run.taskIdentifier,
+ environment,
+ attributes: {
+ properties: {
+ retryAt: retryAt.toISOString(),
+ },
+ runId: run.friendlyId,
+ style: {
+ icon: "schedule-attempt",
+ },
+ queueName: run.queue,
+ },
+ context: run.traceContext as Record,
+ spanIdSeed: `retry-${run.attemptNumber + 1}`,
+ endTime: retryAt,
+ });
+ } catch (error) {
+ logger.error("[runRetryScheduled] Failed to record retry event", {
+ error: error instanceof Error ? error.message : error,
+ runId: run.id,
+ spanId: run.spanId,
+ });
+ }
+ });
+
+ engine.eventBus.on("runAttemptStarted", async ({ time, run, organization }) => {
+ try {
+ if (run.attemptNumber === 1 && run.baseCostInCents > 0) {
+ await reportInvocationUsage(organization.id, run.baseCostInCents, { runId: run.id });
+ }
+ } catch (error) {
+ logger.error("[runAttemptStarted] Failed to report invocation usage", {
+ error: error instanceof Error ? error.message : error,
+ runId: run.id,
+ orgId: organization.id,
+ });
+ }
+ });
+
+ engine.eventBus.on("runMetadataUpdated", async ({ time, run }) => {
+ const env = await findEnvironmentFromRun(run.id);
+
+ if (!env) {
+ logger.error("[runMetadataUpdated] Failed to find environment", { runId: run.id });
+ return;
+ }
+
+ try {
+ await updateMetadataService.call(run.id, run.metadata, env);
+ } catch (e) {
+ logger.error("[runMetadataUpdated] Failed to update metadata", {
+ taskRun: run.id,
+ error:
+ e instanceof Error
+ ? {
+ name: e.name,
+ message: e.message,
+ stack: e.stack,
+ }
+ : e,
+ });
+ }
+ });
+
+ engine.eventBus.on("executionSnapshotCreated", async ({ time, run, snapshot }) => {
+ const eventResult = await recordRunDebugLog(
+ run.id,
+ `${snapshot.executionStatus} - ${snapshot.description}`,
+ {
+ attributes: {
+ properties: {
+ snapshotId: snapshot.id,
+ snapshotDescription: snapshot.description,
+ snapshotStatus: snapshot.executionStatus,
+ workerId: snapshot.workerId ?? undefined,
+ runnerId: snapshot.runnerId ?? undefined,
+ },
+ },
+ startTime: time,
+ }
+ );
+
+ if (!eventResult.success) {
+ logger.error("[executionSnapshotCreated] Failed to record event", {
+ runId: run.id,
+ snapshot,
+ error: eventResult.error,
+ });
+ }
+ });
+
+ engine.eventBus.on("workerNotification", async ({ time, run, snapshot }) => {
+ logger.debug("[workerNotification] Notifying worker", { time, runId: run.id, snapshot });
+
+ // Notify the worker
+ try {
+ const runFriendlyId = RunId.toFriendlyId(run.id);
+ const room = roomFromFriendlyRunId(runFriendlyId);
+
+ //send the notification to connected workers
+ socketIo.workerNamespace
+ .to(room)
+ .emit("run:notify", { version: "1", run: { friendlyId: runFriendlyId } });
+
+ //send the notification to connected dev workers
+ socketIo.devWorkerNamespace
+ .to(room)
+ .emit("run:notify", { version: "1", run: { friendlyId: runFriendlyId } });
+
+ if (!env.RUN_ENGINE_DEBUG_WORKER_NOTIFICATIONS) {
+ return;
+ }
+
+ // Record notification event
+ const eventResult = await recordRunDebugLog(
+ run.id,
+ `run:notify platform -> supervisor: ${snapshot.executionStatus}`,
+ {
+ attributes: {
+ properties: {
+ snapshotId: snapshot.id,
+ snapshotStatus: snapshot.executionStatus,
+ },
+ },
+ startTime: time,
+ }
+ );
+
+ if (!eventResult.success) {
+ logger.error("[workerNotification] Failed to record event", {
+ runId: run.id,
+ snapshot,
+ error: eventResult.error,
+ });
+ }
+ } catch (error) {
+ logger.error("[workerNotification] Failed to notify worker", {
+ error: error instanceof Error ? error.message : error,
+ runId: run.id,
+ snapshot,
+ });
+
+ // Record notification event
+ const eventResult = await recordRunDebugLog(
+ run.id,
+ `run:notify ERROR platform -> supervisor: ${snapshot.executionStatus}`,
+ {
+ attributes: {
+ properties: {
+ snapshotId: snapshot.id,
+ snapshotStatus: snapshot.executionStatus,
+ error: error instanceof Error ? error.message : String(error),
+ },
+ },
+ startTime: time,
+ }
+ );
+
+ if (!eventResult.success) {
+ logger.error("[workerNotification] Failed to record event", {
+ runId: run.id,
+ snapshot,
+ error: eventResult.error,
+ });
+ }
+ }
+ });
+
+ engine.eventBus.on("incomingCheckpointDiscarded", async ({ time, run, snapshot, checkpoint }) => {
+ const eventResult = await recordRunDebugLog(
+ run.id,
+ `Checkpoint discarded: ${checkpoint.discardReason}`,
+ {
+ attributes: {
+ properties: {
+ snapshotId: snapshot.id,
+ ...checkpoint.metadata,
+ },
+ },
+ startTime: time,
+ }
+ );
+
+ if (!eventResult.success) {
+ logger.error("[incomingCheckpointDiscarded] Failed to record event", {
+ runId: run.id,
+ snapshot,
+ error: eventResult.error,
+ });
+ }
+ });
+}
diff --git a/apps/webapp/app/v3/runQueue.server.ts b/apps/webapp/app/v3/runQueue.server.ts
new file mode 100644
index 0000000000..7198456d39
--- /dev/null
+++ b/apps/webapp/app/v3/runQueue.server.ts
@@ -0,0 +1,36 @@
+import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
+import { marqs } from "./marqs/index.server";
+import { engine } from "./runEngine.server";
+
+//This allows us to update MARQS and the RunQueue
+
+/** Updates MARQS and the RunQueue limits */
+export async function updateEnvConcurrencyLimits(environment: AuthenticatedEnvironment) {
+ await Promise.allSettled([
+ marqs?.updateEnvConcurrencyLimits(environment),
+ engine.runQueue.updateEnvConcurrencyLimits(environment),
+ ]);
+}
+
+/** Updates MARQS and the RunQueue limits for a queue */
+export async function updateQueueConcurrencyLimits(
+ environment: AuthenticatedEnvironment,
+ queueName: string,
+ concurrency: number
+) {
+ await Promise.allSettled([
+ marqs?.updateQueueConcurrencyLimits(environment, queueName, concurrency),
+ engine.runQueue.updateQueueConcurrencyLimits(environment, queueName, concurrency),
+ ]);
+}
+
+/** Removes MARQS and the RunQueue limits for a queue */
+export async function removeQueueConcurrencyLimits(
+ environment: AuthenticatedEnvironment,
+ queueName: string
+) {
+ await Promise.allSettled([
+ marqs?.removeQueueConcurrencyLimits(environment, queueName),
+ engine.runQueue.removeQueueConcurrencyLimits(environment, queueName),
+ ]);
+}
diff --git a/apps/webapp/app/v3/services/baseService.server.ts b/apps/webapp/app/v3/services/baseService.server.ts
index 4e7c79d46e..7686f41b6f 100644
--- a/apps/webapp/app/v3/services/baseService.server.ts
+++ b/apps/webapp/app/v3/services/baseService.server.ts
@@ -2,6 +2,7 @@ import { Span, SpanKind } from "@opentelemetry/api";
import { PrismaClientOrTransaction, prisma } from "~/db.server";
import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
import { attributesFromAuthenticatedEnv, tracer } from "../tracer.server";
+import { engine, RunEngine } from "../runEngine.server";
export abstract class BaseService {
constructor(protected readonly _prisma: PrismaClientOrTransaction = prisma) {}
@@ -37,6 +38,20 @@ export abstract class BaseService {
}
}
+export type WithRunEngineOptions = T & {
+ prisma?: PrismaClientOrTransaction;
+ engine?: RunEngine;
+};
+
+export class WithRunEngine extends BaseService {
+ protected readonly _engine: RunEngine;
+
+ constructor(opts: { prisma?: PrismaClientOrTransaction; engine?: RunEngine } = {}) {
+ super(opts.prisma);
+ this._engine = opts.engine ?? engine;
+ }
+}
+
export class ServiceValidationError extends Error {
constructor(message: string, public status?: number) {
super(message);
diff --git a/apps/webapp/app/v3/services/batchTriggerV3.server.ts b/apps/webapp/app/v3/services/batchTriggerV3.server.ts
index 4ffd59aacc..467a81f7cf 100644
--- a/apps/webapp/app/v3/services/batchTriggerV3.server.ts
+++ b/apps/webapp/app/v3/services/batchTriggerV3.server.ts
@@ -858,7 +858,7 @@ export class BatchTriggerV3Service extends BaseService {
spanParentAsLink: options?.spanParentAsLink,
batchId: batch.friendlyId,
skipChecks: true,
- runId: task.runId,
+ runFriendlyId: task.runId,
}
);
diff --git a/apps/webapp/app/v3/services/batchTriggerV4.server.ts b/apps/webapp/app/v3/services/batchTriggerV4.server.ts
new file mode 100644
index 0000000000..aaa945f915
--- /dev/null
+++ b/apps/webapp/app/v3/services/batchTriggerV4.server.ts
@@ -0,0 +1,683 @@
+import {
+ BatchTriggerTaskV2RequestBody,
+ BatchTriggerTaskV3RequestBody,
+ BatchTriggerTaskV3Response,
+ IOPacket,
+ packetRequiresOffloading,
+ parsePacket,
+} from "@trigger.dev/core/v3";
+import { BatchId, RunId } from "@trigger.dev/core/v3/apps";
+import { BatchTaskRun, Prisma } from "@trigger.dev/database";
+import { z } from "zod";
+import { $transaction, prisma, PrismaClientOrTransaction } from "~/db.server";
+import { env } from "~/env.server";
+import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
+import { logger } from "~/services/logger.server";
+import { getEntitlement } from "~/services/platform.v3.server";
+import { workerQueue } from "~/services/worker.server";
+import { downloadPacketFromObjectStore, uploadPacketToObjectStore } from "../r2.server";
+import { startActiveSpan } from "../tracer.server";
+import { ServiceValidationError, WithRunEngine } from "./baseService.server";
+import { OutOfEntitlementError, TriggerTaskService } from "./triggerTask.server";
+
+const PROCESSING_BATCH_SIZE = 50;
+const ASYNC_BATCH_PROCESS_SIZE_THRESHOLD = 20;
+const MAX_ATTEMPTS = 10;
+
+export const BatchProcessingStrategy = z.enum(["sequential", "parallel"]);
+export type BatchProcessingStrategy = z.infer;
+
+export const BatchProcessingOptions = z.object({
+ batchId: z.string(),
+ processingId: z.string(),
+ range: z.object({ start: z.number().int(), count: z.number().int() }),
+ attemptCount: z.number().int(),
+ strategy: BatchProcessingStrategy,
+ parentRunId: z.string().optional(),
+ resumeParentOnCompletion: z.boolean().optional(),
+});
+
+export type BatchProcessingOptions = z.infer;
+
+export type BatchTriggerTaskServiceOptions = {
+ triggerVersion?: string;
+ traceContext?: Record;
+ spanParentAsLink?: boolean;
+ oneTimeUseToken?: string;
+};
+
+/**
+ * Larger batches, used in Run Engine v2
+ */
+export class BatchTriggerV4Service extends WithRunEngine {
+ private _batchProcessingStrategy: BatchProcessingStrategy;
+
+ constructor(
+ batchProcessingStrategy?: BatchProcessingStrategy,
+ protected readonly _prisma: PrismaClientOrTransaction = prisma
+ ) {
+ super({ prisma });
+
+ this._batchProcessingStrategy = batchProcessingStrategy ?? "parallel";
+ }
+
+ public async call(
+ environment: AuthenticatedEnvironment,
+ body: BatchTriggerTaskV3RequestBody,
+ options: BatchTriggerTaskServiceOptions = {}
+ ): Promise {
+ try {
+ return await this.traceWithEnv(
+ "call()",
+ environment,
+ async (span) => {
+ const { id, friendlyId } = BatchId.generate();
+
+ span.setAttribute("batchId", friendlyId);
+
+ if (environment.type !== "DEVELOPMENT") {
+ const result = await getEntitlement(environment.organizationId);
+ if (result && result.hasAccess === false) {
+ throw new OutOfEntitlementError();
+ }
+ }
+
+ // Upload to object store
+ const payloadPacket = await this.#handlePayloadPacket(
+ body.items,
+ `batch/${friendlyId}`,
+ environment
+ );
+
+ const batch = await this.#createAndProcessBatchTaskRun(
+ friendlyId,
+ payloadPacket,
+ environment,
+ body,
+ options
+ );
+
+ if (!batch) {
+ throw new Error("Failed to create batch");
+ }
+
+ return {
+ id: batch.friendlyId,
+ isCached: false,
+ idempotencyKey: batch.idempotencyKey ?? undefined,
+ runCount: body.items.length,
+ };
+ }
+ );
+ } catch (error) {
+ // Detect a prisma transaction Unique constraint violation
+ if (error instanceof Prisma.PrismaClientKnownRequestError) {
+ logger.debug("BatchTriggerV3: Prisma transaction error", {
+ code: error.code,
+ message: error.message,
+ meta: error.meta,
+ });
+
+ if (error.code === "P2002") {
+ const target = error.meta?.target;
+
+ if (
+ Array.isArray(target) &&
+ target.length > 0 &&
+ typeof target[0] === "string" &&
+ target[0].includes("oneTimeUseToken")
+ ) {
+ throw new ServiceValidationError(
+ "Cannot batch trigger with a one-time use token as it has already been used."
+ );
+ } else {
+ throw new ServiceValidationError(
+ "Cannot batch trigger as it has already been triggered with the same idempotency key."
+ );
+ }
+ }
+ }
+
+ throw error;
+ }
+ }
+
+ async #createAndProcessBatchTaskRun(
+ batchId: string,
+ payloadPacket: IOPacket,
+ environment: AuthenticatedEnvironment,
+ body: BatchTriggerTaskV2RequestBody,
+ options: BatchTriggerTaskServiceOptions = {}
+ ) {
+ if (body.items.length <= ASYNC_BATCH_PROCESS_SIZE_THRESHOLD) {
+ const batch = await this._prisma.batchTaskRun.create({
+ data: {
+ id: BatchId.fromFriendlyId(batchId),
+ friendlyId: batchId,
+ runtimeEnvironmentId: environment.id,
+ runCount: body.items.length,
+ runIds: [],
+ payload: payloadPacket.data,
+ payloadType: payloadPacket.dataType,
+ options,
+ batchVersion: "runengine:v1",
+ oneTimeUseToken: options.oneTimeUseToken,
+ },
+ });
+
+ if (body.parentRunId && body.resumeParentOnCompletion) {
+ await this._engine.blockRunWithCreatedBatch({
+ runId: RunId.fromFriendlyId(body.parentRunId),
+ batchId: batch.id,
+ environmentId: environment.id,
+ projectId: environment.projectId,
+ organizationId: environment.organizationId,
+ });
+ }
+
+ const result = await this.#processBatchTaskRunItems({
+ batch,
+ environment,
+ currentIndex: 0,
+ batchSize: PROCESSING_BATCH_SIZE,
+ items: body.items,
+ options,
+ parentRunId: body.parentRunId,
+ resumeParentOnCompletion: body.resumeParentOnCompletion,
+ });
+
+ switch (result.status) {
+ case "COMPLETE": {
+ logger.debug("[BatchTriggerV3][call] Batch inline processing complete", {
+ batchId: batch.friendlyId,
+ currentIndex: 0,
+ });
+
+ return batch;
+ }
+ case "INCOMPLETE": {
+ logger.debug("[BatchTriggerV3][call] Batch inline processing incomplete", {
+ batchId: batch.friendlyId,
+ currentIndex: result.workingIndex,
+ });
+
+ // If processing inline does not finish for some reason, enqueue processing the rest of the batch
+ await this.#enqueueBatchTaskRun({
+ batchId: batch.id,
+ processingId: "0",
+ range: {
+ start: result.workingIndex,
+ count: PROCESSING_BATCH_SIZE,
+ },
+ attemptCount: 0,
+ strategy: "sequential",
+ parentRunId: body.parentRunId,
+ resumeParentOnCompletion: body.resumeParentOnCompletion,
+ });
+
+ return batch;
+ }
+ case "ERROR": {
+ logger.error("[BatchTriggerV3][call] Batch inline processing error", {
+ batchId: batch.friendlyId,
+ currentIndex: result.workingIndex,
+ error: result.error,
+ });
+
+ await this.#enqueueBatchTaskRun({
+ batchId: batch.id,
+ processingId: "0",
+ range: {
+ start: result.workingIndex,
+ count: PROCESSING_BATCH_SIZE,
+ },
+ attemptCount: 0,
+ strategy: "sequential",
+ parentRunId: body.parentRunId,
+ resumeParentOnCompletion: body.resumeParentOnCompletion,
+ });
+
+ return batch;
+ }
+ }
+ } else {
+ return await $transaction(this._prisma, async (tx) => {
+ const batch = await tx.batchTaskRun.create({
+ data: {
+ id: BatchId.fromFriendlyId(batchId),
+ friendlyId: batchId,
+ runtimeEnvironmentId: environment.id,
+ runCount: body.items.length,
+ runIds: [],
+ payload: payloadPacket.data,
+ payloadType: payloadPacket.dataType,
+ options,
+ batchVersion: "runengine:v1",
+ oneTimeUseToken: options.oneTimeUseToken,
+ },
+ });
+
+ if (body.parentRunId && body.resumeParentOnCompletion) {
+ await this._engine.blockRunWithCreatedBatch({
+ runId: RunId.fromFriendlyId(body.parentRunId),
+ batchId: batch.id,
+ environmentId: environment.id,
+ projectId: environment.projectId,
+ organizationId: environment.organizationId,
+ tx,
+ });
+ }
+
+ switch (this._batchProcessingStrategy) {
+ case "sequential": {
+ await this.#enqueueBatchTaskRun({
+ batchId: batch.id,
+ processingId: batchId,
+ range: { start: 0, count: PROCESSING_BATCH_SIZE },
+ attemptCount: 0,
+ strategy: this._batchProcessingStrategy,
+ parentRunId: body.parentRunId,
+ resumeParentOnCompletion: body.resumeParentOnCompletion,
+ });
+
+ break;
+ }
+ case "parallel": {
+ const ranges = Array.from({
+ length: Math.ceil(body.items.length / PROCESSING_BATCH_SIZE),
+ }).map((_, index) => ({
+ start: index * PROCESSING_BATCH_SIZE,
+ count: PROCESSING_BATCH_SIZE,
+ }));
+
+ await Promise.all(
+ ranges.map((range, index) =>
+ this.#enqueueBatchTaskRun(
+ {
+ batchId: batch.id,
+ processingId: `${index}`,
+ range,
+ attemptCount: 0,
+ strategy: this._batchProcessingStrategy,
+ parentRunId: body.parentRunId,
+ resumeParentOnCompletion: body.resumeParentOnCompletion,
+ },
+ tx
+ )
+ )
+ );
+
+ break;
+ }
+ }
+
+ return batch;
+ });
+ }
+ }
+
+ async processBatchTaskRun(options: BatchProcessingOptions) {
+ logger.debug("[BatchTriggerV3][processBatchTaskRun] Processing batch", {
+ options,
+ });
+
+ const $attemptCount = options.attemptCount + 1;
+
+ // Add early return if max attempts reached
+ if ($attemptCount > MAX_ATTEMPTS) {
+ logger.error("[BatchTriggerV3][processBatchTaskRun] Max attempts reached", {
+ options,
+ attemptCount: $attemptCount,
+ });
+ // You might want to update the batch status to failed here
+ return;
+ }
+
+ const batch = await this._prisma.batchTaskRun.findFirst({
+ where: { id: options.batchId },
+ include: {
+ runtimeEnvironment: {
+ include: {
+ project: true,
+ organization: true,
+ },
+ },
+ },
+ });
+
+ if (!batch) {
+ return;
+ }
+
+ // Check to make sure the currentIndex is not greater than the runCount
+ if (options.range.start >= batch.runCount) {
+ logger.debug("[BatchTriggerV3][processBatchTaskRun] currentIndex is greater than runCount", {
+ options,
+ batchId: batch.friendlyId,
+ runCount: batch.runCount,
+ attemptCount: $attemptCount,
+ });
+
+ return;
+ }
+
+ // Resolve the payload
+ const payloadPacket = await downloadPacketFromObjectStore(
+ {
+ data: batch.payload ?? undefined,
+ dataType: batch.payloadType,
+ },
+ batch.runtimeEnvironment
+ );
+
+ const payload = await parsePacket(payloadPacket);
+
+ if (!payload) {
+ logger.debug("[BatchTriggerV3][processBatchTaskRun] Failed to parse payload", {
+ options,
+ batchId: batch.friendlyId,
+ attemptCount: $attemptCount,
+ });
+
+ throw new Error("Failed to parse payload");
+ }
+
+ // Skip zod parsing
+ const $payload = payload as BatchTriggerTaskV2RequestBody["items"];
+ const $options = batch.options as BatchTriggerTaskServiceOptions;
+
+ const result = await this.#processBatchTaskRunItems({
+ batch,
+ environment: batch.runtimeEnvironment,
+ currentIndex: options.range.start,
+ batchSize: options.range.count,
+ items: $payload,
+ options: $options,
+ parentRunId: options.parentRunId,
+ resumeParentOnCompletion: options.resumeParentOnCompletion,
+ });
+
+ switch (result.status) {
+ case "COMPLETE": {
+ logger.debug("[BatchTriggerV3][processBatchTaskRun] Batch processing complete", {
+ options,
+ batchId: batch.friendlyId,
+ attemptCount: $attemptCount,
+ });
+
+ return;
+ }
+ case "INCOMPLETE": {
+ logger.debug("[BatchTriggerV3][processBatchTaskRun] Batch processing incomplete", {
+ batchId: batch.friendlyId,
+ currentIndex: result.workingIndex,
+ attemptCount: $attemptCount,
+ });
+
+ // Only enqueue the next batch task run if the strategy is sequential
+ // if the strategy is parallel, we will already have enqueued the next batch task run
+ if (options.strategy === "sequential") {
+ await this.#enqueueBatchTaskRun({
+ batchId: batch.id,
+ processingId: options.processingId,
+ range: {
+ start: result.workingIndex,
+ count: options.range.count,
+ },
+ attemptCount: 0,
+ strategy: options.strategy,
+ parentRunId: options.parentRunId,
+ resumeParentOnCompletion: options.resumeParentOnCompletion,
+ });
+ }
+
+ return;
+ }
+ case "ERROR": {
+ logger.error("[BatchTriggerV3][processBatchTaskRun] Batch processing error", {
+ batchId: batch.friendlyId,
+ currentIndex: result.workingIndex,
+ error: result.error,
+ attemptCount: $attemptCount,
+ });
+
+ // if the strategy is sequential, we will requeue processing with a count of the PROCESSING_BATCH_SIZE
+ // if the strategy is parallel, we will requeue processing with a range starting at the workingIndex and a count that is the remainder of this "slice" of the batch
+ if (options.strategy === "sequential") {
+ await this.#enqueueBatchTaskRun({
+ batchId: batch.id,
+ processingId: options.processingId,
+ range: {
+ start: result.workingIndex,
+ count: options.range.count, // This will be the same as the original count
+ },
+ attemptCount: $attemptCount,
+ strategy: options.strategy,
+ parentRunId: options.parentRunId,
+ resumeParentOnCompletion: options.resumeParentOnCompletion,
+ });
+ } else {
+ await this.#enqueueBatchTaskRun({
+ batchId: batch.id,
+ processingId: options.processingId,
+ range: {
+ start: result.workingIndex,
+ // This will be the remainder of the slice
+ // for example if the original range was 0-50 and the workingIndex is 25, the new range will be 25-25
+ // if the original range was 51-100 and the workingIndex is 75, the new range will be 75-25
+ count: options.range.count - result.workingIndex - options.range.start,
+ },
+ attemptCount: $attemptCount,
+ strategy: options.strategy,
+ parentRunId: options.parentRunId,
+ resumeParentOnCompletion: options.resumeParentOnCompletion,
+ });
+ }
+
+ return;
+ }
+ }
+ }
+
+ async #processBatchTaskRunItems({
+ batch,
+ environment,
+ currentIndex,
+ batchSize,
+ items,
+ options,
+ parentRunId,
+ resumeParentOnCompletion,
+ }: {
+ batch: BatchTaskRun;
+ environment: AuthenticatedEnvironment;
+ currentIndex: number;
+ batchSize: number;
+ items: BatchTriggerTaskV2RequestBody["items"];
+ options?: BatchTriggerTaskServiceOptions;
+ parentRunId?: string | undefined;
+ resumeParentOnCompletion?: boolean | undefined;
+ }): Promise<
+ | { status: "COMPLETE" }
+ | { status: "INCOMPLETE"; workingIndex: number }
+ | { status: "ERROR"; error: string; workingIndex: number }
+ > {
+ // Grab the next PROCESSING_BATCH_SIZE items
+ const itemsToProcess = items.slice(currentIndex, currentIndex + batchSize);
+
+ logger.debug("[BatchTriggerV3][processBatchTaskRun] Processing batch items", {
+ batchId: batch.friendlyId,
+ currentIndex,
+ runCount: batch.runCount,
+ });
+
+ let workingIndex = currentIndex;
+
+ let runIds: string[] = [];
+
+ for (const item of itemsToProcess) {
+ try {
+ const run = await this.#processBatchTaskRunItem({
+ batch,
+ environment,
+ item,
+ currentIndex: workingIndex,
+ options,
+ parentRunId,
+ resumeParentOnCompletion,
+ });
+
+ if (!run) {
+ logger.error("[BatchTriggerV3][processBatchTaskRun] Failed to process item", {
+ batchId: batch.friendlyId,
+ currentIndex: workingIndex,
+ });
+
+ throw new Error("[BatchTriggerV3][processBatchTaskRun] Failed to process item");
+ }
+
+ runIds.push(run.friendlyId);
+
+ workingIndex++;
+ } catch (error) {
+ logger.error("[BatchTriggerV3][processBatchTaskRun] Failed to process item", {
+ batchId: batch.friendlyId,
+ currentIndex: workingIndex,
+ error,
+ });
+
+ return {
+ status: "ERROR",
+ error: error instanceof Error ? error.message : String(error),
+ workingIndex,
+ };
+ }
+ }
+
+ //add the run ids to the batch
+ const updatedBatch = await this._prisma.batchTaskRun.update({
+ where: { id: batch.id },
+ data: {
+ runIds: {
+ push: runIds,
+ },
+ },
+ });
+
+ // if there are more items to process, requeue the batch
+ if (workingIndex < batch.runCount) {
+ return { status: "INCOMPLETE", workingIndex };
+ }
+
+ //triggered all the runs
+ if (updatedBatch.runIds.length === updatedBatch.runCount) {
+ //unblock the parent run from the batch
+ //this prevents the parent continuing before all the runs are created
+ if (parentRunId && resumeParentOnCompletion) {
+ await this._engine.unblockRunForCreatedBatch({
+ runId: RunId.fromFriendlyId(parentRunId),
+ batchId: batch.id,
+ environmentId: environment.id,
+ projectId: environment.projectId,
+ });
+ }
+
+ //if all the runs were idempotent, it's possible the batch is already completed
+ await this._engine.tryCompleteBatch({ batchId: batch.id });
+ }
+
+ return { status: "COMPLETE" };
+ }
+
+ async #processBatchTaskRunItem({
+ batch,
+ environment,
+ item,
+ currentIndex,
+ options,
+ parentRunId,
+ resumeParentOnCompletion,
+ }: {
+ batch: BatchTaskRun;
+ environment: AuthenticatedEnvironment;
+ item: BatchTriggerTaskV2RequestBody["items"][number];
+ currentIndex: number;
+ options?: BatchTriggerTaskServiceOptions;
+ parentRunId: string | undefined;
+ resumeParentOnCompletion: boolean | undefined;
+ }) {
+ logger.debug("[BatchTriggerV3][processBatchTaskRunItem] Processing item", {
+ batchId: batch.friendlyId,
+ currentIndex,
+ });
+
+ const triggerTaskService = new TriggerTaskService();
+
+ const result = await triggerTaskService.call(
+ item.task,
+ environment,
+ {
+ ...item,
+ options: {
+ ...item.options,
+ parentRunId,
+ resumeParentOnCompletion,
+ parentBatch: batch.id,
+ },
+ },
+ {
+ triggerVersion: options?.triggerVersion,
+ traceContext: options?.traceContext,
+ spanParentAsLink: options?.spanParentAsLink,
+ batchId: batch.id,
+ batchIndex: currentIndex,
+ },
+ "V2"
+ );
+
+ return result
+ ? {
+ friendlyId: result.run.friendlyId,
+ }
+ : undefined;
+ }
+
+ async #enqueueBatchTaskRun(options: BatchProcessingOptions, tx?: PrismaClientOrTransaction) {
+ await workerQueue.enqueue("v3.processBatchTaskRunV3", options, {
+ tx,
+ jobKey: `BatchTriggerV3Service.process:${options.batchId}:${options.processingId}`,
+ });
+ }
+
+ async #handlePayloadPacket(
+ payload: any,
+ pathPrefix: string,
+ environment: AuthenticatedEnvironment
+ ) {
+ return await startActiveSpan("handlePayloadPacket()", async (span) => {
+ const packet = { data: JSON.stringify(payload), dataType: "application/json" };
+
+ if (!packet.data) {
+ return packet;
+ }
+
+ const { needsOffloading } = packetRequiresOffloading(
+ packet,
+ env.TASK_PAYLOAD_OFFLOAD_THRESHOLD
+ );
+
+ if (!needsOffloading) {
+ return packet;
+ }
+
+ const filename = `${pathPrefix}/payload.json`;
+
+ await uploadPacketToObjectStore(filename, packet.data, packet.dataType, environment);
+
+ return {
+ data: filename,
+ dataType: "application/store",
+ };
+ });
+ }
+}
diff --git a/apps/webapp/app/v3/services/cancelTaskRun.server.ts b/apps/webapp/app/v3/services/cancelTaskRun.server.ts
index b78e75264b..811fd54d64 100644
--- a/apps/webapp/app/v3/services/cancelTaskRun.server.ts
+++ b/apps/webapp/app/v3/services/cancelTaskRun.server.ts
@@ -1,28 +1,10 @@
-import { type Prisma, type TaskRun } from "@trigger.dev/database";
-import assertNever from "assert-never";
+import { RunEngineVersion, type TaskRun } from "@trigger.dev/database";
import { logger } from "~/services/logger.server";
import { eventRepository } from "../eventRepository.server";
-import { socketIo } from "../handleSocketIo.server";
-import { devPubSub } from "../marqs/devPubSub.server";
-import { CANCELLABLE_ATTEMPT_STATUSES, isCancellableRunStatus } from "../taskStatus";
-import { BaseService } from "./baseService.server";
-import { CancelAttemptService } from "./cancelAttempt.server";
-import { CancelTaskAttemptDependenciesService } from "./cancelTaskAttemptDependencies.server";
-import { FinalizeTaskRunService } from "./finalizeTaskRun.server";
+import { engine } from "../runEngine.server";
import { getTaskEventStoreTableForRun } from "../taskEventStore.server";
-
-type ExtendedTaskRun = Prisma.TaskRunGetPayload<{
- include: {
- runtimeEnvironment: true;
- lockedToVersion: true;
- };
-}>;
-
-type ExtendedTaskRunAttempt = Prisma.TaskRunAttemptGetPayload<{
- include: {
- backgroundWorker: true;
- };
-}>;
+import { BaseService } from "./baseService.server";
+import { CancelTaskRunServiceV1 } from "./cancelTaskRunV1.server";
export type CancelTaskRunServiceOptions = {
reason?: string;
@@ -30,58 +12,39 @@ export type CancelTaskRunServiceOptions = {
cancelledAt?: Date;
};
-export class CancelTaskRunService extends BaseService {
- public async call(taskRun: TaskRun, options?: CancelTaskRunServiceOptions) {
- const opts = {
- reason: "Task run was cancelled by user",
- cancelAttempts: true,
- cancelledAt: new Date(),
- ...options,
- };
+type CancelTaskRunServiceResult = {
+ id: string;
+};
- // Make sure the task run is in a cancellable state
- if (!isCancellableRunStatus(taskRun.status)) {
- logger.error("Task run is not in a cancellable state", {
- runId: taskRun.id,
- status: taskRun.status,
- });
- return;
+export class CancelTaskRunService extends BaseService {
+ public async call(
+ taskRun: TaskRun,
+ options?: CancelTaskRunServiceOptions
+ ): Promise {
+ if (taskRun.engine === RunEngineVersion.V1) {
+ return await this.callV1(taskRun, options);
+ } else {
+ return await this.callV2(taskRun, options);
}
+ }
- const finalizeService = new FinalizeTaskRunService();
- const cancelledTaskRun = await finalizeService.call({
- id: taskRun.id,
- status: "CANCELED",
- completedAt: opts.cancelledAt,
- include: {
- attempts: {
- where: {
- status: {
- in: CANCELLABLE_ATTEMPT_STATUSES,
- },
- },
- include: {
- backgroundWorker: true,
- dependencies: {
- include: {
- taskRun: true,
- },
- },
- batchTaskRunItems: {
- include: {
- taskRun: true,
- },
- },
- },
- },
- runtimeEnvironment: true,
- lockedToVersion: true,
- },
- attemptStatus: "CANCELED",
- error: {
- type: "STRING_ERROR",
- raw: opts.reason,
- },
+ private async callV1(
+ taskRun: TaskRun,
+ options?: CancelTaskRunServiceOptions
+ ): Promise {
+ const service = new CancelTaskRunServiceV1(this._prisma);
+ return await service.call(taskRun, options);
+ }
+
+ private async callV2(
+ taskRun: TaskRun,
+ options?: CancelTaskRunServiceOptions
+ ): Promise {
+ const result = await engine.cancelRun({
+ runId: taskRun.id,
+ completedAt: options?.cancelledAt,
+ reason: options?.reason,
+ tx: this._prisma,
});
const inProgressEvents = await eventRepository.queryIncompleteEvents(
@@ -99,94 +62,16 @@ export class CancelTaskRunService extends BaseService {
await Promise.all(
inProgressEvents.map((event) => {
- return eventRepository.cancelEvent(event, opts.cancelledAt, opts.reason);
+ return eventRepository.cancelEvent(
+ event,
+ options?.cancelledAt ?? new Date(),
+ options?.reason ?? "Run cancelled"
+ );
})
);
- // Cancel any in progress attempts
- if (opts.cancelAttempts) {
- await this.#cancelPotentiallyRunningAttempts(cancelledTaskRun, cancelledTaskRun.attempts);
- await this.#cancelRemainingRunWorkers(cancelledTaskRun);
- }
-
return {
- id: cancelledTaskRun.id,
+ id: result.run.id,
};
}
-
- async #cancelPotentiallyRunningAttempts(
- run: ExtendedTaskRun,
- attempts: ExtendedTaskRunAttempt[]
- ) {
- for (const attempt of attempts) {
- await CancelTaskAttemptDependenciesService.enqueue(attempt.id, this._prisma);
-
- if (run.runtimeEnvironment.type === "DEVELOPMENT") {
- // Signal the task run attempt to stop
- await devPubSub.publish(
- `backgroundWorker:${attempt.backgroundWorkerId}:${attempt.id}`,
- "CANCEL_ATTEMPT",
- {
- attemptId: attempt.friendlyId,
- backgroundWorkerId: attempt.backgroundWorker.friendlyId,
- taskRunId: run.friendlyId,
- }
- );
- } else {
- switch (attempt.status) {
- case "EXECUTING": {
- // We need to send a cancel message to the coordinator
- socketIo.coordinatorNamespace.emit("REQUEST_ATTEMPT_CANCELLATION", {
- version: "v1",
- attemptId: attempt.id,
- attemptFriendlyId: attempt.friendlyId,
- });
-
- break;
- }
- case "PENDING":
- case "PAUSED": {
- logger.debug("Cancelling pending or paused attempt", {
- attempt,
- });
-
- const service = new CancelAttemptService();
-
- await service.call(
- attempt.friendlyId,
- run.id,
- new Date(),
- "Task run was cancelled by user"
- );
-
- break;
- }
- case "CANCELED":
- case "COMPLETED":
- case "FAILED": {
- // Do nothing
- break;
- }
- default: {
- assertNever(attempt.status);
- }
- }
- }
- }
- }
-
- async #cancelRemainingRunWorkers(run: ExtendedTaskRun) {
- if (run.runtimeEnvironment.type === "DEVELOPMENT") {
- // Nothing to do
- return;
- }
-
- // Broadcast cancel message to all coordinators
- socketIo.coordinatorNamespace.emit("REQUEST_RUN_CANCELLATION", {
- version: "v1",
- runId: run.id,
- // Give the attempts some time to exit gracefully. If the runs supports lazy attempts, it also supports exit delays.
- delayInMs: run.lockedToVersion?.supportsLazyAttempts ? 5_000 : undefined,
- });
- }
}
diff --git a/apps/webapp/app/v3/services/cancelTaskRunV1.server.ts b/apps/webapp/app/v3/services/cancelTaskRunV1.server.ts
new file mode 100644
index 0000000000..78aec652f7
--- /dev/null
+++ b/apps/webapp/app/v3/services/cancelTaskRunV1.server.ts
@@ -0,0 +1,192 @@
+import { type Prisma, type TaskRun } from "@trigger.dev/database";
+import assertNever from "assert-never";
+import { logger } from "~/services/logger.server";
+import { eventRepository } from "../eventRepository.server";
+import { socketIo } from "../handleSocketIo.server";
+import { devPubSub } from "../marqs/devPubSub.server";
+import { CANCELLABLE_ATTEMPT_STATUSES, isCancellableRunStatus } from "../taskStatus";
+import { BaseService } from "./baseService.server";
+import { CancelAttemptService } from "./cancelAttempt.server";
+import { CancelTaskAttemptDependenciesService } from "./cancelTaskAttemptDependencies.server";
+import { FinalizeTaskRunService } from "./finalizeTaskRun.server";
+import { getTaskEventStoreTableForRun } from "../taskEventStore.server";
+
+type ExtendedTaskRun = Prisma.TaskRunGetPayload<{
+ include: {
+ runtimeEnvironment: true;
+ lockedToVersion: true;
+ };
+}>;
+
+type ExtendedTaskRunAttempt = Prisma.TaskRunAttemptGetPayload<{
+ include: {
+ backgroundWorker: true;
+ };
+}>;
+
+export type CancelTaskRunServiceOptions = {
+ reason?: string;
+ cancelAttempts?: boolean;
+ cancelledAt?: Date;
+};
+
+export class CancelTaskRunServiceV1 extends BaseService {
+ public async call(taskRun: TaskRun, options?: CancelTaskRunServiceOptions) {
+ const opts = {
+ reason: "Task run was cancelled by user",
+ cancelAttempts: true,
+ cancelledAt: new Date(),
+ ...options,
+ };
+
+ // Make sure the task run is in a cancellable state
+ if (!isCancellableRunStatus(taskRun.status)) {
+ logger.error("Task run is not in a cancellable state", {
+ runId: taskRun.id,
+ status: taskRun.status,
+ });
+ return;
+ }
+
+ const finalizeService = new FinalizeTaskRunService();
+ const cancelledTaskRun = await finalizeService.call({
+ id: taskRun.id,
+ status: "CANCELED",
+ completedAt: opts.cancelledAt,
+ include: {
+ attempts: {
+ where: {
+ status: {
+ in: CANCELLABLE_ATTEMPT_STATUSES,
+ },
+ },
+ include: {
+ backgroundWorker: true,
+ dependencies: {
+ include: {
+ taskRun: true,
+ },
+ },
+ batchTaskRunItems: {
+ include: {
+ taskRun: true,
+ },
+ },
+ },
+ },
+ runtimeEnvironment: true,
+ lockedToVersion: true,
+ },
+ attemptStatus: "CANCELED",
+ error: {
+ type: "STRING_ERROR",
+ raw: opts.reason,
+ },
+ });
+
+ const inProgressEvents = await eventRepository.queryIncompleteEvents(
+ getTaskEventStoreTableForRun(taskRun),
+ {
+ runId: taskRun.friendlyId,
+ },
+ taskRun.createdAt,
+ taskRun.completedAt ?? undefined
+ );
+
+ logger.debug("Cancelling in-progress events", {
+ inProgressEvents: inProgressEvents.map((event) => event.id),
+ });
+
+ await Promise.all(
+ inProgressEvents.map((event) => {
+ return eventRepository.cancelEvent(event, opts.cancelledAt, opts.reason);
+ })
+ );
+
+ // Cancel any in progress attempts
+ if (opts.cancelAttempts) {
+ await this.#cancelPotentiallyRunningAttempts(cancelledTaskRun, cancelledTaskRun.attempts);
+ await this.#cancelRemainingRunWorkers(cancelledTaskRun);
+ }
+
+ return {
+ id: cancelledTaskRun.id,
+ };
+ }
+
+ async #cancelPotentiallyRunningAttempts(
+ run: ExtendedTaskRun,
+ attempts: ExtendedTaskRunAttempt[]
+ ) {
+ for (const attempt of attempts) {
+ await CancelTaskAttemptDependenciesService.enqueue(attempt.id, this._prisma);
+
+ if (run.runtimeEnvironment.type === "DEVELOPMENT") {
+ // Signal the task run attempt to stop
+ await devPubSub.publish(
+ `backgroundWorker:${attempt.backgroundWorkerId}:${attempt.id}`,
+ "CANCEL_ATTEMPT",
+ {
+ attemptId: attempt.friendlyId,
+ backgroundWorkerId: attempt.backgroundWorker.friendlyId,
+ taskRunId: run.friendlyId,
+ }
+ );
+ } else {
+ switch (attempt.status) {
+ case "EXECUTING": {
+ // We need to send a cancel message to the coordinator
+ socketIo.coordinatorNamespace.emit("REQUEST_ATTEMPT_CANCELLATION", {
+ version: "v1",
+ attemptId: attempt.id,
+ attemptFriendlyId: attempt.friendlyId,
+ });
+
+ break;
+ }
+ case "PENDING":
+ case "PAUSED": {
+ logger.debug("Cancelling pending or paused attempt", {
+ attempt,
+ });
+
+ const service = new CancelAttemptService();
+
+ await service.call(
+ attempt.friendlyId,
+ run.id,
+ new Date(),
+ "Task run was cancelled by user"
+ );
+
+ break;
+ }
+ case "CANCELED":
+ case "COMPLETED":
+ case "FAILED": {
+ // Do nothing
+ break;
+ }
+ default: {
+ assertNever(attempt.status);
+ }
+ }
+ }
+ }
+ }
+
+ async #cancelRemainingRunWorkers(run: ExtendedTaskRun) {
+ if (run.runtimeEnvironment.type === "DEVELOPMENT") {
+ // Nothing to do
+ return;
+ }
+
+ // Broadcast cancel message to all coordinators
+ socketIo.coordinatorNamespace.emit("REQUEST_RUN_CANCELLATION", {
+ version: "v1",
+ runId: run.id,
+ // Give the attempts some time to exit gracefully. If the runs supports lazy attempts, it also supports exit delays.
+ delayInMs: run.lockedToVersion?.supportsLazyAttempts ? 5_000 : undefined,
+ });
+ }
+}
diff --git a/apps/webapp/app/v3/services/changeCurrentDeployment.server.ts b/apps/webapp/app/v3/services/changeCurrentDeployment.server.ts
index 9a28fc503a..a5740bfe90 100644
--- a/apps/webapp/app/v3/services/changeCurrentDeployment.server.ts
+++ b/apps/webapp/app/v3/services/changeCurrentDeployment.server.ts
@@ -1,8 +1,8 @@
import { WorkerDeployment } from "@trigger.dev/database";
-import { CURRENT_DEPLOYMENT_LABEL } from "~/consts";
import { BaseService, ServiceValidationError } from "./baseService.server";
import { ExecuteTasksWaitingForDeployService } from "./executeTasksWaitingForDeploy";
import { compareDeploymentVersions } from "../utils/deploymentVersions";
+import { CURRENT_DEPLOYMENT_LABEL } from "@trigger.dev/core/v3/apps";
export type ChangeCurrentDeploymentDirection = "promote" | "rollback";
diff --git a/apps/webapp/app/v3/services/createBackgroundWorker.server.ts b/apps/webapp/app/v3/services/createBackgroundWorker.server.ts
index 58c37f463c..fb5e6eeef0 100644
--- a/apps/webapp/app/v3/services/createBackgroundWorker.server.ts
+++ b/apps/webapp/app/v3/services/createBackgroundWorker.server.ts
@@ -16,6 +16,12 @@ import { RegisterNextTaskScheduleInstanceService } from "./registerNextTaskSched
import cronstrue from "cronstrue";
import { CheckScheduleService } from "./checkSchedule.server";
import { clampMaxDuration } from "../utils/maxDuration";
+import {
+ removeQueueConcurrencyLimits,
+ updateEnvConcurrencyLimits,
+ updateQueueConcurrencyLimits,
+} from "../runQueue.server";
+import { BackgroundWorkerId } from "@trigger.dev/core/v3/apps";
import { sanitizeQueueName } from "~/models/taskQueue.server";
export class CreateBackgroundWorkerService extends BaseService {
@@ -64,7 +70,7 @@ export class CreateBackgroundWorkerService extends BaseService {
const backgroundWorker = await this._prisma.backgroundWorker.create({
data: {
- friendlyId: generateFriendlyId("worker"),
+ ...BackgroundWorkerId.generate(),
version: nextVersion,
runtimeEnvironmentId: environment.id,
projectId: project.id,
@@ -73,9 +79,22 @@ export class CreateBackgroundWorkerService extends BaseService {
cliVersion: body.metadata.cliPackageVersion,
sdkVersion: body.metadata.packageVersion,
supportsLazyAttempts: body.supportsLazyAttempts,
+ engine: body.engine,
},
});
+ //upgrade the project to engine "V2" if it's not already
+ if (project.engine === "V1" && body.engine === "V2") {
+ await this._prisma.project.update({
+ where: {
+ id: project.id,
+ },
+ data: {
+ engine: "V2",
+ },
+ });
+ }
+
const tasksToBackgroundFiles = await createBackgroundFiles(
body.metadata.sourceFiles,
backgroundWorker,
@@ -110,7 +129,7 @@ export class CreateBackgroundWorkerService extends BaseService {
}
);
- await marqs?.updateEnvConcurrencyLimits(environment);
+ await updateEnvConcurrencyLimits(environment);
} catch (err) {
logger.error(
"Error publishing WORKER_CREATED event or updating global concurrency limits",
@@ -212,7 +231,7 @@ export async function createBackgroundTasks(
concurrencyLimit,
taskidentifier: task.id,
});
- await marqs?.updateQueueConcurrencyLimits(environment, taskQueue.name, concurrencyLimit);
+ await updateQueueConcurrencyLimits(environment, taskQueue.name, concurrencyLimit);
} else {
logger.debug("CreateBackgroundWorkerService: removing concurrency limit", {
workerId: worker.id,
@@ -223,8 +242,7 @@ export async function createBackgroundTasks(
concurrencyLimit,
taskidentifier: task.id,
});
-
- await marqs?.removeQueueConcurrencyLimits(environment, taskQueue.name);
+ await removeQueueConcurrencyLimits(environment, taskQueue.name);
}
} catch (error) {
if (error instanceof Prisma.PrismaClientKnownRequestError) {
diff --git a/apps/webapp/app/v3/services/createCheckpoint.server.ts b/apps/webapp/app/v3/services/createCheckpoint.server.ts
index e95f6cedeb..85f6eb8192 100644
--- a/apps/webapp/app/v3/services/createCheckpoint.server.ts
+++ b/apps/webapp/app/v3/services/createCheckpoint.server.ts
@@ -3,12 +3,12 @@ import type { InferSocketMessageSchema } from "@trigger.dev/core/v3/zodSocket";
import type { Checkpoint, CheckpointRestoreEvent } from "@trigger.dev/database";
import { logger } from "~/services/logger.server";
import { marqs } from "~/v3/marqs/index.server";
-import { generateFriendlyId } from "../friendlyIdentifiers";
import { isFreezableAttemptStatus, isFreezableRunStatus } from "../taskStatus";
import { BaseService } from "./baseService.server";
import { CreateCheckpointRestoreEventService } from "./createCheckpointRestoreEvent.server";
import { ResumeBatchRunService } from "./resumeBatchRun.server";
import { ResumeDependentParentsService } from "./resumeDependentParents.server";
+import { CheckpointId } from "@trigger.dev/core/v3/apps";
export class CreateCheckpointService extends BaseService {
public async call(
@@ -209,7 +209,7 @@ export class CreateCheckpointService extends BaseService {
const checkpoint = await this._prisma.checkpoint.create({
data: {
- friendlyId: generateFriendlyId("checkpoint"),
+ ...CheckpointId.generate(),
runtimeEnvironmentId: attempt.taskRun.runtimeEnvironmentId,
projectId: attempt.taskRun.projectId,
attemptId: attempt.id,
diff --git a/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts b/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts
index c46f5b0266..2f378ffdc1 100644
--- a/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts
+++ b/apps/webapp/app/v3/services/createDeployedBackgroundWorker.server.ts
@@ -1,17 +1,17 @@
import { CreateBackgroundWorkerRequestBody } from "@trigger.dev/core/v3";
import type { BackgroundWorker } from "@trigger.dev/database";
+import { CURRENT_DEPLOYMENT_LABEL } from "@trigger.dev/core/v3/apps";
import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
-import { generateFriendlyId } from "../friendlyIdentifiers";
+import { logger } from "~/services/logger.server";
+import { socketIo } from "../handleSocketIo.server";
+import { updateEnvConcurrencyLimits } from "../runQueue.server";
+import { PerformDeploymentAlertsService } from "./alerts/performDeploymentAlerts.server";
import { BaseService } from "./baseService.server";
import { createBackgroundTasks, syncDeclarativeSchedules } from "./createBackgroundWorker.server";
-import { CURRENT_DEPLOYMENT_LABEL } from "~/consts";
-import { projectPubSub } from "./projectPubSub.server";
-import { marqs } from "~/v3/marqs/index.server";
-import { logger } from "~/services/logger.server";
import { ExecuteTasksWaitingForDeployService } from "./executeTasksWaitingForDeploy";
-import { PerformDeploymentAlertsService } from "./alerts/performDeploymentAlerts.server";
+import { projectPubSub } from "./projectPubSub.server";
import { TimeoutDeploymentService } from "./timeoutDeployment.server";
-import { socketIo } from "../handleSocketIo.server";
+import { BackgroundWorkerId } from "@trigger.dev/core/v3/apps";
export class CreateDeployedBackgroundWorkerService extends BaseService {
public async call(
@@ -39,7 +39,7 @@ export class CreateDeployedBackgroundWorkerService extends BaseService {
const backgroundWorker = await this._prisma.backgroundWorker.create({
data: {
- friendlyId: generateFriendlyId("worker"),
+ ...BackgroundWorkerId.generate(),
version: deployment.version,
runtimeEnvironmentId: environment.id,
projectId: environment.projectId,
@@ -48,9 +48,22 @@ export class CreateDeployedBackgroundWorkerService extends BaseService {
cliVersion: body.metadata.cliPackageVersion,
sdkVersion: body.metadata.packageVersion,
supportsLazyAttempts: body.supportsLazyAttempts,
+ engine: body.engine,
},
});
+ //upgrade the project to engine "V2" if it's not already
+ if (environment.project.engine === "V1" && body.engine === "V2") {
+ await this._prisma.project.update({
+ where: {
+ id: environment.project.id,
+ },
+ data: {
+ engine: "V2",
+ },
+ });
+ }
+
try {
await createBackgroundTasks(
body.metadata.tasks,
@@ -128,7 +141,7 @@ export class CreateDeployedBackgroundWorkerService extends BaseService {
type: "deployed",
}
);
- await marqs?.updateEnvConcurrencyLimits(environment);
+ await updateEnvConcurrencyLimits(environment);
} catch (err) {
logger.error("Failed to publish WORKER_CREATED event", { err });
}
diff --git a/apps/webapp/app/v3/services/createDeploymentBackgroundWorker.server.ts b/apps/webapp/app/v3/services/createDeploymentBackgroundWorker.server.ts
index 826ac841b0..53f2dedd88 100644
--- a/apps/webapp/app/v3/services/createDeploymentBackgroundWorker.server.ts
+++ b/apps/webapp/app/v3/services/createDeploymentBackgroundWorker.server.ts
@@ -9,7 +9,7 @@ import {
syncDeclarativeSchedules,
} from "./createBackgroundWorker.server";
import { TimeoutDeploymentService } from "./timeoutDeployment.server";
-import { logger } from "~/services/logger.server";
+import { BackgroundWorkerId } from "@trigger.dev/core/v3/apps";
export class CreateDeploymentBackgroundWorkerService extends BaseService {
public async call(
@@ -36,7 +36,7 @@ export class CreateDeploymentBackgroundWorkerService extends BaseService {
const backgroundWorker = await this._prisma.backgroundWorker.create({
data: {
- friendlyId: generateFriendlyId("worker"),
+ ...BackgroundWorkerId.generate(),
version: deployment.version,
runtimeEnvironmentId: environment.id,
projectId: environment.projectId,
@@ -45,9 +45,22 @@ export class CreateDeploymentBackgroundWorkerService extends BaseService {
cliVersion: body.metadata.cliPackageVersion,
sdkVersion: body.metadata.packageVersion,
supportsLazyAttempts: body.supportsLazyAttempts,
+ engine: body.engine,
},
});
+ //upgrade the project to engine "V2" if it's not already
+ if (environment.project.engine === "V1" && body.engine === "V2") {
+ await this._prisma.project.update({
+ where: {
+ id: environment.project.id,
+ },
+ data: {
+ engine: "V2",
+ },
+ });
+ }
+
try {
const tasksToBackgroundFiles = await createBackgroundFiles(
body.metadata.sourceFiles,
@@ -97,7 +110,7 @@ export class CreateDeploymentBackgroundWorkerService extends BaseService {
data: {
status: "DEPLOYING",
workerId: backgroundWorker.id,
- deployedAt: new Date(),
+ builtAt: new Date(),
},
});
diff --git a/apps/webapp/app/v3/services/finalizeDeployment.server.ts b/apps/webapp/app/v3/services/finalizeDeployment.server.ts
index b25582dd44..a69b21c02e 100644
--- a/apps/webapp/app/v3/services/finalizeDeployment.server.ts
+++ b/apps/webapp/app/v3/services/finalizeDeployment.server.ts
@@ -1,9 +1,10 @@
import { FinalizeDeploymentRequestBody } from "@trigger.dev/core/v3/schemas";
+import { CURRENT_DEPLOYMENT_LABEL } from "@trigger.dev/core/v3/apps";
import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
import { logger } from "~/services/logger.server";
import { socketIo } from "../handleSocketIo.server";
-import { marqs } from "../marqs/index.server";
import { registryProxy } from "../registryProxy.server";
+import { updateEnvConcurrencyLimits } from "../runQueue.server";
import { PerformDeploymentAlertsService } from "./alerts/performDeploymentAlerts.server";
import { BaseService, ServiceValidationError } from "./baseService.server";
import { ChangeCurrentDeploymentService } from "./changeCurrentDeployment.server";
@@ -91,7 +92,7 @@ export class FinalizeDeploymentService extends BaseService {
}
);
- await marqs?.updateEnvConcurrencyLimits(authenticatedEnv);
+ await updateEnvConcurrencyLimits(authenticatedEnv);
} catch (err) {
logger.error("Failed to publish WORKER_CREATED event", { err });
}
diff --git a/apps/webapp/app/v3/services/heartbeatService.server.ts b/apps/webapp/app/v3/services/heartbeatService.server.ts
deleted file mode 100644
index 8db36b2beb..0000000000
--- a/apps/webapp/app/v3/services/heartbeatService.server.ts
+++ /dev/null
@@ -1,49 +0,0 @@
-type HeartbeatServiceOptions = {
- heartbeat: () => Promise;
- pingIntervalInMs?: number;
- leadingEdge?: boolean;
-};
-
-export class HeartbeatService {
- private _heartbeat: () => Promise;
- private _heartbeatIntervalInMs: number;
- private _nextHeartbeat: NodeJS.Timeout | undefined;
- private _leadingEdge: boolean;
-
- constructor(opts: HeartbeatServiceOptions) {
- this._heartbeat = opts.heartbeat;
- this._heartbeatIntervalInMs = opts.pingIntervalInMs ?? 45_000;
- this._nextHeartbeat = undefined;
- this._leadingEdge = opts.leadingEdge ?? false;
- }
-
- start() {
- if (this._leadingEdge) {
- this.#doHeartbeat();
- } else {
- this.#scheduleNextHeartbeat();
- }
- }
-
- stop() {
- this.#clearNextHeartbeat();
- }
-
- #doHeartbeat = async () => {
- this.#clearNextHeartbeat();
-
- await this._heartbeat();
-
- this.#scheduleNextHeartbeat();
- };
-
- #clearNextHeartbeat() {
- if (this._nextHeartbeat) {
- clearTimeout(this._nextHeartbeat);
- }
- }
-
- #scheduleNextHeartbeat() {
- this._nextHeartbeat = setTimeout(this.#doHeartbeat, this._heartbeatIntervalInMs);
- }
-}
diff --git a/apps/webapp/app/v3/services/initializeDeployment.server.ts b/apps/webapp/app/v3/services/initializeDeployment.server.ts
index 12be3ee783..c5a375ba90 100644
--- a/apps/webapp/app/v3/services/initializeDeployment.server.ts
+++ b/apps/webapp/app/v3/services/initializeDeployment.server.ts
@@ -4,9 +4,11 @@ import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
import { generateFriendlyId } from "../friendlyIdentifiers";
import { createRemoteImageBuild } from "../remoteImageBuilder.server";
import { calculateNextBuildVersion } from "../utils/calculateNextBuildVersion";
-import { BaseService } from "./baseService.server";
+import { BaseService, ServiceValidationError } from "./baseService.server";
import { TimeoutDeploymentService } from "./timeoutDeployment.server";
import { env } from "~/env.server";
+import { WorkerDeploymentType } from "@trigger.dev/database";
+import { logger } from "~/services/logger.server";
const nanoid = customAlphabet("1234567890abcdefghijklmnopqrstuvwxyz", 8);
@@ -16,6 +18,10 @@ export class InitializeDeploymentService extends BaseService {
payload: InitializeDeploymentRequestBody
) {
return this.traceWithEnv("call", environment, async (span) => {
+ if (payload.type !== "V1" && environment.project.engine !== "V2") {
+ throw new ServiceValidationError("Only V1 deployments are supported for this project");
+ }
+
const latestDeployment = await this._prisma.workerDeployment.findFirst({
where: {
environmentId: environment.id,
@@ -46,6 +52,36 @@ export class InitializeDeploymentService extends BaseService {
})
: undefined;
+ const sharedImageTag = `${payload.namespace ?? env.DEPLOY_REGISTRY_NAMESPACE}/${
+ environment.project.externalRef
+ }:${nextVersion}.${environment.slug}`;
+
+ const unmanagedImageParts = [];
+
+ if (payload.registryHost) {
+ unmanagedImageParts.push(payload.registryHost);
+ }
+ if (payload.namespace) {
+ unmanagedImageParts.push(payload.namespace);
+ }
+ unmanagedImageParts.push(
+ `${environment.project.externalRef}:${nextVersion}.${environment.slug}`
+ );
+
+ const unmanagedImageTag = unmanagedImageParts.join("/");
+
+ const isManaged = payload.type === WorkerDeploymentType.MANAGED;
+
+ logger.debug("Creating deployment", {
+ environmentId: environment.id,
+ projectId: environment.projectId,
+ version: nextVersion,
+ triggeredById: triggeredBy?.id,
+ type: payload.type,
+ imageTag: isManaged ? sharedImageTag : unmanagedImageTag,
+ imageReference: isManaged ? undefined : unmanagedImageTag,
+ });
+
const deployment = await this._prisma.workerDeployment.create({
data: {
friendlyId: generateFriendlyId("deployment"),
@@ -57,6 +93,8 @@ export class InitializeDeploymentService extends BaseService {
projectId: environment.projectId,
externalBuildData,
triggeredById: triggeredBy?.id,
+ type: payload.type,
+ imageReference: isManaged ? undefined : unmanagedImageTag,
},
});
@@ -67,11 +105,10 @@ export class InitializeDeploymentService extends BaseService {
new Date(Date.now() + env.DEPLOY_TIMEOUT_MS)
);
- const imageTag = `${payload.namespace ?? env.DEPLOY_REGISTRY_NAMESPACE}/${
- environment.project.externalRef
- }:${deployment.version}.${environment.slug}`;
-
- return { deployment, imageTag };
+ return {
+ deployment,
+ imageTag: isManaged ? sharedImageTag : unmanagedImageTag,
+ };
});
}
}
diff --git a/apps/webapp/app/v3/services/replayTaskRun.server.ts b/apps/webapp/app/v3/services/replayTaskRun.server.ts
index 26058e1e6a..a521c4f435 100644
--- a/apps/webapp/app/v3/services/replayTaskRun.server.ts
+++ b/apps/webapp/app/v3/services/replayTaskRun.server.ts
@@ -120,7 +120,9 @@ export class ReplayTaskRunService extends BaseService {
return;
}
- logger.error("Failed to replay a run", { error: error });
+ logger.error("Failed to replay a run", {
+ error: error instanceof Error ? error.message : error,
+ });
return;
}
diff --git a/apps/webapp/app/v3/services/rescheduleTaskRun.server.ts b/apps/webapp/app/v3/services/rescheduleTaskRun.server.ts
index 4a26bca94e..122fcc2c59 100644
--- a/apps/webapp/app/v3/services/rescheduleTaskRun.server.ts
+++ b/apps/webapp/app/v3/services/rescheduleTaskRun.server.ts
@@ -1,8 +1,8 @@
import { RescheduleRunRequestBody } from "@trigger.dev/core/v3";
import { TaskRun } from "@trigger.dev/database";
+import { parseDelay } from "~/utils/delays";
import { BaseService, ServiceValidationError } from "./baseService.server";
import { EnqueueDelayedRunService } from "./enqueueDelayedRun.server";
-import { parseDelay } from "./triggerTask.server";
export class RescheduleTaskRunService extends BaseService {
public async call(taskRun: TaskRun, body: RescheduleRunRequestBody) {
diff --git a/apps/webapp/app/v3/services/triggerTask.server.ts b/apps/webapp/app/v3/services/triggerTask.server.ts
index 91a201e9cf..b7aef6450f 100644
--- a/apps/webapp/app/v3/services/triggerTask.server.ts
+++ b/apps/webapp/app/v3/services/triggerTask.server.ts
@@ -1,37 +1,10 @@
-import {
- IOPacket,
- packetRequiresOffloading,
- QueueOptions,
- SemanticInternalAttributes,
- taskRunErrorEnhancer,
- taskRunErrorToString,
- TriggerTaskRequestBody,
-} from "@trigger.dev/core/v3";
-import { parseNaturalLanguageDuration } from "@trigger.dev/core/v3/apps";
-import { Prisma, TaskRun } from "@trigger.dev/database";
-import { env } from "~/env.server";
-import { sanitizeQueueName } from "~/models/taskQueue.server";
-import { createTag, MAX_TAGS_PER_RUN } from "~/models/taskRunTag.server";
+import { TriggerTaskRequestBody } from "@trigger.dev/core/v3";
+import { RunEngineVersion, TaskRun } from "@trigger.dev/database";
import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
-import { autoIncrementCounter } from "~/services/autoIncrementCounter.server";
-import { logger } from "~/services/logger.server";
-import { getEntitlement } from "~/services/platform.v3.server";
-import { resolveIdempotencyKeyTTL } from "~/utils/idempotencyKeys.server";
-import { handleMetadataPacket } from "~/utils/packets";
-import { marqs } from "~/v3/marqs/index.server";
-import { eventRepository } from "../eventRepository.server";
-import { generateFriendlyId } from "../friendlyIdentifiers";
-import { findCurrentWorkerFromEnvironment } from "../models/workerDeployment.server";
-import { guardQueueSizeLimitsForEnv } from "../queueSizeLimits.server";
-import { uploadPacketToObjectStore } from "../r2.server";
-import { isFinalAttemptStatus, isFinalRunStatus } from "../taskStatus";
-import { startActiveSpan } from "../tracer.server";
-import { clampMaxDuration } from "../utils/maxDuration";
-import { BaseService, ServiceValidationError } from "./baseService.server";
-import { EnqueueDelayedRunService } from "./enqueueDelayedRun.server";
-import { enqueueRun } from "./enqueueRun.server";
-import { ExpireEnqueuedRunService } from "./expireEnqueuedRun.server";
-import { getTaskEventStore } from "../taskEventStore.server";
+import { determineEngineVersion } from "../engineVersion.server";
+import { WithRunEngine } from "./baseService.server";
+import { TriggerTaskServiceV1 } from "./triggerTaskV1.server";
+import { TriggerTaskServiceV2 } from "./triggerTaskV2.server";
export type TriggerTaskServiceOptions = {
idempotencyKey?: string;
@@ -41,8 +14,9 @@ export type TriggerTaskServiceOptions = {
spanParentAsLink?: boolean;
parentAsLinkType?: "replay" | "trigger";
batchId?: string;
+ batchIndex?: number;
customIcon?: string;
- runId?: string;
+ runFriendlyId?: string;
skipChecks?: boolean;
oneTimeUseToken?: string;
};
@@ -58,766 +32,61 @@ export type TriggerTaskServiceResult = {
isCached: boolean;
};
-const MAX_ATTEMPTS = 2;
+export const MAX_ATTEMPTS = 2;
-export class TriggerTaskService extends BaseService {
+export class TriggerTaskService extends WithRunEngine {
public async call(
taskId: string,
environment: AuthenticatedEnvironment,
body: TriggerTaskRequestBody,
options: TriggerTaskServiceOptions = {},
- attempt: number = 0
+ version?: RunEngineVersion
): Promise {
return await this.traceWithEnv("call()", environment, async (span) => {
span.setAttribute("taskId", taskId);
- span.setAttribute("attempt", attempt);
- if (attempt > MAX_ATTEMPTS) {
- throw new ServiceValidationError(
- `Failed to trigger ${taskId} after ${MAX_ATTEMPTS} attempts.`
- );
- }
-
- // TODO: Add idempotency key expiring here
- const idempotencyKey = options.idempotencyKey ?? body.options?.idempotencyKey;
- const idempotencyKeyExpiresAt =
- options.idempotencyKeyExpiresAt ??
- resolveIdempotencyKeyTTL(body.options?.idempotencyKeyTTL) ??
- new Date(Date.now() + 24 * 60 * 60 * 1000 * 30); // 30 days
-
- const delayUntil = await parseDelay(body.options?.delay);
-
- const ttl =
- typeof body.options?.ttl === "number"
- ? stringifyDuration(body.options?.ttl)
- : body.options?.ttl ?? (environment.type === "DEVELOPMENT" ? "10m" : undefined);
-
- const existingRun = idempotencyKey
- ? await this._prisma.taskRun.findFirst({
- where: {
- runtimeEnvironmentId: environment.id,
- idempotencyKey,
- taskIdentifier: taskId,
- },
- })
- : undefined;
-
- if (existingRun) {
- if (
- existingRun.idempotencyKeyExpiresAt &&
- existingRun.idempotencyKeyExpiresAt < new Date()
- ) {
- logger.debug("[TriggerTaskService][call] Idempotency key has expired", {
- idempotencyKey: options.idempotencyKey,
- run: existingRun,
- });
-
- // Update the existing batch to remove the idempotency key
- await this._prisma.taskRun.update({
- where: { id: existingRun.id },
- data: { idempotencyKey: null },
- });
- } else {
- span.setAttribute("runId", existingRun.friendlyId);
-
- return { run: existingRun, isCached: true };
- }
- }
-
- if (environment.type !== "DEVELOPMENT" && !options.skipChecks) {
- const result = await getEntitlement(environment.organizationId);
- if (result && result.hasAccess === false) {
- throw new OutOfEntitlementError();
- }
- }
-
- if (!options.skipChecks) {
- const queueSizeGuard = await guardQueueSizeLimitsForEnv(environment, marqs);
-
- logger.debug("Queue size guard result", {
- queueSizeGuard,
- environment: {
- id: environment.id,
- type: environment.type,
- organization: environment.organization,
- project: environment.project,
- },
- });
-
- if (!queueSizeGuard.isWithinLimits) {
- throw new ServiceValidationError(
- `Cannot trigger ${taskId} as the queue size limit for this environment has been reached. The maximum size is ${queueSizeGuard.maximumSize}`
- );
- }
- }
-
- if (
- body.options?.tags &&
- typeof body.options.tags !== "string" &&
- body.options.tags.length > MAX_TAGS_PER_RUN
- ) {
- throw new ServiceValidationError(
- `Runs can only have ${MAX_TAGS_PER_RUN} tags, you're trying to set ${body.options.tags.length}.`
- );
- }
-
- const runFriendlyId = options?.runId ?? generateFriendlyId("run");
-
- const payloadPacket = await this.#handlePayloadPacket(
- body.payload,
- body.options?.payloadType ?? "application/json",
- runFriendlyId,
- environment
- );
-
- const metadataPacket = body.options?.metadata
- ? handleMetadataPacket(
- body.options?.metadata,
- body.options?.metadataType ?? "application/json"
- )
- : undefined;
-
- const dependentAttempt = body.options?.dependentAttempt
- ? await this._prisma.taskRunAttempt.findFirst({
- where: { friendlyId: body.options.dependentAttempt },
- include: {
- taskRun: {
- select: {
- id: true,
- status: true,
- taskIdentifier: true,
- rootTaskRunId: true,
- depth: true,
- queueTimestamp: true,
- queue: true,
- },
- },
- },
- })
- : undefined;
-
- if (
- dependentAttempt &&
- (isFinalAttemptStatus(dependentAttempt.status) ||
- isFinalRunStatus(dependentAttempt.taskRun.status))
- ) {
- logger.debug("Dependent attempt or run is in a terminal state", {
- dependentAttempt: dependentAttempt,
- });
-
- if (isFinalAttemptStatus(dependentAttempt.status)) {
- throw new ServiceValidationError(
- `Cannot trigger ${taskId} as the parent attempt has a status of ${dependentAttempt.status}`
- );
- } else {
- throw new ServiceValidationError(
- `Cannot trigger ${taskId} as the parent run has a status of ${dependentAttempt.taskRun.status}`
- );
- }
- }
-
- const parentAttempt = body.options?.parentAttempt
- ? await this._prisma.taskRunAttempt.findFirst({
- where: { friendlyId: body.options.parentAttempt },
- include: {
- taskRun: {
- select: {
- id: true,
- status: true,
- taskIdentifier: true,
- rootTaskRunId: true,
- depth: true,
- },
- },
- },
- })
- : undefined;
-
- const dependentBatchRun = body.options?.dependentBatch
- ? await this._prisma.batchTaskRun.findFirst({
- where: { friendlyId: body.options.dependentBatch },
- include: {
- dependentTaskAttempt: {
- include: {
- taskRun: {
- select: {
- id: true,
- status: true,
- taskIdentifier: true,
- rootTaskRunId: true,
- depth: true,
- queueTimestamp: true,
- queue: true,
- },
- },
- },
- },
- },
- })
- : undefined;
-
- if (
- dependentBatchRun &&
- dependentBatchRun.dependentTaskAttempt &&
- (isFinalAttemptStatus(dependentBatchRun.dependentTaskAttempt.status) ||
- isFinalRunStatus(dependentBatchRun.dependentTaskAttempt.taskRun.status))
- ) {
- logger.debug("Dependent batch run task attempt or run has been canceled", {
- dependentBatchRunId: dependentBatchRun.id,
- status: dependentBatchRun.status,
- attempt: dependentBatchRun.dependentTaskAttempt,
- });
-
- if (isFinalAttemptStatus(dependentBatchRun.dependentTaskAttempt.status)) {
- throw new ServiceValidationError(
- `Cannot trigger ${taskId} as the parent attempt has a status of ${dependentBatchRun.dependentTaskAttempt.status}`
- );
- } else {
- throw new ServiceValidationError(
- `Cannot trigger ${taskId} as the parent run has a status of ${dependentBatchRun.dependentTaskAttempt.taskRun.status}`
- );
- }
- }
-
- const parentBatchRun = body.options?.parentBatch
- ? await this._prisma.batchTaskRun.findFirst({
- where: { friendlyId: body.options.parentBatch },
- include: {
- dependentTaskAttempt: {
- include: {
- taskRun: {
- select: {
- id: true,
- status: true,
- taskIdentifier: true,
- rootTaskRunId: true,
- },
- },
- },
- },
- },
- })
- : undefined;
-
- try {
- const result = await eventRepository.traceEvent(
- taskId,
- {
- context: options.traceContext,
- spanParentAsLink: options.spanParentAsLink,
- parentAsLinkType: options.parentAsLinkType,
- kind: "SERVER",
- environment,
- taskSlug: taskId,
- attributes: {
- properties: {
- [SemanticInternalAttributes.SHOW_ACTIONS]: true,
- },
- style: {
- icon: options.customIcon ?? "task",
- },
- runIsTest: body.options?.test ?? false,
- batchId: options.batchId,
- idempotencyKey,
- },
- incomplete: true,
- immediate: true,
- },
- async (event, traceContext, traceparent) => {
- const run = await autoIncrementCounter.incrementInTransaction(
- `v3-run:${environment.id}:${taskId}`,
- async (num, tx) => {
- const lockedToBackgroundWorker = body.options?.lockToVersion
- ? await tx.backgroundWorker.findFirst({
- where: {
- projectId: environment.projectId,
- runtimeEnvironmentId: environment.id,
- version: body.options?.lockToVersion,
- },
- })
- : undefined;
-
- let queueName = sanitizeQueueName(
- await this.#getQueueName(taskId, environment, body.options?.queue?.name)
- );
-
- // Check that the queuename is not an empty string
- if (!queueName) {
- queueName = sanitizeQueueName(`task/${taskId}`);
- }
-
- event.setAttribute("queueName", queueName);
- span.setAttribute("queueName", queueName);
-
- //upsert tags
- let tagIds: string[] = [];
- const bodyTags =
- typeof body.options?.tags === "string" ? [body.options.tags] : body.options?.tags;
- if (bodyTags && bodyTags.length > 0) {
- for (const tag of bodyTags) {
- const tagRecord = await createTag({
- tag,
- projectId: environment.projectId,
- });
- if (tagRecord) {
- tagIds.push(tagRecord.id);
- }
- }
- }
-
- const depth = dependentAttempt
- ? dependentAttempt.taskRun.depth + 1
- : parentAttempt
- ? parentAttempt.taskRun.depth + 1
- : dependentBatchRun?.dependentTaskAttempt
- ? dependentBatchRun.dependentTaskAttempt.taskRun.depth + 1
- : 0;
-
- const queueTimestamp =
- dependentAttempt?.taskRun.queueTimestamp ??
- dependentBatchRun?.dependentTaskAttempt?.taskRun.queueTimestamp ??
- delayUntil ??
- new Date();
-
- const taskRun = await tx.taskRun.create({
- data: {
- status: delayUntil ? "DELAYED" : "PENDING",
- number: num,
- friendlyId: runFriendlyId,
- runtimeEnvironmentId: environment.id,
- projectId: environment.projectId,
- idempotencyKey,
- idempotencyKeyExpiresAt: idempotencyKey ? idempotencyKeyExpiresAt : undefined,
- taskIdentifier: taskId,
- payload: payloadPacket.data ?? "",
- payloadType: payloadPacket.dataType,
- context: body.context,
- traceContext: traceContext,
- traceId: event.traceId,
- spanId: event.spanId,
- parentSpanId:
- options.parentAsLinkType === "replay" ? undefined : traceparent?.spanId,
- lockedToVersionId: lockedToBackgroundWorker?.id,
- taskVersion: lockedToBackgroundWorker?.version,
- sdkVersion: lockedToBackgroundWorker?.sdkVersion,
- cliVersion: lockedToBackgroundWorker?.cliVersion,
- concurrencyKey: body.options?.concurrencyKey,
- queue: queueName,
- isTest: body.options?.test ?? false,
- delayUntil,
- queuedAt: delayUntil ? undefined : new Date(),
- queueTimestamp,
- maxAttempts: body.options?.maxAttempts,
- taskEventStore: getTaskEventStore(),
- ttl,
- tags:
- tagIds.length === 0
- ? undefined
- : {
- connect: tagIds.map((id) => ({ id })),
- },
- parentTaskRunId:
- dependentAttempt?.taskRun.id ??
- parentAttempt?.taskRun.id ??
- dependentBatchRun?.dependentTaskAttempt?.taskRun.id,
- parentTaskRunAttemptId:
- dependentAttempt?.id ??
- parentAttempt?.id ??
- dependentBatchRun?.dependentTaskAttempt?.id,
- rootTaskRunId:
- dependentAttempt?.taskRun.rootTaskRunId ??
- dependentAttempt?.taskRun.id ??
- parentAttempt?.taskRun.rootTaskRunId ??
- parentAttempt?.taskRun.id ??
- dependentBatchRun?.dependentTaskAttempt?.taskRun.rootTaskRunId ??
- dependentBatchRun?.dependentTaskAttempt?.taskRun.id,
- batchId: dependentBatchRun?.id ?? parentBatchRun?.id,
- resumeParentOnCompletion: !!(dependentAttempt ?? dependentBatchRun),
- depth,
- metadata: metadataPacket?.data,
- metadataType: metadataPacket?.dataType,
- seedMetadata: metadataPacket?.data,
- seedMetadataType: metadataPacket?.dataType,
- maxDurationInSeconds: body.options?.maxDuration
- ? clampMaxDuration(body.options.maxDuration)
- : undefined,
- runTags: bodyTags,
- oneTimeUseToken: options.oneTimeUseToken,
- machinePreset: body.options?.machine,
- },
- });
-
- event.setAttribute("runId", taskRun.friendlyId);
- span.setAttribute("runId", taskRun.friendlyId);
-
- if (dependentAttempt) {
- await tx.taskRunDependency.create({
- data: {
- taskRunId: taskRun.id,
- dependentAttemptId: dependentAttempt.id,
- },
- });
- } else if (dependentBatchRun) {
- await tx.taskRunDependency.create({
- data: {
- taskRunId: taskRun.id,
- dependentBatchRunId: dependentBatchRun.id,
- },
- });
- }
-
- if (body.options?.queue) {
- const concurrencyLimit =
- typeof body.options.queue?.concurrencyLimit === "number"
- ? Math.max(
- Math.min(
- body.options.queue.concurrencyLimit,
- environment.maximumConcurrencyLimit,
- environment.organization.maximumConcurrencyLimit
- ),
- 0
- )
- : body.options.queue?.concurrencyLimit;
-
- let taskQueue = await tx.taskQueue.findFirst({
- where: {
- runtimeEnvironmentId: environment.id,
- name: queueName,
- },
- });
-
- if (!taskQueue) {
- // handle conflicts with existing queues
- taskQueue = await tx.taskQueue.create({
- data: {
- friendlyId: generateFriendlyId("queue"),
- name: queueName,
- concurrencyLimit,
- runtimeEnvironmentId: environment.id,
- projectId: environment.projectId,
- type: "NAMED",
- },
- });
- }
-
- if (typeof concurrencyLimit === "number") {
- logger.debug("TriggerTaskService: updating concurrency limit", {
- runId: taskRun.id,
- friendlyId: taskRun.friendlyId,
- taskQueue,
- orgId: environment.organizationId,
- projectId: environment.projectId,
- concurrencyLimit,
- queueOptions: body.options?.queue,
- });
-
- await marqs?.updateQueueConcurrencyLimits(
- environment,
- taskQueue.name,
- concurrencyLimit
- );
- } else if (concurrencyLimit === null) {
- logger.debug("TriggerTaskService: removing concurrency limit", {
- runId: taskRun.id,
- friendlyId: taskRun.friendlyId,
- taskQueue,
- orgId: environment.organizationId,
- projectId: environment.projectId,
- queueOptions: body.options?.queue,
- });
-
- await marqs?.removeQueueConcurrencyLimits(environment, taskQueue.name);
- }
- }
-
- if (taskRun.delayUntil) {
- await EnqueueDelayedRunService.enqueue(taskRun.id, taskRun.delayUntil);
- }
-
- if (!taskRun.delayUntil && taskRun.ttl) {
- const expireAt = parseNaturalLanguageDuration(taskRun.ttl);
-
- if (expireAt) {
- await ExpireEnqueuedRunService.enqueue(taskRun.id, expireAt);
- }
- }
-
- return taskRun;
- },
- async (_, tx) => {
- const counter = await tx.taskRunNumberCounter.findUnique({
- where: {
- taskIdentifier_environmentId: {
- taskIdentifier: taskId,
- environmentId: environment.id,
- },
- },
- select: { lastNumber: true },
- });
-
- return counter?.lastNumber;
- },
- this._prisma
- );
-
- if (!run) {
- return;
- }
-
- // Now enqueue the run if it's not delayed
- if (run.status === "PENDING") {
- const enqueueResult = await enqueueRun({
- env: environment,
- run,
- dependentRun:
- dependentAttempt?.taskRun ?? dependentBatchRun?.dependentTaskAttempt?.taskRun,
- });
-
- if (!enqueueResult.ok) {
- // Now we need to fail the run with enqueueResult.error and make sure and
- // set the traced event to failed as well
- await this._prisma.taskRun.update({
- where: { id: run.id },
- data: {
- status: "SYSTEM_FAILURE",
- completedAt: new Date(),
- error: enqueueResult.error,
- },
- });
-
- event.failWithError(enqueueResult.error);
-
- return {
- run,
- isCached: false,
- error: enqueueResult.error,
- };
- }
- }
-
- return { run, isCached: false };
- }
- );
-
- if (result?.error) {
- throw new ServiceValidationError(
- taskRunErrorToString(taskRunErrorEnhancer(result.error))
- );
- }
-
- const run = result?.run;
+ const v = await determineEngineVersion({
+ environment,
+ workerVersion: body.options?.lockToVersion,
+ engineVersion: version,
+ });
- if (!run) {
- return;
+ switch (v) {
+ case "V1": {
+ return await this.callV1(taskId, environment, body, options);
}
-
- return {
- run,
- isCached: result?.isCached,
- };
- } catch (error) {
- // Detect a prisma transaction Unique constraint violation
- if (error instanceof Prisma.PrismaClientKnownRequestError) {
- logger.debug("TriggerTask: Prisma transaction error", {
- code: error.code,
- message: error.message,
- meta: error.meta,
- });
-
- if (error.code === "P2002") {
- const target = error.meta?.target;
-
- if (
- Array.isArray(target) &&
- target.length > 0 &&
- typeof target[0] === "string" &&
- target[0].includes("oneTimeUseToken")
- ) {
- throw new ServiceValidationError(
- `Cannot trigger ${taskId} with a one-time use token as it has already been used.`
- );
- } else if (
- Array.isArray(target) &&
- target.length == 2 &&
- typeof target[0] === "string" &&
- typeof target[1] === "string" &&
- target[0] == "runtimeEnvironmentId" &&
- target[1] == "name" &&
- error.message.includes("prisma.taskQueue.create")
- ) {
- throw new Error(
- `Failed to trigger ${taskId} as the queue could not be created do to a unique constraint error, please try again.`
- );
- } else if (
- Array.isArray(target) &&
- target.length == 3 &&
- typeof target[0] === "string" &&
- typeof target[1] === "string" &&
- typeof target[2] === "string" &&
- target[0] == "runtimeEnvironmentId" &&
- target[1] == "taskIdentifier" &&
- target[2] == "idempotencyKey"
- ) {
- logger.debug("TriggerTask: Idempotency key violation, retrying...", {
- taskId,
- environmentId: environment.id,
- idempotencyKey,
- });
- // We need to retry the task run creation as the idempotency key has been used
- return await this.call(taskId, environment, body, options, attempt + 1);
- } else {
- throw new ServiceValidationError(
- `Cannot trigger ${taskId} as it has already been triggered with the same idempotency key.`
- );
- }
- }
+ case "V2": {
+ return await this.callV2(taskId, environment, body, options);
}
-
- throw error;
}
});
}
- async #getQueueName(taskId: string, environment: AuthenticatedEnvironment, queueName?: string) {
- if (queueName) {
- return queueName;
- }
-
- const defaultQueueName = `task/${taskId}`;
-
- const worker = await findCurrentWorkerFromEnvironment(environment);
-
- if (!worker) {
- logger.debug("Failed to get queue name: No worker found", {
- taskId,
- environmentId: environment.id,
- });
-
- return defaultQueueName;
- }
-
- const task = await this._prisma.backgroundWorkerTask.findFirst({
- where: {
- workerId: worker.id,
- slug: taskId,
- },
- });
-
- if (!task) {
- console.log("Failed to get queue name: No task found", {
- taskId,
- environmentId: environment.id,
- });
-
- return defaultQueueName;
- }
-
- const queueConfig = QueueOptions.optional().nullable().safeParse(task.queueConfig);
-
- if (!queueConfig.success) {
- console.log("Failed to get queue name: Invalid queue config", {
- taskId,
- environmentId: environment.id,
- queueConfig: task.queueConfig,
- });
-
- return defaultQueueName;
- }
-
- return queueConfig.data?.name ?? defaultQueueName;
+ private async callV1(
+ taskId: string,
+ environment: AuthenticatedEnvironment,
+ body: TriggerTaskRequestBody,
+ options: TriggerTaskServiceOptions = {}
+ ): Promise {
+ const service = new TriggerTaskServiceV1(this._prisma);
+ return await service.call(taskId, environment, body, options);
}
- async #handlePayloadPacket(
- payload: any,
- payloadType: string,
- pathPrefix: string,
- environment: AuthenticatedEnvironment
- ) {
- return await startActiveSpan("handlePayloadPacket()", async (span) => {
- const packet = this.#createPayloadPacket(payload, payloadType);
-
- if (!packet.data) {
- return packet;
- }
-
- const { needsOffloading, size } = packetRequiresOffloading(
- packet,
- env.TASK_PAYLOAD_OFFLOAD_THRESHOLD
- );
-
- if (!needsOffloading) {
- return packet;
- }
-
- const filename = `${pathPrefix}/payload.json`;
-
- await uploadPacketToObjectStore(filename, packet.data, packet.dataType, environment);
-
- return {
- data: filename,
- dataType: "application/store",
- };
+ private async callV2(
+ taskId: string,
+ environment: AuthenticatedEnvironment,
+ body: TriggerTaskRequestBody,
+ options: TriggerTaskServiceOptions = {}
+ ): Promise {
+ const service = new TriggerTaskServiceV2({
+ prisma: this._prisma,
+ engine: this._engine,
+ });
+ return await service.call({
+ taskId,
+ environment,
+ body,
+ options,
});
}
-
- #createPayloadPacket(payload: any, payloadType: string): IOPacket {
- if (payloadType === "application/json") {
- return { data: JSON.stringify(payload), dataType: "application/json" };
- }
-
- if (typeof payload === "string") {
- return { data: payload, dataType: payloadType };
- }
-
- return { dataType: payloadType };
- }
-}
-
-export async function parseDelay(value?: string | Date): Promise {
- if (!value) {
- return;
- }
-
- if (value instanceof Date) {
- return value;
- }
-
- try {
- const date = new Date(value);
-
- // Check if the date is valid
- if (isNaN(date.getTime())) {
- return parseNaturalLanguageDuration(value);
- }
-
- if (date.getTime() <= Date.now()) {
- return;
- }
-
- return date;
- } catch (error) {
- return parseNaturalLanguageDuration(value);
- }
-}
-
-function stringifyDuration(seconds: number): string | undefined {
- if (seconds <= 0) {
- return;
- }
-
- const units = {
- w: Math.floor(seconds / 604800),
- d: Math.floor((seconds % 604800) / 86400),
- h: Math.floor((seconds % 86400) / 3600),
- m: Math.floor((seconds % 3600) / 60),
- s: Math.floor(seconds % 60),
- };
-
- // Filter the units having non-zero values and join them
- const result: string = Object.entries(units)
- .filter(([unit, val]) => val != 0)
- .map(([unit, val]) => `${val}${unit}`)
- .join("");
-
- return result;
}
diff --git a/apps/webapp/app/v3/services/triggerTaskV1.server.ts b/apps/webapp/app/v3/services/triggerTaskV1.server.ts
new file mode 100644
index 0000000000..e8a89c00ec
--- /dev/null
+++ b/apps/webapp/app/v3/services/triggerTaskV1.server.ts
@@ -0,0 +1,758 @@
+import {
+ IOPacket,
+ packetRequiresOffloading,
+ QueueOptions,
+ SemanticInternalAttributes,
+ taskRunErrorToString,
+ taskRunErrorEnhancer,
+ TriggerTaskRequestBody,
+} from "@trigger.dev/core/v3";
+import {
+ parseNaturalLanguageDuration,
+ sanitizeQueueName,
+ stringifyDuration,
+} from "@trigger.dev/core/v3/apps";
+import { Prisma } from "@trigger.dev/database";
+import { env } from "~/env.server";
+import { createTag, MAX_TAGS_PER_RUN } from "~/models/taskRunTag.server";
+import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
+import { autoIncrementCounter } from "~/services/autoIncrementCounter.server";
+import { logger } from "~/services/logger.server";
+import { getEntitlement } from "~/services/platform.v3.server";
+import { parseDelay } from "~/utils/delays";
+import { resolveIdempotencyKeyTTL } from "~/utils/idempotencyKeys.server";
+import { handleMetadataPacket } from "~/utils/packets";
+import { marqs } from "~/v3/marqs/index.server";
+import { eventRepository } from "../eventRepository.server";
+import { generateFriendlyId } from "../friendlyIdentifiers";
+import { findCurrentWorkerFromEnvironment } from "../models/workerDeployment.server";
+import { guardQueueSizeLimitsForEnv } from "../queueSizeLimits.server";
+import { uploadPacketToObjectStore } from "../r2.server";
+import { removeQueueConcurrencyLimits, updateQueueConcurrencyLimits } from "../runQueue.server";
+import { isFinalAttemptStatus, isFinalRunStatus } from "../taskStatus";
+import { startActiveSpan } from "../tracer.server";
+import { clampMaxDuration } from "../utils/maxDuration";
+import { BaseService, ServiceValidationError } from "./baseService.server";
+import { EnqueueDelayedRunService } from "./enqueueDelayedRun.server";
+import { ExpireEnqueuedRunService } from "./expireEnqueuedRun.server";
+import {
+ MAX_ATTEMPTS,
+ OutOfEntitlementError,
+ TriggerTaskServiceOptions,
+ TriggerTaskServiceResult,
+} from "./triggerTask.server";
+import { getTaskEventStore } from "../taskEventStore.server";
+import { enqueueRun } from "./enqueueRun.server";
+
+/** @deprecated Use TriggerTaskService in `triggerTask.server.ts` instead. */
+export class TriggerTaskServiceV1 extends BaseService {
+ public async call(
+ taskId: string,
+ environment: AuthenticatedEnvironment,
+ body: TriggerTaskRequestBody,
+ options: TriggerTaskServiceOptions = {},
+ attempt: number = 0
+ ): Promise {
+ return await this.traceWithEnv("call()", environment, async (span) => {
+ span.setAttribute("taskId", taskId);
+ span.setAttribute("attempt", attempt);
+
+ if (attempt > MAX_ATTEMPTS) {
+ throw new ServiceValidationError(
+ `Failed to trigger ${taskId} after ${MAX_ATTEMPTS} attempts.`
+ );
+ }
+
+ const idempotencyKey = options.idempotencyKey ?? body.options?.idempotencyKey;
+ const idempotencyKeyExpiresAt =
+ options.idempotencyKeyExpiresAt ??
+ resolveIdempotencyKeyTTL(body.options?.idempotencyKeyTTL) ??
+ new Date(Date.now() + 24 * 60 * 60 * 1000 * 30); // 30 days
+
+ const delayUntil = await parseDelay(body.options?.delay);
+
+ const ttl =
+ typeof body.options?.ttl === "number"
+ ? stringifyDuration(body.options?.ttl)
+ : body.options?.ttl ?? (environment.type === "DEVELOPMENT" ? "10m" : undefined);
+
+ const existingRun = idempotencyKey
+ ? await this._prisma.taskRun.findFirst({
+ where: {
+ runtimeEnvironmentId: environment.id,
+ idempotencyKey,
+ taskIdentifier: taskId,
+ },
+ })
+ : undefined;
+
+ if (existingRun) {
+ if (
+ existingRun.idempotencyKeyExpiresAt &&
+ existingRun.idempotencyKeyExpiresAt < new Date()
+ ) {
+ logger.debug("[TriggerTaskService][call] Idempotency key has expired", {
+ idempotencyKey: options.idempotencyKey,
+ run: existingRun,
+ });
+
+ // Update the existing batch to remove the idempotency key
+ await this._prisma.taskRun.update({
+ where: { id: existingRun.id },
+ data: { idempotencyKey: null },
+ });
+ } else {
+ span.setAttribute("runId", existingRun.friendlyId);
+
+ return { run: existingRun, isCached: true };
+ }
+ }
+
+ if (environment.type !== "DEVELOPMENT" && !options.skipChecks) {
+ const result = await getEntitlement(environment.organizationId);
+ if (result && result.hasAccess === false) {
+ throw new OutOfEntitlementError();
+ }
+ }
+
+ if (!options.skipChecks) {
+ const queueSizeGuard = await guardQueueSizeLimitsForEnv(environment, marqs);
+
+ logger.debug("Queue size guard result", {
+ queueSizeGuard,
+ environment: {
+ id: environment.id,
+ type: environment.type,
+ organization: environment.organization,
+ project: environment.project,
+ },
+ });
+
+ if (!queueSizeGuard.isWithinLimits) {
+ throw new ServiceValidationError(
+ `Cannot trigger ${taskId} as the queue size limit for this environment has been reached. The maximum size is ${queueSizeGuard.maximumSize}`
+ );
+ }
+ }
+
+ if (
+ body.options?.tags &&
+ typeof body.options.tags !== "string" &&
+ body.options.tags.length > MAX_TAGS_PER_RUN
+ ) {
+ throw new ServiceValidationError(
+ `Runs can only have ${MAX_TAGS_PER_RUN} tags, you're trying to set ${body.options.tags.length}.`
+ );
+ }
+
+ const runFriendlyId = options?.runFriendlyId ?? generateFriendlyId("run");
+
+ const payloadPacket = await this.#handlePayloadPacket(
+ body.payload,
+ body.options?.payloadType ?? "application/json",
+ runFriendlyId,
+ environment
+ );
+
+ const metadataPacket = body.options?.metadata
+ ? handleMetadataPacket(
+ body.options?.metadata,
+ body.options?.metadataType ?? "application/json"
+ )
+ : undefined;
+
+ const dependentAttempt = body.options?.dependentAttempt
+ ? await this._prisma.taskRunAttempt.findFirst({
+ where: { friendlyId: body.options.dependentAttempt },
+ include: {
+ taskRun: {
+ select: {
+ id: true,
+ status: true,
+ taskIdentifier: true,
+ rootTaskRunId: true,
+ depth: true,
+ queueTimestamp: true,
+ queue: true,
+ },
+ },
+ },
+ })
+ : undefined;
+
+ if (
+ dependentAttempt &&
+ (isFinalAttemptStatus(dependentAttempt.status) ||
+ isFinalRunStatus(dependentAttempt.taskRun.status))
+ ) {
+ logger.debug("Dependent attempt or run is in a terminal state", {
+ dependentAttempt: dependentAttempt,
+ });
+
+ if (isFinalAttemptStatus(dependentAttempt.status)) {
+ throw new ServiceValidationError(
+ `Cannot trigger ${taskId} as the parent attempt has a status of ${dependentAttempt.status}`
+ );
+ } else {
+ throw new ServiceValidationError(
+ `Cannot trigger ${taskId} as the parent run has a status of ${dependentAttempt.taskRun.status}`
+ );
+ }
+ }
+
+ const parentAttempt = body.options?.parentAttempt
+ ? await this._prisma.taskRunAttempt.findFirst({
+ where: { friendlyId: body.options.parentAttempt },
+ include: {
+ taskRun: {
+ select: {
+ id: true,
+ status: true,
+ taskIdentifier: true,
+ rootTaskRunId: true,
+ depth: true,
+ },
+ },
+ },
+ })
+ : undefined;
+
+ const dependentBatchRun = body.options?.dependentBatch
+ ? await this._prisma.batchTaskRun.findFirst({
+ where: { friendlyId: body.options.dependentBatch },
+ include: {
+ dependentTaskAttempt: {
+ include: {
+ taskRun: {
+ select: {
+ id: true,
+ status: true,
+ taskIdentifier: true,
+ rootTaskRunId: true,
+ depth: true,
+ queueTimestamp: true,
+ queue: true,
+ },
+ },
+ },
+ },
+ },
+ })
+ : undefined;
+
+ if (
+ dependentBatchRun &&
+ dependentBatchRun.dependentTaskAttempt &&
+ (isFinalAttemptStatus(dependentBatchRun.dependentTaskAttempt.status) ||
+ isFinalRunStatus(dependentBatchRun.dependentTaskAttempt.taskRun.status))
+ ) {
+ logger.debug("Dependent batch run task attempt or run has been canceled", {
+ dependentBatchRunId: dependentBatchRun.id,
+ status: dependentBatchRun.status,
+ attempt: dependentBatchRun.dependentTaskAttempt,
+ });
+
+ if (isFinalAttemptStatus(dependentBatchRun.dependentTaskAttempt.status)) {
+ throw new ServiceValidationError(
+ `Cannot trigger ${taskId} as the parent attempt has a status of ${dependentBatchRun.dependentTaskAttempt.status}`
+ );
+ } else {
+ throw new ServiceValidationError(
+ `Cannot trigger ${taskId} as the parent run has a status of ${dependentBatchRun.dependentTaskAttempt.taskRun.status}`
+ );
+ }
+ }
+
+ const parentBatchRun = body.options?.parentBatch
+ ? await this._prisma.batchTaskRun.findFirst({
+ where: { friendlyId: body.options.parentBatch },
+ include: {
+ dependentTaskAttempt: {
+ include: {
+ taskRun: {
+ select: {
+ id: true,
+ status: true,
+ taskIdentifier: true,
+ rootTaskRunId: true,
+ },
+ },
+ },
+ },
+ },
+ })
+ : undefined;
+
+ try {
+ const result = await eventRepository.traceEvent(
+ taskId,
+ {
+ context: options.traceContext,
+ spanParentAsLink: options.spanParentAsLink,
+ parentAsLinkType: options.parentAsLinkType,
+ kind: "SERVER",
+ environment,
+ taskSlug: taskId,
+ attributes: {
+ properties: {
+ [SemanticInternalAttributes.SHOW_ACTIONS]: true,
+ },
+ style: {
+ icon: options.customIcon ?? "task",
+ },
+ runIsTest: body.options?.test ?? false,
+ batchId: options.batchId,
+ idempotencyKey,
+ },
+ incomplete: true,
+ immediate: true,
+ },
+ async (event, traceContext, traceparent) => {
+ const run = await autoIncrementCounter.incrementInTransaction(
+ `v3-run:${environment.id}:${taskId}`,
+ async (num, tx) => {
+ const lockedToBackgroundWorker = body.options?.lockToVersion
+ ? await tx.backgroundWorker.findFirst({
+ where: {
+ projectId: environment.projectId,
+ runtimeEnvironmentId: environment.id,
+ version: body.options?.lockToVersion,
+ },
+ })
+ : undefined;
+
+ let queueName = sanitizeQueueName(
+ await this.#getQueueName(taskId, environment, body.options?.queue?.name)
+ );
+
+ // Check that the queuename is not an empty string
+ if (!queueName) {
+ queueName = sanitizeQueueName(`task/${taskId}`);
+ }
+
+ event.setAttribute("queueName", queueName);
+ span.setAttribute("queueName", queueName);
+
+ //upsert tags
+ let tagIds: string[] = [];
+ const bodyTags =
+ typeof body.options?.tags === "string" ? [body.options.tags] : body.options?.tags;
+ if (bodyTags && bodyTags.length > 0) {
+ for (const tag of bodyTags) {
+ const tagRecord = await createTag({
+ tag,
+ projectId: environment.projectId,
+ });
+ if (tagRecord) {
+ tagIds.push(tagRecord.id);
+ }
+ }
+ }
+
+ const depth = dependentAttempt
+ ? dependentAttempt.taskRun.depth + 1
+ : parentAttempt
+ ? parentAttempt.taskRun.depth + 1
+ : dependentBatchRun?.dependentTaskAttempt
+ ? dependentBatchRun.dependentTaskAttempt.taskRun.depth + 1
+ : 0;
+
+ const queueTimestamp =
+ dependentAttempt?.taskRun.queueTimestamp ??
+ dependentBatchRun?.dependentTaskAttempt?.taskRun.queueTimestamp ??
+ delayUntil ??
+ new Date();
+
+ const taskRun = await tx.taskRun.create({
+ data: {
+ status: delayUntil ? "DELAYED" : "PENDING",
+ number: num,
+ friendlyId: runFriendlyId,
+ runtimeEnvironmentId: environment.id,
+ projectId: environment.projectId,
+ idempotencyKey,
+ idempotencyKeyExpiresAt: idempotencyKey ? idempotencyKeyExpiresAt : undefined,
+ taskIdentifier: taskId,
+ payload: payloadPacket.data ?? "",
+ payloadType: payloadPacket.dataType,
+ context: body.context,
+ traceContext: traceContext,
+ traceId: event.traceId,
+ spanId: event.spanId,
+ parentSpanId:
+ options.parentAsLinkType === "replay" ? undefined : traceparent?.spanId,
+ lockedToVersionId: lockedToBackgroundWorker?.id,
+ taskVersion: lockedToBackgroundWorker?.version,
+ sdkVersion: lockedToBackgroundWorker?.sdkVersion,
+ cliVersion: lockedToBackgroundWorker?.cliVersion,
+ concurrencyKey: body.options?.concurrencyKey,
+ queue: queueName,
+ isTest: body.options?.test ?? false,
+ delayUntil,
+ queuedAt: delayUntil ? undefined : new Date(),
+ queueTimestamp,
+ maxAttempts: body.options?.maxAttempts,
+ taskEventStore: getTaskEventStore(),
+ ttl,
+ tags:
+ tagIds.length === 0
+ ? undefined
+ : {
+ connect: tagIds.map((id) => ({ id })),
+ },
+ parentTaskRunId:
+ dependentAttempt?.taskRun.id ??
+ parentAttempt?.taskRun.id ??
+ dependentBatchRun?.dependentTaskAttempt?.taskRun.id,
+ parentTaskRunAttemptId:
+ dependentAttempt?.id ??
+ parentAttempt?.id ??
+ dependentBatchRun?.dependentTaskAttempt?.id,
+ rootTaskRunId:
+ dependentAttempt?.taskRun.rootTaskRunId ??
+ dependentAttempt?.taskRun.id ??
+ parentAttempt?.taskRun.rootTaskRunId ??
+ parentAttempt?.taskRun.id ??
+ dependentBatchRun?.dependentTaskAttempt?.taskRun.rootTaskRunId ??
+ dependentBatchRun?.dependentTaskAttempt?.taskRun.id,
+ batchId: dependentBatchRun?.id ?? parentBatchRun?.id,
+ resumeParentOnCompletion: !!(dependentAttempt ?? dependentBatchRun),
+ depth,
+ metadata: metadataPacket?.data,
+ metadataType: metadataPacket?.dataType,
+ seedMetadata: metadataPacket?.data,
+ seedMetadataType: metadataPacket?.dataType,
+ maxDurationInSeconds: body.options?.maxDuration
+ ? clampMaxDuration(body.options.maxDuration)
+ : undefined,
+ runTags: bodyTags,
+ oneTimeUseToken: options.oneTimeUseToken,
+ machinePreset: body.options?.machine,
+ },
+ });
+
+ event.setAttribute("runId", taskRun.friendlyId);
+ span.setAttribute("runId", taskRun.friendlyId);
+
+ if (dependentAttempt) {
+ await tx.taskRunDependency.create({
+ data: {
+ taskRunId: taskRun.id,
+ dependentAttemptId: dependentAttempt.id,
+ },
+ });
+ } else if (dependentBatchRun) {
+ await tx.taskRunDependency.create({
+ data: {
+ taskRunId: taskRun.id,
+ dependentBatchRunId: dependentBatchRun.id,
+ },
+ });
+ }
+
+ if (body.options?.queue) {
+ const concurrencyLimit =
+ typeof body.options.queue?.concurrencyLimit === "number"
+ ? Math.max(
+ Math.min(
+ body.options.queue.concurrencyLimit,
+ environment.maximumConcurrencyLimit,
+ environment.organization.maximumConcurrencyLimit
+ ),
+ 0
+ )
+ : body.options.queue?.concurrencyLimit;
+
+ let taskQueue = await tx.taskQueue.findFirst({
+ where: {
+ runtimeEnvironmentId: environment.id,
+ name: queueName,
+ },
+ });
+
+ if (!taskQueue) {
+ // handle conflicts with existing queues
+ taskQueue = await tx.taskQueue.create({
+ data: {
+ friendlyId: generateFriendlyId("queue"),
+ name: queueName,
+ concurrencyLimit,
+ runtimeEnvironmentId: environment.id,
+ projectId: environment.projectId,
+ type: "NAMED",
+ },
+ });
+ }
+
+ if (typeof concurrencyLimit === "number") {
+ logger.debug("TriggerTaskService: updating concurrency limit", {
+ runId: taskRun.id,
+ friendlyId: taskRun.friendlyId,
+ taskQueue,
+ orgId: environment.organizationId,
+ projectId: environment.projectId,
+ concurrencyLimit,
+ queueOptions: body.options?.queue,
+ });
+
+ await updateQueueConcurrencyLimits(
+ environment,
+ taskQueue.name,
+ concurrencyLimit
+ );
+ } else if (concurrencyLimit === null) {
+ logger.debug("TriggerTaskService: removing concurrency limit", {
+ runId: taskRun.id,
+ friendlyId: taskRun.friendlyId,
+ taskQueue,
+ orgId: environment.organizationId,
+ projectId: environment.projectId,
+ queueOptions: body.options?.queue,
+ });
+
+ await removeQueueConcurrencyLimits(environment, taskQueue.name);
+ }
+ }
+
+ if (taskRun.delayUntil) {
+ await EnqueueDelayedRunService.enqueue(taskRun.id, taskRun.delayUntil);
+ }
+
+ if (!taskRun.delayUntil && taskRun.ttl) {
+ const expireAt = parseNaturalLanguageDuration(taskRun.ttl);
+
+ if (expireAt) {
+ await ExpireEnqueuedRunService.enqueue(taskRun.id, expireAt);
+ }
+ }
+
+ return taskRun;
+ },
+ async (_, tx) => {
+ const counter = await tx.taskRunNumberCounter.findUnique({
+ where: {
+ taskIdentifier_environmentId: {
+ taskIdentifier: taskId,
+ environmentId: environment.id,
+ },
+ },
+ select: { lastNumber: true },
+ });
+
+ return counter?.lastNumber;
+ },
+ this._prisma
+ );
+
+ if (!run) {
+ return;
+ }
+
+ // Now enqueue the run if it's not delayed
+ if (run.status === "PENDING") {
+ const enqueueResult = await enqueueRun({
+ env: environment,
+ run,
+ dependentRun:
+ dependentAttempt?.taskRun ?? dependentBatchRun?.dependentTaskAttempt?.taskRun,
+ });
+
+ if (!enqueueResult.ok) {
+ // Now we need to fail the run with enqueueResult.error and make sure and
+ // set the traced event to failed as well
+ await this._prisma.taskRun.update({
+ where: { id: run.id },
+ data: {
+ status: "SYSTEM_FAILURE",
+ completedAt: new Date(),
+ error: enqueueResult.error,
+ },
+ });
+
+ event.failWithError(enqueueResult.error);
+
+ return {
+ run,
+ isCached: false,
+ error: enqueueResult.error,
+ };
+ }
+ }
+
+ return { run, isCached: false };
+ }
+ );
+
+ if (result?.error) {
+ throw new ServiceValidationError(
+ taskRunErrorToString(taskRunErrorEnhancer(result.error))
+ );
+ }
+
+ const run = result?.run;
+
+ if (!run) {
+ return;
+ }
+
+ return {
+ run,
+ isCached: result?.isCached,
+ };
+ } catch (error) {
+ // Detect a prisma transaction Unique constraint violation
+ if (error instanceof Prisma.PrismaClientKnownRequestError) {
+ logger.debug("TriggerTask: Prisma transaction error", {
+ code: error.code,
+ message: error.message,
+ meta: error.meta,
+ });
+
+ if (error.code === "P2002") {
+ const target = error.meta?.target;
+
+ if (
+ Array.isArray(target) &&
+ target.length > 0 &&
+ typeof target[0] === "string" &&
+ target[0].includes("oneTimeUseToken")
+ ) {
+ throw new ServiceValidationError(
+ `Cannot trigger ${taskId} with a one-time use token as it has already been used.`
+ );
+ } else if (
+ Array.isArray(target) &&
+ target.length == 2 &&
+ typeof target[0] === "string" &&
+ typeof target[1] === "string" &&
+ target[0] == "runtimeEnvironmentId" &&
+ target[1] == "name" &&
+ error.message.includes("prisma.taskQueue.create")
+ ) {
+ throw new Error(
+ `Failed to trigger ${taskId} as the queue could not be created do to a unique constraint error, please try again.`
+ );
+ } else if (
+ Array.isArray(target) &&
+ target.length == 3 &&
+ typeof target[0] === "string" &&
+ typeof target[1] === "string" &&
+ typeof target[2] === "string" &&
+ target[0] == "runtimeEnvironmentId" &&
+ target[1] == "taskIdentifier" &&
+ target[2] == "idempotencyKey"
+ ) {
+ logger.debug("TriggerTask: Idempotency key violation, retrying...", {
+ taskId,
+ environmentId: environment.id,
+ idempotencyKey,
+ });
+ // We need to retry the task run creation as the idempotency key has been used
+ return await this.call(taskId, environment, body, options, attempt + 1);
+ } else {
+ throw new ServiceValidationError(
+ `Cannot trigger ${taskId} as it has already been triggered with the same idempotency key.`
+ );
+ }
+ }
+ }
+
+ throw error;
+ }
+ });
+ }
+
+ async #getQueueName(taskId: string, environment: AuthenticatedEnvironment, queueName?: string) {
+ if (queueName) {
+ return queueName;
+ }
+
+ const defaultQueueName = `task/${taskId}`;
+
+ const worker = await findCurrentWorkerFromEnvironment(environment);
+
+ if (!worker) {
+ logger.debug("Failed to get queue name: No worker found", {
+ taskId,
+ environmentId: environment.id,
+ });
+
+ return defaultQueueName;
+ }
+
+ const task = await this._prisma.backgroundWorkerTask.findFirst({
+ where: {
+ workerId: worker.id,
+ slug: taskId,
+ },
+ });
+
+ if (!task) {
+ console.log("Failed to get queue name: No task found", {
+ taskId,
+ environmentId: environment.id,
+ });
+
+ return defaultQueueName;
+ }
+
+ const queueConfig = QueueOptions.optional().nullable().safeParse(task.queueConfig);
+
+ if (!queueConfig.success) {
+ console.log("Failed to get queue name: Invalid queue config", {
+ taskId,
+ environmentId: environment.id,
+ queueConfig: task.queueConfig,
+ });
+
+ return defaultQueueName;
+ }
+
+ return queueConfig.data?.name ?? defaultQueueName;
+ }
+
+ async #handlePayloadPacket(
+ payload: any,
+ payloadType: string,
+ pathPrefix: string,
+ environment: AuthenticatedEnvironment
+ ) {
+ return await startActiveSpan("handlePayloadPacket()", async (span) => {
+ const packet = this.#createPayloadPacket(payload, payloadType);
+
+ if (!packet.data) {
+ return packet;
+ }
+
+ const { needsOffloading, size } = packetRequiresOffloading(
+ packet,
+ env.TASK_PAYLOAD_OFFLOAD_THRESHOLD
+ );
+
+ if (!needsOffloading) {
+ return packet;
+ }
+
+ const filename = `${pathPrefix}/payload.json`;
+
+ await uploadPacketToObjectStore(filename, packet.data, packet.dataType, environment);
+
+ return {
+ data: filename,
+ dataType: "application/store",
+ };
+ });
+ }
+
+ #createPayloadPacket(payload: any, payloadType: string): IOPacket {
+ if (payloadType === "application/json") {
+ return { data: JSON.stringify(payload), dataType: "application/json" };
+ }
+
+ if (typeof payload === "string") {
+ return { data: payload, dataType: payloadType };
+ }
+
+ return { dataType: payloadType };
+ }
+}
diff --git a/apps/webapp/app/v3/services/triggerTaskV2.server.ts b/apps/webapp/app/v3/services/triggerTaskV2.server.ts
new file mode 100644
index 0000000000..dacb7e1dee
--- /dev/null
+++ b/apps/webapp/app/v3/services/triggerTaskV2.server.ts
@@ -0,0 +1,566 @@
+import { RunEngine, RunDuplicateIdempotencyKeyError } from "@internal/run-engine";
+import {
+ IOPacket,
+ packetRequiresOffloading,
+ QueueOptions,
+ SemanticInternalAttributes,
+ TriggerTaskRequestBody,
+} from "@trigger.dev/core/v3";
+import { BatchId, RunId, sanitizeQueueName, stringifyDuration } from "@trigger.dev/core/v3/apps";
+import { Prisma, TaskRun } from "@trigger.dev/database";
+import { env } from "~/env.server";
+import { createTag, MAX_TAGS_PER_RUN } from "~/models/taskRunTag.server";
+import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
+import { autoIncrementCounter } from "~/services/autoIncrementCounter.server";
+import { logger } from "~/services/logger.server";
+import { getEntitlement } from "~/services/platform.v3.server";
+import { parseDelay } from "~/utils/delays";
+import { resolveIdempotencyKeyTTL } from "~/utils/idempotencyKeys.server";
+import { handleMetadataPacket } from "~/utils/packets";
+import { eventRepository } from "../eventRepository.server";
+import { findCurrentWorkerFromEnvironment } from "../models/workerDeployment.server";
+import { uploadPacketToObjectStore } from "../r2.server";
+import { isFinalRunStatus } from "../taskStatus";
+import { startActiveSpan } from "../tracer.server";
+import { clampMaxDuration } from "../utils/maxDuration";
+import { ServiceValidationError, WithRunEngine } from "./baseService.server";
+import {
+ MAX_ATTEMPTS,
+ OutOfEntitlementError,
+ TriggerTaskServiceOptions,
+ TriggerTaskServiceResult,
+} from "./triggerTask.server";
+import { WorkerGroupService } from "./worker/workerGroupService.server";
+import { getTaskEventStore } from "../taskEventStore.server";
+
+/** @deprecated Use TriggerTaskService in `triggerTask.server.ts` instead. */
+export class TriggerTaskServiceV2 extends WithRunEngine {
+ public async call({
+ taskId,
+ environment,
+ body,
+ options = {},
+ attempt = 0,
+ }: {
+ taskId: string;
+ environment: AuthenticatedEnvironment;
+ body: TriggerTaskRequestBody;
+ options?: TriggerTaskServiceOptions;
+ attempt?: number;
+ }): Promise {
+ return await this.traceWithEnv("call()", environment, async (span) => {
+ span.setAttribute("taskId", taskId);
+ span.setAttribute("attempt", attempt);
+
+ if (attempt > MAX_ATTEMPTS) {
+ throw new ServiceValidationError(
+ `Failed to trigger ${taskId} after ${MAX_ATTEMPTS} attempts.`
+ );
+ }
+
+ const idempotencyKey = options.idempotencyKey ?? body.options?.idempotencyKey;
+ const idempotencyKeyExpiresAt =
+ options.idempotencyKeyExpiresAt ??
+ resolveIdempotencyKeyTTL(body.options?.idempotencyKeyTTL) ??
+ new Date(Date.now() + 24 * 60 * 60 * 1000 * 30); // 30 days
+
+ const delayUntil = await parseDelay(body.options?.delay);
+
+ const ttl =
+ typeof body.options?.ttl === "number"
+ ? stringifyDuration(body.options?.ttl)
+ : body.options?.ttl ?? (environment.type === "DEVELOPMENT" ? "10m" : undefined);
+
+ const existingRun = idempotencyKey
+ ? await this._prisma.taskRun.findFirst({
+ where: {
+ runtimeEnvironmentId: environment.id,
+ idempotencyKey,
+ taskIdentifier: taskId,
+ },
+ include: {
+ associatedWaitpoint: true,
+ },
+ })
+ : undefined;
+
+ if (existingRun) {
+ if (
+ existingRun.idempotencyKeyExpiresAt &&
+ existingRun.idempotencyKeyExpiresAt < new Date()
+ ) {
+ logger.debug("[TriggerTaskService][call] Idempotency key has expired", {
+ idempotencyKey: options.idempotencyKey,
+ run: existingRun,
+ });
+
+ // Update the existing run to remove the idempotency key
+ await this._prisma.taskRun.update({
+ where: { id: existingRun.id },
+ data: { idempotencyKey: null },
+ });
+ } else {
+ span.setAttribute("runId", existingRun.friendlyId);
+
+ //We're using `andWait` so we need to block the parent run with a waitpoint
+ if (
+ existingRun.associatedWaitpoint &&
+ body.options?.resumeParentOnCompletion &&
+ body.options?.parentRunId
+ ) {
+ await eventRepository.traceEvent(
+ `${taskId} (cached)`,
+ {
+ context: options.traceContext,
+ spanParentAsLink: options.spanParentAsLink,
+ parentAsLinkType: options.parentAsLinkType,
+ kind: "SERVER",
+ environment,
+ taskSlug: taskId,
+ attributes: {
+ properties: {
+ [SemanticInternalAttributes.SHOW_ACTIONS]: true,
+ [SemanticInternalAttributes.ORIGINAL_RUN_ID]: existingRun.friendlyId,
+ },
+ style: {
+ icon: "task-cached",
+ },
+ runIsTest: body.options?.test ?? false,
+ batchId: options.batchId ? BatchId.toFriendlyId(options.batchId) : undefined,
+ idempotencyKey,
+ runId: existingRun.friendlyId,
+ },
+ incomplete: existingRun.associatedWaitpoint.status === "PENDING",
+ isError: existingRun.associatedWaitpoint.outputIsError,
+ immediate: true,
+ },
+ async (event) => {
+ //log a message
+ await eventRepository.recordEvent(
+ `There's an existing run for idempotencyKey: ${idempotencyKey}`,
+ {
+ taskSlug: taskId,
+ environment,
+ attributes: {
+ runId: existingRun.friendlyId,
+ },
+ context: options.traceContext,
+ parentId: event.spanId,
+ }
+ );
+ //block run with waitpoint
+ await this._engine.blockRunWithWaitpoint({
+ runId: RunId.fromFriendlyId(body.options!.parentRunId!),
+ waitpoints: existingRun.associatedWaitpoint!.id,
+ spanIdToComplete: event.spanId,
+ batch: options?.batchId
+ ? {
+ id: options.batchId,
+ index: options.batchIndex ?? 0,
+ }
+ : undefined,
+ environmentId: environment.id,
+ projectId: environment.projectId,
+ organizationId: environment.organizationId,
+ tx: this._prisma,
+ });
+ }
+ );
+ }
+
+ return { run: existingRun, isCached: true };
+ }
+ }
+
+ if (environment.type !== "DEVELOPMENT") {
+ const result = await getEntitlement(environment.organizationId);
+ if (result && result.hasAccess === false) {
+ throw new OutOfEntitlementError();
+ }
+ }
+
+ if (!options.skipChecks) {
+ const queueSizeGuard = await guardQueueSizeLimitsForEnv(this._engine, environment);
+
+ logger.debug("Queue size guard result", {
+ queueSizeGuard,
+ environment: {
+ id: environment.id,
+ type: environment.type,
+ organization: environment.organization,
+ project: environment.project,
+ },
+ });
+
+ if (!queueSizeGuard.isWithinLimits) {
+ throw new ServiceValidationError(
+ `Cannot trigger ${taskId} as the queue size limit for this environment has been reached. The maximum size is ${queueSizeGuard.maximumSize}`
+ );
+ }
+ }
+
+ if (
+ body.options?.tags &&
+ typeof body.options.tags !== "string" &&
+ body.options.tags.length > MAX_TAGS_PER_RUN
+ ) {
+ throw new ServiceValidationError(
+ `Runs can only have ${MAX_TAGS_PER_RUN} tags, you're trying to set ${body.options.tags.length}.`
+ );
+ }
+
+ const runFriendlyId = options?.runFriendlyId ?? RunId.generate().friendlyId;
+
+ const payloadPacket = await this.#handlePayloadPacket(
+ body.payload,
+ body.options?.payloadType ?? "application/json",
+ runFriendlyId,
+ environment
+ );
+
+ const metadataPacket = body.options?.metadata
+ ? handleMetadataPacket(
+ body.options?.metadata,
+ body.options?.metadataType ?? "application/json"
+ )
+ : undefined;
+
+ const parentRun = body.options?.parentRunId
+ ? await this._prisma.taskRun.findFirst({
+ where: { id: RunId.fromFriendlyId(body.options.parentRunId) },
+ })
+ : undefined;
+
+ if (parentRun && isFinalRunStatus(parentRun.status)) {
+ logger.debug("Parent run is in a terminal state", {
+ parentRun,
+ });
+
+ throw new ServiceValidationError(
+ `Cannot trigger ${taskId} as the parent run has a status of ${parentRun.status}`
+ );
+ }
+
+ try {
+ return await eventRepository.traceEvent(
+ taskId,
+ {
+ context: options.traceContext,
+ spanParentAsLink: options.spanParentAsLink,
+ parentAsLinkType: options.parentAsLinkType,
+ kind: "SERVER",
+ environment,
+ taskSlug: taskId,
+ attributes: {
+ properties: {
+ [SemanticInternalAttributes.SHOW_ACTIONS]: true,
+ },
+ style: {
+ icon: options.customIcon ?? "task",
+ },
+ runIsTest: body.options?.test ?? false,
+ batchId: options.batchId ? BatchId.toFriendlyId(options.batchId) : undefined,
+ idempotencyKey,
+ },
+ incomplete: true,
+ immediate: true,
+ },
+ async (event, traceContext, traceparent) => {
+ const run = await autoIncrementCounter.incrementInTransaction(
+ `v3-run:${environment.id}:${taskId}`,
+ async (num, tx) => {
+ const lockedToBackgroundWorker = body.options?.lockToVersion
+ ? await tx.backgroundWorker.findFirst({
+ where: {
+ projectId: environment.projectId,
+ runtimeEnvironmentId: environment.id,
+ version: body.options?.lockToVersion,
+ },
+ })
+ : undefined;
+
+ let queueName = sanitizeQueueName(
+ await this.#getQueueName(taskId, environment, body.options?.queue?.name)
+ );
+
+ // Check that the queuename is not an empty string
+ if (!queueName) {
+ queueName = sanitizeQueueName(`task/${taskId}`);
+ }
+
+ event.setAttribute("queueName", queueName);
+ span.setAttribute("queueName", queueName);
+
+ //upsert tags
+ let tags: { id: string; name: string }[] = [];
+ const bodyTags =
+ typeof body.options?.tags === "string" ? [body.options.tags] : body.options?.tags;
+ if (bodyTags && bodyTags.length > 0) {
+ for (const tag of bodyTags) {
+ const tagRecord = await createTag({
+ tag,
+ projectId: environment.projectId,
+ });
+ if (tagRecord) {
+ tags.push(tagRecord);
+ }
+ }
+ }
+
+ const depth = parentRun ? parentRun.depth + 1 : 0;
+
+ event.setAttribute("runId", runFriendlyId);
+ span.setAttribute("runId", runFriendlyId);
+
+ const workerGroupService = new WorkerGroupService({
+ prisma: this._prisma,
+ engine: this._engine,
+ });
+ const workerGroup = await workerGroupService.getDefaultWorkerGroupForProject({
+ projectId: environment.projectId,
+ });
+
+ if (!workerGroup) {
+ logger.error("Default worker group not found", {
+ projectId: environment.projectId,
+ });
+
+ return;
+ }
+
+ const taskRun = await this._engine.trigger(
+ {
+ number: num,
+ friendlyId: runFriendlyId,
+ environment: environment,
+ idempotencyKey,
+ idempotencyKeyExpiresAt: idempotencyKey ? idempotencyKeyExpiresAt : undefined,
+ taskIdentifier: taskId,
+ payload: payloadPacket.data ?? "",
+ payloadType: payloadPacket.dataType,
+ context: body.context,
+ traceContext: traceContext,
+ traceId: event.traceId,
+ spanId: event.spanId,
+ parentSpanId:
+ options.parentAsLinkType === "replay" ? undefined : traceparent?.spanId,
+ lockedToVersionId: lockedToBackgroundWorker?.id,
+ taskVersion: lockedToBackgroundWorker?.version,
+ sdkVersion: lockedToBackgroundWorker?.sdkVersion,
+ cliVersion: lockedToBackgroundWorker?.cliVersion,
+ concurrencyKey: body.options?.concurrencyKey,
+ queueName,
+ queue: body.options?.queue,
+ masterQueue: workerGroup.masterQueue,
+ isTest: body.options?.test ?? false,
+ delayUntil,
+ queuedAt: delayUntil ? undefined : new Date(),
+ maxAttempts: body.options?.maxAttempts,
+ taskEventStore: getTaskEventStore(),
+ ttl,
+ tags,
+ oneTimeUseToken: options.oneTimeUseToken,
+ parentTaskRunId: parentRun?.id,
+ rootTaskRunId: parentRun?.rootTaskRunId ?? parentRun?.id,
+ batch: options?.batchId
+ ? {
+ id: options.batchId,
+ index: options.batchIndex ?? 0,
+ }
+ : undefined,
+ resumeParentOnCompletion: body.options?.resumeParentOnCompletion,
+ depth,
+ metadata: metadataPacket?.data,
+ metadataType: metadataPacket?.dataType,
+ seedMetadata: metadataPacket?.data,
+ seedMetadataType: metadataPacket?.dataType,
+ maxDurationInSeconds: body.options?.maxDuration
+ ? clampMaxDuration(body.options.maxDuration)
+ : undefined,
+ machine: body.options?.machine,
+ priorityMs: body.options?.priority ? body.options.priority * 1_000 : undefined,
+ },
+ this._prisma
+ );
+
+ return { run: taskRun, isCached: false };
+ },
+ async (_, tx) => {
+ const counter = await tx.taskRunNumberCounter.findFirst({
+ where: {
+ taskIdentifier: taskId,
+ environmentId: environment.id,
+ },
+ select: { lastNumber: true },
+ });
+
+ return counter?.lastNumber;
+ },
+ this._prisma
+ );
+
+ return run;
+ }
+ );
+ } catch (error) {
+ if (error instanceof RunDuplicateIdempotencyKeyError) {
+ //retry calling this function, because this time it will return the idempotent run
+ return await this.call({ taskId, environment, body, options, attempt: attempt + 1 });
+ }
+
+ // Detect a prisma transaction Unique constraint violation
+ if (error instanceof Prisma.PrismaClientKnownRequestError) {
+ logger.debug("TriggerTask: Prisma transaction error", {
+ code: error.code,
+ message: error.message,
+ meta: error.meta,
+ });
+
+ if (error.code === "P2002") {
+ const target = error.meta?.target;
+
+ if (
+ Array.isArray(target) &&
+ target.length > 0 &&
+ typeof target[0] === "string" &&
+ target[0].includes("oneTimeUseToken")
+ ) {
+ throw new ServiceValidationError(
+ `Cannot trigger ${taskId} with a one-time use token as it has already been used.`
+ );
+ } else {
+ throw new ServiceValidationError(
+ `Cannot trigger ${taskId} as it has already been triggered with the same idempotency key.`
+ );
+ }
+ }
+ }
+
+ throw error;
+ }
+ });
+ }
+
+ async #getQueueName(taskId: string, environment: AuthenticatedEnvironment, queueName?: string) {
+ if (queueName) {
+ return queueName;
+ }
+
+ const defaultQueueName = `task/${taskId}`;
+
+ const worker = await findCurrentWorkerFromEnvironment(environment);
+
+ if (!worker) {
+ logger.debug("Failed to get queue name: No worker found", {
+ taskId,
+ environmentId: environment.id,
+ });
+
+ return defaultQueueName;
+ }
+
+ const task = await this._prisma.backgroundWorkerTask.findFirst({
+ where: {
+ workerId: worker.id,
+ slug: taskId,
+ },
+ });
+
+ if (!task) {
+ console.log("Failed to get queue name: No task found", {
+ taskId,
+ environmentId: environment.id,
+ });
+
+ return defaultQueueName;
+ }
+
+ const queueConfig = QueueOptions.optional().nullable().safeParse(task.queueConfig);
+
+ if (!queueConfig.success) {
+ console.log("Failed to get queue name: Invalid queue config", {
+ taskId,
+ environmentId: environment.id,
+ queueConfig: task.queueConfig,
+ });
+
+ return defaultQueueName;
+ }
+
+ return queueConfig.data?.name ?? defaultQueueName;
+ }
+
+ async #handlePayloadPacket(
+ payload: any,
+ payloadType: string,
+ pathPrefix: string,
+ environment: AuthenticatedEnvironment
+ ) {
+ return await startActiveSpan("handlePayloadPacket()", async (span) => {
+ const packet = this.#createPayloadPacket(payload, payloadType);
+
+ if (!packet.data) {
+ return packet;
+ }
+
+ const { needsOffloading, size } = packetRequiresOffloading(
+ packet,
+ env.TASK_PAYLOAD_OFFLOAD_THRESHOLD
+ );
+
+ if (!needsOffloading) {
+ return packet;
+ }
+
+ const filename = `${pathPrefix}/payload.json`;
+
+ await uploadPacketToObjectStore(filename, packet.data, packet.dataType, environment);
+
+ return {
+ data: filename,
+ dataType: "application/store",
+ };
+ });
+ }
+
+ #createPayloadPacket(payload: any, payloadType: string): IOPacket {
+ if (payloadType === "application/json") {
+ return { data: JSON.stringify(payload), dataType: "application/json" };
+ }
+
+ if (typeof payload === "string") {
+ return { data: payload, dataType: payloadType };
+ }
+
+ return { dataType: payloadType };
+ }
+}
+
+function getMaximumSizeForEnvironment(environment: AuthenticatedEnvironment): number | undefined {
+ if (environment.type === "DEVELOPMENT") {
+ return environment.organization.maximumDevQueueSize ?? env.MAXIMUM_DEV_QUEUE_SIZE;
+ } else {
+ return environment.organization.maximumDeployedQueueSize ?? env.MAXIMUM_DEPLOYED_QUEUE_SIZE;
+ }
+}
+
+export async function guardQueueSizeLimitsForEnv(
+ engine: RunEngine,
+ environment: AuthenticatedEnvironment,
+ itemsToAdd: number = 1
+) {
+ const maximumSize = getMaximumSizeForEnvironment(environment);
+
+ if (typeof maximumSize === "undefined") {
+ return { isWithinLimits: true };
+ }
+
+ const queueSize = await engine.lengthOfEnvQueue(environment);
+ const projectedSize = queueSize + itemsToAdd;
+
+ return {
+ isWithinLimits: projectedSize <= maximumSize,
+ maximumSize,
+ queueSize,
+ };
+}
diff --git a/apps/webapp/app/v3/services/worker/workerGroupService.server.ts b/apps/webapp/app/v3/services/worker/workerGroupService.server.ts
new file mode 100644
index 0000000000..24d457a882
--- /dev/null
+++ b/apps/webapp/app/v3/services/worker/workerGroupService.server.ts
@@ -0,0 +1,263 @@
+import { WorkerInstanceGroup, WorkerInstanceGroupType } from "@trigger.dev/database";
+import { WithRunEngine } from "../baseService.server";
+import { WorkerGroupTokenService } from "./workerGroupTokenService.server";
+import { logger } from "~/services/logger.server";
+import { makeFlags, makeSetFlags } from "~/v3/featureFlags.server";
+
+export class WorkerGroupService extends WithRunEngine {
+ private readonly defaultNamePrefix = "worker_group";
+
+ async createWorkerGroup({
+ projectId,
+ organizationId,
+ name,
+ description,
+ }: {
+ projectId?: string;
+ organizationId?: string;
+ name?: string;
+ description?: string;
+ }) {
+ if (!name) {
+ name = await this.generateWorkerName({ projectId });
+ }
+
+ const tokenService = new WorkerGroupTokenService({
+ prisma: this._prisma,
+ engine: this._engine,
+ });
+ const token = await tokenService.createToken();
+
+ const workerGroup = await this._prisma.workerInstanceGroup.create({
+ data: {
+ projectId,
+ organizationId,
+ type: projectId ? WorkerInstanceGroupType.UNMANAGED : WorkerInstanceGroupType.MANAGED,
+ masterQueue: this.generateMasterQueueName({ projectId, name }),
+ tokenId: token.id,
+ description,
+ name,
+ },
+ });
+
+ if (workerGroup.type === WorkerInstanceGroupType.MANAGED) {
+ const managedCount = await this._prisma.workerInstanceGroup.count({
+ where: {
+ type: WorkerInstanceGroupType.MANAGED,
+ },
+ });
+
+ if (managedCount === 1) {
+ const setFlag = makeSetFlags(this._prisma);
+ await setFlag({
+ key: "defaultWorkerInstanceGroupId",
+ value: workerGroup.id,
+ });
+ }
+ }
+
+ return {
+ workerGroup,
+ token,
+ };
+ }
+
+ /**
+ This updates a single worker group.
+ The name should never be updated. This would mean changing the masterQueue name which can have unexpected consequences.
+ */
+ async updateWorkerGroup({
+ projectId,
+ workerGroupId,
+ description,
+ }: {
+ projectId: string;
+ workerGroupId: string;
+ description?: string;
+ }) {
+ const workerGroup = await this._prisma.workerInstanceGroup.findUnique({
+ where: {
+ id: workerGroupId,
+ projectId,
+ },
+ });
+
+ if (!workerGroup) {
+ logger.error("[WorkerGroupService] No worker group found for update", {
+ workerGroupId,
+ description,
+ });
+ return;
+ }
+
+ await this._prisma.workerInstanceGroup.update({
+ where: {
+ id: workerGroup.id,
+ },
+ data: {
+ description,
+ },
+ });
+ }
+
+ /**
+ This lists worker groups.
+ Without a project ID, only shared worker groups will be returned.
+ With a project ID, in addition to all shared worker groups, ones associated with the project will also be returned.
+ */
+ async listWorkerGroups({ projectId, listHidden }: { projectId?: string; listHidden?: boolean }) {
+ const workerGroups = await this._prisma.workerInstanceGroup.findMany({
+ where: {
+ OR: [
+ {
+ type: WorkerInstanceGroupType.MANAGED,
+ },
+ {
+ projectId,
+ },
+ ],
+ AND: listHidden ? [] : [{ hidden: false }],
+ },
+ });
+
+ return workerGroups;
+ }
+
+ async deleteWorkerGroup({
+ projectId,
+ workerGroupId,
+ }: {
+ projectId: string;
+ workerGroupId: string;
+ }) {
+ const workerGroup = await this._prisma.workerInstanceGroup.findUnique({
+ where: {
+ id: workerGroupId,
+ },
+ });
+
+ if (!workerGroup) {
+ logger.error("[WorkerGroupService] WorkerGroup not found for deletion", {
+ workerGroupId,
+ projectId,
+ });
+ return;
+ }
+
+ if (workerGroup.projectId !== projectId) {
+ logger.error("[WorkerGroupService] WorkerGroup does not belong to project", {
+ workerGroupId,
+ projectId,
+ });
+ return;
+ }
+
+ await this._prisma.workerInstanceGroup.delete({
+ where: {
+ id: workerGroupId,
+ },
+ });
+ }
+
+ async getGlobalDefaultWorkerGroup() {
+ const flags = makeFlags(this._prisma);
+
+ const defaultWorkerInstanceGroupId = await flags({
+ key: "defaultWorkerInstanceGroupId",
+ });
+
+ if (!defaultWorkerInstanceGroupId) {
+ logger.error("[WorkerGroupService] Default worker group not found in feature flags");
+ return;
+ }
+
+ const workerGroup = await this._prisma.workerInstanceGroup.findUnique({
+ where: {
+ id: defaultWorkerInstanceGroupId,
+ },
+ });
+
+ if (!workerGroup) {
+ logger.error("[WorkerGroupService] Default worker group not found", {
+ defaultWorkerInstanceGroupId,
+ });
+ return;
+ }
+
+ return workerGroup;
+ }
+
+ async getDefaultWorkerGroupForProject({
+ projectId,
+ }: {
+ projectId: string;
+ }): Promise {
+ const project = await this._prisma.project.findUnique({
+ where: {
+ id: projectId,
+ },
+ include: {
+ defaultWorkerGroup: true,
+ },
+ });
+
+ if (!project) {
+ logger.error("[WorkerGroupService] Project not found", { projectId });
+ return;
+ }
+
+ if (project.defaultWorkerGroup) {
+ return project.defaultWorkerGroup;
+ }
+
+ return await this.getGlobalDefaultWorkerGroup();
+ }
+
+ async setDefaultWorkerGroupForProject({
+ projectId,
+ workerGroupId,
+ }: {
+ projectId: string;
+ workerGroupId: string;
+ }) {
+ const workerGroup = await this._prisma.workerInstanceGroup.findUnique({
+ where: {
+ id: workerGroupId,
+ },
+ });
+
+ if (!workerGroup) {
+ logger.error("[WorkerGroupService] WorkerGroup not found", {
+ workerGroupId,
+ });
+ return;
+ }
+
+ await this._prisma.project.update({
+ where: {
+ id: projectId,
+ },
+ data: {
+ defaultWorkerGroupId: workerGroupId,
+ },
+ });
+ }
+
+ private async generateWorkerName({ projectId }: { projectId?: string }) {
+ const workerGroups = await this._prisma.workerInstanceGroup.count({
+ where: {
+ projectId: projectId ?? null,
+ },
+ });
+
+ return `${this.defaultNamePrefix}_${workerGroups + 1}`;
+ }
+
+ private generateMasterQueueName({ projectId, name }: { projectId?: string; name: string }) {
+ if (!projectId) {
+ return name;
+ }
+
+ return `${projectId}-${name}`;
+ }
+}
diff --git a/apps/webapp/app/v3/services/worker/workerGroupTokenService.server.ts b/apps/webapp/app/v3/services/worker/workerGroupTokenService.server.ts
new file mode 100644
index 0000000000..1c58efcf6a
--- /dev/null
+++ b/apps/webapp/app/v3/services/worker/workerGroupTokenService.server.ts
@@ -0,0 +1,824 @@
+import { customAlphabet } from "nanoid";
+import { WithRunEngine, WithRunEngineOptions } from "../baseService.server";
+import { createHash, timingSafeEqual } from "crypto";
+import { logger } from "~/services/logger.server";
+import {
+ Prisma,
+ RuntimeEnvironment,
+ WorkerInstanceGroup,
+ WorkerInstanceGroupType,
+} from "@trigger.dev/database";
+import { z } from "zod";
+import { WORKER_HEADERS } from "@trigger.dev/core/v3/workers";
+import {
+ TaskRunExecutionResult,
+ DequeuedMessage,
+ CompleteRunAttemptResult,
+ StartRunAttemptResult,
+ ExecutionResult,
+ MachinePreset,
+ MachineResources,
+ CheckpointInput,
+} from "@trigger.dev/core/v3";
+import { env } from "~/env.server";
+import { $transaction } from "~/db.server";
+import { resolveVariablesForEnvironment } from "~/v3/environmentVariables/environmentVariablesRepository.server";
+import { generateJWTTokenForEnvironment } from "~/services/apiAuth.server";
+import { CURRENT_UNMANAGED_DEPLOYMENT_LABEL, fromFriendlyId } from "@trigger.dev/core/v3/apps";
+import { machinePresetFromName } from "~/v3/machinePresets.server";
+import { defaultMachine } from "@trigger.dev/platform/v3";
+
+export class WorkerGroupTokenService extends WithRunEngine {
+ private readonly tokenPrefix = "tr_wgt_";
+ private readonly tokenLength = 40;
+ private readonly tokenChars = "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
+ private readonly tokenGenerator = customAlphabet(this.tokenChars, this.tokenLength);
+
+ async createToken() {
+ const rawToken = await this.generateToken();
+
+ const workerGroupToken = await this._prisma.workerGroupToken.create({
+ data: {
+ tokenHash: rawToken.hash,
+ },
+ });
+
+ return {
+ id: workerGroupToken.id,
+ tokenHash: workerGroupToken.tokenHash,
+ plaintext: rawToken.plaintext,
+ };
+ }
+
+ async findWorkerGroup({ token }: { token: string }) {
+ const tokenHash = await this.hashToken(token);
+
+ const workerGroup = await this._prisma.workerInstanceGroup.findFirst({
+ where: {
+ token: {
+ tokenHash,
+ },
+ },
+ });
+
+ if (!workerGroup) {
+ logger.warn("[WorkerGroupTokenService] No matching worker group found", { token });
+ return null;
+ }
+
+ return workerGroup;
+ }
+
+ async rotateToken({ workerGroupId }: { workerGroupId: string }) {
+ const workerGroup = await this._prisma.workerInstanceGroup.findUnique({
+ where: {
+ id: workerGroupId,
+ },
+ });
+
+ if (!workerGroup) {
+ logger.error("[WorkerGroupTokenService] WorkerGroup not found", { workerGroupId });
+ return;
+ }
+
+ const rawToken = await this.generateToken();
+
+ const workerGroupToken = await this._prisma.workerGroupToken.update({
+ where: {
+ id: workerGroup.tokenId,
+ },
+ data: {
+ tokenHash: rawToken.hash,
+ },
+ });
+
+ if (!workerGroupToken) {
+ logger.error("[WorkerGroupTokenService] WorkerGroupToken not found", { workerGroupId });
+ return;
+ }
+
+ return {
+ id: workerGroupToken.id,
+ tokenHash: workerGroupToken.tokenHash,
+ plaintext: rawToken.plaintext,
+ };
+ }
+
+ private async hashToken(token: string) {
+ return createHash("sha256").update(token).digest("hex");
+ }
+
+ private async generateToken() {
+ const plaintext = `${this.tokenPrefix}${this.tokenGenerator()}`;
+ const hash = await this.hashToken(plaintext);
+
+ return {
+ plaintext,
+ hash,
+ };
+ }
+
+ async authenticate(request: Request): Promise {
+ const token = request.headers.get("Authorization")?.replace("Bearer ", "").trim();
+
+ if (!token) {
+ logger.error("[WorkerGroupTokenService] Token not found in request", {
+ headers: this.sanitizeHeaders(request),
+ });
+ return;
+ }
+
+ if (!token.startsWith(this.tokenPrefix)) {
+ logger.error("[WorkerGroupTokenService] Token does not start with expected prefix", {
+ token,
+ prefix: this.tokenPrefix,
+ });
+ return;
+ }
+
+ const instanceName = request.headers.get(WORKER_HEADERS.INSTANCE_NAME);
+
+ if (!instanceName) {
+ logger.error("[WorkerGroupTokenService] Instance name not found in request", {
+ headers: this.sanitizeHeaders(request),
+ });
+ return;
+ }
+
+ const workerGroup = await this.findWorkerGroup({ token });
+
+ if (!workerGroup) {
+ logger.warn("[WorkerGroupTokenService] Worker group not found", { token });
+ return;
+ }
+
+ if (workerGroup.type === WorkerInstanceGroupType.MANAGED) {
+ const managedWorkerSecret = request.headers.get(WORKER_HEADERS.MANAGED_SECRET);
+
+ if (!managedWorkerSecret) {
+ logger.error("[WorkerGroupTokenService] Managed secret not found in request", {
+ headers: this.sanitizeHeaders(request),
+ });
+ return;
+ }
+
+ const encoder = new TextEncoder();
+
+ const a = encoder.encode(managedWorkerSecret);
+ const b = encoder.encode(env.MANAGED_WORKER_SECRET);
+
+ if (a.byteLength !== b.byteLength) {
+ logger.error("[WorkerGroupTokenService] Managed secret length mismatch", {
+ managedWorkerSecret,
+ headers: this.sanitizeHeaders(request),
+ });
+ return;
+ }
+
+ if (!timingSafeEqual(a, b)) {
+ logger.error("[WorkerGroupTokenService] Managed secret mismatch", {
+ managedWorkerSecret,
+ headers: this.sanitizeHeaders(request),
+ });
+ return;
+ }
+ }
+
+ const workerInstance = await this.getOrCreateWorkerInstance({
+ workerGroup,
+ instanceName,
+ deploymentId: request.headers.get(WORKER_HEADERS.DEPLOYMENT_ID) ?? undefined,
+ });
+
+ if (!workerInstance) {
+ logger.error("[WorkerGroupTokenService] Unable to get or create worker instance", {
+ workerGroup,
+ instanceName,
+ });
+ return;
+ }
+
+ if (workerGroup.type === WorkerInstanceGroupType.MANAGED) {
+ return new AuthenticatedWorkerInstance({
+ prisma: this._prisma,
+ engine: this._engine,
+ type: WorkerInstanceGroupType.MANAGED,
+ name: workerGroup.name,
+ workerGroupId: workerGroup.id,
+ workerInstanceId: workerInstance.id,
+ masterQueue: workerGroup.masterQueue,
+ environment: null,
+ runnerId: request.headers.get(WORKER_HEADERS.RUNNER_ID) ?? undefined,
+ });
+ }
+
+ if (!workerInstance.environment) {
+ logger.error(
+ "[WorkerGroupTokenService] Unmanaged worker instance not linked to environment",
+ { workerGroup, workerInstance }
+ );
+ return;
+ }
+
+ if (!workerInstance.deployment) {
+ logger.error("[WorkerGroupTokenService] Unmanaged worker instance not linked to deployment", {
+ workerGroup,
+ workerInstance,
+ });
+ return;
+ }
+
+ if (!workerInstance.deployment.workerId) {
+ logger.error(
+ "[WorkerGroupTokenService] Unmanaged worker instance deployment not linked to background worker",
+ { workerGroup, workerInstance }
+ );
+ return;
+ }
+
+ return new AuthenticatedWorkerInstance({
+ prisma: this._prisma,
+ engine: this._engine,
+ type: WorkerInstanceGroupType.UNMANAGED,
+ name: workerGroup.name,
+ workerGroupId: workerGroup.id,
+ workerInstanceId: workerInstance.id,
+ masterQueue: workerGroup.masterQueue,
+ environmentId: workerInstance.environment.id,
+ deploymentId: workerInstance.deployment.id,
+ backgroundWorkerId: workerInstance.deployment.workerId,
+ environment: workerInstance.environment,
+ });
+ }
+
+ private async getOrCreateWorkerInstance({
+ workerGroup,
+ instanceName,
+ deploymentId,
+ }: {
+ workerGroup: WorkerInstanceGroup;
+ instanceName: string;
+ deploymentId?: string;
+ }) {
+ return await $transaction(this._prisma, async (tx) => {
+ const resourceIdentifier = deploymentId ? `${deploymentId}:${instanceName}` : instanceName;
+
+ const workerInstance = await tx.workerInstance.findUnique({
+ where: {
+ workerGroupId_resourceIdentifier: {
+ workerGroupId: workerGroup.id,
+ resourceIdentifier,
+ },
+ },
+ include: {
+ deployment: true,
+ environment: true,
+ },
+ });
+
+ if (workerInstance) {
+ return workerInstance;
+ }
+
+ if (workerGroup.type === WorkerInstanceGroupType.MANAGED) {
+ if (deploymentId) {
+ logger.warn(
+ "[WorkerGroupTokenService] Shared worker group instances should not authenticate with a deployment ID",
+ {
+ workerGroup,
+ workerInstance,
+ deploymentId,
+ }
+ );
+ }
+
+ try {
+ const newWorkerInstance = await tx.workerInstance.create({
+ data: {
+ workerGroupId: workerGroup.id,
+ name: instanceName,
+ resourceIdentifier,
+ },
+ include: {
+ // This will always be empty for shared worker instances, but required for types
+ deployment: true,
+ environment: true,
+ },
+ });
+ return newWorkerInstance;
+ } catch (error) {
+ // Gracefully handle race conditions when connecting for the first time
+ if (error instanceof Prisma.PrismaClientKnownRequestError) {
+ // Unique constraint violation
+ if (error.code === "P2002") {
+ try {
+ const existingWorkerInstance = await tx.workerInstance.findUnique({
+ where: {
+ workerGroupId_resourceIdentifier: {
+ workerGroupId: workerGroup.id,
+ resourceIdentifier,
+ },
+ },
+ include: {
+ deployment: true,
+ environment: true,
+ },
+ });
+ return existingWorkerInstance;
+ } catch (error) {
+ logger.error("[WorkerGroupTokenService] Failed to find worker instance", {
+ workerGroup,
+ workerInstance,
+ deploymentId,
+ });
+ return;
+ }
+ }
+ }
+ }
+ }
+
+ if (!workerGroup.projectId || !workerGroup.organizationId) {
+ logger.error(
+ "[WorkerGroupTokenService] Unmanaged worker group missing project or organization",
+ {
+ workerGroup,
+ workerInstance,
+ deploymentId,
+ }
+ );
+ return;
+ }
+
+ if (!deploymentId) {
+ logger.error("[WorkerGroupTokenService] Unmanaged worker group required deployment ID", {
+ workerGroup,
+ workerInstance,
+ });
+ return;
+ }
+
+ // Unmanaged workers instances are locked to a specific deployment version
+
+ const deployment = await tx.workerDeployment.findUnique({
+ where: {
+ ...(deploymentId.startsWith("deployment_")
+ ? {
+ friendlyId: deploymentId,
+ }
+ : {
+ id: deploymentId,
+ }),
+ },
+ });
+
+ if (!deployment) {
+ logger.error("[WorkerGroupTokenService] Deployment not found", {
+ workerGroup,
+ workerInstance,
+ deploymentId,
+ });
+ return;
+ }
+
+ if (deployment.projectId !== workerGroup.projectId) {
+ logger.error("[WorkerGroupTokenService] Deployment does not match worker group project", {
+ deployment,
+ workerGroup,
+ workerInstance,
+ });
+ return;
+ }
+
+ if (deployment.status === "DEPLOYING") {
+ // This is the first instance to be created for this deployment, so mark it as deployed
+ await tx.workerDeployment.update({
+ where: {
+ id: deployment.id,
+ },
+ data: {
+ status: "DEPLOYED",
+ deployedAt: new Date(),
+ },
+ });
+
+ // Check if the deployment should be promoted
+ const workerPromotion = await tx.workerDeploymentPromotion.findFirst({
+ where: {
+ label: CURRENT_UNMANAGED_DEPLOYMENT_LABEL,
+ environmentId: deployment.environmentId,
+ },
+ include: {
+ deployment: true,
+ },
+ });
+
+ const shouldPromote =
+ !workerPromotion || deployment.createdAt > workerPromotion.deployment.createdAt;
+
+ if (shouldPromote) {
+ // Promote the deployment
+ await tx.workerDeploymentPromotion.upsert({
+ where: {
+ environmentId_label: {
+ environmentId: deployment.environmentId,
+ label: CURRENT_UNMANAGED_DEPLOYMENT_LABEL,
+ },
+ },
+ create: {
+ deploymentId: deployment.id,
+ environmentId: deployment.environmentId,
+ label: CURRENT_UNMANAGED_DEPLOYMENT_LABEL,
+ },
+ update: {
+ deploymentId: deployment.id,
+ },
+ });
+ }
+ } else if (deployment.status !== "DEPLOYED") {
+ logger.error("[WorkerGroupTokenService] Deployment not deploying or deployed", {
+ deployment,
+ workerGroup,
+ workerInstance,
+ });
+ return;
+ }
+
+ const nonSharedWorkerInstance = tx.workerInstance.create({
+ data: {
+ workerGroupId: workerGroup.id,
+ name: instanceName,
+ resourceIdentifier,
+ environmentId: deployment.environmentId,
+ deploymentId: deployment.id,
+ },
+ include: {
+ deployment: true,
+ environment: true,
+ },
+ });
+
+ return nonSharedWorkerInstance;
+ });
+ }
+
+ private sanitizeHeaders(request: Request, skipHeaders = ["authorization"]) {
+ const sanitizedHeaders: Partial> = {};
+
+ for (const [key, value] of request.headers.entries()) {
+ if (!skipHeaders.includes(key.toLowerCase())) {
+ sanitizedHeaders[key] = value;
+ }
+ }
+
+ return sanitizedHeaders;
+ }
+}
+
+export const WorkerInstanceEnv = z.enum(["dev", "staging", "prod"]).default("prod");
+export type WorkerInstanceEnv = z.infer;
+
+export type AuthenticatedWorkerInstanceOptions = WithRunEngineOptions<{
+ type: WorkerInstanceGroupType;
+ name: string;
+ workerGroupId: string;
+ workerInstanceId: string;
+ masterQueue: string;
+ environmentId?: string;
+ deploymentId?: string;
+ backgroundWorkerId?: string;
+ runnerId?: string;
+ environment: RuntimeEnvironment | null;
+}>;
+
+export class AuthenticatedWorkerInstance extends WithRunEngine {
+ readonly type: WorkerInstanceGroupType;
+ readonly name: string;
+ readonly workerGroupId: string;
+ readonly workerInstanceId: string;
+ readonly runnerId?: string;
+ readonly masterQueue: string;
+ readonly environment: RuntimeEnvironment | null;
+ readonly deploymentId?: string;
+ readonly backgroundWorkerId?: string;
+
+ // FIXME: Required for unmanaged workers
+ readonly isLatestDeployment = true;
+
+ constructor(opts: AuthenticatedWorkerInstanceOptions) {
+ super({ prisma: opts.prisma, engine: opts.engine });
+
+ this.type = opts.type;
+ this.name = opts.name;
+ this.workerGroupId = opts.workerGroupId;
+ this.workerInstanceId = opts.workerInstanceId;
+ this.masterQueue = opts.masterQueue;
+ this.environment = opts.environment;
+ this.deploymentId = opts.deploymentId;
+ this.backgroundWorkerId = opts.backgroundWorkerId;
+ this.runnerId = opts.runnerId;
+ }
+
+ async connect(metadata: Record): Promise {
+ await this._prisma.workerInstance.update({
+ where: {
+ id: this.workerInstanceId,
+ },
+ data: {
+ metadata,
+ },
+ });
+ }
+
+ async dequeue({
+ maxRunCount = 10,
+ maxResources,
+ }: {
+ maxRunCount?: number;
+ maxResources?: MachineResources;
+ } = {}): Promise {
+ if (this.type === WorkerInstanceGroupType.MANAGED) {
+ return await this._engine.dequeueFromMasterQueue({
+ consumerId: this.workerInstanceId,
+ masterQueue: this.masterQueue,
+ maxRunCount,
+ maxResources,
+ workerId: this.workerInstanceId,
+ runnerId: this.runnerId,
+ });
+ }
+
+ if (!this.environment || !this.deploymentId || !this.backgroundWorkerId) {
+ logger.error("[AuthenticatedWorkerInstance] Missing environment or deployment", {
+ ...this.toJSON(),
+ });
+ return [];
+ }
+
+ await this._prisma.workerInstance.update({
+ where: {
+ id: this.workerInstanceId,
+ },
+ data: {
+ lastDequeueAt: new Date(),
+ },
+ });
+
+ if (this.isLatestDeployment) {
+ return await this._engine.dequeueFromEnvironmentMasterQueue({
+ consumerId: this.workerInstanceId,
+ environmentId: this.environment.id,
+ maxRunCount,
+ backgroundWorkerId: this.backgroundWorkerId,
+ workerId: this.workerInstanceId,
+ runnerId: this.runnerId,
+ });
+ }
+
+ return await this._engine.dequeueFromBackgroundWorkerMasterQueue({
+ consumerId: this.workerInstanceId,
+ backgroundWorkerId: this.backgroundWorkerId,
+ maxRunCount,
+ workerId: this.workerInstanceId,
+ runnerId: this.runnerId,
+ });
+ }
+
+ /** Allows managed workers to dequeue from a specific version */
+ async dequeueFromVersion(
+ backgroundWorkerId: string,
+ maxRunCount = 1
+ ): Promise {
+ if (this.type !== WorkerInstanceGroupType.MANAGED) {
+ logger.error("[AuthenticatedWorkerInstance] Worker instance is not managed", {
+ ...this.toJSON(),
+ });
+ return [];
+ }
+
+ return await this._engine.dequeueFromBackgroundWorkerMasterQueue({
+ consumerId: this.workerInstanceId,
+ backgroundWorkerId,
+ maxRunCount,
+ workerId: this.workerInstanceId,
+ runnerId: this.runnerId,
+ });
+ }
+
+ /** Allows managed workers to dequeue from a specific environment */
+ async dequeueFromEnvironment(
+ backgroundWorkerId: string,
+ environmentId: string,
+ maxRunCount = 1
+ ): Promise {
+ if (this.type !== WorkerInstanceGroupType.MANAGED) {
+ logger.error("[AuthenticatedWorkerInstance] Worker instance is not managed", {
+ ...this.toJSON(),
+ });
+ return [];
+ }
+
+ return await this._engine.dequeueFromEnvironmentMasterQueue({
+ consumerId: this.workerInstanceId,
+ backgroundWorkerId,
+ environmentId,
+ maxRunCount,
+ workerId: this.workerInstanceId,
+ runnerId: this.runnerId,
+ });
+ }
+
+ async heartbeatWorkerInstance() {
+ await this._prisma.workerInstance.update({
+ where: {
+ id: this.workerInstanceId,
+ },
+ data: {
+ lastHeartbeatAt: new Date(),
+ },
+ });
+ }
+
+ async heartbeatRun({
+ runFriendlyId,
+ snapshotFriendlyId,
+ }: {
+ runFriendlyId: string;
+ snapshotFriendlyId: string;
+ }): Promise {
+ return await this._engine.heartbeatRun({
+ runId: fromFriendlyId(runFriendlyId),
+ snapshotId: fromFriendlyId(snapshotFriendlyId),
+ workerId: this.workerInstanceId,
+ runnerId: this.runnerId,
+ });
+ }
+
+ async startRunAttempt({
+ runFriendlyId,
+ snapshotFriendlyId,
+ isWarmStart,
+ }: {
+ runFriendlyId: string;
+ snapshotFriendlyId: string;
+ isWarmStart?: boolean;
+ }): Promise<
+ StartRunAttemptResult & {
+ envVars: Record;
+ }
+ > {
+ const engineResult = await this._engine.startRunAttempt({
+ runId: fromFriendlyId(runFriendlyId),
+ snapshotId: fromFriendlyId(snapshotFriendlyId),
+ isWarmStart,
+ workerId: this.workerInstanceId,
+ runnerId: this.runnerId,
+ });
+
+ const defaultMachinePreset = machinePresetFromName(defaultMachine);
+
+ const environment =
+ this.environment ??
+ (await this._prisma.runtimeEnvironment.findUnique({
+ where: {
+ id: engineResult.execution.environment.id,
+ },
+ }));
+
+ const envVars = environment
+ ? await this.getEnvVars(
+ environment,
+ engineResult.run.id,
+ engineResult.execution.machine ?? defaultMachinePreset
+ )
+ : {};
+
+ return {
+ ...engineResult,
+ envVars,
+ };
+ }
+
+ async completeRunAttempt({
+ runFriendlyId,
+ snapshotFriendlyId,
+ completion,
+ }: {
+ runFriendlyId: string;
+ snapshotFriendlyId: string;
+ completion: TaskRunExecutionResult;
+ }): Promise {
+ return await this._engine.completeRunAttempt({
+ runId: fromFriendlyId(runFriendlyId),
+ snapshotId: fromFriendlyId(snapshotFriendlyId),
+ completion,
+ workerId: this.workerInstanceId,
+ runnerId: this.runnerId,
+ });
+ }
+
+ async getLatestSnapshot({ runFriendlyId }: { runFriendlyId: string }) {
+ return await this._engine.getRunExecutionData({
+ runId: fromFriendlyId(runFriendlyId),
+ });
+ }
+
+ async createCheckpoint({
+ runFriendlyId,
+ snapshotFriendlyId,
+ checkpoint,
+ }: {
+ runFriendlyId: string;
+ snapshotFriendlyId: string;
+ checkpoint: CheckpointInput;
+ }) {
+ return await this._engine.createCheckpoint({
+ runId: fromFriendlyId(runFriendlyId),
+ snapshotId: fromFriendlyId(snapshotFriendlyId),
+ checkpoint,
+ workerId: this.workerInstanceId,
+ runnerId: this.runnerId,
+ });
+ }
+
+ async continueRunExecution({
+ runFriendlyId,
+ snapshotFriendlyId,
+ }: {
+ runFriendlyId: string;
+ snapshotFriendlyId: string;
+ }) {
+ return await this._engine.continueRunExecution({
+ runId: fromFriendlyId(runFriendlyId),
+ snapshotId: fromFriendlyId(snapshotFriendlyId),
+ workerId: this.workerInstanceId,
+ runnerId: this.runnerId,
+ });
+ }
+
+ toJSON(): WorkerGroupTokenAuthenticationResponse {
+ if (this.type === WorkerInstanceGroupType.MANAGED) {
+ return {
+ type: WorkerInstanceGroupType.MANAGED,
+ name: this.name,
+ workerGroupId: this.workerGroupId,
+ workerInstanceId: this.workerInstanceId,
+ masterQueue: this.masterQueue,
+ };
+ }
+
+ return {
+ type: WorkerInstanceGroupType.UNMANAGED,
+ name: this.name,
+ workerGroupId: this.workerGroupId,
+ workerInstanceId: this.workerInstanceId,
+ masterQueue: this.masterQueue,
+ environmentId: this.environment?.id!,
+ deploymentId: this.deploymentId!,
+ };
+ }
+
+ private async getEnvVars(
+ environment: RuntimeEnvironment,
+ runId: string,
+ machinePreset: MachinePreset
+ ): Promise> {
+ const variables = await resolveVariablesForEnvironment(environment);
+
+ const jwt = await generateJWTTokenForEnvironment(environment, {
+ run_id: runId,
+ machine_preset: machinePreset.name,
+ });
+
+ variables.push(
+ ...[
+ { key: "TRIGGER_JWT", value: jwt },
+ { key: "TRIGGER_RUN_ID", value: runId },
+ { key: "TRIGGER_MACHINE_PRESET", value: machinePreset.name },
+ ]
+ );
+
+ return variables.reduce((acc: Record, curr) => {
+ acc[curr.key] = curr.value;
+ return acc;
+ }, {});
+ }
+}
+
+export type WorkerGroupTokenAuthenticationResponse =
+ | {
+ type: typeof WorkerInstanceGroupType.MANAGED;
+ name: string;
+ workerGroupId: string;
+ workerInstanceId: string;
+ masterQueue: string;
+ }
+ | {
+ type: typeof WorkerInstanceGroupType.UNMANAGED;
+ name: string;
+ workerGroupId: string;
+ workerInstanceId: string;
+ masterQueue: string;
+ environmentId: string;
+ deploymentId: string;
+ };
diff --git a/apps/webapp/app/v3/taskEventStore.server.ts b/apps/webapp/app/v3/taskEventStore.server.ts
index 2a677101c2..ec66fd8ce1 100644
--- a/apps/webapp/app/v3/taskEventStore.server.ts
+++ b/apps/webapp/app/v3/taskEventStore.server.ts
@@ -20,6 +20,7 @@ export type TraceEvent = Pick<
| "level"
| "events"
| "environmentType"
+ | "isDebug"
>;
export type TaskEventStoreTable = "taskEvent" | "taskEventPartitioned";
@@ -122,7 +123,7 @@ export class TaskEventStore {
) {
if (table === "taskEventPartitioned") {
return await this.readReplica.$queryRaw`
- SELECT
+ SELECT
"spanId",
"parentId",
"runId",
@@ -136,11 +137,12 @@ export class TaskEventStore {
"isCancelled",
level,
events,
- "environmentType"
+ "environmentType",
+ "isDebug"
FROM "TaskEventPartitioned"
- WHERE
- "traceId" = ${traceId}
- AND "createdAt" >= ${startCreatedAt.toISOString()}::timestamp
+ WHERE
+ "traceId" = ${traceId}
+ AND "createdAt" >= ${startCreatedAt.toISOString()}::timestamp
AND "createdAt" < ${(endCreatedAt
? new Date(endCreatedAt.getTime() + env.TASK_EVENT_PARTITIONED_WINDOW_IN_SECONDS * 1000)
: new Date()
@@ -150,7 +152,7 @@ export class TaskEventStore {
`;
} else {
return await this.readReplica.$queryRaw`
- SELECT
+ SELECT
id,
"spanId",
"parentId",
@@ -165,7 +167,8 @@ export class TaskEventStore {
"isCancelled",
level,
events,
- "environmentType"
+ "environmentType",
+ "isDebug"
FROM "TaskEvent"
WHERE "traceId" = ${traceId}
ORDER BY "startTime" ASC
diff --git a/apps/webapp/package.json b/apps/webapp/package.json
index e6f11259f9..6318adf4d7 100644
--- a/apps/webapp/package.json
+++ b/apps/webapp/package.json
@@ -50,6 +50,7 @@
"@electric-sql/react": "^0.3.5",
"@headlessui/react": "^1.7.8",
"@heroicons/react": "^2.0.12",
+ "@internal/run-engine": "workspace:*",
"@internal/zod-worker": "workspace:*",
"@internal/redis-worker": "workspace:*",
"@internationalized/date": "^3.5.1",
@@ -167,7 +168,7 @@
"remix-auth-email-link": "2.0.2",
"remix-auth-github": "^1.6.0",
"remix-typedjson": "0.3.1",
- "remix-utils": "^7.1.0",
+ "remix-utils": "^7.7.0",
"seedrandom": "^3.0.5",
"semver": "^7.5.0",
"simple-oauth2": "^5.0.0",
@@ -210,7 +211,6 @@
"@types/lodash.omit": "^4.5.7",
"@types/marked": "^4.0.3",
"@types/morgan": "^1.9.3",
- "@types/node": "^18.11.15",
"@types/node-fetch": "^2.6.2",
"@types/prismjs": "^1.26.0",
"@types/qs": "^6.9.7",
@@ -252,7 +252,6 @@
"tailwindcss": "3.4.1",
"ts-node": "^10.7.0",
"tsconfig-paths": "^3.14.1",
- "typescript": "^5.1.6",
"vite-tsconfig-paths": "^4.0.5",
"vitest": "^1.4.0"
},
diff --git a/apps/webapp/prisma/populate.ts b/apps/webapp/prisma/populate.ts
index 6b3f277d39..fb31a1e978 100644
--- a/apps/webapp/prisma/populate.ts
+++ b/apps/webapp/prisma/populate.ts
@@ -4,12 +4,212 @@
// 2. pnpm run db:populate -- --projectRef=proj_liazlkfgmfcusswwgohl --taskIdentifier=child-task --runCount=100000
import { generateFriendlyId } from "~/v3/friendlyIdentifiers";
import { prisma } from "../app/db.server";
+import { createHash } from "crypto";
+import {
+ BackgroundWorker,
+ BackgroundWorkerTask,
+ RuntimeEnvironmentType,
+ WorkerInstanceGroupType,
+} from "@trigger.dev/database";
+import { nanoid } from "nanoid";
async function populate() {
if (process.env.NODE_ENV !== "development") {
return;
}
+ const project = await getProject();
+
+ await generateRuns(project);
+ await createWorkerGroup(project);
+ const { worker, tasks } = await createBackgroundWorker(project, getEnvTypeFromArg());
+ await createWorkerDeployment(project, worker, getEnvTypeFromArg());
+}
+
+function getEnvironment(
+ project: ProjectWithEnvironment,
+ envType: RuntimeEnvironmentType = "PRODUCTION"
+) {
+ const env = project.environments.find((e) => e.type === envType);
+
+ if (!env) {
+ throw new Error(`No environment of type "${envType}" found for project ${project.id}`);
+ }
+
+ return env;
+}
+
+async function createWorkerDeployment(
+ project: ProjectWithEnvironment,
+ worker: BackgroundWorker,
+ envType: RuntimeEnvironmentType = "PRODUCTION"
+) {
+ const env = getEnvironment(project, envType);
+ const deploymentId = `cm3c821sk00032v6is7ufqy3d-${env.slug}`;
+
+ if (env.type === "DEVELOPMENT") {
+ console.warn("Skipping deployment creation for development environment");
+ return;
+ }
+
+ let deployment = await prisma.workerDeployment.findUnique({
+ where: {
+ id: deploymentId,
+ },
+ });
+
+ if (deployment) {
+ console.log(`Deployment "${deploymentId}" already exists`);
+ return deployment;
+ }
+
+ const firstOrgMember = project.organization.members[0];
+
+ deployment = await prisma.workerDeployment.create({
+ data: {
+ id: deploymentId,
+ friendlyId: generateFriendlyId("deployment"),
+ contentHash: worker.contentHash,
+ version: worker.version,
+ shortCode: nanoid(8),
+ imageReference: `trigger/${project.externalRef}:${worker.version}.${env.slug}`,
+ status: "DEPLOYING",
+ projectId: project.id,
+ environmentId: env.id,
+ workerId: worker.id,
+ triggeredById: firstOrgMember.userId,
+ },
+ });
+
+ console.log(`Created deployment "${deploymentId}"`);
+
+ return deployment;
+}
+
+async function createBackgroundWorker(
+ project: ProjectWithEnvironment,
+ envType: RuntimeEnvironmentType = "PRODUCTION"
+) {
+ const env = getEnvironment(project, envType);
+ const taskIdentifier = "seed-task";
+ const backgroundWorkerId = `cm3c8fmiv00042v6imoqwxst1-${env.slug}`;
+
+ let worker = await prisma.backgroundWorker.findUnique({
+ where: {
+ id: backgroundWorkerId,
+ },
+ include: {
+ tasks: true,
+ },
+ });
+
+ if (worker) {
+ console.log(`Worker "${backgroundWorkerId}" already exists`);
+
+ return {
+ worker,
+ tasks: worker.tasks,
+ };
+ }
+
+ worker = await prisma.backgroundWorker.create({
+ data: {
+ id: backgroundWorkerId,
+ friendlyId: generateFriendlyId("worker"),
+ contentHash: "hash",
+ projectId: project.id,
+ runtimeEnvironmentId: env.id,
+ version: "20241111.1",
+ metadata: {},
+ },
+ include: {
+ tasks: true,
+ },
+ });
+
+ console.log(`Created worker "${backgroundWorkerId}"`);
+
+ const taskIdentifiers = Array.isArray(taskIdentifier) ? taskIdentifier : [taskIdentifier];
+
+ const tasks: BackgroundWorkerTask[] = [];
+
+ for (const identifier of taskIdentifiers) {
+ const task = await prisma.backgroundWorkerTask.create({
+ data: {
+ friendlyId: generateFriendlyId("task"),
+ slug: identifier,
+ filePath: `/trigger/${identifier}.ts`,
+ exportName: identifier,
+ workerId: worker.id,
+ runtimeEnvironmentId: env.id,
+ projectId: project.id,
+ },
+ });
+
+ tasks.push(task);
+ }
+
+ return {
+ worker,
+ tasks,
+ };
+}
+
+async function createWorkerGroup(project: ProjectWithEnvironment) {
+ const workerGroupName = "seed-unmanaged";
+ const rawToken = "tr_wgt_15480aa1712cae4b8db8c7a49707d69d";
+
+ const existingWorkerGroup = await prisma.workerInstanceGroup.findFirst({
+ where: {
+ projectId: project.id,
+ name: workerGroupName,
+ },
+ });
+
+ if (existingWorkerGroup) {
+ console.log(`Worker group "${workerGroupName}" already exists`);
+
+ await setAsDefaultWorkerGroup(project, existingWorkerGroup.id);
+
+ return existingWorkerGroup;
+ }
+
+ const token = await prisma.workerGroupToken.create({
+ data: {
+ tokenHash: createHash("sha256").update(rawToken).digest("hex"),
+ },
+ });
+
+ const workerGroup = await prisma.workerInstanceGroup.create({
+ data: {
+ projectId: project.id,
+ organizationId: project.organizationId,
+ type: WorkerInstanceGroupType.UNMANAGED,
+ masterQueue: `${project.id}-${workerGroupName}`,
+ tokenId: token.id,
+ description: "Seeded worker group",
+ name: workerGroupName,
+ },
+ });
+
+ await setAsDefaultWorkerGroup(project, workerGroup.id);
+
+ return workerGroup;
+}
+
+async function setAsDefaultWorkerGroup(project: ProjectWithEnvironment, workerGroupId: string) {
+ // Set as default worker group
+ await prisma.project.update({
+ where: {
+ id: project.id,
+ },
+ data: {
+ defaultWorkerGroupId: workerGroupId,
+ },
+ });
+}
+
+async function getProject() {
const projectRef = getArg("projectRef");
if (!projectRef) {
throw new Error("projectRef is required");
@@ -18,15 +218,27 @@ async function populate() {
const project = await prisma.project.findUnique({
include: {
environments: true,
+ organization: {
+ include: {
+ members: true,
+ },
+ },
},
where: {
externalRef: projectRef,
},
});
+
if (!project) {
throw new Error("Project not found");
}
+ return project;
+}
+
+type ProjectWithEnvironment = Awaited>;
+
+async function generateRuns(project: ProjectWithEnvironment) {
const taskIdentifier = getArg("taskIdentifier");
if (!taskIdentifier) {
throw new Error("taskIdentifier is required");
@@ -74,6 +286,25 @@ async function populate() {
console.log(`Added ${runs.count} runs`);
}
+function getEnvTypeFromArg(): RuntimeEnvironmentType {
+ const env = getArg("env");
+
+ if (!env) {
+ return RuntimeEnvironmentType.PRODUCTION;
+ }
+
+ switch (env) {
+ case "dev":
+ return RuntimeEnvironmentType.DEVELOPMENT;
+ case "prod":
+ return RuntimeEnvironmentType.PRODUCTION;
+ case "stg":
+ return RuntimeEnvironmentType.STAGING;
+ default:
+ throw new Error(`Invalid environment: ${env}`);
+ }
+}
+
function getArg(name: string) {
const args = process.argv.slice(2);
diff --git a/apps/webapp/remix.config.js b/apps/webapp/remix.config.js
index ffa62d14af..296921e575 100644
--- a/apps/webapp/remix.config.js
+++ b/apps/webapp/remix.config.js
@@ -11,6 +11,7 @@ module.exports = {
/^remix-utils.*/,
"marked",
"axios",
+ "@internal/redis-worker",
"p-limit",
"yocto-queue",
"@trigger.dev/core",
diff --git a/apps/webapp/server.ts b/apps/webapp/server.ts
index 26e30343a2..4b4a6a843e 100644
--- a/apps/webapp/server.ts
+++ b/apps/webapp/server.ts
@@ -43,6 +43,7 @@ if (process.env.HTTP_SERVER_DISABLED !== "true") {
const wss: WebSocketServer | undefined = build.entry.module.wss;
const registryProxy: RegistryProxy | undefined = build.entry.module.registryProxy;
const apiRateLimiter: RateLimitMiddleware = build.entry.module.apiRateLimiter;
+ const engineRateLimiter: RateLimitMiddleware = build.entry.module.engineRateLimiter;
const runWithHttpContext: RunWithHttpContextFunction = build.entry.module.runWithHttpContext;
if (registryProxy && process.env.ENABLE_REGISTRY_PROXY === "true") {
@@ -95,6 +96,7 @@ if (process.env.HTTP_SERVER_DISABLED !== "true") {
}
app.use(apiRateLimiter);
+ app.use(engineRateLimiter);
app.all(
"*",
diff --git a/apps/webapp/test/GCRARateLimiter.test.ts b/apps/webapp/test/GCRARateLimiter.test.ts
index 9c645310c0..95f0e6118b 100644
--- a/apps/webapp/test/GCRARateLimiter.test.ts
+++ b/apps/webapp/test/GCRARateLimiter.test.ts
@@ -2,12 +2,15 @@
import { redisTest } from "@internal/testcontainers";
import { describe, expect, vi } from "vitest";
import { GCRARateLimiter } from "../app/v3/GCRARateLimiter.server.js"; // adjust the import as needed
+import Redis from "ioredis";
// Extend the timeout to 30 seconds (as in your redis tests)
vi.setConfig({ testTimeout: 30_000 });
describe("GCRARateLimiter", () => {
- redisTest("should allow a single request when under the rate limit", async ({ redis }) => {
+ redisTest("should allow a single request when under the rate limit", async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+
const limiter = new GCRARateLimiter({
redis,
emissionInterval: 1000, // 1 request per second on average
@@ -21,7 +24,9 @@ describe("GCRARateLimiter", () => {
redisTest(
"should allow bursts up to the configured limit and then reject further requests",
- async ({ redis }) => {
+ async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+
const limiter = new GCRARateLimiter({
redis,
emissionInterval: 1000,
@@ -45,55 +50,67 @@ describe("GCRARateLimiter", () => {
}
);
- redisTest("should allow a request after the required waiting period", async ({ redis }) => {
- const limiter = new GCRARateLimiter({
- redis,
- emissionInterval: 1000,
- burstTolerance: 3000,
- keyPrefix: "test:ratelimit:",
- });
+ redisTest(
+ "should allow a request after the required waiting period",
+ async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
- // Exhaust burst capacity with 4 rapid calls.
- await limiter.check("user:wait");
- await limiter.check("user:wait");
- await limiter.check("user:wait");
- await limiter.check("user:wait");
+ const limiter = new GCRARateLimiter({
+ redis,
+ emissionInterval: 1000,
+ burstTolerance: 3000,
+ keyPrefix: "test:ratelimit:",
+ });
- // The 5th call should be rejected.
- const rejection = await limiter.check("user:wait");
- expect(rejection.allowed).toBe(false);
- expect(rejection.retryAfter).toBeGreaterThan(0);
+ // Exhaust burst capacity with 4 rapid calls.
+ await limiter.check("user:wait");
+ await limiter.check("user:wait");
+ await limiter.check("user:wait");
+ await limiter.check("user:wait");
- // Wait for the period specified in retryAfter (plus a small buffer)
- await new Promise((resolve) => setTimeout(resolve, rejection.retryAfter! + 50));
+ // The 5th call should be rejected.
+ const rejection = await limiter.check("user:wait");
+ expect(rejection.allowed).toBe(false);
+ expect(rejection.retryAfter).toBeGreaterThan(0);
- // Now the next call should be allowed.
- const allowedAfterWait = await limiter.check("user:wait");
- expect(allowedAfterWait.allowed).toBe(true);
- });
+ // Wait for the period specified in retryAfter (plus a small buffer)
+ await new Promise((resolve) => setTimeout(resolve, rejection.retryAfter! + 50));
- redisTest("should rate limit independently for different identifiers", async ({ redis }) => {
- const limiter = new GCRARateLimiter({
- redis,
- emissionInterval: 1000,
- burstTolerance: 3000,
- keyPrefix: "test:ratelimit:",
- });
+ // Now the next call should be allowed.
+ const allowedAfterWait = await limiter.check("user:wait");
+ expect(allowedAfterWait.allowed).toBe(true);
+ }
+ );
- // For "user:independent", exhaust burst capacity.
- await limiter.check("user:independent");
- await limiter.check("user:independent");
- await limiter.check("user:independent");
- await limiter.check("user:independent");
- const rejected = await limiter.check("user:independent");
- expect(rejected.allowed).toBe(false);
-
- // A different identifier should start fresh.
- const fresh = await limiter.check("user:different");
- expect(fresh.allowed).toBe(true);
- });
+ redisTest(
+ "should rate limit independently for different identifiers",
+ async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+
+ const limiter = new GCRARateLimiter({
+ redis,
+ emissionInterval: 1000,
+ burstTolerance: 3000,
+ keyPrefix: "test:ratelimit:",
+ });
+
+ // For "user:independent", exhaust burst capacity.
+ await limiter.check("user:independent");
+ await limiter.check("user:independent");
+ await limiter.check("user:independent");
+ await limiter.check("user:independent");
+ const rejected = await limiter.check("user:independent");
+ expect(rejected.allowed).toBe(false);
+
+ // A different identifier should start fresh.
+ const fresh = await limiter.check("user:different");
+ expect(fresh.allowed).toBe(true);
+ }
+ );
+
+ redisTest("should gradually reduce retryAfter with time", async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
- redisTest("should gradually reduce retryAfter with time", async ({ redis }) => {
const limiter = new GCRARateLimiter({
redis,
emissionInterval: 1000,
@@ -120,7 +137,9 @@ describe("GCRARateLimiter", () => {
expect(secondRetry).toBeLessThan(firstRetry);
});
- redisTest("should expire the key after the TTL", async ({ redis }) => {
+ redisTest("should expire the key after the TTL", async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+
// For this test, override keyExpiration to a short value.
const keyExpiration = 1500; // 1.5 seconds TTL
const limiter = new GCRARateLimiter({
@@ -147,7 +166,9 @@ describe("GCRARateLimiter", () => {
expect(stored).toBeNull();
});
- redisTest("should not share state across different key prefixes", async ({ redis }) => {
+ redisTest("should not share state across different key prefixes", async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+
const limiter1 = new GCRARateLimiter({
redis,
emissionInterval: 1000,
@@ -174,25 +195,32 @@ describe("GCRARateLimiter", () => {
expect(result2.allowed).toBe(true);
});
- redisTest("should increment TAT correctly on sequential allowed requests", async ({ redis }) => {
- const limiter = new GCRARateLimiter({
- redis,
- emissionInterval: 1000,
- burstTolerance: 3000,
- keyPrefix: "test:ratelimit:",
- });
+ redisTest(
+ "should increment TAT correctly on sequential allowed requests",
+ async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
- // The first request should be allowed.
- const r1 = await limiter.check("user:sequential");
- expect(r1.allowed).toBe(true);
+ const limiter = new GCRARateLimiter({
+ redis,
+ emissionInterval: 1000,
+ burstTolerance: 3000,
+ keyPrefix: "test:ratelimit:",
+ });
- // Wait a bit longer than the emission interval.
- await new Promise((resolve) => setTimeout(resolve, 1100));
- const r2 = await limiter.check("user:sequential");
- expect(r2.allowed).toBe(true);
- });
+ // The first request should be allowed.
+ const r1 = await limiter.check("user:sequential");
+ expect(r1.allowed).toBe(true);
+
+ // Wait a bit longer than the emission interval.
+ await new Promise((resolve) => setTimeout(resolve, 1100));
+ const r2 = await limiter.check("user:sequential");
+ expect(r2.allowed).toBe(true);
+ }
+ );
+
+ redisTest("should throw an error if redis command fails", async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
- redisTest("should throw an error if redis command fails", async ({ redis }) => {
const limiter = new GCRARateLimiter({
redis,
emissionInterval: 1000,
diff --git a/apps/webapp/test/authorizationRateLimitMiddleware.test.ts b/apps/webapp/test/authorizationRateLimitMiddleware.test.ts
index c599a4ddea..af88d4c4af 100644
--- a/apps/webapp/test/authorizationRateLimitMiddleware.test.ts
+++ b/apps/webapp/test/authorizationRateLimitMiddleware.test.ts
@@ -22,9 +22,9 @@ describe.skipIf(process.env.GITHUB_ACTIONS)("authorizationRateLimitMiddleware",
app = express();
});
- redisTest("should allow requests within the rate limit", async ({ redis }) => {
+ redisTest("should allow requests within the rate limit", async ({ redisOptions }) => {
const rateLimitMiddleware = authorizationRateLimitMiddleware({
- redis: redis.options,
+ redis: redisOptions,
keyPrefix: "test",
defaultLimiter: {
type: "tokenBucket",
@@ -53,9 +53,9 @@ describe.skipIf(process.env.GITHUB_ACTIONS)("authorizationRateLimitMiddleware",
expect(response.headers["x-ratelimit-reset"]).toBeDefined();
});
- redisTest("should reject requests without an Authorization header", async ({ redis }) => {
+ redisTest("should reject requests without an Authorization header", async ({ redisOptions }) => {
const rateLimitMiddleware = authorizationRateLimitMiddleware({
- redis: redis.options,
+ redis: redisOptions,
keyPrefix: "test",
defaultLimiter: {
type: "tokenBucket",
@@ -77,9 +77,9 @@ describe.skipIf(process.env.GITHUB_ACTIONS)("authorizationRateLimitMiddleware",
expect(response.body).toHaveProperty("title", "Unauthorized");
});
- redisTest("should reject requests that exceed the rate limit", async ({ redis }) => {
+ redisTest("should reject requests that exceed the rate limit", async ({ redisOptions }) => {
const rateLimitMiddleware = authorizationRateLimitMiddleware({
- redis: redis.options,
+ redis: redisOptions,
keyPrefix: "test",
defaultLimiter: {
type: "tokenBucket",
@@ -105,9 +105,9 @@ describe.skipIf(process.env.GITHUB_ACTIONS)("authorizationRateLimitMiddleware",
expect(response.body).toHaveProperty("title", "Rate Limit Exceeded");
});
- redisTest("should not apply rate limiting to whitelisted paths", async ({ redis }) => {
+ redisTest("should not apply rate limiting to whitelisted paths", async ({ redisOptions }) => {
const rateLimitMiddleware = authorizationRateLimitMiddleware({
- redis: redis.options,
+ redis: redisOptions,
keyPrefix: "test",
defaultLimiter: {
type: "tokenBucket",
@@ -135,9 +135,9 @@ describe.skipIf(process.env.GITHUB_ACTIONS)("authorizationRateLimitMiddleware",
redisTest(
"should apply different rate limits based on limiterConfigOverride",
- async ({ redis }) => {
+ async ({ redisOptions }) => {
const rateLimitMiddleware = authorizationRateLimitMiddleware({
- redis: redis.options,
+ redis: redisOptions,
keyPrefix: "test",
defaultLimiter: {
type: "tokenBucket",
@@ -185,9 +185,9 @@ describe.skipIf(process.env.GITHUB_ACTIONS)("authorizationRateLimitMiddleware",
describe("Advanced Cases", () => {
// 1. Test different rate limit configurations
- redisTest("should enforce fixed window rate limiting", async ({ redis }) => {
+ redisTest("should enforce fixed window rate limiting", async ({ redisOptions }) => {
const rateLimitMiddleware = authorizationRateLimitMiddleware({
- redis: redis.options,
+ redis: redisOptions,
keyPrefix: "test-fixed",
defaultLimiter: {
type: "fixedWindow",
@@ -221,9 +221,9 @@ describe.skipIf(process.env.GITHUB_ACTIONS)("authorizationRateLimitMiddleware",
expect(newResponse.status).toBe(200);
});
- redisTest("should enforce sliding window rate limiting", async ({ redis }) => {
+ redisTest("should enforce sliding window rate limiting", async ({ redisOptions }) => {
const rateLimitMiddleware = authorizationRateLimitMiddleware({
- redis: redis.options,
+ redis: redisOptions,
keyPrefix: "test-sliding",
defaultLimiter: {
type: "slidingWindow",
@@ -265,9 +265,9 @@ describe.skipIf(process.env.GITHUB_ACTIONS)("authorizationRateLimitMiddleware",
});
// 2. Test edge cases around rate limit calculations
- redisTest("should handle token refill correctly", async ({ redis }) => {
+ redisTest("should handle token refill correctly", async ({ redisOptions }) => {
const rateLimitMiddleware = authorizationRateLimitMiddleware({
- redis: redis.options,
+ redis: redisOptions,
keyPrefix: "test-refill",
defaultLimiter: {
type: "tokenBucket",
@@ -306,9 +306,9 @@ describe.skipIf(process.env.GITHUB_ACTIONS)("authorizationRateLimitMiddleware",
expect(limitedAgainResponse.status).toBe(429);
});
- redisTest("should handle near-zero remaining tokens correctly", async ({ redis }) => {
+ redisTest("should handle near-zero remaining tokens correctly", async ({ redisOptions }) => {
const rateLimitMiddleware = authorizationRateLimitMiddleware({
- redis: redis.options,
+ redis: redisOptions,
keyPrefix: "test-near-zero",
defaultLimiter: {
type: "tokenBucket",
@@ -353,10 +353,10 @@ describe.skipIf(process.env.GITHUB_ACTIONS)("authorizationRateLimitMiddleware",
});
// 3. Test the limiterCache functionality
- redisTest("should use cached limiter configurations", async ({ redis }) => {
+ redisTest("should use cached limiter configurations", async ({ redisOptions }) => {
let configOverrideCalls = 0;
const rateLimitMiddleware = authorizationRateLimitMiddleware({
- redis: redis.options,
+ redis: redisOptions,
keyPrefix: "test-cache",
defaultLimiter: {
type: "tokenBucket",
diff --git a/apps/webapp/test/fairDequeuingStrategy.test.ts b/apps/webapp/test/fairDequeuingStrategy.test.ts
index 94f9f4a3e7..109e49168e 100644
--- a/apps/webapp/test/fairDequeuingStrategy.test.ts
+++ b/apps/webapp/test/fairDequeuingStrategy.test.ts
@@ -10,13 +10,16 @@ import {
import { trace } from "@opentelemetry/api";
import { EnvQueues } from "~/v3/marqs/types.js";
import { MARQS_RESUME_PRIORITY_TIMESTAMP_OFFSET } from "~/v3/marqs/constants.server.js";
+import Redis from "ioredis";
const tracer = trace.getTracer("test");
vi.setConfig({ testTimeout: 30_000 }); // 30 seconds timeout
describe("FairDequeuingStrategy", () => {
- redisTest("should distribute a single queue from a single env", async ({ redis }) => {
+ redisTest("should distribute a single queue from a single env", async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+
const keyProducer = createKeyProducer("test");
const strategy = new FairDequeuingStrategy({
tracer,
@@ -46,7 +49,9 @@ describe("FairDequeuingStrategy", () => {
});
});
- redisTest("should respect env concurrency limits", async ({ redis }) => {
+ redisTest("should respect env concurrency limits", async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+
const keyProducer = createKeyProducer("test");
const strategy = new FairDequeuingStrategy({
tracer,
@@ -79,7 +84,9 @@ describe("FairDequeuingStrategy", () => {
redisTest(
"should give extra concurrency when the env has reserve concurrency",
- async ({ redis }) => {
+ async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+
const keyProducer = createKeyProducer("test");
const strategy = new FairDequeuingStrategy({
tracer,
@@ -118,7 +125,9 @@ describe("FairDequeuingStrategy", () => {
}
);
- redisTest("should respect parentQueueLimit", async ({ redis }) => {
+ redisTest("should respect parentQueueLimit", async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+
const keyProducer = createKeyProducer("test");
const strategy = new FairDequeuingStrategy({
tracer,
@@ -173,255 +182,267 @@ describe("FairDequeuingStrategy", () => {
});
});
- redisTest("should reuse snapshots across calls for the same consumer", async ({ redis }) => {
- const keyProducer = createKeyProducer("test");
- const strategy = new FairDequeuingStrategy({
- tracer,
- redis,
- keys: keyProducer,
- defaultEnvConcurrency: 5,
- parentQueueLimit: 10,
- seed: "test-seed-reuse-1",
- reuseSnapshotCount: 1,
- });
+ redisTest(
+ "should reuse snapshots across calls for the same consumer",
+ async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
- const now = Date.now();
+ const keyProducer = createKeyProducer("test");
+ const strategy = new FairDequeuingStrategy({
+ tracer,
+ redis,
+ keys: keyProducer,
+ defaultEnvConcurrency: 5,
+ parentQueueLimit: 10,
+ seed: "test-seed-reuse-1",
+ reuseSnapshotCount: 1,
+ });
- await setupQueue({
- redis,
- keyProducer,
- parentQueue: "parent-queue",
- score: now - 3000,
- queueId: "queue-1",
- orgId: "org-1",
- envId: "env-1",
- });
+ const now = Date.now();
- await setupQueue({
- redis,
- keyProducer,
- parentQueue: "parent-queue",
- score: now - 2000,
- queueId: "queue-2",
- orgId: "org-2",
- envId: "env-2",
- });
+ await setupQueue({
+ redis,
+ keyProducer,
+ parentQueue: "parent-queue",
+ score: now - 3000,
+ queueId: "queue-1",
+ orgId: "org-1",
+ envId: "env-1",
+ });
- await setupQueue({
- redis,
- keyProducer,
- parentQueue: "parent-queue",
- score: now - 1000,
- queueId: "queue-3",
- orgId: "org-3",
- envId: "env-3",
- });
+ await setupQueue({
+ redis,
+ keyProducer,
+ parentQueue: "parent-queue",
+ score: now - 2000,
+ queueId: "queue-2",
+ orgId: "org-2",
+ envId: "env-2",
+ });
- const startDistribute1 = performance.now();
+ await setupQueue({
+ redis,
+ keyProducer,
+ parentQueue: "parent-queue",
+ score: now - 1000,
+ queueId: "queue-3",
+ orgId: "org-3",
+ envId: "env-3",
+ });
- const envResult = await strategy.distributeFairQueuesFromParentQueue(
- "parent-queue",
- "consumer-1"
- );
- const result = flattenResults(envResult);
+ const startDistribute1 = performance.now();
- const distribute1Duration = performance.now() - startDistribute1;
+ const envResult = await strategy.distributeFairQueuesFromParentQueue(
+ "parent-queue",
+ "consumer-1"
+ );
+ const result = flattenResults(envResult);
- console.log("First distribution took", distribute1Duration, "ms");
+ const distribute1Duration = performance.now() - startDistribute1;
- expect(result).toHaveLength(3);
- // Should only get the two oldest queues
- const queue1 = keyProducer.queueKey("org-1", "env-1", "queue-1");
- const queue2 = keyProducer.queueKey("org-2", "env-2", "queue-2");
- const queue3 = keyProducer.queueKey("org-3", "env-3", "queue-3");
- expect(result).toEqual([queue2, queue1, queue3]);
+ console.log("First distribution took", distribute1Duration, "ms");
- const startDistribute2 = performance.now();
+ expect(result).toHaveLength(3);
+ // Should only get the two oldest queues
+ const queue1 = keyProducer.queueKey("org-1", "env-1", "queue-1");
+ const queue2 = keyProducer.queueKey("org-2", "env-2", "queue-2");
+ const queue3 = keyProducer.queueKey("org-3", "env-3", "queue-3");
+ expect(result).toEqual([queue2, queue1, queue3]);
- const result2 = await strategy.distributeFairQueuesFromParentQueue(
- "parent-queue",
- "consumer-1"
- );
+ const startDistribute2 = performance.now();
- const distribute2Duration = performance.now() - startDistribute2;
+ const result2 = await strategy.distributeFairQueuesFromParentQueue(
+ "parent-queue",
+ "consumer-1"
+ );
- console.log("Second distribution took", distribute2Duration, "ms");
+ const distribute2Duration = performance.now() - startDistribute2;
- // Make sure the second call is more than 9 times faster than the first
- expect(distribute2Duration).toBeLessThan(distribute1Duration / 9);
+ console.log("Second distribution took", distribute2Duration, "ms");
- const startDistribute3 = performance.now();
+ // Make sure the second call is more than 9 times faster than the first
+ expect(distribute2Duration).toBeLessThan(distribute1Duration / 9);
- const result3 = await strategy.distributeFairQueuesFromParentQueue(
- "parent-queue",
- "consumer-1"
- );
+ const startDistribute3 = performance.now();
- const distribute3Duration = performance.now() - startDistribute3;
+ const result3 = await strategy.distributeFairQueuesFromParentQueue(
+ "parent-queue",
+ "consumer-1"
+ );
- console.log("Third distribution took", distribute3Duration, "ms");
+ const distribute3Duration = performance.now() - startDistribute3;
- // Make sure the third call is more than 4 times the second
- expect(distribute3Duration).toBeGreaterThan(distribute2Duration * 4);
- });
+ console.log("Third distribution took", distribute3Duration, "ms");
- redisTest("should fairly distribute queues across environments over time", async ({ redis }) => {
- const keyProducer = createKeyProducer("test");
- const strategy = new FairDequeuingStrategy({
- tracer,
- redis,
- keys: keyProducer,
- defaultEnvConcurrency: 5,
- parentQueueLimit: 100,
- seed: "test-seed-5",
- });
+ // Make sure the third call is more than 4 times the second
+ expect(distribute3Duration).toBeGreaterThan(distribute2Duration * 4);
+ }
+ );
- const now = Date.now();
+ redisTest(
+ "should fairly distribute queues across environments over time",
+ async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+
+ const keyProducer = createKeyProducer("test");
+ const strategy = new FairDequeuingStrategy({
+ tracer,
+ redis,
+ keys: keyProducer,
+ defaultEnvConcurrency: 5,
+ parentQueueLimit: 100,
+ seed: "test-seed-5",
+ });
- // Test configuration
- const orgs = ["org-1", "org-2", "org-3"];
- const envsPerOrg = 3; // Each org has 3 environments
- const queuesPerEnv = 5; // Each env has 5 queues
- const iterations = 1000;
+ const now = Date.now();
- // Setup queues
- for (const orgId of orgs) {
- for (let envNum = 1; envNum <= envsPerOrg; envNum++) {
- const envId = `env-${orgId}-${envNum}`;
+ // Test configuration
+ const orgs = ["org-1", "org-2", "org-3"];
+ const envsPerOrg = 3; // Each org has 3 environments
+ const queuesPerEnv = 5; // Each env has 5 queues
+ const iterations = 1000;
- for (let queueNum = 1; queueNum <= queuesPerEnv; queueNum++) {
- await setupQueue({
+ // Setup queues
+ for (const orgId of orgs) {
+ for (let envNum = 1; envNum <= envsPerOrg; envNum++) {
+ const envId = `env-${orgId}-${envNum}`;
+
+ for (let queueNum = 1; queueNum <= queuesPerEnv; queueNum++) {
+ await setupQueue({
+ redis,
+ keyProducer,
+ parentQueue: "parent-queue",
+ // Vary the ages slightly
+ score: now - Math.random() * 10000,
+ queueId: `queue-${orgId}-${envId}-${queueNum}`,
+ orgId,
+ envId,
+ });
+ }
+
+ // Setup reasonable concurrency limits
+ await setupConcurrency({
redis,
keyProducer,
- parentQueue: "parent-queue",
- // Vary the ages slightly
- score: now - Math.random() * 10000,
- queueId: `queue-${orgId}-${envId}-${queueNum}`,
- orgId,
- envId,
+ env: { id: envId, currentConcurrency: 1, limit: 5 },
});
}
-
- // Setup reasonable concurrency limits
- await setupConcurrency({
- redis,
- keyProducer,
- env: { id: envId, currentConcurrency: 1, limit: 5 },
- });
}
- }
- // Track distribution statistics
- type PositionStats = {
- firstPosition: number; // Count of times this env/org was first
- positionSums: number; // Sum of positions (for averaging)
- appearances: number; // Total number of appearances
- };
-
- const envStats: Record = {};
- const orgStats: Record = {};
-
- // Initialize stats objects
- for (const orgId of orgs) {
- orgStats[orgId] = { firstPosition: 0, positionSums: 0, appearances: 0 };
- for (let envNum = 1; envNum <= envsPerOrg; envNum++) {
- const envId = `env-${orgId}-${envNum}`;
- envStats[envId] = { firstPosition: 0, positionSums: 0, appearances: 0 };
- }
- }
+ // Track distribution statistics
+ type PositionStats = {
+ firstPosition: number; // Count of times this env/org was first
+ positionSums: number; // Sum of positions (for averaging)
+ appearances: number; // Total number of appearances
+ };
- // Run multiple iterations
- for (let i = 0; i < iterations; i++) {
- const envResult = await strategy.distributeFairQueuesFromParentQueue(
- "parent-queue",
- `consumer-${i % 3}` // Simulate 3 different consumers
- );
- const result = flattenResults(envResult);
+ const envStats: Record = {};
+ const orgStats: Record = {};
- // Track positions of queues
- result.forEach((queueId, position) => {
- const orgId = keyProducer.orgIdFromQueue(queueId);
- const envId = keyProducer.envIdFromQueue(queueId);
+ // Initialize stats objects
+ for (const orgId of orgs) {
+ orgStats[orgId] = { firstPosition: 0, positionSums: 0, appearances: 0 };
+ for (let envNum = 1; envNum <= envsPerOrg; envNum++) {
+ const envId = `env-${orgId}-${envNum}`;
+ envStats[envId] = { firstPosition: 0, positionSums: 0, appearances: 0 };
+ }
+ }
- // Update org stats
- orgStats[orgId].appearances++;
- orgStats[orgId].positionSums += position;
- if (position === 0) orgStats[orgId].firstPosition++;
+ // Run multiple iterations
+ for (let i = 0; i < iterations; i++) {
+ const envResult = await strategy.distributeFairQueuesFromParentQueue(
+ "parent-queue",
+ `consumer-${i % 3}` // Simulate 3 different consumers
+ );
+ const result = flattenResults(envResult);
- // Update env stats
- envStats[envId].appearances++;
- envStats[envId].positionSums += position;
- if (position === 0) envStats[envId].firstPosition++;
- });
- }
+ // Track positions of queues
+ result.forEach((queueId, position) => {
+ const orgId = keyProducer.orgIdFromQueue(queueId);
+ const envId = keyProducer.envIdFromQueue(queueId);
+
+ // Update org stats
+ orgStats[orgId].appearances++;
+ orgStats[orgId].positionSums += position;
+ if (position === 0) orgStats[orgId].firstPosition++;
+
+ // Update env stats
+ envStats[envId].appearances++;
+ envStats[envId].positionSums += position;
+ if (position === 0) envStats[envId].firstPosition++;
+ });
+ }
- // Calculate and log statistics
- console.log("\nOrganization Statistics:");
- for (const [orgId, stats] of Object.entries(orgStats)) {
- const avgPosition = stats.positionSums / stats.appearances;
- const firstPositionPercentage = (stats.firstPosition / iterations) * 100;
- console.log(`${orgId}:
+ // Calculate and log statistics
+ console.log("\nOrganization Statistics:");
+ for (const [orgId, stats] of Object.entries(orgStats)) {
+ const avgPosition = stats.positionSums / stats.appearances;
+ const firstPositionPercentage = (stats.firstPosition / iterations) * 100;
+ console.log(`${orgId}:
First Position: ${firstPositionPercentage.toFixed(2)}%
Average Position: ${avgPosition.toFixed(2)}
Total Appearances: ${stats.appearances}`);
- }
+ }
- console.log("\nEnvironment Statistics:");
- for (const [envId, stats] of Object.entries(envStats)) {
- const avgPosition = stats.positionSums / stats.appearances;
- const firstPositionPercentage = (stats.firstPosition / iterations) * 100;
- console.log(`${envId}:
+ console.log("\nEnvironment Statistics:");
+ for (const [envId, stats] of Object.entries(envStats)) {
+ const avgPosition = stats.positionSums / stats.appearances;
+ const firstPositionPercentage = (stats.firstPosition / iterations) * 100;
+ console.log(`${envId}:
First Position: ${firstPositionPercentage.toFixed(2)}%
Average Position: ${avgPosition.toFixed(2)}
Total Appearances: ${stats.appearances}`);
- }
+ }
- // Verify fairness of first position distribution
- const expectedFirstPositionPercentage = 100 / orgs.length;
- const firstPositionStdDevOrgs = calculateStandardDeviation(
- Object.values(orgStats).map((stats) => (stats.firstPosition / iterations) * 100)
- );
-
- const expectedEnvFirstPositionPercentage = 100 / (orgs.length * envsPerOrg);
- const firstPositionStdDevEnvs = calculateStandardDeviation(
- Object.values(envStats).map((stats) => (stats.firstPosition / iterations) * 100)
- );
-
- // Assert reasonable fairness for first position
- expect(firstPositionStdDevOrgs).toBeLessThan(5); // Allow 5% standard deviation for orgs
- expect(firstPositionStdDevEnvs).toBeLessThan(5); // Allow 5% standard deviation for envs
-
- // Verify that each org and env gets a fair chance at first position
- for (const [orgId, stats] of Object.entries(orgStats)) {
- const firstPositionPercentage = (stats.firstPosition / iterations) * 100;
- expect(firstPositionPercentage).toBeGreaterThan(expectedFirstPositionPercentage * 0.7); // Within 30% of expected
- expect(firstPositionPercentage).toBeLessThan(expectedFirstPositionPercentage * 1.3);
- }
+ // Verify fairness of first position distribution
+ const expectedFirstPositionPercentage = 100 / orgs.length;
+ const firstPositionStdDevOrgs = calculateStandardDeviation(
+ Object.values(orgStats).map((stats) => (stats.firstPosition / iterations) * 100)
+ );
- for (const [envId, stats] of Object.entries(envStats)) {
- const firstPositionPercentage = (stats.firstPosition / iterations) * 100;
- expect(firstPositionPercentage).toBeGreaterThan(expectedEnvFirstPositionPercentage * 0.7); // Within 30% of expected
- expect(firstPositionPercentage).toBeLessThan(expectedEnvFirstPositionPercentage * 1.3);
- }
+ const expectedEnvFirstPositionPercentage = 100 / (orgs.length * envsPerOrg);
+ const firstPositionStdDevEnvs = calculateStandardDeviation(
+ Object.values(envStats).map((stats) => (stats.firstPosition / iterations) * 100)
+ );
- // Verify average positions are reasonably distributed
- const avgPositionsOrgs = Object.values(orgStats).map(
- (stats) => stats.positionSums / stats.appearances
- );
- const avgPositionsEnvs = Object.values(envStats).map(
- (stats) => stats.positionSums / stats.appearances
- );
+ // Assert reasonable fairness for first position
+ expect(firstPositionStdDevOrgs).toBeLessThan(5); // Allow 5% standard deviation for orgs
+ expect(firstPositionStdDevEnvs).toBeLessThan(5); // Allow 5% standard deviation for envs
- const avgPositionStdDevOrgs = calculateStandardDeviation(avgPositionsOrgs);
- const avgPositionStdDevEnvs = calculateStandardDeviation(avgPositionsEnvs);
+ // Verify that each org and env gets a fair chance at first position
+ for (const [orgId, stats] of Object.entries(orgStats)) {
+ const firstPositionPercentage = (stats.firstPosition / iterations) * 100;
+ expect(firstPositionPercentage).toBeGreaterThan(expectedFirstPositionPercentage * 0.7); // Within 30% of expected
+ expect(firstPositionPercentage).toBeLessThan(expectedFirstPositionPercentage * 1.3);
+ }
- expect(avgPositionStdDevOrgs).toBeLessThan(1); // Average positions should be fairly consistent
- expect(avgPositionStdDevEnvs).toBeLessThan(1);
- });
+ for (const [envId, stats] of Object.entries(envStats)) {
+ const firstPositionPercentage = (stats.firstPosition / iterations) * 100;
+ expect(firstPositionPercentage).toBeGreaterThan(expectedEnvFirstPositionPercentage * 0.7); // Within 30% of expected
+ expect(firstPositionPercentage).toBeLessThan(expectedEnvFirstPositionPercentage * 1.3);
+ }
+
+ // Verify average positions are reasonably distributed
+ const avgPositionsOrgs = Object.values(orgStats).map(
+ (stats) => stats.positionSums / stats.appearances
+ );
+ const avgPositionsEnvs = Object.values(envStats).map(
+ (stats) => stats.positionSums / stats.appearances
+ );
+
+ const avgPositionStdDevOrgs = calculateStandardDeviation(avgPositionsOrgs);
+ const avgPositionStdDevEnvs = calculateStandardDeviation(avgPositionsEnvs);
+
+ expect(avgPositionStdDevOrgs).toBeLessThan(1); // Average positions should be fairly consistent
+ expect(avgPositionStdDevEnvs).toBeLessThan(1);
+ }
+ );
redisTest(
"should shuffle environments while maintaining age order within environments",
- async ({ redis }) => {
+ async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+
const keyProducer = createKeyProducer("test");
const strategy = new FairDequeuingStrategy({
tracer,
@@ -523,7 +544,9 @@ describe("FairDequeuingStrategy", () => {
redisTest(
"should bias shuffling based on concurrency limits and available capacity",
- async ({ redis }) => {
+ async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+
const keyProducer = createKeyProducer("test");
const now = Date.now();
@@ -650,102 +673,109 @@ describe("FairDequeuingStrategy", () => {
}
);
- redisTest("should respect ageInfluence parameter for queue ordering", async ({ redis }) => {
- const keyProducer = createKeyProducer("test");
- const now = Date.now();
+ redisTest(
+ "should respect ageInfluence parameter for queue ordering",
+ async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
- // Setup queues with different ages in the same environment
- const queueAges = [
- { id: "queue-1", age: 5000 }, // oldest
- { id: "queue-2", age: 3000 },
- { id: "queue-3", age: 1000 }, // newest
- ];
+ const keyProducer = createKeyProducer("test");
+ const now = Date.now();
- // Helper function to run iterations with a specific age influence
- async function runWithQueueAgeRandomization(queueAgeRandomization: number) {
- const strategy = new FairDequeuingStrategy({
- tracer,
- redis,
- keys: keyProducer,
- defaultEnvConcurrency: 5,
- parentQueueLimit: 100,
- seed: "fixed-seed",
- biases: {
- concurrencyLimitBias: 0,
- availableCapacityBias: 0,
- queueAgeRandomization,
- },
- });
+ // Setup queues with different ages in the same environment
+ const queueAges = [
+ { id: "queue-1", age: 5000 }, // oldest
+ { id: "queue-2", age: 3000 },
+ { id: "queue-3", age: 1000 }, // newest
+ ];
- const positionCounts: Record = {
- "queue-1": [0, 0, 0],
- "queue-2": [0, 0, 0],
- "queue-3": [0, 0, 0],
- };
+ // Helper function to run iterations with a specific age influence
+ async function runWithQueueAgeRandomization(queueAgeRandomization: number) {
+ const strategy = new FairDequeuingStrategy({
+ tracer,
+ redis,
+ keys: keyProducer,
+ defaultEnvConcurrency: 5,
+ parentQueueLimit: 100,
+ seed: "fixed-seed",
+ biases: {
+ concurrencyLimitBias: 0,
+ availableCapacityBias: 0,
+ queueAgeRandomization,
+ },
+ });
- const iterations = 1000;
- for (let i = 0; i < iterations; i++) {
- const envResult = await strategy.distributeFairQueuesFromParentQueue(
- "parent-queue",
- "consumer-1"
- );
- const result = flattenResults(envResult);
+ const positionCounts: Record = {
+ "queue-1": [0, 0, 0],
+ "queue-2": [0, 0, 0],
+ "queue-3": [0, 0, 0],
+ };
- result.forEach((queueId, position) => {
- const baseQueueId = queueId.split(":").pop()!;
- positionCounts[baseQueueId][position]++;
- });
+ const iterations = 1000;
+ for (let i = 0; i < iterations; i++) {
+ const envResult = await strategy.distributeFairQueuesFromParentQueue(
+ "parent-queue",
+ "consumer-1"
+ );
+ const result = flattenResults(envResult);
+
+ result.forEach((queueId, position) => {
+ const baseQueueId = queueId.split(":").pop()!;
+ positionCounts[baseQueueId][position]++;
+ });
+ }
+
+ return positionCounts;
}
- return positionCounts;
- }
+ // Setup test data
+ for (const { id, age } of queueAges) {
+ await setupQueue({
+ redis,
+ keyProducer,
+ parentQueue: "parent-queue",
+ score: now - age,
+ queueId: id,
+ orgId: "org-1",
+ envId: "env-1",
+ });
+ }
- // Setup test data
- for (const { id, age } of queueAges) {
- await setupQueue({
+ await setupConcurrency({
redis,
keyProducer,
- parentQueue: "parent-queue",
- score: now - age,
- queueId: id,
- orgId: "org-1",
- envId: "env-1",
+ env: { id: "env-1", currentConcurrency: 0, limit: 5 },
});
- }
-
- await setupConcurrency({
- redis,
- keyProducer,
- env: { id: "env-1", currentConcurrency: 0, limit: 5 },
- });
- // Test with different age influence values
- const strictAge = await runWithQueueAgeRandomization(0); // Strict age-based ordering
- const mixed = await runWithQueueAgeRandomization(0.5); // Mix of age and random
- const fullyRandom = await runWithQueueAgeRandomization(1); // Completely random
+ // Test with different age influence values
+ const strictAge = await runWithQueueAgeRandomization(0); // Strict age-based ordering
+ const mixed = await runWithQueueAgeRandomization(0.5); // Mix of age and random
+ const fullyRandom = await runWithQueueAgeRandomization(1); // Completely random
- console.log("Distribution with strict age ordering (0.0):", strictAge);
- console.log("Distribution with mixed ordering (0.5):", mixed);
- console.log("Distribution with random ordering (1.0):", fullyRandom);
+ console.log("Distribution with strict age ordering (0.0):", strictAge);
+ console.log("Distribution with mixed ordering (0.5):", mixed);
+ console.log("Distribution with random ordering (1.0):", fullyRandom);
- // With strict age ordering (0.0), oldest should always be first
- expect(strictAge["queue-1"][0]).toBe(1000); // Always in first position
- expect(strictAge["queue-3"][0]).toBe(0); // Never in first position
+ // With strict age ordering (0.0), oldest should always be first
+ expect(strictAge["queue-1"][0]).toBe(1000); // Always in first position
+ expect(strictAge["queue-3"][0]).toBe(0); // Never in first position
- // With fully random (1.0), positions should still allow for some age bias
- const randomFirstPositionSpread = Math.abs(
- fullyRandom["queue-1"][0] - fullyRandom["queue-3"][0]
- );
- expect(randomFirstPositionSpread).toBeLessThan(200); // Allow for larger spread in distribution
+ // With fully random (1.0), positions should still allow for some age bias
+ const randomFirstPositionSpread = Math.abs(
+ fullyRandom["queue-1"][0] - fullyRandom["queue-3"][0]
+ );
+ expect(randomFirstPositionSpread).toBeLessThan(200); // Allow for larger spread in distribution
- // With mixed (0.5), should show preference for age but not absolute
- expect(mixed["queue-1"][0]).toBeGreaterThan(mixed["queue-3"][0]); // Older preferred
- expect(mixed["queue-3"][0]).toBeGreaterThan(0); // But newer still gets chances
- });
+ // With mixed (0.5), should show preference for age but not absolute
+ expect(mixed["queue-1"][0]).toBeGreaterThan(mixed["queue-3"][0]); // Older preferred
+ expect(mixed["queue-3"][0]).toBeGreaterThan(0); // But newer still gets chances
+ }
+ );
redisTest(
"should respect maximumEnvCount and select envs based on queue ages",
- async ({ redis }) => {
+ async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+
const keyProducer = createKeyProducer("test");
const strategy = new FairDequeuingStrategy({
tracer,
@@ -874,7 +904,9 @@ describe("FairDequeuingStrategy", () => {
redisTest(
"should not overly bias picking environments when queue have priority offset ages",
- async ({ redis }) => {
+ async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+
const keyProducer = createKeyProducer("test");
const strategy = new FairDequeuingStrategy({
tracer,
diff --git a/apps/webapp/test/realtimeClient.test.ts b/apps/webapp/test/realtimeClient.test.ts
index f8aab54fd0..5cfa8c39d9 100644
--- a/apps/webapp/test/realtimeClient.test.ts
+++ b/apps/webapp/test/realtimeClient.test.ts
@@ -1,12 +1,15 @@
import { containerWithElectricAndRedisTest } from "@internal/testcontainers";
import { expect, describe } from "vitest";
import { RealtimeClient } from "../app/services/realtimeClient.server.js";
+import Redis from "ioredis";
describe.skipIf(process.env.GITHUB_ACTIONS)("RealtimeClient", () => {
containerWithElectricAndRedisTest(
"Should only track concurrency for live requests",
{ timeout: 30_000 },
- async ({ redis, electricOrigin, prisma }) => {
+ async ({ redisOptions, electricOrigin, prisma }) => {
+ const redis = new Redis(redisOptions);
+
const client = new RealtimeClient({
electricOrigin,
keyPrefix: "test:realtime",
@@ -146,7 +149,9 @@ describe.skipIf(process.env.GITHUB_ACTIONS)("RealtimeClient", () => {
containerWithElectricAndRedisTest(
"Should support subscribing to a run tag",
{ timeout: 30_000 },
- async ({ redis, electricOrigin, prisma }) => {
+ async ({ redisOptions, electricOrigin, prisma }) => {
+ const redis = new Redis(redisOptions);
+
const client = new RealtimeClient({
electricOrigin,
keyPrefix: "test:realtime",
@@ -229,7 +234,9 @@ describe.skipIf(process.env.GITHUB_ACTIONS)("RealtimeClient", () => {
containerWithElectricAndRedisTest(
"Should adapt for older client versions",
{ timeout: 30_000 },
- async ({ redis, electricOrigin, prisma }) => {
+ async ({ redisOptions, electricOrigin, prisma }) => {
+ const redis = new Redis(redisOptions);
+
const client = new RealtimeClient({
electricOrigin,
keyPrefix: "test:realtime",
diff --git a/apps/webapp/tsconfig.json b/apps/webapp/tsconfig.json
index af02ef016b..0904fbfea5 100644
--- a/apps/webapp/tsconfig.json
+++ b/apps/webapp/tsconfig.json
@@ -3,7 +3,7 @@
"include": ["remix.env.d.ts", "global.d.ts", "**/*.ts", "**/*.tsx"],
"compilerOptions": {
"types": ["vitest/globals"],
- "lib": ["DOM", "DOM.Iterable", "ES2019"],
+ "lib": ["DOM", "DOM.Iterable", "DOM.AsyncIterable", "ES2019"],
"isolatedModules": true,
"esModuleInterop": true,
"jsx": "react-jsx",
@@ -35,6 +35,8 @@
"emails/*": ["../../internal-packages/emails/src/*"],
"@internal/zod-worker": ["../../internal-packages/zod-worker/src/index"],
"@internal/zod-worker/*": ["../../internal-packages/zod-worker/src/*"],
+ "@internal/run-engine": ["../../internal-packages/run-engine/src/index"],
+ "@internal/run-engine/*": ["../../internal-packages/run-engine/src/*"],
"@internal/redis-worker": ["../../internal-packages/redis-worker/src/index"],
"@internal/redis-worker/*": ["../../internal-packages/redis-worker/src/*"]
},
diff --git a/internal-packages/database/package.json b/internal-packages/database/package.json
index 41afe0e5ef..a170b10cec 100644
--- a/internal-packages/database/package.json
+++ b/internal-packages/database/package.json
@@ -5,15 +5,13 @@
"main": "./src/index.ts",
"types": "./src/index.ts",
"dependencies": {
- "@prisma/client": "5.4.1",
- "typescript": "^4.8.4"
+ "@prisma/client": "5.4.1"
},
"devDependencies": {
"prisma": "5.4.1"
},
"scripts": {
"generate": "prisma generate",
- "db:migrate:dev": "prisma migrate dev",
"db:migrate:dev:create": "prisma migrate dev --create-only",
"db:migrate:deploy": "prisma migrate deploy",
"db:push": "prisma db push",
@@ -21,4 +19,4 @@
"db:reset": "prisma migrate reset",
"typecheck": "tsc --noEmit"
}
-}
\ No newline at end of file
+}
diff --git a/internal-packages/database/prisma/migrations/20250103152909_add_run_engine_v2/migration.sql b/internal-packages/database/prisma/migrations/20250103152909_add_run_engine_v2/migration.sql
new file mode 100644
index 0000000000..5161459ff5
--- /dev/null
+++ b/internal-packages/database/prisma/migrations/20250103152909_add_run_engine_v2/migration.sql
@@ -0,0 +1,288 @@
+-- CreateEnum
+CREATE TYPE "RunEngineVersion" AS ENUM ('V1', 'V2');
+
+-- CreateEnum
+CREATE TYPE "TaskRunExecutionStatus" AS ENUM ('RUN_CREATED', 'QUEUED', 'PENDING_EXECUTING', 'EXECUTING', 'EXECUTING_WITH_WAITPOINTS', 'BLOCKED_BY_WAITPOINTS', 'PENDING_CANCEL', 'FINISHED');
+
+-- CreateEnum
+CREATE TYPE "TaskRunCheckpointType" AS ENUM ('DOCKER', 'KUBERNETES');
+
+-- CreateEnum
+CREATE TYPE "WaitpointType" AS ENUM ('RUN', 'DATETIME', 'MANUAL');
+
+-- CreateEnum
+CREATE TYPE "WaitpointStatus" AS ENUM ('PENDING', 'COMPLETED');
+
+-- CreateEnum
+CREATE TYPE "WorkerInstanceGroupType" AS ENUM ('MANAGED', 'UNMANAGED');
+
+-- CreateEnum
+CREATE TYPE "WorkerDeploymentType" AS ENUM ('MANAGED', 'UNMANAGED', 'V1');
+
+-- AlterTable
+ALTER TABLE "BackgroundWorker" ADD COLUMN "workerGroupId" TEXT;
+
+-- AlterTable
+ALTER TABLE "Project" ADD COLUMN "defaultWorkerGroupId" TEXT,
+ADD COLUMN "engine" "RunEngineVersion" NOT NULL DEFAULT 'V1';
+
+-- AlterTable
+ALTER TABLE "TaskEvent" ADD COLUMN "isDebug" BOOLEAN NOT NULL DEFAULT false;
+
+-- AlterTable
+ALTER TABLE "TaskRun" ADD COLUMN "attemptNumber" INTEGER,
+ADD COLUMN "engine" "RunEngineVersion" NOT NULL DEFAULT 'V1',
+ADD COLUMN "firstAttemptStartedAt" TIMESTAMP(3),
+ADD COLUMN "masterQueue" TEXT NOT NULL DEFAULT 'main',
+ADD COLUMN "priorityMs" INTEGER NOT NULL DEFAULT 0,
+ADD COLUMN "secondaryMasterQueue" TEXT;
+
+-- AlterTable
+ALTER TABLE "WorkerDeployment" ADD COLUMN "type" "WorkerDeploymentType" NOT NULL DEFAULT 'V1';
+
+-- CreateTable
+CREATE TABLE "TaskRunExecutionSnapshot" (
+ "id" TEXT NOT NULL,
+ "engine" "RunEngineVersion" NOT NULL DEFAULT 'V2',
+ "executionStatus" "TaskRunExecutionStatus" NOT NULL,
+ "description" TEXT NOT NULL,
+ "isValid" BOOLEAN NOT NULL DEFAULT true,
+ "error" TEXT,
+ "runId" TEXT NOT NULL,
+ "runStatus" "TaskRunStatus" NOT NULL,
+ "attemptNumber" INTEGER,
+ "checkpointId" TEXT,
+ "workerId" TEXT,
+ "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ "updatedAt" TIMESTAMP(3) NOT NULL,
+ "lastHeartbeatAt" TIMESTAMP(3),
+
+ CONSTRAINT "TaskRunExecutionSnapshot_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateTable
+CREATE TABLE "TaskRunCheckpoint" (
+ "id" TEXT NOT NULL,
+ "friendlyId" TEXT NOT NULL,
+ "type" "TaskRunCheckpointType" NOT NULL,
+ "location" TEXT NOT NULL,
+ "imageRef" TEXT NOT NULL,
+ "reason" TEXT,
+ "metadata" TEXT,
+ "projectId" TEXT NOT NULL,
+ "runtimeEnvironmentId" TEXT NOT NULL,
+ "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ "updatedAt" TIMESTAMP(3) NOT NULL,
+
+ CONSTRAINT "TaskRunCheckpoint_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateTable
+CREATE TABLE "Waitpoint" (
+ "id" TEXT NOT NULL,
+ "friendlyId" TEXT NOT NULL,
+ "type" "WaitpointType" NOT NULL,
+ "status" "WaitpointStatus" NOT NULL DEFAULT 'PENDING',
+ "completedAt" TIMESTAMP(3),
+ "idempotencyKey" TEXT NOT NULL,
+ "userProvidedIdempotencyKey" BOOLEAN NOT NULL,
+ "inactiveIdempotencyKey" TEXT,
+ "completedByTaskRunId" TEXT,
+ "completedAfter" TIMESTAMP(3),
+ "output" TEXT,
+ "outputType" TEXT NOT NULL DEFAULT 'application/json',
+ "outputIsError" BOOLEAN NOT NULL DEFAULT false,
+ "projectId" TEXT NOT NULL,
+ "environmentId" TEXT NOT NULL,
+ "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ "updatedAt" TIMESTAMP(3) NOT NULL,
+
+ CONSTRAINT "Waitpoint_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateTable
+CREATE TABLE "TaskRunWaitpoint" (
+ "id" TEXT NOT NULL,
+ "taskRunId" TEXT NOT NULL,
+ "waitpointId" TEXT NOT NULL,
+ "projectId" TEXT NOT NULL,
+ "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ "updatedAt" TIMESTAMP(3) NOT NULL,
+
+ CONSTRAINT "TaskRunWaitpoint_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateTable
+CREATE TABLE "FeatureFlag" (
+ "id" TEXT NOT NULL,
+ "key" TEXT NOT NULL,
+ "value" JSONB,
+
+ CONSTRAINT "FeatureFlag_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateTable
+CREATE TABLE "WorkerInstance" (
+ "id" TEXT NOT NULL,
+ "name" TEXT NOT NULL,
+ "resourceIdentifier" TEXT NOT NULL,
+ "metadata" JSONB,
+ "workerGroupId" TEXT NOT NULL,
+ "organizationId" TEXT,
+ "projectId" TEXT,
+ "environmentId" TEXT,
+ "deploymentId" TEXT,
+ "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ "updatedAt" TIMESTAMP(3) NOT NULL,
+ "lastDequeueAt" TIMESTAMP(3),
+ "lastHeartbeatAt" TIMESTAMP(3),
+
+ CONSTRAINT "WorkerInstance_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateTable
+CREATE TABLE "WorkerInstanceGroup" (
+ "id" TEXT NOT NULL,
+ "type" "WorkerInstanceGroupType" NOT NULL,
+ "name" TEXT NOT NULL,
+ "masterQueue" TEXT NOT NULL,
+ "description" TEXT,
+ "hidden" BOOLEAN NOT NULL DEFAULT false,
+ "tokenId" TEXT NOT NULL,
+ "organizationId" TEXT,
+ "projectId" TEXT,
+ "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ "updatedAt" TIMESTAMP(3) NOT NULL,
+
+ CONSTRAINT "WorkerInstanceGroup_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateTable
+CREATE TABLE "WorkerGroupToken" (
+ "id" TEXT NOT NULL,
+ "tokenHash" TEXT NOT NULL,
+ "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ "updatedAt" TIMESTAMP(3) NOT NULL,
+
+ CONSTRAINT "WorkerGroupToken_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateTable
+CREATE TABLE "_completedWaitpoints" (
+ "A" TEXT NOT NULL,
+ "B" TEXT NOT NULL
+);
+
+-- CreateIndex
+CREATE INDEX "TaskRunExecutionSnapshot_runId_isValid_createdAt_idx" ON "TaskRunExecutionSnapshot"("runId", "isValid", "createdAt" DESC);
+
+-- CreateIndex
+CREATE UNIQUE INDEX "TaskRunCheckpoint_friendlyId_key" ON "TaskRunCheckpoint"("friendlyId");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "Waitpoint_friendlyId_key" ON "Waitpoint"("friendlyId");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "Waitpoint_completedByTaskRunId_key" ON "Waitpoint"("completedByTaskRunId");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "Waitpoint_environmentId_idempotencyKey_key" ON "Waitpoint"("environmentId", "idempotencyKey");
+
+-- CreateIndex
+CREATE INDEX "TaskRunWaitpoint_taskRunId_idx" ON "TaskRunWaitpoint"("taskRunId");
+
+-- CreateIndex
+CREATE INDEX "TaskRunWaitpoint_waitpointId_idx" ON "TaskRunWaitpoint"("waitpointId");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "TaskRunWaitpoint_taskRunId_waitpointId_key" ON "TaskRunWaitpoint"("taskRunId", "waitpointId");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "FeatureFlag_key_key" ON "FeatureFlag"("key");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "WorkerInstance_workerGroupId_resourceIdentifier_key" ON "WorkerInstance"("workerGroupId", "resourceIdentifier");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "WorkerInstanceGroup_masterQueue_key" ON "WorkerInstanceGroup"("masterQueue");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "WorkerInstanceGroup_tokenId_key" ON "WorkerInstanceGroup"("tokenId");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "WorkerGroupToken_tokenHash_key" ON "WorkerGroupToken"("tokenHash");
+
+-- CreateIndex
+CREATE UNIQUE INDEX "_completedWaitpoints_AB_unique" ON "_completedWaitpoints"("A", "B");
+
+-- CreateIndex
+CREATE INDEX "_completedWaitpoints_B_index" ON "_completedWaitpoints"("B");
+
+-- AddForeignKey
+ALTER TABLE "Project" ADD CONSTRAINT "Project_defaultWorkerGroupId_fkey" FOREIGN KEY ("defaultWorkerGroupId") REFERENCES "WorkerInstanceGroup"("id") ON DELETE SET NULL ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "BackgroundWorker" ADD CONSTRAINT "BackgroundWorker_workerGroupId_fkey" FOREIGN KEY ("workerGroupId") REFERENCES "WorkerInstanceGroup"("id") ON DELETE SET NULL ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "TaskRunExecutionSnapshot" ADD CONSTRAINT "TaskRunExecutionSnapshot_runId_fkey" FOREIGN KEY ("runId") REFERENCES "TaskRun"("id") ON DELETE RESTRICT ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "TaskRunExecutionSnapshot" ADD CONSTRAINT "TaskRunExecutionSnapshot_checkpointId_fkey" FOREIGN KEY ("checkpointId") REFERENCES "TaskRunCheckpoint"("id") ON DELETE SET NULL ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "TaskRunExecutionSnapshot" ADD CONSTRAINT "TaskRunExecutionSnapshot_workerId_fkey" FOREIGN KEY ("workerId") REFERENCES "WorkerInstance"("id") ON DELETE SET NULL ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "TaskRunCheckpoint" ADD CONSTRAINT "TaskRunCheckpoint_projectId_fkey" FOREIGN KEY ("projectId") REFERENCES "Project"("id") ON DELETE CASCADE ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "TaskRunCheckpoint" ADD CONSTRAINT "TaskRunCheckpoint_runtimeEnvironmentId_fkey" FOREIGN KEY ("runtimeEnvironmentId") REFERENCES "RuntimeEnvironment"("id") ON DELETE CASCADE ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "Waitpoint" ADD CONSTRAINT "Waitpoint_completedByTaskRunId_fkey" FOREIGN KEY ("completedByTaskRunId") REFERENCES "TaskRun"("id") ON DELETE SET NULL ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "Waitpoint" ADD CONSTRAINT "Waitpoint_projectId_fkey" FOREIGN KEY ("projectId") REFERENCES "Project"("id") ON DELETE CASCADE ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "Waitpoint" ADD CONSTRAINT "Waitpoint_environmentId_fkey" FOREIGN KEY ("environmentId") REFERENCES "RuntimeEnvironment"("id") ON DELETE CASCADE ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "TaskRunWaitpoint" ADD CONSTRAINT "TaskRunWaitpoint_taskRunId_fkey" FOREIGN KEY ("taskRunId") REFERENCES "TaskRun"("id") ON DELETE RESTRICT ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "TaskRunWaitpoint" ADD CONSTRAINT "TaskRunWaitpoint_waitpointId_fkey" FOREIGN KEY ("waitpointId") REFERENCES "Waitpoint"("id") ON DELETE RESTRICT ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "TaskRunWaitpoint" ADD CONSTRAINT "TaskRunWaitpoint_projectId_fkey" FOREIGN KEY ("projectId") REFERENCES "Project"("id") ON DELETE CASCADE ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "WorkerInstance" ADD CONSTRAINT "WorkerInstance_workerGroupId_fkey" FOREIGN KEY ("workerGroupId") REFERENCES "WorkerInstanceGroup"("id") ON DELETE RESTRICT ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "WorkerInstance" ADD CONSTRAINT "WorkerInstance_organizationId_fkey" FOREIGN KEY ("organizationId") REFERENCES "Organization"("id") ON DELETE CASCADE ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "WorkerInstance" ADD CONSTRAINT "WorkerInstance_projectId_fkey" FOREIGN KEY ("projectId") REFERENCES "Project"("id") ON DELETE CASCADE ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "WorkerInstance" ADD CONSTRAINT "WorkerInstance_environmentId_fkey" FOREIGN KEY ("environmentId") REFERENCES "RuntimeEnvironment"("id") ON DELETE CASCADE ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "WorkerInstance" ADD CONSTRAINT "WorkerInstance_deploymentId_fkey" FOREIGN KEY ("deploymentId") REFERENCES "WorkerDeployment"("id") ON DELETE SET NULL ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "WorkerInstanceGroup" ADD CONSTRAINT "WorkerInstanceGroup_tokenId_fkey" FOREIGN KEY ("tokenId") REFERENCES "WorkerGroupToken"("id") ON DELETE CASCADE ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "WorkerInstanceGroup" ADD CONSTRAINT "WorkerInstanceGroup_organizationId_fkey" FOREIGN KEY ("organizationId") REFERENCES "Organization"("id") ON DELETE CASCADE ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "WorkerInstanceGroup" ADD CONSTRAINT "WorkerInstanceGroup_projectId_fkey" FOREIGN KEY ("projectId") REFERENCES "Project"("id") ON DELETE CASCADE ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "_completedWaitpoints" ADD CONSTRAINT "_completedWaitpoints_A_fkey" FOREIGN KEY ("A") REFERENCES "TaskRunExecutionSnapshot"("id") ON DELETE CASCADE ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "_completedWaitpoints" ADD CONSTRAINT "_completedWaitpoints_B_fkey" FOREIGN KEY ("B") REFERENCES "Waitpoint"("id") ON DELETE CASCADE ON UPDATE CASCADE;
diff --git a/internal-packages/database/prisma/migrations/20250106172943_added_span_id_to_complete_to_task_run_waitpoint/migration.sql b/internal-packages/database/prisma/migrations/20250106172943_added_span_id_to_complete_to_task_run_waitpoint/migration.sql
new file mode 100644
index 0000000000..8d624ba757
--- /dev/null
+++ b/internal-packages/database/prisma/migrations/20250106172943_added_span_id_to_complete_to_task_run_waitpoint/migration.sql
@@ -0,0 +1,2 @@
+-- AlterTable
+ALTER TABLE "TaskRunWaitpoint" ADD COLUMN "spanIdToComplete" TEXT;
diff --git a/internal-packages/database/prisma/migrations/20250109131442_added_batch_and_index_to_task_run_waitpoint_and_task_run_execution_snapshot/migration.sql b/internal-packages/database/prisma/migrations/20250109131442_added_batch_and_index_to_task_run_waitpoint_and_task_run_execution_snapshot/migration.sql
new file mode 100644
index 0000000000..5756f7fa5d
--- /dev/null
+++ b/internal-packages/database/prisma/migrations/20250109131442_added_batch_and_index_to_task_run_waitpoint_and_task_run_execution_snapshot/migration.sql
@@ -0,0 +1,13 @@
+-- AlterTable
+ALTER TABLE "TaskRunExecutionSnapshot" ADD COLUMN "batchId" TEXT,
+ADD COLUMN "completedWaitpointOrder" TEXT[];
+
+-- AlterTable
+ALTER TABLE "TaskRunWaitpoint" ADD COLUMN "batchId" TEXT,
+ADD COLUMN "batchIndex" INTEGER;
+
+-- AddForeignKey
+ALTER TABLE "TaskRunExecutionSnapshot" ADD CONSTRAINT "TaskRunExecutionSnapshot_batchId_fkey" FOREIGN KEY ("batchId") REFERENCES "BatchTaskRun"("id") ON DELETE SET NULL ON UPDATE CASCADE;
+
+-- AddForeignKey
+ALTER TABLE "TaskRunWaitpoint" ADD CONSTRAINT "TaskRunWaitpoint_batchId_fkey" FOREIGN KEY ("batchId") REFERENCES "BatchTaskRun"("id") ON DELETE SET NULL ON UPDATE CASCADE;
diff --git a/internal-packages/database/prisma/migrations/20250109173506_waitpoint_added_batch_type/migration.sql b/internal-packages/database/prisma/migrations/20250109173506_waitpoint_added_batch_type/migration.sql
new file mode 100644
index 0000000000..1e1fead5a5
--- /dev/null
+++ b/internal-packages/database/prisma/migrations/20250109173506_waitpoint_added_batch_type/migration.sql
@@ -0,0 +1,8 @@
+-- AlterEnum
+ALTER TYPE "WaitpointType" ADD VALUE 'BATCH';
+
+-- AlterTable
+ALTER TABLE "Waitpoint" ADD COLUMN "completedByBatchId" TEXT;
+
+-- AddForeignKey
+ALTER TABLE "Waitpoint" ADD CONSTRAINT "Waitpoint_completedByBatchId_fkey" FOREIGN KEY ("completedByBatchId") REFERENCES "BatchTaskRun"("id") ON DELETE SET NULL ON UPDATE CASCADE;
diff --git a/internal-packages/database/prisma/migrations/20250109175955_waitpoint_added_completed_by_batch_id_index/migration.sql b/internal-packages/database/prisma/migrations/20250109175955_waitpoint_added_completed_by_batch_id_index/migration.sql
new file mode 100644
index 0000000000..7d691d17e1
--- /dev/null
+++ b/internal-packages/database/prisma/migrations/20250109175955_waitpoint_added_completed_by_batch_id_index/migration.sql
@@ -0,0 +1,2 @@
+-- CreateIndex
+CREATE INDEX "Waitpoint_completedByBatchId_idx" ON "Waitpoint"("completedByBatchId");
diff --git a/internal-packages/database/prisma/migrations/20250114153223_task_run_waitpoint_unique_constraint_added_batch_index/migration.sql b/internal-packages/database/prisma/migrations/20250114153223_task_run_waitpoint_unique_constraint_added_batch_index/migration.sql
new file mode 100644
index 0000000000..22a41947d4
--- /dev/null
+++ b/internal-packages/database/prisma/migrations/20250114153223_task_run_waitpoint_unique_constraint_added_batch_index/migration.sql
@@ -0,0 +1,14 @@
+/*
+ Warnings:
+
+ - A unique constraint covering the columns `[taskRunId,waitpointId,batchIndex]` on the table `TaskRunWaitpoint` will be added. If there are existing duplicate values, this will fail.
+
+*/
+-- DropIndex
+DROP INDEX "TaskRunWaitpoint_taskRunId_waitpointId_key";
+
+-- CreateIndex (multiple can have null batchIndex, so we need the other one below)
+CREATE UNIQUE INDEX "TaskRunWaitpoint_taskRunId_waitpointId_batchIndex_key" ON "TaskRunWaitpoint" ("taskRunId", "waitpointId", "batchIndex");
+
+-- CreateIndex (where batchIndex is null)
+CREATE UNIQUE INDEX "TaskRunWaitpoint_taskRunId_waitpointId_batchIndex_null_key" ON "TaskRunWaitpoint"("taskRunId", "waitpointId") WHERE "batchIndex" IS NULL;
diff --git a/internal-packages/database/prisma/migrations/20250116115746_rename_blocked_by_waitpoints_to_suspended/migration.sql b/internal-packages/database/prisma/migrations/20250116115746_rename_blocked_by_waitpoints_to_suspended/migration.sql
new file mode 100644
index 0000000000..003151ea34
--- /dev/null
+++ b/internal-packages/database/prisma/migrations/20250116115746_rename_blocked_by_waitpoints_to_suspended/migration.sql
@@ -0,0 +1,14 @@
+/*
+ Warnings:
+
+ - The values [BLOCKED_BY_WAITPOINTS] on the enum `TaskRunExecutionStatus` will be removed. If these variants are still used in the database, this will fail.
+
+*/
+-- AlterEnum
+BEGIN;
+CREATE TYPE "TaskRunExecutionStatus_new" AS ENUM ('RUN_CREATED', 'QUEUED', 'PENDING_EXECUTING', 'EXECUTING', 'EXECUTING_WITH_WAITPOINTS', 'SUSPENDED', 'PENDING_CANCEL', 'FINISHED');
+ALTER TABLE "TaskRunExecutionSnapshot" ALTER COLUMN "executionStatus" TYPE "TaskRunExecutionStatus_new" USING ("executionStatus"::text::"TaskRunExecutionStatus_new");
+ALTER TYPE "TaskRunExecutionStatus" RENAME TO "TaskRunExecutionStatus_old";
+ALTER TYPE "TaskRunExecutionStatus_new" RENAME TO "TaskRunExecutionStatus";
+DROP TYPE "TaskRunExecutionStatus_old";
+COMMIT;
diff --git a/internal-packages/database/prisma/migrations/20250128160520_add_runner_id_to_execution_snapshots/migration.sql b/internal-packages/database/prisma/migrations/20250128160520_add_runner_id_to_execution_snapshots/migration.sql
new file mode 100644
index 0000000000..d44fcf43bc
--- /dev/null
+++ b/internal-packages/database/prisma/migrations/20250128160520_add_runner_id_to_execution_snapshots/migration.sql
@@ -0,0 +1,2 @@
+-- AlterTable
+ALTER TABLE "TaskRunExecutionSnapshot" ADD COLUMN "runnerId" TEXT;
diff --git a/internal-packages/database/prisma/migrations/20250130173941_background_worker_added_engine_version_column/migration.sql b/internal-packages/database/prisma/migrations/20250130173941_background_worker_added_engine_version_column/migration.sql
new file mode 100644
index 0000000000..deb06543fd
--- /dev/null
+++ b/internal-packages/database/prisma/migrations/20250130173941_background_worker_added_engine_version_column/migration.sql
@@ -0,0 +1,2 @@
+-- AlterTable
+ALTER TABLE "BackgroundWorker" ADD COLUMN "engine" "RunEngineVersion" NOT NULL DEFAULT 'V1';
diff --git a/internal-packages/database/prisma/migrations/20250207104914_added_environment_and_environment_type_to_task_run_execution_snapshot/migration.sql b/internal-packages/database/prisma/migrations/20250207104914_added_environment_and_environment_type_to_task_run_execution_snapshot/migration.sql
new file mode 100644
index 0000000000..c0b98dec6f
--- /dev/null
+++ b/internal-packages/database/prisma/migrations/20250207104914_added_environment_and_environment_type_to_task_run_execution_snapshot/migration.sql
@@ -0,0 +1,13 @@
+/*
+ Warnings:
+
+ - Added the required column `environmentId` to the `TaskRunExecutionSnapshot` table without a default value. This is not possible if the table is not empty.
+ - Added the required column `environmentType` to the `TaskRunExecutionSnapshot` table without a default value. This is not possible if the table is not empty.
+
+*/
+-- AlterTable
+ALTER TABLE "TaskRunExecutionSnapshot" ADD COLUMN "environmentId" TEXT NOT NULL,
+ADD COLUMN "environmentType" "RuntimeEnvironmentType" NOT NULL;
+
+-- AddForeignKey
+ALTER TABLE "TaskRunExecutionSnapshot" ADD CONSTRAINT "TaskRunExecutionSnapshot_environmentId_fkey" FOREIGN KEY ("environmentId") REFERENCES "RuntimeEnvironment"("id") ON DELETE RESTRICT ON UPDATE CASCADE;
diff --git a/internal-packages/database/prisma/migrations/20250219140441_waitpoint_added_idempotency_key_expires_at/migration.sql b/internal-packages/database/prisma/migrations/20250219140441_waitpoint_added_idempotency_key_expires_at/migration.sql
new file mode 100644
index 0000000000..6d6b7d49e4
--- /dev/null
+++ b/internal-packages/database/prisma/migrations/20250219140441_waitpoint_added_idempotency_key_expires_at/migration.sql
@@ -0,0 +1,2 @@
+-- AlterTable
+ALTER TABLE "Waitpoint" ADD COLUMN "idempotencyKeyExpiresAt" TIMESTAMP(3);
diff --git a/internal-packages/database/prisma/migrations/20250304184614_remove_task_run_first_attempt_started_at_column/migration.sql b/internal-packages/database/prisma/migrations/20250304184614_remove_task_run_first_attempt_started_at_column/migration.sql
new file mode 100644
index 0000000000..fdd51378e7
--- /dev/null
+++ b/internal-packages/database/prisma/migrations/20250304184614_remove_task_run_first_attempt_started_at_column/migration.sql
@@ -0,0 +1,9 @@
+/*
+ Warnings:
+
+ - You are about to drop the column `firstAttemptStartedAt` on the `TaskRun` table. All the data in the column will be lost.
+
+*/
+
+-- AlterTable
+ALTER TABLE "TaskRun" DROP COLUMN "firstAttemptStartedAt";
diff --git a/internal-packages/database/prisma/schema.prisma b/internal-packages/database/prisma/schema.prisma
index 2fa3ef4886..e38846e382 100644
--- a/internal-packages/database/prisma/schema.prisma
+++ b/internal-packages/database/prisma/schema.prisma
@@ -148,6 +148,8 @@ model Organization {
integrations Integration[]
sources TriggerSource[]
organizationIntegrations OrganizationIntegration[]
+ workerGroups WorkerInstanceGroup[]
+ workerInstances WorkerInstance[]
}
model ExternalAccount {
@@ -419,6 +421,10 @@ model RuntimeEnvironment {
currentSession RuntimeEnvironmentSession? @relation("currentSession", fields: [currentSessionId], references: [id], onDelete: SetNull, onUpdate: Cascade)
currentSessionId String?
taskRunNumberCounter TaskRunNumberCounter[]
+ taskRunCheckpoints TaskRunCheckpoint[]
+ waitpoints Waitpoint[]
+ workerInstances WorkerInstance[]
+ executionSnapshots TaskRunExecutionSnapshot[]
@@unique([projectId, slug, orgMemberId])
@@unique([projectId, shortcode])
@@ -445,10 +451,17 @@ model Project {
updatedAt DateTime @updatedAt
deletedAt DateTime?
- version ProjectVersion @default(V2)
+ version ProjectVersion @default(V2)
+ engine RunEngineVersion @default(V1)
builderProjectId String?
+ workerGroups WorkerInstanceGroup[]
+ workers WorkerInstance[]
+
+ defaultWorkerGroup WorkerInstanceGroup? @relation("ProjectDefaultWorkerGroup", fields: [defaultWorkerGroupId], references: [id])
+ defaultWorkerGroupId String?
+
environments RuntimeEnvironment[]
endpoints Endpoint[]
jobs Job[]
@@ -473,6 +486,9 @@ model Project {
alertStorages ProjectAlertStorage[]
bulkActionGroups BulkActionGroup[]
BackgroundWorkerFile BackgroundWorkerFile[]
+ waitpoints Waitpoint[]
+ taskRunWaitpoints TaskRunWaitpoint[]
+ taskRunCheckpoints TaskRunCheckpoint[]
}
enum ProjectVersion {
@@ -1559,6 +1575,8 @@ model BackgroundWorker {
friendlyId String @unique
+ engine RunEngineVersion @default(V1)
+
contentHash String
sdkVersion String @default("unknown")
cliVersion String @default("unknown")
@@ -1582,6 +1600,9 @@ model BackgroundWorker {
deployment WorkerDeployment?
+ workerGroup WorkerInstanceGroup? @relation(fields: [workerGroupId], references: [id], onDelete: SetNull, onUpdate: Cascade)
+ workerGroupId String?
+
supportsLazyAttempts Boolean @default(false)
@@unique([projectId, runtimeEnvironmentId, version])
@@ -1663,6 +1684,8 @@ model TaskRun {
number Int @default(0)
friendlyId String @unique
+ engine RunEngineVersion @default(V1)
+
status TaskRunStatus @default(PENDING)
idempotencyKey String?
@@ -1685,8 +1708,16 @@ model TaskRun {
project Project @relation(fields: [projectId], references: [id], onDelete: Cascade, onUpdate: Cascade)
projectId String
+ // The specific queue this run is in
queue String
+ /// The main queue that this run is part of
+ masterQueue String @default("main")
+ secondaryMasterQueue String?
+
+ /// From engine v2+ this will be defined after a run has been dequeued (starting at 1)
+ attemptNumber Int?
+
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
@@ -1721,6 +1752,10 @@ model TaskRun {
lockedToVersion BackgroundWorker? @relation(fields: [lockedToVersionId], references: [id])
lockedToVersionId String?
+ /// The "priority" of the run. This is just a negative offset in ms for the queue timestamp
+ /// E.g. a value of 60_000 would put the run into the queue 60s ago.
+ priorityMs Int @default(0)
+
concurrencyKey String?
delayUntil DateTime?
@@ -1732,6 +1767,13 @@ model TaskRun {
/// optional token that can be used to authenticate the task run
oneTimeUseToken String?
+ ///When this run is finished, the waitpoint will be marked as completed
+ associatedWaitpoint Waitpoint?
+
+ ///If there are any blocked waitpoints, the run won't be executed
+ blockedByWaitpoints TaskRunWaitpoint[]
+
+ /// Where the logs are stored
taskEventStore String @default("taskEvent")
queueTimestamp DateTime?
@@ -1739,6 +1781,7 @@ model TaskRun {
batchItems BatchTaskRunItem[]
dependency TaskRunDependency?
CheckpointRestoreEvent CheckpointRestoreEvent[]
+ executionSnapshots TaskRunExecutionSnapshot[]
alerts ProjectAlert[]
@@ -1881,6 +1924,324 @@ enum TaskRunStatus {
TIMED_OUT
}
+enum RunEngineVersion {
+ /// The original version that uses marqs v1 and Graphile
+ V1
+ V2
+}
+
+/// Used by the RunEngine during TaskRun execution
+/// It has the required information to transactionally progress a run through states,
+/// and prevent side effects like heartbeats failing a run that has progressed.
+/// It is optimised for performance and is designed to be cleared at some point,
+/// so there are no cascading relationships to other models.
+model TaskRunExecutionSnapshot {
+ id String @id @default(cuid())
+
+ /// This should always be 2+ (V1 didn't use the run engine or snapshots)
+ engine RunEngineVersion @default(V2)
+
+ /// The execution status
+ executionStatus TaskRunExecutionStatus
+ /// For debugging
+ description String
+
+ /// We store invalid snapshots as a record of the run state when we tried to move
+ isValid Boolean @default(true)
+ error String?
+
+ /// Run
+ runId String
+ run TaskRun @relation(fields: [runId], references: [id])
+ runStatus TaskRunStatus
+
+ // Batch
+ batchId String?
+ batch BatchTaskRun? @relation(fields: [batchId], references: [id])
+
+ /// This is the current run attempt number. Users can define how many attempts they want for a run.
+ attemptNumber Int?
+
+ /// Environment
+ environmentId String
+ environment RuntimeEnvironment @relation(fields: [environmentId], references: [id])
+ environmentType RuntimeEnvironmentType
+
+ /// Waitpoints that have been completed for this execution
+ completedWaitpoints Waitpoint[] @relation("completedWaitpoints")
+
+ /// An array of waitpoint IDs in the correct order, used for batches
+ completedWaitpointOrder String[]
+
+ /// Checkpoint
+ checkpointId String?
+ checkpoint TaskRunCheckpoint? @relation(fields: [checkpointId], references: [id])
+
+ /// Worker
+ workerId String?
+ worker WorkerInstance? @relation(fields: [workerId], references: [id])
+
+ runnerId String?
+
+ createdAt DateTime @default(now())
+ updatedAt DateTime @updatedAt
+
+ lastHeartbeatAt DateTime?
+
+ /// Used to get the latest valid snapshot quickly
+ @@index([runId, isValid, createdAt(sort: Desc)])
+}
+
+enum TaskRunExecutionStatus {
+ /// Run has been created
+ RUN_CREATED
+ /// Run is in the RunQueue
+ QUEUED
+ /// Run has been pulled from the queue, but isn't executing yet
+ PENDING_EXECUTING
+ /// Run is executing on a worker
+ EXECUTING
+ /// Run is executing on a worker but is waiting for waitpoints to complete
+ EXECUTING_WITH_WAITPOINTS
+ /// Run has been suspended and may be waiting for waitpoints to complete before resuming
+ SUSPENDED
+ /// Run has been scheduled for cancellation
+ PENDING_CANCEL
+ /// Run is finished (success of failure)
+ FINISHED
+}
+
+model TaskRunCheckpoint {
+ id String @id @default(cuid())
+
+ friendlyId String @unique
+
+ type TaskRunCheckpointType
+ location String
+ imageRef String
+ reason String?
+ metadata String?
+
+ project Project @relation(fields: [projectId], references: [id], onDelete: Cascade, onUpdate: Cascade)
+ projectId String
+
+ runtimeEnvironment RuntimeEnvironment @relation(fields: [runtimeEnvironmentId], references: [id], onDelete: Cascade, onUpdate: Cascade)
+ runtimeEnvironmentId String
+
+ executionSnapshot TaskRunExecutionSnapshot[]
+
+ createdAt DateTime @default(now())
+ updatedAt DateTime @updatedAt
+}
+
+enum TaskRunCheckpointType {
+ DOCKER
+ KUBERNETES
+}
+
+/// A Waitpoint blocks a run from continuing until it's completed
+/// If there's a waitpoint blocking a run, it shouldn't be in the queue
+model Waitpoint {
+ id String @id @default(cuid())
+
+ friendlyId String @unique
+
+ type WaitpointType
+ status WaitpointStatus @default(PENDING)
+
+ completedAt DateTime?
+
+ /// If it's an Event type waitpoint, this is the event. It can also be provided for the DATETIME type
+ idempotencyKey String
+ /// If this is true then we can show it in the dashboard/return it from the SDK
+ userProvidedIdempotencyKey Boolean
+
+ /// If there's a user provided idempotency key, this is the time it expires at
+ idempotencyKeyExpiresAt DateTime?
+
+ //todo
+ /// Will automatically deactivate the idempotencyKey when the waitpoint is completed
+ /// "Deactivating" means moving it to the inactiveIdempotencyKey field and generating a random new one for the main column
+ /// deactivateIdempotencyKeyWhenCompleted Boolean @default(false)
+
+ /// If an idempotencyKey is no longer active, we store it here and generate a new one for the idempotencyKey field.
+ /// Clearing an idempotencyKey is useful for debounce or cancelling child runs.
+ /// This is a workaround because Prisma doesn't support partial indexes.
+ inactiveIdempotencyKey String?
+
+ /// If it's a RUN type waitpoint, this is the associated run
+ completedByTaskRunId String? @unique
+ completedByTaskRun TaskRun? @relation(fields: [completedByTaskRunId], references: [id], onDelete: SetNull)
+
+ /// If it's a DATETIME type waitpoint, this is the date.
+ /// If it's a MANUAL waitpoint, this can be set as the `timeout`.
+ completedAfter DateTime?
+
+ /// If it's a BATCH type waitpoint, this is the associated batch
+ completedByBatchId String?
+ completedByBatch BatchTaskRun? @relation(fields: [completedByBatchId], references: [id], onDelete: SetNull)
+
+ /// The runs this waitpoint is blocking
+ blockingTaskRuns TaskRunWaitpoint[]
+
+ /// When a waitpoint is complete
+ completedExecutionSnapshots TaskRunExecutionSnapshot[] @relation("completedWaitpoints")
+
+ /// When completed, an output can be stored here
+ output String?
+ outputType String @default("application/json")
+ outputIsError Boolean @default(false)
+
+ project Project @relation(fields: [projectId], references: [id], onDelete: Cascade, onUpdate: Cascade)
+ projectId String
+
+ environment RuntimeEnvironment @relation(fields: [environmentId], references: [id], onDelete: Cascade, onUpdate: Cascade)
+ environmentId String
+
+ createdAt DateTime @default(now())
+ updatedAt DateTime @updatedAt
+
+ @@unique([environmentId, idempotencyKey])
+ @@index([completedByBatchId])
+}
+
+enum WaitpointType {
+ RUN
+ DATETIME
+ MANUAL
+ BATCH
+}
+
+enum WaitpointStatus {
+ PENDING
+ COMPLETED
+}
+
+model TaskRunWaitpoint {
+ id String @id @default(cuid())
+
+ taskRun TaskRun @relation(fields: [taskRunId], references: [id])
+ taskRunId String
+
+ waitpoint Waitpoint @relation(fields: [waitpointId], references: [id])
+ waitpointId String
+
+ project Project @relation(fields: [projectId], references: [id], onDelete: Cascade, onUpdate: Cascade)
+ projectId String
+
+ /// This span id is completed when the waitpoint is completed. This is used with cached runs (idempotent)
+ spanIdToComplete String?
+
+ //associated batch
+ batchId String?
+ batch BatchTaskRun? @relation(fields: [batchId], references: [id])
+ //if there's an associated batch and this isn't set it's for the entire batch
+ //if it is set, it's a specific run in the batch
+ batchIndex Int?
+
+ createdAt DateTime @default(now())
+ updatedAt DateTime @updatedAt
+
+ /// There are two constraints, the one below and also one that Prisma doesn't support
+ /// The second one implemented in SQL only prevents a TaskRun + Waitpoint with a null batchIndex
+ @@unique([taskRunId, waitpointId, batchIndex])
+ @@index([taskRunId])
+ @@index([waitpointId])
+}
+
+model FeatureFlag {
+ id String @id @default(cuid())
+
+ key String @unique
+ value Json?
+}
+
+model WorkerInstance {
+ id String @id @default(cuid())
+
+ /// For example "worker-1"
+ name String
+
+ /// If managed, it will default to the name, e.g. "worker-1"
+ /// If unmanged, it will be prefixed with the deployment ID e.g. "deploy-123-worker-1"
+ resourceIdentifier String
+
+ metadata Json?
+
+ workerGroup WorkerInstanceGroup @relation(fields: [workerGroupId], references: [id])
+ workerGroupId String
+
+ TaskRunExecutionSnapshot TaskRunExecutionSnapshot[]
+
+ organization Organization? @relation(fields: [organizationId], references: [id], onDelete: Cascade, onUpdate: Cascade)
+ organizationId String?
+
+ project Project? @relation(fields: [projectId], references: [id], onDelete: Cascade, onUpdate: Cascade)
+ projectId String?
+
+ environment RuntimeEnvironment? @relation(fields: [environmentId], references: [id], onDelete: Cascade, onUpdate: Cascade)
+ environmentId String?
+
+ deployment WorkerDeployment? @relation(fields: [deploymentId], references: [id], onDelete: SetNull, onUpdate: Cascade)
+ deploymentId String?
+
+ createdAt DateTime @default(now())
+ updatedAt DateTime @updatedAt
+
+ lastDequeueAt DateTime?
+ lastHeartbeatAt DateTime?
+
+ @@unique([workerGroupId, resourceIdentifier])
+}
+
+enum WorkerInstanceGroupType {
+ MANAGED
+ UNMANAGED
+}
+
+model WorkerInstanceGroup {
+ id String @id @default(cuid())
+ type WorkerInstanceGroupType
+
+ /// For example "us-east-1"
+ name String
+
+ /// If managed, it will default to the name, e.g. "us-east-1"
+ /// If unmanged, it will be prefixed with the project ID e.g. "project_1-us-east-1"
+ masterQueue String @unique
+
+ description String?
+ hidden Boolean @default(false)
+
+ token WorkerGroupToken @relation(fields: [tokenId], references: [id], onDelete: Cascade, onUpdate: Cascade)
+ tokenId String @unique
+
+ workers WorkerInstance[]
+ backgroundWorkers BackgroundWorker[]
+
+ defaultForProjects Project[] @relation("ProjectDefaultWorkerGroup")
+
+ organization Organization? @relation(fields: [organizationId], references: [id], onDelete: Cascade, onUpdate: Cascade)
+ organizationId String?
+
+ project Project? @relation(fields: [projectId], references: [id], onDelete: Cascade, onUpdate: Cascade)
+ projectId String?
+
+ createdAt DateTime @default(now())
+ updatedAt DateTime @updatedAt
+}
+
+model WorkerGroupToken {
+ id String @id @default(cuid())
+
+ tokenHash String @unique
+
+ createdAt DateTime @default(now())
+ updatedAt DateTime @updatedAt
+
+ workerGroup WorkerInstanceGroup?
+}
+
model TaskRunTag {
id String @id @default(cuid())
name String
@@ -1945,6 +2306,7 @@ model TaskRunNumberCounter {
@@unique([taskIdentifier, environmentId])
}
+/// This is not used from engine v2+, attempts use the TaskRunExecutionSnapshot and TaskRun
model TaskRunAttempt {
id String @id @default(cuid())
number Int @default(0)
@@ -2019,6 +2381,7 @@ model TaskEvent {
isError Boolean @default(false)
isPartial Boolean @default(false)
isCancelled Boolean @default(false)
+ isDebug Boolean @default(false)
serviceName String
serviceNamespace String
@@ -2166,11 +2529,13 @@ model BatchTaskRun {
runtimeEnvironment RuntimeEnvironment @relation(fields: [runtimeEnvironmentId], references: [id], onDelete: Cascade, onUpdate: Cascade)
status BatchTaskRunStatus @default(PENDING)
runtimeEnvironmentId String
+ /// This only includes new runs, not idempotent runs.
runs TaskRun[]
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
// new columns
+ /// Friendly IDs
runIds String[] @default([])
runCount Int @default(0)
payload String?
@@ -2178,6 +2543,15 @@ model BatchTaskRun {
options Json?
batchVersion String @default("v1")
+ //engine v2
+ /// Snapshots that reference this batch
+ executionSnapshots TaskRunExecutionSnapshot[]
+ /// Specific run blockers,
+ runsBlocked TaskRunWaitpoint[]
+ /// Waitpoints that are blocked by this batch.
+ /// When a Batch is created it blocks execution of the associated parent run (for andWait)
+ waitpoints Waitpoint[]
+
// This is for v3 batches
/// sealed is set to true once no more items can be added to the batch
sealed Boolean @default(false)
@@ -2217,6 +2591,7 @@ enum BatchTaskRunStatus {
ABORTED
}
+///Used in engine V1 only
model BatchTaskRunItem {
id String @id @default(cuid())
@@ -2349,6 +2724,12 @@ enum CheckpointRestoreEventType {
RESTORE
}
+enum WorkerDeploymentType {
+ MANAGED
+ UNMANAGED
+ V1
+}
+
model WorkerDeployment {
id String @id @default(cuid())
@@ -2362,6 +2743,7 @@ model WorkerDeployment {
externalBuildData Json?
status WorkerDeploymentStatus @default(PENDING)
+ type WorkerDeploymentType @default(V1)
project Project @relation(fields: [projectId], references: [id], onDelete: Cascade, onUpdate: Cascade)
projectId String
@@ -2384,8 +2766,9 @@ model WorkerDeployment {
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
- promotions WorkerDeploymentPromotion[]
- alerts ProjectAlert[]
+ promotions WorkerDeploymentPromotion[]
+ alerts ProjectAlert[]
+ workerInstance WorkerInstance[]
@@unique([projectId, shortCode])
@@unique([environmentId, version])
diff --git a/internal-packages/emails/package.json b/internal-packages/emails/package.json
index d7fa150fbb..85cb7abe01 100644
--- a/internal-packages/emails/package.json
+++ b/internal-packages/emails/package.json
@@ -20,10 +20,8 @@
"zod": "3.23.8"
},
"devDependencies": {
- "@types/node": "^18",
"@types/nodemailer": "^6.4.17",
- "@types/react": "18.2.69",
- "typescript": "^4.9.4"
+ "@types/react": "18.2.69"
},
"engines": {
"node": ">=18.0.0"
diff --git a/internal-packages/otlp-importer/package.json b/internal-packages/otlp-importer/package.json
index 49d2fdbae5..72e46c2f9d 100644
--- a/internal-packages/otlp-importer/package.json
+++ b/internal-packages/otlp-importer/package.json
@@ -29,8 +29,7 @@
"devDependencies": {
"@types/node": "^20",
"rimraf": "^3.0.2",
- "ts-proto": "^1.167.3",
- "typescript": "^5.5.0"
+ "ts-proto": "^1.167.3"
},
"engines": {
"node": ">=18.0.0"
diff --git a/internal-packages/redis-worker/src/queue.ts b/internal-packages/redis-worker/src/queue.ts
index bb7225f396..4ba08a01df 100644
--- a/internal-packages/redis-worker/src/queue.ts
+++ b/internal-packages/redis-worker/src/queue.ts
@@ -57,7 +57,7 @@ export class SimpleQueue {
const delay = Math.min(times * 50, 1000);
return delay;
},
- maxRetriesPerRequest: 3,
+ maxRetriesPerRequest: 20,
});
this.#registerCommands();
this.schema = schema;
diff --git a/internal-packages/run-engine/README.md b/internal-packages/run-engine/README.md
new file mode 100644
index 0000000000..a2ca8fda22
--- /dev/null
+++ b/internal-packages/run-engine/README.md
@@ -0,0 +1,189 @@
+# Trigger.dev Run Engine
+
+The Run Engine process runs from triggering, to executing, retrying, and completing them.
+
+It is responsible for:
+
+- Creating, updating, and completing runs as they progress.
+- Operating the run queue, including handling concurrency.
+- Heartbeats which detects stalled runs and attempts to automatically recover them.
+- Registering checkpoints which enable pausing/resuming of runs.
+
+## Glossary
+
+- **Platform**: The main Trigger.dev API, dashboard, database. The Run Engine is part of the platform.
+- **Worker group**: A group of workers that all pull from the same queue, e.g. "us-east-1", "my-self-hosted-workers".
+ - **Worker**: A worker is a 'server' that connects to the platform and receives runs.
+ - **Supervisor**: Pulls new runs from the queue, communicates with the platform, spins up new Deploy executors.
+ - **Deploy container**: Container that comes from a specific deploy from a user's project.
+ - **Run controller**: The code that manages running the task.
+ - **Run executor**: The actual task running.
+
+## Run locking
+
+Many operations on the run are "atomic" in the sense that only a single operation can mutate them at a time. We use RedLock to create a distributed lock to ensure this. Postgres locking is not enough on its own because we have multiple API instances and Redis is used for the queue.
+
+There are race conditions we need to deal with:
+- When checkpointing the run continues to execute until the checkpoint has been stored. At the same time the run continues and the checkpoint can become irrelevant if the waitpoint is completed. Both can happen at the same time, so we must lock the run and protect against outdated checkpoints.
+
+## Run execution
+
+The execution state of a run is stored in the `TaskRunExecutionSnapshot` table in Postgres. This is separate from the `TaskRun` status which is exposed to users via the dashboard and API.
+
+
+
+The `TaskRunExecutionSnapshot` `executionStatus` is used to determine the execution status and is internal to the run engine. It is a log of events that impact run execution – the data is used to execute the run.
+
+A common pattern we use is to read the current state and check that the passed in `snapshotId` matches the current `snapshotId`. If it doesn't, we know that the state has moved on. In the case of a checkpoint coming in, we know we can just ignore it.
+
+We can also store invalid states by setting an error. These invalid states are purely used for debugging and are ignored for execution purposes.
+
+## Workers
+
+A worker is a server that runs tasks. There are two types of workers:
+- Hosted workers (serverless, managed and cloud-only)
+- Self-hosted workers
+
+In the dashboard under the "Workers" page, you can see all worker groups including the "main" group which is the default and not self-hosted. You can also see alternative worker groups that are available to you, such as "EU", "v3.2 (beta)", and any self-hosted worker groups you have created.
+
+You add a new self-hosted worker group by clicking "Add" and choosing an `id` that is unique to your project.
+
+Then when triggering runs, you can specify the `workerGroup` to use. It defaults to "main". The workerGroup is used internally to set the `masterQueue` that a run is placed in, this allows pulling runs only for that worker group.
+
+On the "Workers" page, you can see the status of each worker group, including the number of workers in the group, the number of runs that are queued.
+
+## Pulling from the queue
+
+A worker will call the Trigger.dev API with it's `workerGroup`.
+
+For warm starts, self-hosted workers we will also pass the `BackgroundWorker` id and `environment` id. This allow pulling relevant runs.
+
+For dev environments, we will pass the `environment` id.
+
+If there's only a `workerGroup`, we can just `dequeueFromMasterQueue()` to get runs. If there's a `BackgroundWorker` id, we need to determine if that `BackgroundWorker` is the latest. If it's the latest we call `dequeueFromEnvironmentMasterQueue()` to get any runs that aren't locked to a version. If it's not the latest, we call `dequeueFromBackgroundWorkerMasterQueue()` to get runs that are locked to that version.
+
+## Run Queue
+
+This is a fair multi-tenant queue. It is designed to fairly select runs, respect concurrency limits, and have high throughput. It provides visibility into the current concurrency for the env, org, etc.
+
+It has built-in reliability features:
+- When nacking we increment the `attempt` and if it continually fails we will move it to a Dead Letter Queue (DLQ).
+- If a run is in the DLQ you can redrive it.
+
+## Heartbeats
+
+Heartbeats are used to determine if a run has become stalled. Depending on the current execution status, we do different things. For example, if the run has been dequeued but the attempt hasn't been started we requeue it.
+
+## Checkpoints
+
+Checkpoints allow pausing an executing run and then resuming it later. This is an optimization to avoid wasted compute and is especially useful with "Waitpoints".
+
+## Waitpoints
+
+A "Waitpoint" is something that can block a run from continuing:
+
+A single Waitpoint can block many runs, the same waitpoint can only block a run once (there's a unique constraint). They block run execution from continuing until all of them are completed.
+
+They can have output data associated with them, e.g. the finished run payload. That includes an error, e.g. a failed run.
+
+There are currently three types:
+ - `RUN` which gets completed when the associated run completes. Every run has an `associatedWaitpoint` that matches the lifetime of the run.
+ - `DATETIME` which gets completed when the datetime is reached.
+ - `MANUAL` which gets completed when that event occurs.
+
+Waitpoints can have an idempotencyKey which allows stops them from being created multiple times. This is especially useful for event waitpoints, where you don't want to create a new waitpoint for the same event twice.
+
+### `wait.for()` or `wait.until()`
+Wait for a future time, then continue. We should add the option to pass an `idempotencyKey` so a second attempt doesn't wait again. By default it would wait again.
+
+```ts
+//Note if the idempotency key is a string, it will get prefixed with the run id.
+//you can explicitly pass in an idempotency key created with the the global scope.
+await wait.until(new Date('2022-01-01T00:00:00Z'), { idempotencyKey: "first-wait" });
+await wait.until(new Date('2022-01-01T00:00:00Z'), { idempotencyKey: "second-wait" });
+```
+
+### `triggerAndWait()` or `batchTriggerAndWait()`
+Trigger and then wait for run(s) to finish. If the run fails it will still continue but with the errors so the developer can decide what to do.
+
+### The `trigger` `delay` option
+
+When triggering a run and passing the `delay` option, we use a `DATETIME` waitpoint to block the run from starting.
+
+### `wait.forRequest()`
+Wait until a request has been received at the URL that you are given. This is useful for pausing a run and then continuing it again when some external event occurs on another service. For example, Replicate have an API where they will callback when their work is complete.
+
+### `wait.forWaitpoint(waitpointId)`
+
+A more advanced SDK which would require uses to explicitly create a waitpoint. We would also need `createWaitpoint()`, `completeWaitpoint()`, and `failWaitpoint()`.
+
+```ts
+const waitpoint = await waitpoints.create({ idempotencyKey: `purchase-${payload.cart.id}` });
+const waitpoint = await waitpoints.retrieve(waitpoint.id);
+const waitpoint = await waitpoints.complete(waitpoint.id, result);
+const waitpoint = await waitpoints.fail(waitpoint.id, error);
+
+export const approvalFlow = task({
+ id: "approvalFlow",
+ run: async (payload) => {
+ //...do stuff
+
+ const result = await wait.forWaitpoint(waitpoint.id, { timeout: "1h" });
+ if (!result.ok) {
+ //...timeout
+ }
+
+ //...do more stuff
+ },
+});
+```
+
+### `wait.forRunToComplete(runId)`
+
+You could wait for another run (or runs) using their run ids. This would allow you to wait for runs that you haven't triggered inside that run.
+
+## Run flow control
+
+There are several ways to control when a run will execute (or not). Each of these should be configurable on a task, a named queue that is shared between tasks, and at trigger time including the ability to pass a `key` so you can have per-tenant controls.
+
+### Concurrency limits
+
+When `trigger` is called the run is added to the queue. We only dequeue when the concurrency limit hasn't been exceeded for that task/queue.
+
+### Rate limiting
+
+When `trigger` is called, we check if the rate limit has been exceeded. If it has then we ignore the trigger. The run is thrown away and an appropriate error is returned.
+
+This is useful:
+- To prevent abuse.
+- To control how many executions a user can do (using a `key` with rate limiting).
+
+### Debouncing
+
+When `trigger` is called, we prevent too many runs happening in a period by collapsing into a single run. This is done by discarding some runs in a period.
+
+This is useful:
+- To prevent too many runs happening in a short period.
+
+We should mark the run as `"DELAYED"` with the correct `delayUntil` time. This will allow the user to see that the run is delayed and why.
+
+### Throttling
+
+When `trigger` is called the run is added to the queue. We only run them when they don't exceed the limit in that time period, by controlling the timing of when they are dequeued.
+
+This is useful:
+- To prevent too many runs happening in a short period.
+- To control how many executions a user can do (using a `key` with throttling).
+- When you need to execute every run but not too many in a short period, e.g. avoiding rate limits.
+
+### Batching
+
+When `trigger` is called, we batch the runs together. This means the payload of the run is an array of items, each being a single payload.
+
+This is useful:
+- For performance, as it reduces the number of runs in the system.
+- It can be useful when using 3rd party APIs that support batching.
+
+## Emitting events
+
+The Run Engine emits events using its `eventBus`. This is used for runs completing, failing, or things that any workers should be aware of.
diff --git a/internal-packages/run-engine/execution-states.png b/internal-packages/run-engine/execution-states.png
new file mode 100644
index 0000000000..cc156dd7de
Binary files /dev/null and b/internal-packages/run-engine/execution-states.png differ
diff --git a/internal-packages/run-engine/package.json b/internal-packages/run-engine/package.json
new file mode 100644
index 0000000000..b0a2dc9eb6
--- /dev/null
+++ b/internal-packages/run-engine/package.json
@@ -0,0 +1,27 @@
+{
+ "name": "@internal/run-engine",
+ "private": true,
+ "version": "0.0.1",
+ "main": "./src/index.ts",
+ "types": "./src/index.ts",
+ "dependencies": {
+ "@internal/redis-worker": "workspace:*",
+ "@opentelemetry/api": "^1.9.0",
+ "@opentelemetry/semantic-conventions": "^1.27.0",
+ "@trigger.dev/core": "workspace:*",
+ "@trigger.dev/database": "workspace:*",
+ "assert-never": "^1.2.1",
+ "ioredis": "^5.3.2",
+ "nanoid": "^3.3.4",
+ "redlock": "5.0.0-beta.2",
+ "zod": "3.23.8"
+ },
+ "devDependencies": {
+ "@internal/testcontainers": "workspace:*",
+ "vitest": "^1.4.0"
+ },
+ "scripts": {
+ "typecheck": "tsc --noEmit",
+ "test": "vitest --sequence.concurrent=false"
+ }
+}
diff --git a/internal-packages/run-engine/src/engine/consts.ts b/internal-packages/run-engine/src/engine/consts.ts
new file mode 100644
index 0000000000..6ea6f54c38
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/consts.ts
@@ -0,0 +1 @@
+export const MAX_TASK_RUN_ATTEMPTS = 250;
diff --git a/internal-packages/run-engine/src/engine/db/worker.ts b/internal-packages/run-engine/src/engine/db/worker.ts
new file mode 100644
index 0000000000..8cccc9a4fa
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/db/worker.ts
@@ -0,0 +1,277 @@
+import {
+ BackgroundWorker,
+ BackgroundWorkerTask,
+ Prisma,
+ PrismaClientOrTransaction,
+ WorkerDeployment,
+} from "@trigger.dev/database";
+import { CURRENT_DEPLOYMENT_LABEL } from "@trigger.dev/core/v3/apps";
+
+type RunWithMininimalEnvironment = Prisma.TaskRunGetPayload<{
+ include: {
+ runtimeEnvironment: {
+ select: {
+ id: true;
+ type: true;
+ };
+ };
+ };
+}>;
+
+type RunWithBackgroundWorkerTasksResult =
+ | {
+ success: false;
+ code: "NO_RUN";
+ message: string;
+ }
+ | {
+ success: false;
+ code:
+ | "NO_WORKER"
+ | "TASK_NOT_IN_LATEST"
+ | "TASK_NEVER_REGISTERED"
+ | "BACKGROUND_WORKER_MISMATCH";
+ message: string;
+ run: RunWithMininimalEnvironment;
+ }
+ | {
+ success: false;
+ code: "BACKGROUND_WORKER_MISMATCH";
+ message: string;
+ backgroundWorker: {
+ expected: string;
+ received: string;
+ };
+ run: RunWithMininimalEnvironment;
+ }
+ | {
+ success: true;
+ run: RunWithMininimalEnvironment;
+ worker: BackgroundWorker;
+ task: BackgroundWorkerTask;
+ deployment: WorkerDeployment | null;
+ };
+
+export async function getRunWithBackgroundWorkerTasks(
+ prisma: PrismaClientOrTransaction,
+ runId: string,
+ backgroundWorkerId?: string
+): Promise {
+ const run = await prisma.taskRun.findFirst({
+ where: {
+ id: runId,
+ },
+ include: {
+ runtimeEnvironment: {
+ select: {
+ id: true,
+ type: true,
+ },
+ },
+ lockedToVersion: {
+ include: {
+ deployment: true,
+ tasks: true,
+ },
+ },
+ },
+ });
+
+ if (!run) {
+ return {
+ success: false as const,
+ code: "NO_RUN",
+ message: `No run found with id: ${runId}`,
+ };
+ }
+
+ const workerId = run.lockedToVersionId ?? backgroundWorkerId;
+
+ //get the relevant BackgroundWorker with tasks and deployment (if not DEV)
+ let workerWithTasks: WorkerDeploymentWithWorkerTasks | null = null;
+
+ if (run.runtimeEnvironment.type === "DEVELOPMENT") {
+ workerWithTasks = workerId
+ ? await getWorkerById(prisma, workerId)
+ : await getMostRecentWorker(prisma, run.runtimeEnvironmentId);
+ } else {
+ workerWithTasks = workerId
+ ? await getWorkerDeploymentFromWorker(prisma, workerId)
+ : await getWorkerFromCurrentlyPromotedDeployment(prisma, run.runtimeEnvironmentId);
+ }
+
+ if (!workerWithTasks) {
+ return {
+ success: false as const,
+ code: "NO_WORKER",
+ message: `No worker found for run: ${run.id}`,
+ run,
+ };
+ }
+
+ if (backgroundWorkerId) {
+ if (backgroundWorkerId !== workerWithTasks.worker.id) {
+ return {
+ success: false as const,
+ code: "BACKGROUND_WORKER_MISMATCH",
+ message: `Background worker mismatch for run: ${run.id}`,
+ backgroundWorker: {
+ expected: backgroundWorkerId,
+ received: workerWithTasks.worker.id,
+ },
+ run,
+ };
+ }
+ }
+
+ const backgroundTask = workerWithTasks.tasks.find((task) => task.slug === run.taskIdentifier);
+
+ if (!backgroundTask) {
+ const nonCurrentTask = await prisma.backgroundWorkerTask.findFirst({
+ where: {
+ slug: run.taskIdentifier,
+ projectId: run.projectId,
+ runtimeEnvironmentId: run.runtimeEnvironmentId,
+ },
+ include: {
+ worker: true,
+ },
+ orderBy: {
+ createdAt: "desc",
+ },
+ });
+
+ if (nonCurrentTask) {
+ return {
+ success: false as const,
+ code: "TASK_NOT_IN_LATEST",
+ message: `Task not found in latest version: ${run.taskIdentifier}. Found in ${nonCurrentTask.worker.version}`,
+ run,
+ };
+ } else {
+ return {
+ success: false as const,
+ code: "TASK_NEVER_REGISTERED",
+ message: `Task has never been registered (in dev or deployed): ${run.taskIdentifier}`,
+ run,
+ };
+ }
+ }
+
+ return {
+ success: true as const,
+ run,
+ worker: workerWithTasks.worker,
+ task: backgroundTask,
+ deployment: workerWithTasks.deployment,
+ };
+}
+
+type WorkerDeploymentWithWorkerTasks = {
+ worker: BackgroundWorker;
+ tasks: BackgroundWorkerTask[];
+ deployment: WorkerDeployment | null;
+};
+
+export async function getWorkerDeploymentFromWorker(
+ prisma: PrismaClientOrTransaction,
+ workerId: string
+): Promise {
+ const worker = await prisma.backgroundWorker.findUnique({
+ where: {
+ id: workerId,
+ },
+ include: {
+ deployment: true,
+ tasks: true,
+ },
+ });
+
+ if (!worker) {
+ return null;
+ }
+
+ return { worker, tasks: worker.tasks, deployment: worker.deployment };
+}
+
+export async function getMostRecentWorker(
+ prisma: PrismaClientOrTransaction,
+ environmentId: string
+): Promise {
+ const worker = await prisma.backgroundWorker.findFirst({
+ where: {
+ runtimeEnvironmentId: environmentId,
+ },
+ include: {
+ tasks: true,
+ },
+ orderBy: {
+ id: "desc",
+ },
+ });
+
+ if (!worker) {
+ return null;
+ }
+
+ return { worker, tasks: worker.tasks, deployment: null };
+}
+
+export async function getWorkerById(
+ prisma: PrismaClientOrTransaction,
+ workerId: string
+): Promise {
+ const worker = await prisma.backgroundWorker.findFirst({
+ where: {
+ id: workerId,
+ },
+ include: {
+ deployment: true,
+ tasks: true,
+ },
+ orderBy: {
+ id: "desc",
+ },
+ });
+
+ if (!worker) {
+ return null;
+ }
+
+ return { worker, tasks: worker.tasks, deployment: worker.deployment };
+}
+
+export async function getWorkerFromCurrentlyPromotedDeployment(
+ prisma: PrismaClientOrTransaction,
+ environmentId: string
+): Promise {
+ const promotion = await prisma.workerDeploymentPromotion.findUnique({
+ where: {
+ environmentId_label: {
+ environmentId,
+ label: CURRENT_DEPLOYMENT_LABEL,
+ },
+ },
+ include: {
+ deployment: {
+ include: {
+ worker: {
+ include: {
+ tasks: true,
+ },
+ },
+ },
+ },
+ },
+ });
+
+ if (!promotion || !promotion.deployment.worker) {
+ return null;
+ }
+
+ return {
+ worker: promotion.deployment.worker,
+ tasks: promotion.deployment.worker.tasks,
+ deployment: promotion.deployment,
+ };
+}
diff --git a/internal-packages/run-engine/src/engine/errors.ts b/internal-packages/run-engine/src/engine/errors.ts
new file mode 100644
index 0000000000..33d9be6961
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/errors.ts
@@ -0,0 +1,58 @@
+import { assertExhaustive } from "@trigger.dev/core";
+import { TaskRunError } from "@trigger.dev/core/v3";
+import { TaskRunStatus } from "@trigger.dev/database";
+
+export function runStatusFromError(error: TaskRunError): TaskRunStatus {
+ if (error.type !== "INTERNAL_ERROR") {
+ return "COMPLETED_WITH_ERRORS";
+ }
+
+ //"CRASHED" should be used if it's a user-error or something they've misconfigured
+ //e.g. not enough memory
+ //"SYSTEM_FAILURE" should be used if it's an error from our system
+ //e.g. a bug
+ switch (error.code) {
+ case "RECURSIVE_WAIT_DEADLOCK":
+ return "COMPLETED_WITH_ERRORS";
+ case "TASK_RUN_CANCELLED":
+ return "CANCELED";
+ case "MAX_DURATION_EXCEEDED":
+ return "TIMED_OUT";
+ case "TASK_PROCESS_OOM_KILLED":
+ case "TASK_PROCESS_MAYBE_OOM_KILLED":
+ case "TASK_PROCESS_SIGSEGV":
+ case "DISK_SPACE_EXCEEDED":
+ case "OUTDATED_SDK_VERSION":
+ case "HANDLE_ERROR_ERROR":
+ case "TASK_RUN_CRASHED":
+ case "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE":
+ return "CRASHED";
+ case "COULD_NOT_FIND_EXECUTOR":
+ case "COULD_NOT_FIND_TASK":
+ case "COULD_NOT_IMPORT_TASK":
+ case "CONFIGURED_INCORRECTLY":
+ case "TASK_ALREADY_RUNNING":
+ case "TASK_PROCESS_SIGKILL_TIMEOUT":
+ case "TASK_RUN_HEARTBEAT_TIMEOUT":
+ case "TASK_DEQUEUED_INVALID_STATE":
+ case "TASK_DEQUEUED_QUEUE_NOT_FOUND":
+ case "TASK_RUN_DEQUEUED_MAX_RETRIES":
+ case "TASK_RUN_STALLED_EXECUTING":
+ case "TASK_RUN_STALLED_EXECUTING_WITH_WAITPOINTS":
+ case "TASK_HAS_N0_EXECUTION_SNAPSHOT":
+ case "GRACEFUL_EXIT_TIMEOUT":
+ case "TASK_INPUT_ERROR":
+ case "TASK_OUTPUT_ERROR":
+ case "POD_EVICTED":
+ case "POD_UNKNOWN_ERROR":
+ case "TASK_EXECUTION_ABORTED":
+ case "TASK_EXECUTION_FAILED":
+ case "TASK_PROCESS_SIGTERM":
+ case "TASK_DEQUEUED_INVALID_RETRY_CONFIG":
+ case "TASK_DEQUEUED_NO_RETRY_CONFIG":
+ case "TASK_DID_CONCURRENT_WAIT":
+ return "SYSTEM_FAILURE";
+ default:
+ assertExhaustive(error.code);
+ }
+}
diff --git a/internal-packages/run-engine/src/engine/eventBus.ts b/internal-packages/run-engine/src/engine/eventBus.ts
new file mode 100644
index 0000000000..871115ed6a
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/eventBus.ts
@@ -0,0 +1,180 @@
+import { TaskRunExecutionStatus, TaskRunStatus } from "@trigger.dev/database";
+import { AuthenticatedEnvironment } from "../shared";
+import { FlushedRunMetadata, TaskRunError } from "@trigger.dev/core/v3";
+
+export type EventBusEvents = {
+ runAttemptStarted: [
+ {
+ time: Date;
+ run: {
+ id: string;
+ attemptNumber: number;
+ baseCostInCents: number;
+ };
+ organization: {
+ id: string;
+ };
+ },
+ ];
+ runAttemptFailed: [
+ {
+ time: Date;
+ run: {
+ id: string;
+ status: TaskRunStatus;
+ spanId: string;
+ error: TaskRunError;
+ attemptNumber: number;
+ taskEventStore: string;
+ createdAt: Date;
+ completedAt: Date | null;
+ };
+ },
+ ];
+ runExpired: [
+ {
+ time: Date;
+ run: {
+ id: string;
+ spanId: string;
+ ttl: string | null;
+ taskEventStore: string;
+ createdAt: Date;
+ completedAt: Date | null;
+ };
+ },
+ ];
+ runSucceeded: [
+ {
+ time: Date;
+ run: {
+ id: string;
+ spanId: string;
+ output: string | undefined;
+ outputType: string;
+ taskEventStore: string;
+ createdAt: Date;
+ completedAt: Date | null;
+ };
+ },
+ ];
+ runFailed: [
+ {
+ time: Date;
+ run: {
+ id: string;
+ status: TaskRunStatus;
+ spanId: string;
+ error: TaskRunError;
+ taskEventStore: string;
+ createdAt: Date;
+ completedAt: Date | null;
+ };
+ },
+ ];
+ runRetryScheduled: [
+ {
+ time: Date;
+ run: {
+ id: string;
+ friendlyId: string;
+ spanId: string;
+ attemptNumber: number;
+ queue: string;
+ traceContext: Record;
+ taskIdentifier: string;
+ baseCostInCents: number;
+ };
+ organization: {
+ id: string;
+ };
+ environment: AuthenticatedEnvironment;
+ retryAt: Date;
+ },
+ ];
+ runCancelled: [
+ {
+ time: Date;
+ run: {
+ id: string;
+ friendlyId: string;
+ spanId: string;
+ error: TaskRunError;
+ taskEventStore: string;
+ createdAt: Date;
+ completedAt: Date | null;
+ };
+ },
+ ];
+ cachedRunCompleted: [
+ {
+ time: Date;
+ span: {
+ id: string;
+ createdAt: Date;
+ };
+ hasError: boolean;
+ blockedRunId: string;
+ },
+ ];
+ runMetadataUpdated: [
+ {
+ time: Date;
+ run: {
+ id: string;
+ metadata: FlushedRunMetadata;
+ };
+ },
+ ];
+ workerNotification: [
+ {
+ time: Date;
+ run: {
+ id: string;
+ };
+ snapshot: {
+ id: string;
+ executionStatus: TaskRunExecutionStatus;
+ };
+ },
+ ];
+ executionSnapshotCreated: [
+ {
+ time: Date;
+ run: {
+ id: string;
+ };
+ snapshot: {
+ id: string;
+ executionStatus: TaskRunExecutionStatus;
+ description: string;
+ runStatus: string;
+ attemptNumber: number | null;
+ checkpointId: string | null;
+ workerId: string | null;
+ runnerId: string | null;
+ completedWaitpointIds: string[];
+ isValid: boolean;
+ error: string | null;
+ };
+ },
+ ];
+ incomingCheckpointDiscarded: [
+ {
+ time: Date;
+ run: {
+ id: string;
+ };
+ snapshot: {
+ id: string;
+ executionStatus: TaskRunExecutionStatus;
+ };
+ checkpoint: {
+ metadata: Record;
+ discardReason: string;
+ };
+ },
+ ];
+};
+
+export type EventBusEventArgs = EventBusEvents[T];
diff --git a/internal-packages/run-engine/src/engine/executionSnapshots.ts b/internal-packages/run-engine/src/engine/executionSnapshots.ts
new file mode 100644
index 0000000000..5daca4f419
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/executionSnapshots.ts
@@ -0,0 +1,131 @@
+import { CompletedWaitpoint, ExecutionResult } from "@trigger.dev/core/v3";
+import { BatchId, RunId, SnapshotId } from "@trigger.dev/core/v3/apps";
+import {
+ PrismaClientOrTransaction,
+ TaskRunCheckpoint,
+ TaskRunExecutionSnapshot,
+} from "@trigger.dev/database";
+
+interface LatestExecutionSnapshot extends TaskRunExecutionSnapshot {
+ friendlyId: string;
+ runFriendlyId: string;
+ checkpoint: TaskRunCheckpoint | null;
+ completedWaitpoints: CompletedWaitpoint[];
+}
+
+/* Gets the most recent valid snapshot for a run */
+export async function getLatestExecutionSnapshot(
+ prisma: PrismaClientOrTransaction,
+ runId: string
+): Promise {
+ const snapshot = await prisma.taskRunExecutionSnapshot.findFirst({
+ where: { runId, isValid: true },
+ include: {
+ completedWaitpoints: true,
+ checkpoint: true,
+ },
+ orderBy: { createdAt: "desc" },
+ });
+
+ if (!snapshot) {
+ throw new Error(`No execution snapshot found for TaskRun ${runId}`);
+ }
+
+ return {
+ ...snapshot,
+ friendlyId: SnapshotId.toFriendlyId(snapshot.id),
+ runFriendlyId: RunId.toFriendlyId(snapshot.runId),
+ completedWaitpoints: snapshot.completedWaitpoints.flatMap((w) => {
+ //get all indexes of the waitpoint in the completedWaitpointOrder
+ //we do this because the same run can be in a batch multiple times (i.e. same idempotencyKey)
+ let indexes: (number | undefined)[] = [];
+ for (let i = 0; i < snapshot.completedWaitpointOrder.length; i++) {
+ if (snapshot.completedWaitpointOrder[i] === w.id) {
+ indexes.push(i);
+ }
+ }
+
+ if (indexes.length === 0) {
+ indexes.push(undefined);
+ }
+
+ return indexes.map((index) => {
+ return {
+ id: w.id,
+ index: index === -1 ? undefined : index,
+ friendlyId: w.friendlyId,
+ type: w.type,
+ completedAt: w.completedAt ?? new Date(),
+ idempotencyKey:
+ w.userProvidedIdempotencyKey && !w.inactiveIdempotencyKey
+ ? w.idempotencyKey
+ : undefined,
+ completedByTaskRun: w.completedByTaskRunId
+ ? {
+ id: w.completedByTaskRunId,
+ friendlyId: RunId.toFriendlyId(w.completedByTaskRunId),
+ batch: snapshot.batchId
+ ? {
+ id: snapshot.batchId,
+ friendlyId: BatchId.toFriendlyId(snapshot.batchId),
+ }
+ : undefined,
+ }
+ : undefined,
+ completedAfter: w.completedAfter ?? undefined,
+ completedByBatch: w.completedByBatchId
+ ? {
+ id: w.completedByBatchId,
+ friendlyId: BatchId.toFriendlyId(w.completedByBatchId),
+ }
+ : undefined,
+ output: w.output ?? undefined,
+ outputType: w.outputType,
+ outputIsError: w.outputIsError,
+ } satisfies CompletedWaitpoint;
+ });
+ }),
+ };
+}
+
+export async function getExecutionSnapshotCompletedWaitpoints(
+ prisma: PrismaClientOrTransaction,
+ snapshotId: string
+) {
+ const waitpoints = await prisma.taskRunExecutionSnapshot.findFirst({
+ where: { id: snapshotId },
+ include: {
+ completedWaitpoints: true,
+ },
+ });
+
+ //deduplicate waitpoints
+ const waitpointIds = new Set();
+ return (
+ waitpoints?.completedWaitpoints.filter((waitpoint) => {
+ if (waitpointIds.has(waitpoint.id)) {
+ return false;
+ } else {
+ waitpointIds.add(waitpoint.id);
+ return true;
+ }
+ }) ?? []
+ );
+}
+
+export function executionResultFromSnapshot(snapshot: TaskRunExecutionSnapshot): ExecutionResult {
+ return {
+ snapshot: {
+ id: snapshot.id,
+ friendlyId: SnapshotId.toFriendlyId(snapshot.id),
+ executionStatus: snapshot.executionStatus,
+ description: snapshot.description,
+ },
+ run: {
+ id: snapshot.runId,
+ friendlyId: RunId.toFriendlyId(snapshot.runId),
+ status: snapshot.runStatus,
+ attemptNumber: snapshot.attemptNumber,
+ },
+ };
+}
diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts
new file mode 100644
index 0000000000..1e8ada42a7
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/index.ts
@@ -0,0 +1,4045 @@
+import { Worker } from "@internal/redis-worker";
+import { Attributes, Span, SpanKind, trace, Tracer } from "@opentelemetry/api";
+import { assertExhaustive } from "@trigger.dev/core";
+import { Logger } from "@trigger.dev/core/logger";
+import {
+ CheckpointInput,
+ CompleteRunAttemptResult,
+ CreateCheckpointResult,
+ DequeuedMessage,
+ ExecutionResult,
+ MachineResources,
+ parsePacket,
+ RetryOptions,
+ RunExecutionData,
+ sanitizeError,
+ shouldRetryError,
+ StartRunAttemptResult,
+ TaskRunError,
+ taskRunErrorEnhancer,
+ TaskRunExecution,
+ TaskRunExecutionResult,
+ TaskRunFailedExecutionResult,
+ TaskRunInternalError,
+ TaskRunSuccessfulExecutionResult,
+ timeoutError,
+} from "@trigger.dev/core/v3";
+import {
+ BatchId,
+ CheckpointId,
+ getMaxDuration,
+ parseNaturalLanguageDuration,
+ QueueId,
+ RunId,
+ sanitizeQueueName,
+ SnapshotId,
+ WaitpointId,
+} from "@trigger.dev/core/v3/apps";
+import {
+ $transaction,
+ Prisma,
+ PrismaClient,
+ PrismaClientOrTransaction,
+ RuntimeEnvironmentType,
+ TaskRun,
+ TaskRunExecutionSnapshot,
+ TaskRunExecutionStatus,
+ TaskRunStatus,
+ Waitpoint,
+} from "@trigger.dev/database";
+import assertNever from "assert-never";
+import { Redis } from "ioredis";
+import { nanoid } from "nanoid";
+import { EventEmitter } from "node:events";
+import { z } from "zod";
+import { RunQueue } from "../run-queue";
+import { SimpleWeightedChoiceStrategy } from "../run-queue/simpleWeightedPriorityStrategy";
+import { MinimalAuthenticatedEnvironment } from "../shared";
+import { MAX_TASK_RUN_ATTEMPTS } from "./consts";
+import { getRunWithBackgroundWorkerTasks } from "./db/worker";
+import { runStatusFromError } from "./errors";
+import { EventBusEvents } from "./eventBus";
+import { executionResultFromSnapshot, getLatestExecutionSnapshot } from "./executionSnapshots";
+import { RunLocker } from "./locking";
+import { getMachinePreset } from "./machinePresets";
+import {
+ isCheckpointable,
+ isDequeueableExecutionStatus,
+ isExecuting,
+ isFinalRunStatus,
+ isPendingExecuting,
+} from "./statuses";
+import { HeartbeatTimeouts, RunEngineOptions, TriggerParams } from "./types";
+
+const workerCatalog = {
+ finishWaitpoint: {
+ schema: z.object({
+ waitpointId: z.string(),
+ error: z.string().optional(),
+ }),
+ visibilityTimeoutMs: 5000,
+ },
+ heartbeatSnapshot: {
+ schema: z.object({
+ runId: z.string(),
+ snapshotId: z.string(),
+ }),
+ visibilityTimeoutMs: 5000,
+ },
+ expireRun: {
+ schema: z.object({
+ runId: z.string(),
+ }),
+ visibilityTimeoutMs: 5000,
+ },
+ cancelRun: {
+ schema: z.object({
+ runId: z.string(),
+ completedAt: z.coerce.date(),
+ reason: z.string().optional(),
+ }),
+ visibilityTimeoutMs: 5000,
+ },
+ queueRunsWaitingForWorker: {
+ schema: z.object({
+ backgroundWorkerId: z.string(),
+ }),
+ visibilityTimeoutMs: 5000,
+ },
+ tryCompleteBatch: {
+ schema: z.object({
+ batchId: z.string(),
+ }),
+ visibilityTimeoutMs: 10_000,
+ },
+ continueRunIfUnblocked: {
+ schema: z.object({
+ runId: z.string(),
+ }),
+ visibilityTimeoutMs: 10_000,
+ },
+};
+
+type EngineWorker = Worker;
+
+export class RunEngine {
+ private runLockRedis: Redis;
+ private prisma: PrismaClient;
+ private runLock: RunLocker;
+ runQueue: RunQueue;
+ private worker: EngineWorker;
+ private logger = new Logger("RunEngine", "debug");
+ private tracer: Tracer;
+ private heartbeatTimeouts: HeartbeatTimeouts;
+ eventBus = new EventEmitter();
+
+ constructor(private readonly options: RunEngineOptions) {
+ this.prisma = options.prisma;
+ this.runLockRedis = new Redis({
+ ...options.runLock.redis,
+ keyPrefix: `${options.runLock.redis.keyPrefix}runlock:`,
+ });
+ this.runLock = new RunLocker({ redis: this.runLockRedis });
+
+ this.runQueue = new RunQueue({
+ name: "rq",
+ tracer: trace.getTracer("rq"),
+ queuePriorityStrategy: new SimpleWeightedChoiceStrategy({ queueSelectionCount: 36 }),
+ envQueuePriorityStrategy: new SimpleWeightedChoiceStrategy({ queueSelectionCount: 12 }),
+ defaultEnvConcurrency: options.queue?.defaultEnvConcurrency ?? 10,
+ logger: new Logger("RunQueue", "debug"),
+ redis: { ...options.queue.redis, keyPrefix: `${options.queue.redis.keyPrefix}runqueue:` },
+ retryOptions: options.queue?.retryOptions,
+ });
+
+ this.worker = new Worker({
+ name: "worker",
+ redisOptions: {
+ ...options.worker.redis,
+ keyPrefix: `${options.worker.redis.keyPrefix}worker:`,
+ },
+ catalog: workerCatalog,
+ concurrency: options.worker,
+ pollIntervalMs: options.worker.pollIntervalMs,
+ immediatePollIntervalMs: options.worker.immediatePollIntervalMs,
+ logger: new Logger("RunEngineWorker", "debug"),
+ jobs: {
+ finishWaitpoint: async ({ payload }) => {
+ await this.completeWaitpoint({
+ id: payload.waitpointId,
+ output: payload.error
+ ? {
+ value: payload.error,
+ isError: true,
+ }
+ : undefined,
+ });
+ },
+ heartbeatSnapshot: async ({ payload }) => {
+ await this.#handleStalledSnapshot(payload);
+ },
+ expireRun: async ({ payload }) => {
+ await this.#expireRun({ runId: payload.runId });
+ },
+ cancelRun: async ({ payload }) => {
+ await this.cancelRun({
+ runId: payload.runId,
+ completedAt: payload.completedAt,
+ reason: payload.reason,
+ });
+ },
+ queueRunsWaitingForWorker: async ({ payload }) => {
+ await this.#queueRunsWaitingForWorker({ backgroundWorkerId: payload.backgroundWorkerId });
+ },
+ tryCompleteBatch: async ({ payload }) => {
+ await this.#tryCompleteBatch({ batchId: payload.batchId });
+ },
+ continueRunIfUnblocked: async ({ payload }) => {
+ await this.#continueRunIfUnblocked({
+ runId: payload.runId,
+ });
+ },
+ },
+ }).start();
+
+ this.tracer = options.tracer;
+
+ const defaultHeartbeatTimeouts: HeartbeatTimeouts = {
+ PENDING_EXECUTING: 60_000,
+ PENDING_CANCEL: 60_000,
+ EXECUTING: 60_000,
+ EXECUTING_WITH_WAITPOINTS: 60_000,
+ };
+ this.heartbeatTimeouts = {
+ ...defaultHeartbeatTimeouts,
+ ...(options.heartbeatTimeoutsMs ?? {}),
+ };
+ }
+
+ //MARK: - Run functions
+
+ /** "Triggers" one run. */
+ async trigger(
+ {
+ friendlyId,
+ number,
+ environment,
+ idempotencyKey,
+ idempotencyKeyExpiresAt,
+ taskIdentifier,
+ payload,
+ payloadType,
+ context,
+ traceContext,
+ traceId,
+ spanId,
+ parentSpanId,
+ lockedToVersionId,
+ taskVersion,
+ sdkVersion,
+ cliVersion,
+ concurrencyKey,
+ masterQueue,
+ queueName,
+ queue,
+ isTest,
+ delayUntil,
+ queuedAt,
+ maxAttempts,
+ taskEventStore,
+ priorityMs,
+ ttl,
+ tags,
+ parentTaskRunId,
+ rootTaskRunId,
+ batch,
+ resumeParentOnCompletion,
+ depth,
+ metadata,
+ metadataType,
+ seedMetadata,
+ seedMetadataType,
+ oneTimeUseToken,
+ maxDurationInSeconds,
+ machine,
+ workerId,
+ runnerId,
+ }: TriggerParams,
+ tx?: PrismaClientOrTransaction
+ ): Promise {
+ const prisma = tx ?? this.prisma;
+
+ return this.#trace(
+ "trigger",
+ {
+ friendlyId,
+ environmentId: environment.id,
+ projectId: environment.project.id,
+ taskIdentifier,
+ },
+ async (span) => {
+ const status = delayUntil ? "DELAYED" : "PENDING";
+
+ let secondaryMasterQueue: string | undefined = undefined;
+
+ if (environment.type === "DEVELOPMENT") {
+ // In dev we use the environment id as the master queue, or the locked worker id
+ masterQueue = this.#environmentMasterQueueKey(environment.id);
+ if (lockedToVersionId) {
+ masterQueue = this.#backgroundWorkerQueueKey(lockedToVersionId);
+ }
+ } else {
+ // For deployed runs, we add the env/worker id as the secondary master queue
+ let secondaryMasterQueue = this.#environmentMasterQueueKey(environment.id);
+ if (lockedToVersionId) {
+ secondaryMasterQueue = this.#backgroundWorkerQueueKey(lockedToVersionId);
+ }
+ }
+
+ //create run
+ let taskRun: TaskRun;
+ try {
+ taskRun = await prisma.taskRun.create({
+ data: {
+ id: RunId.fromFriendlyId(friendlyId),
+ engine: "V2",
+ status,
+ number,
+ friendlyId,
+ runtimeEnvironmentId: environment.id,
+ projectId: environment.project.id,
+ idempotencyKey,
+ idempotencyKeyExpiresAt,
+ taskIdentifier,
+ payload,
+ payloadType,
+ context,
+ traceContext,
+ traceId,
+ spanId,
+ parentSpanId,
+ lockedToVersionId,
+ taskVersion,
+ sdkVersion,
+ cliVersion,
+ concurrencyKey,
+ queue: queueName,
+ masterQueue,
+ secondaryMasterQueue,
+ isTest,
+ delayUntil,
+ queuedAt,
+ maxAttempts,
+ taskEventStore,
+ priorityMs,
+ ttl,
+ tags:
+ tags.length === 0
+ ? undefined
+ : {
+ connect: tags,
+ },
+ runTags: tags.length === 0 ? undefined : tags.map((tag) => tag.name),
+ oneTimeUseToken,
+ parentTaskRunId,
+ rootTaskRunId,
+ batchId: batch?.id,
+ resumeParentOnCompletion,
+ depth,
+ metadata,
+ metadataType,
+ seedMetadata,
+ seedMetadataType,
+ maxDurationInSeconds,
+ machinePreset: machine,
+ executionSnapshots: {
+ create: {
+ engine: "V2",
+ executionStatus: "RUN_CREATED",
+ description: "Run was created",
+ runStatus: status,
+ environmentId: environment.id,
+ environmentType: environment.type,
+ workerId,
+ runnerId,
+ },
+ },
+ },
+ });
+ } catch (error) {
+ if (error instanceof Prisma.PrismaClientKnownRequestError) {
+ this.logger.debug("engine.trigger(): Prisma transaction error", {
+ code: error.code,
+ message: error.message,
+ meta: error.meta,
+ idempotencyKey,
+ environmentId: environment.id,
+ });
+
+ if (error.code === "P2002") {
+ this.logger.debug("engine.trigger(): throwing RunDuplicateIdempotencyKeyError", {
+ code: error.code,
+ message: error.message,
+ meta: error.meta,
+ idempotencyKey,
+ environmentId: environment.id,
+ });
+
+ //this happens if a unique constraint failed, i.e. duplicate idempotency
+ throw new RunDuplicateIdempotencyKeyError(
+ `Run with idempotency key ${idempotencyKey} already exists`
+ );
+ }
+ }
+
+ throw error;
+ }
+
+ span.setAttribute("runId", taskRun.id);
+
+ await this.runLock.lock([taskRun.id], 5000, async (signal) => {
+ //create associated waitpoint (this completes when the run completes)
+ const associatedWaitpoint = await this.#createRunAssociatedWaitpoint(prisma, {
+ projectId: environment.project.id,
+ environmentId: environment.id,
+ completedByTaskRunId: taskRun.id,
+ });
+
+ //triggerAndWait or batchTriggerAndWait
+ if (resumeParentOnCompletion && parentTaskRunId) {
+ //this will block the parent run from continuing until this waitpoint is completed (and removed)
+ await this.blockRunWithWaitpoint({
+ runId: parentTaskRunId,
+ waitpoints: associatedWaitpoint.id,
+ environmentId: associatedWaitpoint.environmentId,
+ projectId: associatedWaitpoint.projectId,
+ organizationId: environment.organization.id,
+ batch,
+ workerId,
+ runnerId,
+ tx: prisma,
+ });
+
+ //release the concurrency
+ //if the queue is the same then it's recursive and we need to release that too otherwise we could have a deadlock
+ const parentRun = await prisma.taskRun.findUnique({
+ select: {
+ queue: true,
+ },
+ where: {
+ id: parentTaskRunId,
+ },
+ });
+ const releaseRunConcurrency = parentRun?.queue === taskRun.queue;
+ await this.runQueue.releaseConcurrency(
+ environment.organization.id,
+ parentTaskRunId,
+ releaseRunConcurrency
+ );
+ }
+
+ //Make sure lock extension succeeded
+ signal.throwIfAborted();
+
+ if (queue) {
+ const concurrencyLimit =
+ typeof queue.concurrencyLimit === "number"
+ ? Math.max(Math.min(queue.concurrencyLimit, environment.maximumConcurrencyLimit), 0)
+ : queue.concurrencyLimit;
+
+ let taskQueue = await prisma.taskQueue.findFirst({
+ where: {
+ runtimeEnvironmentId: environment.id,
+ name: queueName,
+ },
+ });
+
+ if (!taskQueue) {
+ // handle conflicts with existing queues
+ taskQueue = await prisma.taskQueue.create({
+ data: {
+ ...QueueId.generate(),
+ name: queueName,
+ concurrencyLimit,
+ runtimeEnvironmentId: environment.id,
+ projectId: environment.project.id,
+ type: "NAMED",
+ },
+ });
+ }
+
+ if (typeof concurrencyLimit === "number") {
+ this.logger.debug("TriggerTaskService: updating concurrency limit", {
+ runId: taskRun.id,
+ friendlyId: taskRun.friendlyId,
+ taskQueue,
+ orgId: environment.organization.id,
+ projectId: environment.project.id,
+ concurrencyLimit,
+ queueOptions: queue,
+ });
+
+ await this.runQueue.updateQueueConcurrencyLimits(
+ environment,
+ taskQueue.name,
+ concurrencyLimit
+ );
+ } else if (concurrencyLimit === null) {
+ this.logger.debug("TriggerTaskService: removing concurrency limit", {
+ runId: taskRun.id,
+ friendlyId: taskRun.friendlyId,
+ taskQueue,
+ orgId: environment.organization.id,
+ projectId: environment.project.id,
+ queueOptions: queue,
+ });
+
+ await this.runQueue.removeQueueConcurrencyLimits(environment, taskQueue.name);
+ }
+ }
+
+ if (taskRun.delayUntil) {
+ const delayWaitpointResult = await this.createDateTimeWaitpoint({
+ projectId: environment.project.id,
+ environmentId: environment.id,
+ completedAfter: taskRun.delayUntil,
+ tx: prisma,
+ });
+
+ await prisma.taskRunWaitpoint.create({
+ data: {
+ taskRunId: taskRun.id,
+ waitpointId: delayWaitpointResult.waitpoint.id,
+ projectId: delayWaitpointResult.waitpoint.projectId,
+ },
+ });
+ }
+
+ if (!taskRun.delayUntil && taskRun.ttl) {
+ const expireAt = parseNaturalLanguageDuration(taskRun.ttl);
+
+ if (expireAt) {
+ await this.worker.enqueue({
+ id: `expireRun:${taskRun.id}`,
+ job: "expireRun",
+ payload: { runId: taskRun.id },
+ availableAt: expireAt,
+ });
+ }
+ }
+
+ //Make sure lock extension succeeded
+ signal.throwIfAborted();
+
+ //enqueue the run if it's not delayed
+ if (!taskRun.delayUntil) {
+ await this.#enqueueRun({
+ run: taskRun,
+ env: environment,
+ timestamp: Date.now() - taskRun.priorityMs,
+ workerId,
+ runnerId,
+ tx: prisma,
+ });
+ }
+ });
+
+ return taskRun;
+ }
+ );
+ }
+
+ /**
+ * Gets a fairly selected run from the specified master queue, returning the information required to run it.
+ * @param consumerId: The consumer that is pulling, allows multiple consumers to pull from the same queue
+ * @param masterQueue: The shared queue to pull from, can be an individual environment (for dev)
+ * @returns
+ */
+ async dequeueFromMasterQueue({
+ consumerId,
+ masterQueue,
+ maxRunCount,
+ maxResources,
+ backgroundWorkerId,
+ workerId,
+ runnerId,
+ tx,
+ }: {
+ consumerId: string;
+ masterQueue: string;
+ maxRunCount: number;
+ maxResources?: MachineResources;
+ backgroundWorkerId?: string;
+ workerId?: string;
+ runnerId?: string;
+ tx?: PrismaClientOrTransaction;
+ }): Promise {
+ const prisma = tx ?? this.prisma;
+ return this.#trace("dequeueFromMasterQueue", { consumerId, masterQueue }, async (span) => {
+ //gets multiple runs from the queue
+ const messages = await this.runQueue.dequeueMessageFromMasterQueue(
+ consumerId,
+ masterQueue,
+ maxRunCount
+ );
+ if (messages.length === 0) {
+ return [];
+ }
+
+ //we can't send more than the max resources
+ const consumedResources: MachineResources = {
+ cpu: 0,
+ memory: 0,
+ };
+
+ const dequeuedRuns: DequeuedMessage[] = [];
+
+ for (const message of messages) {
+ const orgId = message.message.orgId;
+ const runId = message.messageId;
+
+ span.setAttribute("runId", runId);
+
+ //lock the run so nothing else can modify it
+ try {
+ const dequeuedRun = await this.runLock.lock([runId], 5000, async (signal) => {
+ const snapshot = await getLatestExecutionSnapshot(prisma, runId);
+
+ if (!isDequeueableExecutionStatus(snapshot.executionStatus)) {
+ //create a failed snapshot
+ await this.#createExecutionSnapshot(prisma, {
+ run: {
+ id: snapshot.runId,
+ status: snapshot.runStatus,
+ },
+ snapshot: {
+ executionStatus: snapshot.executionStatus,
+ description:
+ "Tried to dequeue a run that is not in a valid state to be dequeued.",
+ },
+ environmentId: snapshot.environmentId,
+ environmentType: snapshot.environmentType,
+ checkpointId: snapshot.checkpointId ?? undefined,
+ completedWaitpoints: snapshot.completedWaitpoints,
+ error: `Tried to dequeue a run that is not in a valid state to be dequeued.`,
+ workerId,
+ runnerId,
+ });
+
+ //todo is there a way to recover this, so the run can be retried?
+ //for example should we update the status to a dequeuable status and nack it?
+ //then at least it has a chance of succeeding and we have the error log above
+ await this.#systemFailure({
+ runId,
+ error: {
+ type: "INTERNAL_ERROR",
+ code: "TASK_DEQUEUED_INVALID_STATE",
+ message: `Task was in the ${snapshot.executionStatus} state when it was dequeued for execution.`,
+ },
+ tx: prisma,
+ });
+ this.logger.error(
+ `RunEngine.dequeueFromMasterQueue(): Run is not in a valid state to be dequeued: ${runId}\n ${snapshot.id}:${snapshot.executionStatus}`
+ );
+ return null;
+ }
+
+ const result = await getRunWithBackgroundWorkerTasks(prisma, runId, backgroundWorkerId);
+
+ if (!result.success) {
+ switch (result.code) {
+ case "NO_RUN": {
+ //this should not happen, the run is unrecoverable so we'll ack it
+ this.logger.error("RunEngine.dequeueFromMasterQueue(): No run found", {
+ runId,
+ latestSnapshot: snapshot.id,
+ });
+ await this.runQueue.acknowledgeMessage(orgId, runId);
+ return null;
+ }
+ case "NO_WORKER":
+ case "TASK_NEVER_REGISTERED":
+ case "TASK_NOT_IN_LATEST": {
+ this.logger.warn(`RunEngine.dequeueFromMasterQueue(): ${result.code}`, {
+ runId,
+ latestSnapshot: snapshot.id,
+ result,
+ });
+
+ //not deployed yet, so we'll wait for the deploy
+ await this.#waitingForDeploy({
+ orgId,
+ runId,
+ reason: result.message,
+ tx: prisma,
+ });
+ return null;
+ }
+ case "BACKGROUND_WORKER_MISMATCH": {
+ this.logger.warn(
+ "RunEngine.dequeueFromMasterQueue(): Background worker mismatch",
+ {
+ runId,
+ latestSnapshot: snapshot.id,
+ result,
+ }
+ );
+
+ //worker mismatch so put it back in the queue
+ await this.runQueue.nackMessage({ orgId, messageId: runId });
+
+ return null;
+ }
+ default: {
+ assertExhaustive(result);
+ }
+ }
+ }
+
+ //check for a valid deployment if it's not a development environment
+ if (result.run.runtimeEnvironment.type !== "DEVELOPMENT") {
+ if (!result.deployment || !result.deployment.imageReference) {
+ this.logger.warn("RunEngine.dequeueFromMasterQueue(): No deployment found", {
+ runId,
+ latestSnapshot: snapshot.id,
+ result,
+ });
+ //not deployed yet, so we'll wait for the deploy
+ await this.#waitingForDeploy({
+ orgId,
+ runId,
+ reason: "No deployment or deployment image reference found for deployed run",
+ tx: prisma,
+ });
+
+ return null;
+ }
+ }
+
+ const machinePreset = getMachinePreset({
+ machines: this.options.machines.machines,
+ defaultMachine: this.options.machines.defaultMachine,
+ config: result.task.machineConfig ?? {},
+ run: result.run,
+ });
+
+ //increment the consumed resources
+ consumedResources.cpu += machinePreset.cpu;
+ consumedResources.memory += machinePreset.memory;
+
+ //are we under the limit?
+ if (maxResources) {
+ if (
+ consumedResources.cpu > maxResources.cpu ||
+ consumedResources.memory > maxResources.memory
+ ) {
+ this.logger.debug(
+ "RunEngine.dequeueFromMasterQueue(): Consumed resources over limit, nacking",
+ {
+ runId,
+ consumedResources,
+ maxResources,
+ }
+ );
+
+ //put it back in the queue where it was
+ await this.runQueue.nackMessage({
+ orgId,
+ messageId: runId,
+ incrementAttemptCount: false,
+ retryAt: result.run.createdAt.getTime() - result.run.priorityMs,
+ });
+ return null;
+ }
+ }
+
+ // Check max attempts that can optionally be set when triggering a run
+ let maxAttempts: number | null | undefined = result.run.maxAttempts;
+
+ // If it's not set, we'll grab it from the task's retry config
+ if (!maxAttempts) {
+ const retryConfig = result.task.retryConfig;
+
+ this.logger.debug(
+ "RunEngine.dequeueFromMasterQueue(): maxAttempts not set, using task's retry config",
+ {
+ runId,
+ task: result.task.id,
+ rawRetryConfig: retryConfig,
+ }
+ );
+
+ const parsedConfig = RetryOptions.nullable().safeParse(retryConfig);
+
+ if (!parsedConfig.success) {
+ this.logger.error("RunEngine.dequeueFromMasterQueue(): Invalid retry config", {
+ runId,
+ task: result.task.id,
+ rawRetryConfig: retryConfig,
+ });
+
+ await this.#systemFailure({
+ runId,
+ error: {
+ type: "INTERNAL_ERROR",
+ code: "TASK_DEQUEUED_INVALID_RETRY_CONFIG",
+ message: `Invalid retry config: ${retryConfig}`,
+ },
+ tx: prisma,
+ });
+
+ return null;
+ }
+
+ if (!parsedConfig.data) {
+ this.logger.error("RunEngine.dequeueFromMasterQueue(): No retry config", {
+ runId,
+ task: result.task.id,
+ rawRetryConfig: retryConfig,
+ });
+
+ await this.#systemFailure({
+ runId,
+ error: {
+ type: "INTERNAL_ERROR",
+ code: "TASK_DEQUEUED_NO_RETRY_CONFIG",
+ message: `No retry config found`,
+ },
+ tx: prisma,
+ });
+
+ return null;
+ }
+
+ maxAttempts = parsedConfig.data.maxAttempts;
+ }
+
+ //update the run
+ const lockedTaskRun = await prisma.taskRun.update({
+ where: {
+ id: runId,
+ },
+ data: {
+ lockedAt: new Date(),
+ lockedById: result.task.id,
+ lockedToVersionId: result.worker.id,
+ startedAt: result.run.startedAt ?? new Date(),
+ baseCostInCents: this.options.machines.baseCostInCents,
+ machinePreset: machinePreset.name,
+ taskVersion: result.worker.version,
+ sdkVersion: result.worker.sdkVersion,
+ cliVersion: result.worker.cliVersion,
+ maxDurationInSeconds: getMaxDuration(
+ result.run.maxDurationInSeconds,
+ result.task.maxDurationInSeconds
+ ),
+ maxAttempts: maxAttempts ?? undefined,
+ },
+ include: {
+ runtimeEnvironment: true,
+ tags: true,
+ },
+ });
+
+ if (!lockedTaskRun) {
+ this.logger.error("RunEngine.dequeueFromMasterQueue(): Failed to lock task run", {
+ taskRun: result.run.id,
+ taskIdentifier: result.run.taskIdentifier,
+ deployment: result.deployment?.id,
+ worker: result.worker.id,
+ task: result.task.id,
+ runId,
+ });
+
+ await this.runQueue.acknowledgeMessage(orgId, runId);
+ return null;
+ }
+
+ const queue = await prisma.taskQueue.findUnique({
+ where: {
+ runtimeEnvironmentId_name: {
+ runtimeEnvironmentId: lockedTaskRun.runtimeEnvironmentId,
+ name: sanitizeQueueName(lockedTaskRun.queue),
+ },
+ },
+ });
+
+ if (!queue) {
+ this.logger.debug(
+ "RunEngine.dequeueFromMasterQueue(): queue not found, so nacking message",
+ {
+ queueMessage: message,
+ taskRunQueue: lockedTaskRun.queue,
+ runtimeEnvironmentId: lockedTaskRun.runtimeEnvironmentId,
+ }
+ );
+
+ //will auto-retry
+ const gotRequeued = await this.runQueue.nackMessage({ orgId, messageId: runId });
+ if (!gotRequeued) {
+ await this.#systemFailure({
+ runId,
+ error: {
+ type: "INTERNAL_ERROR",
+ code: "TASK_DEQUEUED_QUEUE_NOT_FOUND",
+ message: `Tried to dequeue the run but the queue doesn't exist: ${lockedTaskRun.queue}`,
+ },
+ tx: prisma,
+ });
+ }
+
+ return null;
+ }
+
+ const currentAttemptNumber = lockedTaskRun.attemptNumber ?? 0;
+ const nextAttemptNumber = currentAttemptNumber + 1;
+
+ const newSnapshot = await this.#createExecutionSnapshot(prisma, {
+ run: {
+ id: runId,
+ status: snapshot.runStatus,
+ attemptNumber: lockedTaskRun.attemptNumber,
+ },
+ snapshot: {
+ executionStatus: "PENDING_EXECUTING",
+ description: "Run was dequeued for execution",
+ },
+ environmentId: snapshot.environmentId,
+ environmentType: snapshot.environmentType,
+ checkpointId: snapshot.checkpointId ?? undefined,
+ completedWaitpoints: snapshot.completedWaitpoints,
+ workerId,
+ runnerId,
+ });
+
+ return {
+ version: "1" as const,
+ snapshot: {
+ id: newSnapshot.id,
+ friendlyId: newSnapshot.friendlyId,
+ executionStatus: newSnapshot.executionStatus,
+ description: newSnapshot.description,
+ },
+ image: result.deployment?.imageReference ?? undefined,
+ checkpoint: newSnapshot.checkpoint ?? undefined,
+ completedWaitpoints: snapshot.completedWaitpoints,
+ backgroundWorker: {
+ id: result.worker.id,
+ friendlyId: result.worker.friendlyId,
+ version: result.worker.version,
+ },
+ deployment: {
+ id: result.deployment?.id,
+ friendlyId: result.deployment?.friendlyId,
+ },
+ run: {
+ id: lockedTaskRun.id,
+ friendlyId: lockedTaskRun.friendlyId,
+ isTest: lockedTaskRun.isTest,
+ machine: machinePreset,
+ attemptNumber: nextAttemptNumber,
+ masterQueue: lockedTaskRun.masterQueue,
+ traceContext: lockedTaskRun.traceContext as Record,
+ },
+ environment: {
+ id: lockedTaskRun.runtimeEnvironment.id,
+ type: lockedTaskRun.runtimeEnvironment.type,
+ },
+ organization: {
+ id: orgId,
+ },
+ project: {
+ id: lockedTaskRun.projectId,
+ },
+ } satisfies DequeuedMessage;
+ });
+
+ if (dequeuedRun !== null) {
+ dequeuedRuns.push(dequeuedRun);
+ }
+ } catch (error) {
+ this.logger.error(
+ "RunEngine.dequeueFromMasterQueue(): Thrown error while preparing run to be run",
+ {
+ error,
+ runId,
+ }
+ );
+
+ const run = await prisma.taskRun.findFirst({
+ where: { id: runId },
+ include: {
+ runtimeEnvironment: true,
+ },
+ });
+
+ if (!run) {
+ //this isn't ideal because we're not creating a snapshot… but we can't do much else
+ this.logger.error(
+ "RunEngine.dequeueFromMasterQueue(): Thrown error, then run not found. Nacking.",
+ {
+ runId,
+ orgId,
+ }
+ );
+ await this.runQueue.nackMessage({ orgId, messageId: runId });
+ continue;
+ }
+
+ //this is an unknown error, we'll reattempt (with auto-backoff and eventually DLQ)
+ const gotRequeued = await this.#tryNackAndRequeue({
+ run,
+ environment: run.runtimeEnvironment,
+ orgId,
+ error: {
+ type: "INTERNAL_ERROR",
+ code: "TASK_RUN_DEQUEUED_MAX_RETRIES",
+ message: `We tried to dequeue the run the maximum number of times but it wouldn't start executing`,
+ },
+ tx: prisma,
+ });
+ //we don't need this, but it makes it clear we're in a loop here
+ continue;
+ }
+ }
+
+ return dequeuedRuns;
+ });
+ }
+
+ async dequeueFromEnvironmentMasterQueue({
+ consumerId,
+ environmentId,
+ maxRunCount,
+ maxResources,
+ backgroundWorkerId,
+ workerId,
+ runnerId,
+ tx,
+ }: {
+ consumerId: string;
+ environmentId: string;
+ maxRunCount: number;
+ maxResources?: MachineResources;
+ backgroundWorkerId?: string;
+ workerId?: string;
+ runnerId?: string;
+ tx?: PrismaClientOrTransaction;
+ }): Promise {
+ return this.dequeueFromMasterQueue({
+ consumerId,
+ masterQueue: this.#environmentMasterQueueKey(environmentId),
+ maxRunCount,
+ maxResources,
+ backgroundWorkerId,
+ workerId,
+ runnerId,
+ tx,
+ });
+ }
+
+ async dequeueFromBackgroundWorkerMasterQueue({
+ consumerId,
+ backgroundWorkerId,
+ maxRunCount,
+ maxResources,
+ workerId,
+ runnerId,
+ tx,
+ }: {
+ consumerId: string;
+ backgroundWorkerId: string;
+ maxRunCount: number;
+ maxResources?: MachineResources;
+ workerId?: string;
+ runnerId?: string;
+ tx?: PrismaClientOrTransaction;
+ }): Promise {
+ return this.dequeueFromMasterQueue({
+ consumerId,
+ masterQueue: this.#backgroundWorkerQueueKey(backgroundWorkerId),
+ maxRunCount,
+ maxResources,
+ backgroundWorkerId,
+ workerId,
+ runnerId,
+ tx,
+ });
+ }
+
+ async startRunAttempt({
+ runId,
+ snapshotId,
+ workerId,
+ runnerId,
+ isWarmStart,
+ tx,
+ }: {
+ runId: string;
+ snapshotId: string;
+ workerId?: string;
+ runnerId?: string;
+ isWarmStart?: boolean;
+ tx?: PrismaClientOrTransaction;
+ }): Promise {
+ const prisma = tx ?? this.prisma;
+
+ return this.#trace("startRunAttempt", { runId, snapshotId }, async (span) => {
+ return this.runLock.lock([runId], 5000, async (signal) => {
+ const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId);
+
+ if (latestSnapshot.id !== snapshotId) {
+ //if there is a big delay between the snapshot and the attempt, the snapshot might have changed
+ //we just want to log because elsewhere it should have been put back into a state where it can be attempted
+ this.logger.warn(
+ "RunEngine.createRunAttempt(): snapshot has changed since the attempt was created, ignoring."
+ );
+ throw new ServiceValidationError("Snapshot changed", 409);
+ }
+
+ const environment = await this.#getAuthenticatedEnvironmentFromRun(runId, prisma);
+ if (!environment) {
+ throw new ServiceValidationError("Environment not found", 404);
+ }
+
+ const taskRun = await prisma.taskRun.findFirst({
+ where: {
+ id: runId,
+ },
+ include: {
+ tags: true,
+ lockedBy: {
+ include: {
+ worker: {
+ select: {
+ id: true,
+ version: true,
+ sdkVersion: true,
+ cliVersion: true,
+ supportsLazyAttempts: true,
+ },
+ },
+ },
+ },
+ batchItems: {
+ include: {
+ batchTaskRun: true,
+ },
+ },
+ },
+ });
+
+ this.logger.debug("Creating a task run attempt", { taskRun });
+
+ if (!taskRun) {
+ throw new ServiceValidationError("Task run not found", 404);
+ }
+
+ span.setAttribute("projectId", taskRun.projectId);
+ span.setAttribute("environmentId", taskRun.runtimeEnvironmentId);
+ span.setAttribute("taskRunId", taskRun.id);
+ span.setAttribute("taskRunFriendlyId", taskRun.friendlyId);
+
+ if (taskRun.status === "CANCELED") {
+ throw new ServiceValidationError("Task run is cancelled", 400);
+ }
+
+ if (!taskRun.lockedBy) {
+ throw new ServiceValidationError("Task run is not locked", 400);
+ }
+
+ const queue = await prisma.taskQueue.findUnique({
+ where: {
+ runtimeEnvironmentId_name: {
+ runtimeEnvironmentId: environment.id,
+ name: taskRun.queue,
+ },
+ },
+ });
+
+ if (!queue) {
+ throw new ServiceValidationError("Queue not found", 404);
+ }
+
+ //increment the attempt number (start at 1)
+ const nextAttemptNumber = (taskRun.attemptNumber ?? 0) + 1;
+
+ if (nextAttemptNumber > MAX_TASK_RUN_ATTEMPTS) {
+ await this.#attemptFailed({
+ runId: taskRun.id,
+ snapshotId,
+ completion: {
+ ok: false,
+ id: taskRun.id,
+ error: {
+ type: "INTERNAL_ERROR",
+ code: "TASK_RUN_CRASHED",
+ message: "Max attempts reached.",
+ },
+ },
+ tx: prisma,
+ });
+ throw new ServiceValidationError("Max attempts reached", 400);
+ }
+
+ this.eventBus.emit("runAttemptStarted", {
+ time: new Date(),
+ run: {
+ id: taskRun.id,
+ attemptNumber: nextAttemptNumber,
+ baseCostInCents: taskRun.baseCostInCents,
+ },
+ organization: {
+ id: environment.organization.id,
+ },
+ });
+
+ const result = await $transaction(
+ prisma,
+ async (tx) => {
+ const run = await tx.taskRun.update({
+ where: {
+ id: taskRun.id,
+ },
+ data: {
+ status: "EXECUTING",
+ attemptNumber: nextAttemptNumber,
+ executedAt: taskRun.attemptNumber === null ? new Date() : undefined,
+ },
+ include: {
+ tags: true,
+ lockedBy: {
+ include: { worker: true },
+ },
+ },
+ });
+
+ const newSnapshot = await this.#createExecutionSnapshot(tx, {
+ run,
+ snapshot: {
+ executionStatus: "EXECUTING",
+ description: `Attempt created, starting execution${
+ isWarmStart ? " (warm start)" : ""
+ }`,
+ },
+ environmentId: latestSnapshot.environmentId,
+ environmentType: latestSnapshot.environmentType,
+ workerId,
+ runnerId,
+ });
+
+ if (taskRun.ttl) {
+ //don't expire the run, it's going to execute
+ await this.worker.ack(`expireRun:${taskRun.id}`);
+ }
+
+ return { run, snapshot: newSnapshot };
+ },
+ (error) => {
+ this.logger.error("RunEngine.createRunAttempt(): prisma.$transaction error", {
+ code: error.code,
+ meta: error.meta,
+ stack: error.stack,
+ message: error.message,
+ name: error.name,
+ });
+ throw new ServiceValidationError(
+ "Failed to update task run and execution snapshot",
+ 500
+ );
+ }
+ );
+
+ if (!result) {
+ this.logger.error("RunEngine.createRunAttempt(): failed to create task run attempt", {
+ runId: taskRun.id,
+ nextAttemptNumber,
+ });
+ throw new ServiceValidationError("Failed to create task run attempt", 500);
+ }
+
+ const { run, snapshot } = result;
+
+ const machinePreset = getMachinePreset({
+ machines: this.options.machines.machines,
+ defaultMachine: this.options.machines.defaultMachine,
+ config: taskRun.lockedBy.machineConfig ?? {},
+ run: taskRun,
+ });
+
+ const metadata = await parsePacket({
+ data: taskRun.metadata ?? undefined,
+ dataType: taskRun.metadataType,
+ });
+
+ const execution: TaskRunExecution = {
+ task: {
+ id: run.lockedBy!.slug,
+ filePath: run.lockedBy!.filePath,
+ exportName: run.lockedBy!.exportName,
+ },
+ attempt: {
+ number: nextAttemptNumber,
+ startedAt: latestSnapshot.updatedAt,
+ /** @deprecated */
+ id: "deprecated",
+ /** @deprecated */
+ backgroundWorkerId: "deprecated",
+ /** @deprecated */
+ backgroundWorkerTaskId: "deprecated",
+ /** @deprecated */
+ status: "deprecated",
+ },
+ run: {
+ id: run.friendlyId,
+ payload: run.payload,
+ payloadType: run.payloadType,
+ createdAt: run.createdAt,
+ tags: run.tags.map((tag) => tag.name),
+ isTest: run.isTest,
+ idempotencyKey: run.idempotencyKey ?? undefined,
+ startedAt: run.startedAt ?? run.createdAt,
+ maxAttempts: run.maxAttempts ?? undefined,
+ version: run.lockedBy!.worker.version,
+ metadata,
+ maxDuration: run.maxDurationInSeconds ?? undefined,
+ /** @deprecated */
+ context: undefined,
+ /** @deprecated */
+ durationMs: run.usageDurationMs,
+ /** @deprecated */
+ costInCents: run.costInCents,
+ /** @deprecated */
+ baseCostInCents: run.baseCostInCents,
+ traceContext: run.traceContext as Record,
+ priority: run.priorityMs === 0 ? undefined : run.priorityMs / 1_000,
+ },
+ queue: {
+ id: queue.friendlyId,
+ name: queue.name,
+ },
+ environment: {
+ id: environment.id,
+ slug: environment.slug,
+ type: environment.type,
+ },
+ organization: {
+ id: environment.organization.id,
+ slug: environment.organization.slug,
+ name: environment.organization.title,
+ },
+ project: {
+ id: environment.project.id,
+ ref: environment.project.externalRef,
+ slug: environment.project.slug,
+ name: environment.project.name,
+ },
+ batch:
+ taskRun.batchItems[0] && taskRun.batchItems[0].batchTaskRun
+ ? { id: taskRun.batchItems[0].batchTaskRun.friendlyId }
+ : undefined,
+ machine: machinePreset,
+ };
+
+ return { run, snapshot, execution };
+ });
+ });
+ }
+
+ /** How a run is completed */
+ async completeRunAttempt({
+ runId,
+ snapshotId,
+ completion,
+ workerId,
+ runnerId,
+ }: {
+ runId: string;
+ snapshotId: string;
+ completion: TaskRunExecutionResult;
+ workerId?: string;
+ runnerId?: string;
+ }): Promise {
+ if (completion.metadata) {
+ this.eventBus.emit("runMetadataUpdated", {
+ time: new Date(),
+ run: {
+ id: runId,
+ metadata: completion.metadata,
+ },
+ });
+ }
+
+ switch (completion.ok) {
+ case true: {
+ return this.#attemptSucceeded({
+ runId,
+ snapshotId,
+ completion,
+ tx: this.prisma,
+ workerId,
+ runnerId,
+ });
+ }
+ case false: {
+ return this.#attemptFailed({
+ runId,
+ snapshotId,
+ completion,
+ tx: this.prisma,
+ workerId,
+ runnerId,
+ });
+ }
+ }
+ }
+
+ /**
+ Call this to cancel a run.
+ If the run is in-progress it will change it's state to PENDING_CANCEL and notify the worker.
+ If the run is not in-progress it will finish it.
+ You can pass `finalizeRun` in if you know it's no longer running, e.g. the worker has messaged to say it's done.
+ */
+ async cancelRun({
+ runId,
+ workerId,
+ runnerId,
+ completedAt,
+ reason,
+ finalizeRun,
+ tx,
+ }: {
+ runId: string;
+ workerId?: string;
+ runnerId?: string;
+ completedAt?: Date;
+ reason?: string;
+ finalizeRun?: boolean;
+ tx?: PrismaClientOrTransaction;
+ }): Promise {
+ const prisma = tx ?? this.prisma;
+ reason = reason ?? "Cancelled by user";
+
+ return this.#trace("cancelRun", { runId }, async (span) => {
+ return this.runLock.lock([runId], 5_000, async (signal) => {
+ const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId);
+
+ //already finished, do nothing
+ if (latestSnapshot.executionStatus === "FINISHED") {
+ return executionResultFromSnapshot(latestSnapshot);
+ }
+
+ //is pending cancellation and we're not finalizing, alert the worker again
+ if (latestSnapshot.executionStatus === "PENDING_CANCEL" && !finalizeRun) {
+ await this.#sendNotificationToWorker({ runId, snapshot: latestSnapshot });
+ return executionResultFromSnapshot(latestSnapshot);
+ }
+
+ //set the run to cancelled immediately
+ const error: TaskRunError = {
+ type: "STRING_ERROR",
+ raw: reason,
+ };
+
+ const run = await prisma.taskRun.update({
+ where: { id: runId },
+ data: {
+ status: "CANCELED",
+ completedAt: finalizeRun ? completedAt ?? new Date() : completedAt,
+ error,
+ },
+ select: {
+ id: true,
+ friendlyId: true,
+ status: true,
+ attemptNumber: true,
+ spanId: true,
+ batchId: true,
+ createdAt: true,
+ completedAt: true,
+ taskEventStore: true,
+ runtimeEnvironment: {
+ select: {
+ organizationId: true,
+ },
+ },
+ associatedWaitpoint: {
+ select: {
+ id: true,
+ },
+ },
+ childRuns: {
+ select: {
+ id: true,
+ },
+ },
+ },
+ });
+
+ //remove it from the queue and release concurrency
+ await this.runQueue.acknowledgeMessage(run.runtimeEnvironment.organizationId, runId);
+
+ //if executing, we need to message the worker to cancel the run and put it into `PENDING_CANCEL` status
+ if (isExecuting(latestSnapshot.executionStatus)) {
+ const newSnapshot = await this.#createExecutionSnapshot(prisma, {
+ run,
+ snapshot: {
+ executionStatus: "PENDING_CANCEL",
+ description: "Run was cancelled",
+ },
+ environmentId: latestSnapshot.environmentId,
+ environmentType: latestSnapshot.environmentType,
+ workerId,
+ runnerId,
+ });
+
+ //the worker needs to be notified so it can kill the run and complete the attempt
+ await this.#sendNotificationToWorker({ runId, snapshot: newSnapshot });
+ return executionResultFromSnapshot(newSnapshot);
+ }
+
+ //not executing, so we will actually finish the run
+ const newSnapshot = await this.#createExecutionSnapshot(prisma, {
+ run,
+ snapshot: {
+ executionStatus: "FINISHED",
+ description: "Run was cancelled, not finished",
+ },
+ environmentId: latestSnapshot.environmentId,
+ environmentType: latestSnapshot.environmentType,
+ workerId,
+ runnerId,
+ });
+
+ if (!run.associatedWaitpoint) {
+ throw new ServiceValidationError("No associated waitpoint found", 400);
+ }
+
+ //complete the waitpoint so the parent run can continue
+ await this.completeWaitpoint({
+ id: run.associatedWaitpoint.id,
+ output: { value: JSON.stringify(error), isError: true },
+ });
+
+ this.eventBus.emit("runCancelled", {
+ time: new Date(),
+ run: {
+ id: run.id,
+ friendlyId: run.friendlyId,
+ spanId: run.spanId,
+ taskEventStore: run.taskEventStore,
+ createdAt: run.createdAt,
+ completedAt: run.completedAt,
+ error,
+ },
+ });
+
+ //schedule the cancellation of all the child runs
+ //it will call this function for each child,
+ //which will recursively cancel all children if they need to be
+ if (run.childRuns.length > 0) {
+ for (const childRun of run.childRuns) {
+ await this.worker.enqueue({
+ id: `cancelRun:${childRun.id}`,
+ job: "cancelRun",
+ payload: { runId: childRun.id, completedAt: run.completedAt ?? new Date(), reason },
+ });
+ }
+ }
+
+ return executionResultFromSnapshot(newSnapshot);
+ });
+ });
+ }
+
+ async queueRunsWaitingForWorker({
+ backgroundWorkerId,
+ }: {
+ backgroundWorkerId: string;
+ }): Promise {
+ //we want this to happen in the background
+ await this.worker.enqueue({
+ job: "queueRunsWaitingForWorker",
+ payload: { backgroundWorkerId },
+ });
+ }
+
+ /**
+ * Reschedules a delayed run where the run hasn't been queued yet
+ */
+ async rescheduleRun({
+ runId,
+ delayUntil,
+ tx,
+ }: {
+ runId: string;
+ delayUntil: Date;
+ tx?: PrismaClientOrTransaction;
+ }): Promise {
+ const prisma = tx ?? this.prisma;
+ return this.#trace("rescheduleRun", { runId }, async (span) => {
+ return await this.runLock.lock([runId], 5_000, async (signal) => {
+ const snapshot = await getLatestExecutionSnapshot(prisma, runId);
+
+ //if the run isn't just created then we can't reschedule it
+ if (snapshot.executionStatus !== "RUN_CREATED") {
+ throw new ServiceValidationError("Cannot reschedule a run that is not delayed");
+ }
+
+ const updatedRun = await prisma.taskRun.update({
+ where: {
+ id: runId,
+ },
+ data: {
+ delayUntil: delayUntil,
+ executionSnapshots: {
+ create: {
+ engine: "V2",
+ executionStatus: "RUN_CREATED",
+ description: "Delayed run was rescheduled to a future date",
+ runStatus: "EXPIRED",
+ environmentId: snapshot.environmentId,
+ environmentType: snapshot.environmentType,
+ },
+ },
+ },
+ include: {
+ blockedByWaitpoints: true,
+ },
+ });
+
+ if (updatedRun.blockedByWaitpoints.length === 0) {
+ throw new ServiceValidationError(
+ "Cannot reschedule a run that is not blocked by a waitpoint"
+ );
+ }
+
+ const result = await this.#rescheduleDateTimeWaitpoint(
+ prisma,
+ updatedRun.blockedByWaitpoints[0].waitpointId,
+ delayUntil
+ );
+
+ if (!result.success) {
+ throw new ServiceValidationError("Failed to reschedule waitpoint, too late.", 400);
+ }
+
+ return updatedRun;
+ });
+ });
+ }
+
+ async lengthOfEnvQueue(environment: MinimalAuthenticatedEnvironment): Promise {
+ return this.runQueue.lengthOfEnvQueue(environment);
+ }
+
+ /**
+ * This creates a DATETIME waitpoint, that will be completed automatically when the specified date is reached.
+ * If you pass an `idempotencyKey`, the waitpoint will be created only if it doesn't already exist.
+ */
+ async createDateTimeWaitpoint({
+ projectId,
+ environmentId,
+ completedAfter,
+ idempotencyKey,
+ idempotencyKeyExpiresAt,
+ tx,
+ }: {
+ projectId: string;
+ environmentId: string;
+ completedAfter: Date;
+ idempotencyKey?: string;
+ idempotencyKeyExpiresAt?: Date;
+ tx?: PrismaClientOrTransaction;
+ }) {
+ const prisma = tx ?? this.prisma;
+
+ const existingWaitpoint = idempotencyKey
+ ? await prisma.waitpoint.findUnique({
+ where: {
+ environmentId_idempotencyKey: {
+ environmentId,
+ idempotencyKey,
+ },
+ },
+ })
+ : undefined;
+
+ if (existingWaitpoint) {
+ if (
+ existingWaitpoint.idempotencyKeyExpiresAt &&
+ new Date() > existingWaitpoint.idempotencyKeyExpiresAt
+ ) {
+ //the idempotency key has expired
+ //remove the waitpoint idempotencyKey
+ await prisma.waitpoint.update({
+ where: {
+ id: existingWaitpoint.id,
+ },
+ data: {
+ idempotencyKey: nanoid(24),
+ inactiveIdempotencyKey: existingWaitpoint.idempotencyKey,
+ },
+ });
+
+ //let it fall through to create a new waitpoint
+ } else {
+ return { waitpoint: existingWaitpoint, isCached: true };
+ }
+ }
+
+ const waitpoint = await prisma.waitpoint.upsert({
+ where: {
+ environmentId_idempotencyKey: {
+ environmentId,
+ idempotencyKey: idempotencyKey ?? nanoid(24),
+ },
+ },
+ create: {
+ ...WaitpointId.generate(),
+ type: "DATETIME",
+ idempotencyKey: idempotencyKey ?? nanoid(24),
+ idempotencyKeyExpiresAt,
+ userProvidedIdempotencyKey: !!idempotencyKey,
+ environmentId,
+ projectId,
+ completedAfter,
+ },
+ update: {},
+ });
+
+ await this.worker.enqueue({
+ id: `finishWaitpoint.${waitpoint.id}`,
+ job: "finishWaitpoint",
+ payload: { waitpointId: waitpoint.id },
+ availableAt: completedAfter,
+ });
+
+ return { waitpoint, isCached: false };
+ }
+
+ /** This creates a MANUAL waitpoint, that can be explicitly completed (or failed).
+ * If you pass an `idempotencyKey` and it already exists, it will return the existing waitpoint.
+ */
+ async createManualWaitpoint({
+ environmentId,
+ projectId,
+ idempotencyKey,
+ idempotencyKeyExpiresAt,
+ timeout,
+ }: {
+ environmentId: string;
+ projectId: string;
+ idempotencyKey?: string;
+ idempotencyKeyExpiresAt?: Date;
+ timeout?: Date;
+ }): Promise<{ waitpoint: Waitpoint; isCached: boolean }> {
+ const existingWaitpoint = idempotencyKey
+ ? await this.prisma.waitpoint.findUnique({
+ where: {
+ environmentId_idempotencyKey: {
+ environmentId,
+ idempotencyKey,
+ },
+ },
+ })
+ : undefined;
+
+ if (existingWaitpoint) {
+ if (
+ existingWaitpoint.idempotencyKeyExpiresAt &&
+ new Date() > existingWaitpoint.idempotencyKeyExpiresAt
+ ) {
+ //the idempotency key has expired
+ //remove the waitpoint idempotencyKey
+ await this.prisma.waitpoint.update({
+ where: {
+ id: existingWaitpoint.id,
+ },
+ data: {
+ idempotencyKey: nanoid(24),
+ inactiveIdempotencyKey: existingWaitpoint.idempotencyKey,
+ },
+ });
+
+ //let it fall through to create a new waitpoint
+ } else {
+ return { waitpoint: existingWaitpoint, isCached: true };
+ }
+ }
+
+ const waitpoint = await this.prisma.waitpoint.upsert({
+ where: {
+ environmentId_idempotencyKey: {
+ environmentId,
+ idempotencyKey: idempotencyKey ?? nanoid(24),
+ },
+ },
+ create: {
+ ...WaitpointId.generate(),
+ type: "MANUAL",
+ idempotencyKey: idempotencyKey ?? nanoid(24),
+ idempotencyKeyExpiresAt,
+ userProvidedIdempotencyKey: !!idempotencyKey,
+ environmentId,
+ projectId,
+ completedAfter: timeout,
+ },
+ update: {},
+ });
+
+ //schedule the timeout
+ if (timeout) {
+ await this.worker.enqueue({
+ id: `finishWaitpoint.${waitpoint.id}`,
+ job: "finishWaitpoint",
+ payload: {
+ waitpointId: waitpoint.id,
+ error: JSON.stringify(timeoutError(timeout)),
+ },
+ availableAt: timeout,
+ });
+ }
+
+ return { waitpoint, isCached: false };
+ }
+
+ /** This block a run with a BATCH waitpoint.
+ * The waitpoint will be created, and it will block the parent run.
+ */
+ async blockRunWithCreatedBatch({
+ runId,
+ batchId,
+ environmentId,
+ projectId,
+ organizationId,
+ tx,
+ }: {
+ runId: string;
+ batchId: string;
+ environmentId: string;
+ projectId: string;
+ organizationId: string;
+ tx?: PrismaClientOrTransaction;
+ }): Promise {
+ const prisma = tx ?? this.prisma;
+
+ try {
+ const waitpoint = await prisma.waitpoint.create({
+ data: {
+ ...WaitpointId.generate(),
+ type: "BATCH",
+ idempotencyKey: batchId,
+ userProvidedIdempotencyKey: false,
+ completedByBatchId: batchId,
+ environmentId,
+ projectId,
+ },
+ });
+
+ await this.blockRunWithWaitpoint({
+ runId,
+ waitpoints: waitpoint.id,
+ environmentId,
+ projectId,
+ organizationId,
+ batch: { id: batchId },
+ tx: prisma,
+ });
+
+ return waitpoint;
+ } catch (error) {
+ if (error instanceof Prisma.PrismaClientKnownRequestError) {
+ // duplicate idempotency key
+ if (error.code === "P2002") {
+ return null;
+ } else {
+ throw error;
+ }
+ }
+ throw error;
+ }
+ }
+
+ /**
+ * This is called when all the runs for a batch have been created.
+ * This does NOT mean that all the runs for the batch are completed.
+ */
+ async unblockRunForCreatedBatch({
+ runId,
+ batchId,
+ environmentId,
+ projectId,
+ tx,
+ }: {
+ runId: string;
+ batchId: string;
+ environmentId: string;
+ projectId: string;
+ tx?: PrismaClientOrTransaction;
+ }): Promise {
+ const prisma = tx ?? this.prisma;
+
+ const waitpoint = await prisma.waitpoint.findFirst({
+ where: {
+ completedByBatchId: batchId,
+ },
+ });
+
+ if (!waitpoint) {
+ this.logger.error("RunEngine.unblockRunForBatch(): Waitpoint not found", {
+ runId,
+ batchId,
+ });
+ throw new ServiceValidationError("Waitpoint not found for batch", 404);
+ }
+
+ await this.completeWaitpoint({
+ id: waitpoint.id,
+ output: { value: "Batch waitpoint completed", isError: false },
+ });
+ }
+
+ async tryCompleteBatch({ batchId }: { batchId: string }): Promise {
+ await this.worker.enqueue({
+ //this will debounce the call
+ id: `tryCompleteBatch:${batchId}`,
+ job: "tryCompleteBatch",
+ payload: { batchId: batchId },
+ //2s in the future
+ availableAt: new Date(Date.now() + 2_000),
+ });
+ }
+
+ async getWaitpoint({
+ waitpointId,
+ environmentId,
+ projectId,
+ }: {
+ environmentId: string;
+ projectId: string;
+ waitpointId: string;
+ }): Promise {
+ const waitpoint = await this.prisma.waitpoint.findFirst({
+ where: { id: waitpointId },
+ include: {
+ blockingTaskRuns: {
+ select: {
+ taskRun: {
+ select: {
+ id: true,
+ friendlyId: true,
+ },
+ },
+ },
+ },
+ },
+ });
+
+ if (!waitpoint) return null;
+ if (waitpoint.environmentId !== environmentId) return null;
+
+ return waitpoint;
+ }
+
+ /**
+ * Prevents a run from continuing until the waitpoint is completed.
+ */
+ async blockRunWithWaitpoint({
+ runId,
+ waitpoints,
+ projectId,
+ organizationId,
+ releaseConcurrency,
+ timeout,
+ spanIdToComplete,
+ batch,
+ workerId,
+ runnerId,
+ tx,
+ }: {
+ runId: string;
+ waitpoints: string | string[];
+ environmentId: string;
+ projectId: string;
+ organizationId: string;
+ releaseConcurrency?: {
+ releaseQueue: boolean;
+ };
+ timeout?: Date;
+ spanIdToComplete?: string;
+ batch?: { id: string; index?: number };
+ workerId?: string;
+ runnerId?: string;
+ tx?: PrismaClientOrTransaction;
+ }): Promise {
+ const prisma = tx ?? this.prisma;
+
+ let $waitpoints = typeof waitpoints === "string" ? [waitpoints] : waitpoints;
+
+ return await this.runLock.lock([runId], 5000, async (signal) => {
+ let snapshot: TaskRunExecutionSnapshot = await getLatestExecutionSnapshot(prisma, runId);
+
+ //block the run with the waitpoints, returning how many waitpoints are pending
+ const insert = await prisma.$queryRaw<{ pending_count: BigInt }[]>`
+ WITH inserted AS (
+ INSERT INTO "TaskRunWaitpoint" ("id", "taskRunId", "waitpointId", "projectId", "createdAt", "updatedAt", "spanIdToComplete", "batchId", "batchIndex")
+ SELECT
+ gen_random_uuid(),
+ ${runId},
+ w.id,
+ ${projectId},
+ NOW(),
+ NOW(),
+ ${spanIdToComplete ?? null},
+ ${batch?.id ?? null},
+ ${batch?.index ?? null}
+ FROM "Waitpoint" w
+ WHERE w.id IN (${Prisma.join($waitpoints)})
+ ON CONFLICT DO NOTHING
+ RETURNING "waitpointId"
+ )
+ SELECT COUNT(*) as pending_count
+ FROM inserted i
+ JOIN "Waitpoint" w ON w.id = i."waitpointId"
+ WHERE w.status = 'PENDING';`;
+
+ const pendingCount = Number(insert.at(0)?.pending_count ?? 0);
+
+ let newStatus: TaskRunExecutionStatus = "SUSPENDED";
+ if (
+ snapshot.executionStatus === "EXECUTING" ||
+ snapshot.executionStatus === "EXECUTING_WITH_WAITPOINTS"
+ ) {
+ newStatus = "EXECUTING_WITH_WAITPOINTS";
+ }
+
+ //if the state has changed, create a new snapshot
+ if (newStatus !== snapshot.executionStatus) {
+ snapshot = await this.#createExecutionSnapshot(prisma, {
+ run: {
+ id: snapshot.runId,
+ status: snapshot.runStatus,
+ attemptNumber: snapshot.attemptNumber,
+ },
+ snapshot: {
+ executionStatus: newStatus,
+ description: "Run was blocked by a waitpoint.",
+ },
+ environmentId: snapshot.environmentId,
+ environmentType: snapshot.environmentType,
+ batchId: batch?.id ?? snapshot.batchId ?? undefined,
+ workerId,
+ runnerId,
+ });
+
+ // Let the worker know immediately, so it can suspend the run
+ await this.#sendNotificationToWorker({ runId, snapshot });
+ }
+
+ if (timeout) {
+ for (const waitpoint of $waitpoints) {
+ await this.worker.enqueue({
+ id: `finishWaitpoint.${waitpoint}`,
+ job: "finishWaitpoint",
+ payload: {
+ waitpointId: waitpoint,
+ error: JSON.stringify(timeoutError(timeout)),
+ },
+ availableAt: timeout,
+ });
+ }
+ }
+
+ //no pending waitpoint, schedule unblocking the run
+ //debounce if we're rapidly adding waitpoints
+ if (pendingCount === 0) {
+ await this.worker.enqueue({
+ //this will debounce the call
+ id: `continueRunIfUnblocked:${runId}`,
+ job: "continueRunIfUnblocked",
+ payload: { runId: runId },
+ //in the near future
+ availableAt: new Date(Date.now() + 50),
+ });
+ } else {
+ if (releaseConcurrency) {
+ //release concurrency
+ await this.runQueue.releaseConcurrency(
+ organizationId,
+ runId,
+ releaseConcurrency.releaseQueue === true
+ );
+ }
+ }
+
+ return snapshot;
+ });
+ }
+
+ /** This completes a waitpoint and updates all entries so the run isn't blocked,
+ * if they're no longer blocked. This doesn't suffer from race conditions. */
+ async completeWaitpoint({
+ id,
+ output,
+ }: {
+ id: string;
+ output?: {
+ value: string;
+ type?: string;
+ isError: boolean;
+ };
+ }): Promise {
+ const result = await $transaction(
+ this.prisma,
+ async (tx) => {
+ // 1. Find the TaskRuns blocked by this waitpoint
+ const affectedTaskRuns = await tx.taskRunWaitpoint.findMany({
+ where: { waitpointId: id },
+ select: { taskRunId: true, spanIdToComplete: true, createdAt: true },
+ });
+
+ if (affectedTaskRuns.length === 0) {
+ this.logger.warn(`completeWaitpoint: No TaskRunWaitpoints found for waitpoint`, {
+ waitpointId: id,
+ });
+ }
+
+ // 2. Update the waitpoint to completed (only if it's pending)
+ let waitpoint: Waitpoint | null = null;
+ try {
+ waitpoint = await tx.waitpoint.update({
+ where: { id, status: "PENDING" },
+ data: {
+ status: "COMPLETED",
+ completedAt: new Date(),
+ output: output?.value,
+ outputType: output?.type,
+ outputIsError: output?.isError,
+ },
+ });
+ } catch (error) {
+ if (error instanceof Prisma.PrismaClientKnownRequestError && error.code === "P2025") {
+ waitpoint = await tx.waitpoint.findFirst({
+ where: { id },
+ });
+ } else {
+ this.logger.log("completeWaitpoint: error updating waitpoint:", { error });
+ throw error;
+ }
+ }
+
+ return { waitpoint, affectedTaskRuns };
+ },
+ (error) => {
+ this.logger.error(`completeWaitpoint: Error completing waitpoint ${id}, retrying`, {
+ error,
+ });
+ throw error;
+ }
+ );
+
+ if (!result) {
+ throw new Error(`Waitpoint couldn't be updated`);
+ }
+
+ if (!result.waitpoint) {
+ throw new Error(`Waitpoint ${id} not found`);
+ }
+
+ //schedule trying to continue the runs
+ for (const run of result.affectedTaskRuns) {
+ await this.worker.enqueue({
+ //this will debounce the call
+ id: `continueRunIfUnblocked:${run.taskRunId}`,
+ job: "continueRunIfUnblocked",
+ payload: { runId: run.taskRunId },
+ //50ms in the future
+ availableAt: new Date(Date.now() + 50),
+ });
+
+ // emit an event to complete associated cached runs
+ if (run.spanIdToComplete) {
+ this.eventBus.emit("cachedRunCompleted", {
+ time: new Date(),
+ span: {
+ id: run.spanIdToComplete,
+ createdAt: run.createdAt,
+ },
+ blockedRunId: run.taskRunId,
+ hasError: output?.isError ?? false,
+ });
+ }
+ }
+
+ return result.waitpoint;
+ }
+
+ async createCheckpoint({
+ runId,
+ snapshotId,
+ checkpoint,
+ workerId,
+ runnerId,
+ tx,
+ }: {
+ runId: string;
+ snapshotId: string;
+ checkpoint: CheckpointInput;
+ workerId?: string;
+ runnerId?: string;
+ tx?: PrismaClientOrTransaction;
+ }): Promise {
+ const prisma = tx ?? this.prisma;
+
+ return await this.runLock.lock([runId], 5_000, async (signal) => {
+ const snapshot = await getLatestExecutionSnapshot(prisma, runId);
+ if (snapshot.id !== snapshotId) {
+ this.eventBus.emit("incomingCheckpointDiscarded", {
+ time: new Date(),
+ run: {
+ id: runId,
+ },
+ checkpoint: {
+ discardReason: "Not the latest snapshot",
+ metadata: checkpoint,
+ },
+ snapshot: {
+ id: snapshot.id,
+ executionStatus: snapshot.executionStatus,
+ },
+ });
+
+ return {
+ ok: false as const,
+ error: "Not the latest snapshot",
+ };
+ }
+
+ if (!isCheckpointable(snapshot.executionStatus)) {
+ this.logger.error("Tried to createCheckpoint on a run in an invalid state", {
+ snapshot,
+ });
+
+ this.eventBus.emit("incomingCheckpointDiscarded", {
+ time: new Date(),
+ run: {
+ id: runId,
+ },
+ checkpoint: {
+ discardReason: `Status ${snapshot.executionStatus} is not checkpointable`,
+ metadata: checkpoint,
+ },
+ snapshot: {
+ id: snapshot.id,
+ executionStatus: snapshot.executionStatus,
+ },
+ });
+
+ return {
+ ok: false as const,
+ error: `Status ${snapshot.executionStatus} is not checkpointable`,
+ };
+ }
+
+ // Get the run and update the status
+ const run = await this.prisma.taskRun.update({
+ where: {
+ id: runId,
+ },
+ data: {
+ status: "WAITING_TO_RESUME",
+ },
+ select: {
+ id: true,
+ status: true,
+ attemptNumber: true,
+ runtimeEnvironment: {
+ select: {
+ id: true,
+ projectId: true,
+ },
+ },
+ },
+ });
+
+ if (!run) {
+ this.logger.error("Run not found for createCheckpoint", {
+ snapshot,
+ });
+
+ throw new ServiceValidationError("Run not found", 404);
+ }
+
+ // Create the checkpoint
+ const taskRunCheckpoint = await prisma.taskRunCheckpoint.create({
+ data: {
+ ...CheckpointId.generate(),
+ type: checkpoint.type,
+ location: checkpoint.location,
+ imageRef: checkpoint.imageRef,
+ reason: checkpoint.reason,
+ runtimeEnvironmentId: run.runtimeEnvironment.id,
+ projectId: run.runtimeEnvironment.projectId,
+ },
+ });
+
+ //create a new execution snapshot, with the checkpoint
+ const newSnapshot = await this.#createExecutionSnapshot(prisma, {
+ run,
+ snapshot: {
+ executionStatus: "SUSPENDED",
+ description: "Run was suspended after creating a checkpoint.",
+ },
+ environmentId: snapshot.environmentId,
+ environmentType: snapshot.environmentType,
+ checkpointId: taskRunCheckpoint.id,
+ workerId,
+ runnerId,
+ });
+
+ return {
+ ok: true as const,
+ ...executionResultFromSnapshot(newSnapshot),
+ checkpoint: taskRunCheckpoint,
+ } satisfies CreateCheckpointResult;
+ });
+ }
+
+ async continueRunExecution({
+ runId,
+ snapshotId,
+ workerId,
+ runnerId,
+ tx,
+ }: {
+ runId: string;
+ snapshotId: string;
+ workerId?: string;
+ runnerId?: string;
+ tx?: PrismaClientOrTransaction;
+ }): Promise {
+ const prisma = tx ?? this.prisma;
+
+ return await this.runLock.lock([runId], 5_000, async (signal) => {
+ const snapshot = await getLatestExecutionSnapshot(prisma, runId);
+
+ if (snapshot.id !== snapshotId) {
+ throw new ServiceValidationError("Snapshot ID doesn't match the latest snapshot", 400);
+ }
+
+ if (!isPendingExecuting(snapshot.executionStatus)) {
+ throw new ServiceValidationError("Snapshot is not in a valid state to continue", 400);
+ }
+
+ // Get the run and update the status
+ const run = await this.prisma.taskRun.update({
+ where: {
+ id: runId,
+ },
+ data: {
+ status: "EXECUTING",
+ },
+ select: {
+ id: true,
+ status: true,
+ attemptNumber: true,
+ },
+ });
+
+ if (!run) {
+ this.logger.error("Run not found for createCheckpoint", {
+ snapshot,
+ });
+
+ throw new ServiceValidationError("Run not found", 404);
+ }
+
+ const newSnapshot = await this.#createExecutionSnapshot(prisma, {
+ run,
+ snapshot: {
+ executionStatus: "EXECUTING",
+ description: "Run was continued after being suspended",
+ },
+ environmentId: snapshot.environmentId,
+ environmentType: snapshot.environmentType,
+ completedWaitpoints: snapshot.completedWaitpoints,
+ workerId,
+ runnerId,
+ });
+
+ // Let worker know about the new snapshot so it can continue the run
+ await this.#sendNotificationToWorker({ runId, snapshot: newSnapshot });
+
+ return {
+ ...executionResultFromSnapshot(newSnapshot),
+ } satisfies ExecutionResult;
+ });
+ }
+
+ /**
+ Send a heartbeat to signal the the run is still executing.
+ If a heartbeat isn't received, after a while the run is considered "stalled"
+ and some logic will be run to try recover it.
+ @returns The ExecutionResult, which could be a different snapshot.
+ */
+ async heartbeatRun({
+ runId,
+ snapshotId,
+ workerId,
+ runnerId,
+ tx,
+ }: {
+ runId: string;
+ snapshotId: string;
+ workerId?: string;
+ runnerId?: string;
+ tx?: PrismaClientOrTransaction;
+ }): Promise {
+ const prisma = tx ?? this.prisma;
+
+ //we don't need to acquire a run lock for any of this, it's not critical if it happens on an older version
+ const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId);
+ if (latestSnapshot.id !== snapshotId) {
+ this.logger.log("heartbeatRun: no longer the latest snapshot, stopping the heartbeat.", {
+ runId,
+ snapshotId,
+ latestSnapshot,
+ workerId,
+ runnerId,
+ });
+
+ await this.worker.ack(`heartbeatSnapshot.${runId}`);
+ return executionResultFromSnapshot(latestSnapshot);
+ }
+
+ if (latestSnapshot.workerId !== workerId) {
+ this.logger.debug("heartbeatRun: worker ID does not match the latest snapshot", {
+ runId,
+ snapshotId,
+ latestSnapshot,
+ workerId,
+ runnerId,
+ });
+ }
+
+ //update the snapshot heartbeat time
+ await prisma.taskRunExecutionSnapshot.update({
+ where: { id: latestSnapshot.id },
+ data: {
+ lastHeartbeatAt: new Date(),
+ },
+ });
+
+ //extending is the same as creating a new heartbeat
+ await this.#setHeartbeatDeadline({ runId, snapshotId, status: latestSnapshot.executionStatus });
+
+ return executionResultFromSnapshot(latestSnapshot);
+ }
+
+ /** Get required data to execute the run */
+ async getRunExecutionData({
+ runId,
+ tx,
+ }: {
+ runId: string;
+ tx?: PrismaClientOrTransaction;
+ }): Promise {
+ const prisma = tx ?? this.prisma;
+ try {
+ const snapshot = await getLatestExecutionSnapshot(prisma, runId);
+
+ const executionData: RunExecutionData = {
+ version: "1" as const,
+ snapshot: {
+ id: snapshot.id,
+ friendlyId: snapshot.friendlyId,
+ executionStatus: snapshot.executionStatus,
+ description: snapshot.description,
+ },
+ run: {
+ id: snapshot.runId,
+ friendlyId: snapshot.runFriendlyId,
+ status: snapshot.runStatus,
+ attemptNumber: snapshot.attemptNumber ?? undefined,
+ },
+ batch: snapshot.batchId
+ ? {
+ id: snapshot.batchId,
+ friendlyId: BatchId.toFriendlyId(snapshot.batchId),
+ }
+ : undefined,
+ checkpoint: snapshot.checkpoint
+ ? {
+ id: snapshot.checkpoint.id,
+ friendlyId: snapshot.checkpoint.friendlyId,
+ type: snapshot.checkpoint.type,
+ location: snapshot.checkpoint.location,
+ imageRef: snapshot.checkpoint.imageRef,
+ reason: snapshot.checkpoint.reason ?? undefined,
+ }
+ : undefined,
+ completedWaitpoints: snapshot.completedWaitpoints,
+ };
+
+ return executionData;
+ } catch (e) {
+ this.logger.error("Failed to getRunExecutionData", {
+ message: e instanceof Error ? e.message : e,
+ });
+ return null;
+ }
+ }
+
+ async quit() {
+ try {
+ //stop the run queue
+ await this.runQueue.quit();
+ await this.worker.stop();
+ await this.runLock.quit();
+
+ // This is just a failsafe
+ await this.runLockRedis.quit();
+ } catch (error) {
+ // And should always throw
+ }
+ }
+
+ async #systemFailure({
+ runId,
+ error,
+ tx,
+ }: {
+ runId: string;
+ error: TaskRunInternalError;
+ tx?: PrismaClientOrTransaction;
+ }): Promise {
+ const prisma = tx ?? this.prisma;
+ return this.#trace("#systemFailure", { runId }, async (span) => {
+ const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId);
+
+ //already finished
+ if (latestSnapshot.executionStatus === "FINISHED") {
+ //todo check run is in the correct state
+ return {
+ attemptStatus: "RUN_FINISHED",
+ snapshot: latestSnapshot,
+ run: {
+ id: runId,
+ friendlyId: latestSnapshot.runFriendlyId,
+ status: latestSnapshot.runStatus,
+ attemptNumber: latestSnapshot.attemptNumber,
+ },
+ };
+ }
+
+ const result = await this.#attemptFailed({
+ runId,
+ snapshotId: latestSnapshot.id,
+ completion: {
+ ok: false,
+ id: runId,
+ error,
+ },
+ tx: prisma,
+ });
+
+ return result;
+ });
+ }
+
+ async #expireRun({ runId, tx }: { runId: string; tx?: PrismaClientOrTransaction }) {
+ const prisma = tx ?? this.prisma;
+ await this.runLock.lock([runId], 5_000, async (signal) => {
+ const snapshot = await getLatestExecutionSnapshot(prisma, runId);
+
+ //if we're executing then we won't expire the run
+ if (isExecuting(snapshot.executionStatus)) {
+ return;
+ }
+
+ //only expire "PENDING" runs
+ const run = await prisma.taskRun.findUnique({ where: { id: runId } });
+
+ if (!run) {
+ this.logger.debug("Could not find enqueued run to expire", {
+ runId,
+ });
+ return;
+ }
+
+ if (run.status !== "PENDING") {
+ this.logger.debug("Run cannot be expired because it's not in PENDING status", {
+ run,
+ });
+ return;
+ }
+
+ if (run.lockedAt) {
+ this.logger.debug("Run cannot be expired because it's locked, so will run", {
+ run,
+ });
+ return;
+ }
+
+ const error: TaskRunError = {
+ type: "STRING_ERROR",
+ raw: `Run expired because the TTL (${run.ttl}) was reached`,
+ };
+
+ const updatedRun = await prisma.taskRun.update({
+ where: { id: runId },
+ data: {
+ status: "EXPIRED",
+ completedAt: new Date(),
+ expiredAt: new Date(),
+ error,
+ executionSnapshots: {
+ create: {
+ engine: "V2",
+ executionStatus: "FINISHED",
+ description: "Run was expired because the TTL was reached",
+ runStatus: "EXPIRED",
+ environmentId: snapshot.environmentId,
+ environmentType: snapshot.environmentType,
+ },
+ },
+ },
+ select: {
+ id: true,
+ spanId: true,
+ ttl: true,
+ associatedWaitpoint: {
+ select: {
+ id: true,
+ },
+ },
+ runtimeEnvironment: {
+ select: {
+ organizationId: true,
+ },
+ },
+ createdAt: true,
+ completedAt: true,
+ taskEventStore: true,
+ },
+ });
+
+ await this.runQueue.acknowledgeMessage(updatedRun.runtimeEnvironment.organizationId, runId);
+
+ if (!updatedRun.associatedWaitpoint) {
+ throw new ServiceValidationError("No associated waitpoint found", 400);
+ }
+
+ await this.completeWaitpoint({
+ id: updatedRun.associatedWaitpoint.id,
+ output: { value: JSON.stringify(error), isError: true },
+ });
+
+ this.eventBus.emit("runExpired", { run: updatedRun, time: new Date() });
+ });
+ }
+
+ async #waitingForDeploy({
+ orgId,
+ runId,
+ workerId,
+ runnerId,
+ reason,
+ tx,
+ }: {
+ orgId: string;
+ runId: string;
+ workerId?: string;
+ runnerId?: string;
+ reason?: string;
+ tx?: PrismaClientOrTransaction;
+ }) {
+ const prisma = tx ?? this.prisma;
+
+ return this.#trace("#waitingForDeploy", { runId }, async (span) => {
+ return this.runLock.lock([runId], 5_000, async (signal) => {
+ //mark run as waiting for deploy
+ const run = await prisma.taskRun.update({
+ where: { id: runId },
+ data: {
+ status: "WAITING_FOR_DEPLOY",
+ },
+ select: {
+ id: true,
+ status: true,
+ attemptNumber: true,
+ runtimeEnvironment: {
+ select: { id: true, type: true },
+ },
+ },
+ });
+
+ await this.#createExecutionSnapshot(prisma, {
+ run,
+ snapshot: {
+ executionStatus: "RUN_CREATED",
+ description:
+ reason ??
+ "The run doesn't have a background worker, so we're going to ack it for now.",
+ },
+ environmentId: run.runtimeEnvironment.id,
+ environmentType: run.runtimeEnvironment.type,
+ workerId,
+ runnerId,
+ });
+
+ //we ack because when it's deployed it will be requeued
+ await this.runQueue.acknowledgeMessage(orgId, runId);
+ });
+ });
+ }
+
+ async #attemptSucceeded({
+ runId,
+ snapshotId,
+ completion,
+ tx,
+ workerId,
+ runnerId,
+ }: {
+ runId: string;
+ snapshotId: string;
+ completion: TaskRunSuccessfulExecutionResult;
+ tx: PrismaClientOrTransaction;
+ workerId?: string;
+ runnerId?: string;
+ }): Promise {
+ const prisma = tx ?? this.prisma;
+ return this.#trace("#completeRunAttemptSuccess", { runId, snapshotId }, async (span) => {
+ return this.runLock.lock([runId], 5_000, async (signal) => {
+ const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId);
+
+ if (latestSnapshot.id !== snapshotId) {
+ throw new ServiceValidationError("Snapshot ID doesn't match the latest snapshot", 400);
+ }
+
+ span.setAttribute("completionStatus", completion.ok);
+
+ const completedAt = new Date();
+
+ const run = await prisma.taskRun.update({
+ where: { id: runId },
+ data: {
+ status: "COMPLETED_SUCCESSFULLY",
+ completedAt,
+ output: completion.output,
+ outputType: completion.outputType,
+ executionSnapshots: {
+ create: {
+ executionStatus: "FINISHED",
+ description: "Task completed successfully",
+ runStatus: "COMPLETED_SUCCESSFULLY",
+ attemptNumber: latestSnapshot.attemptNumber,
+ environmentId: latestSnapshot.environmentId,
+ environmentType: latestSnapshot.environmentType,
+ workerId,
+ runnerId,
+ },
+ },
+ },
+ select: {
+ id: true,
+ friendlyId: true,
+ status: true,
+ attemptNumber: true,
+ spanId: true,
+ associatedWaitpoint: {
+ select: {
+ id: true,
+ },
+ },
+ project: {
+ select: {
+ organizationId: true,
+ },
+ },
+ batchId: true,
+ createdAt: true,
+ completedAt: true,
+ taskEventStore: true,
+ },
+ });
+ const newSnapshot = await getLatestExecutionSnapshot(prisma, runId);
+ await this.runQueue.acknowledgeMessage(run.project.organizationId, runId);
+
+ // We need to manually emit this as we created the final snapshot as part of the task run update
+ this.eventBus.emit("executionSnapshotCreated", {
+ time: newSnapshot.createdAt,
+ run: {
+ id: newSnapshot.runId,
+ },
+ snapshot: {
+ ...newSnapshot,
+ completedWaitpointIds: newSnapshot.completedWaitpoints.map((wp) => wp.id),
+ },
+ });
+
+ if (!run.associatedWaitpoint) {
+ throw new ServiceValidationError("No associated waitpoint found", 400);
+ }
+
+ await this.completeWaitpoint({
+ id: run.associatedWaitpoint.id,
+ output: completion.output
+ ? { value: completion.output, type: completion.outputType, isError: false }
+ : undefined,
+ });
+
+ this.eventBus.emit("runSucceeded", {
+ time: completedAt,
+ run: {
+ id: runId,
+ spanId: run.spanId,
+ output: completion.output,
+ outputType: completion.outputType,
+ createdAt: run.createdAt,
+ completedAt: run.completedAt,
+ taskEventStore: run.taskEventStore,
+ },
+ });
+
+ await this.#finalizeRun(run);
+
+ return {
+ attemptStatus: "RUN_FINISHED",
+ snapshot: newSnapshot,
+ run,
+ };
+ });
+ });
+ }
+
+ async #attemptFailed({
+ runId,
+ snapshotId,
+ workerId,
+ runnerId,
+ completion,
+ forceRequeue,
+ tx,
+ }: {
+ runId: string;
+ snapshotId: string;
+ workerId?: string;
+ runnerId?: string;
+ completion: TaskRunFailedExecutionResult;
+ forceRequeue?: boolean;
+ tx: PrismaClientOrTransaction;
+ }): Promise {
+ const prisma = this.prisma;
+
+ return this.#trace("completeRunAttemptFailure", { runId, snapshotId }, async (span) => {
+ return this.runLock.lock([runId], 5_000, async (signal) => {
+ const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId);
+
+ if (latestSnapshot.id !== snapshotId) {
+ throw new ServiceValidationError("Snapshot ID doesn't match the latest snapshot", 400);
+ }
+
+ span.setAttribute("completionStatus", completion.ok);
+
+ //remove waitpoints blocking the run
+ const deletedCount = await this.#clearBlockingWaitpoints({ runId, tx });
+ if (deletedCount > 0) {
+ this.logger.debug("Cleared blocking waitpoints", { runId, deletedCount });
+ }
+
+ const failedAt = new Date();
+
+ if (
+ completion.error.type === "INTERNAL_ERROR" &&
+ completion.error.code === "TASK_RUN_CANCELLED"
+ ) {
+ // We need to cancel the task run instead of fail it
+ const result = await this.cancelRun({
+ runId,
+ completedAt: failedAt,
+ reason: completion.error.message,
+ finalizeRun: true,
+ tx: prisma,
+ });
+ return {
+ attemptStatus:
+ result.snapshot.executionStatus === "PENDING_CANCEL"
+ ? "RUN_PENDING_CANCEL"
+ : "RUN_FINISHED",
+ ...result,
+ };
+ }
+
+ const error = sanitizeError(completion.error);
+ const retriableError = shouldRetryError(taskRunErrorEnhancer(completion.error));
+
+ const permanentlyFailRun = async (run?: {
+ status: TaskRunStatus;
+ spanId: string;
+ createdAt: Date;
+ completedAt: Date | null;
+ taskEventStore: string;
+ }) => {
+ // Emit an event so we can complete any spans of stalled executions
+ if (forceRequeue && run) {
+ this.eventBus.emit("runAttemptFailed", {
+ time: failedAt,
+ run: {
+ id: runId,
+ status: run.status,
+ spanId: run.spanId,
+ error,
+ attemptNumber: latestSnapshot.attemptNumber ?? 0,
+ createdAt: run.createdAt,
+ completedAt: run.completedAt,
+ taskEventStore: run.taskEventStore,
+ },
+ });
+ }
+
+ return await this.#permanentlyFailRun({
+ runId,
+ snapshotId,
+ failedAt,
+ error,
+ workerId,
+ runnerId,
+ });
+ };
+
+ // Error is not retriable, fail the run
+ if (!retriableError) {
+ return await permanentlyFailRun();
+ }
+
+ // No retry config attached to completion, fail the run
+ if (completion.retry === undefined) {
+ return await permanentlyFailRun();
+ }
+
+ // Run attempts have reached the global maximum, fail the run
+ if (
+ latestSnapshot.attemptNumber !== null &&
+ latestSnapshot.attemptNumber >= MAX_TASK_RUN_ATTEMPTS
+ ) {
+ return await permanentlyFailRun();
+ }
+
+ const minimalRun = await prisma.taskRun.findFirst({
+ where: {
+ id: runId,
+ },
+ select: {
+ status: true,
+ spanId: true,
+ maxAttempts: true,
+ runtimeEnvironment: {
+ select: {
+ organizationId: true,
+ },
+ },
+ taskEventStore: true,
+ createdAt: true,
+ completedAt: true,
+ },
+ });
+
+ if (!minimalRun) {
+ throw new ServiceValidationError("Run not found", 404);
+ }
+
+ // Run doesn't have any max attempts set which is required for retrying, fail the run
+ if (!minimalRun.maxAttempts) {
+ return await permanentlyFailRun(minimalRun);
+ }
+
+ // Run has reached the maximum configured number of attempts, fail the run
+ if (
+ latestSnapshot.attemptNumber !== null &&
+ latestSnapshot.attemptNumber >= minimalRun.maxAttempts
+ ) {
+ return await permanentlyFailRun(minimalRun);
+ }
+
+ // This error didn't come from user code, so we need to emit an event to complete any spans
+ if (forceRequeue) {
+ this.eventBus.emit("runAttemptFailed", {
+ time: failedAt,
+ run: {
+ id: runId,
+ status: minimalRun.status,
+ spanId: minimalRun.spanId,
+ error,
+ attemptNumber: latestSnapshot.attemptNumber ?? 0,
+ taskEventStore: minimalRun.taskEventStore,
+ createdAt: minimalRun.createdAt,
+ completedAt: minimalRun.completedAt,
+ },
+ });
+ }
+
+ const retryAt = new Date(completion.retry.timestamp);
+
+ const run = await prisma.taskRun.update({
+ where: {
+ id: runId,
+ },
+ data: {
+ status: "RETRYING_AFTER_FAILURE",
+ },
+ include: {
+ runtimeEnvironment: {
+ include: {
+ project: true,
+ organization: true,
+ orgMember: true,
+ },
+ },
+ },
+ });
+
+ const nextAttemptNumber =
+ latestSnapshot.attemptNumber === null ? 1 : latestSnapshot.attemptNumber + 1;
+
+ this.eventBus.emit("runRetryScheduled", {
+ time: failedAt,
+ run: {
+ id: run.id,
+ friendlyId: run.friendlyId,
+ attemptNumber: nextAttemptNumber,
+ queue: run.queue,
+ taskIdentifier: run.taskIdentifier,
+ traceContext: run.traceContext as Record,
+ baseCostInCents: run.baseCostInCents,
+ spanId: run.spanId,
+ },
+ organization: {
+ id: run.runtimeEnvironment.organizationId,
+ },
+ environment: run.runtimeEnvironment,
+ retryAt,
+ });
+
+ //todo anything special for DEV? Ideally not.
+
+ //if it's a long delay and we support checkpointing, put it back in the queue
+ if (
+ forceRequeue ||
+ (this.options.retryWarmStartThresholdMs !== undefined &&
+ completion.retry.delay >= this.options.retryWarmStartThresholdMs)
+ ) {
+ //we nack the message, requeuing it for later
+ const nackResult = await this.#tryNackAndRequeue({
+ run,
+ environment: run.runtimeEnvironment,
+ orgId: run.runtimeEnvironment.organizationId,
+ timestamp: retryAt.getTime(),
+ error: {
+ type: "INTERNAL_ERROR",
+ code: "TASK_RUN_DEQUEUED_MAX_RETRIES",
+ message: `We tried to dequeue the run the maximum number of times but it wouldn't start executing`,
+ },
+ tx: prisma,
+ });
+
+ if (!nackResult.wasRequeued) {
+ return {
+ attemptStatus: "RUN_FINISHED",
+ ...nackResult,
+ };
+ } else {
+ return { attemptStatus: "RETRY_QUEUED", ...nackResult };
+ }
+ }
+
+ //it will continue running because the retry delay is short
+ const newSnapshot = await this.#createExecutionSnapshot(prisma, {
+ run,
+ snapshot: {
+ executionStatus: "PENDING_EXECUTING",
+ description: "Attempt failed with a short delay, starting a new attempt",
+ },
+ environmentId: latestSnapshot.environmentId,
+ environmentType: latestSnapshot.environmentType,
+ workerId,
+ runnerId,
+ });
+ //the worker can fetch the latest snapshot and should create a new attempt
+ await this.#sendNotificationToWorker({ runId, snapshot: newSnapshot });
+
+ return {
+ attemptStatus: "RETRY_IMMEDIATELY",
+ ...executionResultFromSnapshot(newSnapshot),
+ };
+ });
+ });
+ }
+
+ async #permanentlyFailRun({
+ runId,
+ snapshotId,
+ failedAt,
+ error,
+ workerId,
+ runnerId,
+ }: {
+ runId: string;
+ snapshotId: string;
+ failedAt: Date;
+ error: TaskRunError;
+ workerId?: string;
+ runnerId?: string;
+ }): Promise {
+ const prisma = this.prisma;
+
+ return this.#trace("permanentlyFailRun", { runId, snapshotId }, async (span) => {
+ const status = runStatusFromError(error);
+
+ //run permanently failed
+ const run = await prisma.taskRun.update({
+ where: {
+ id: runId,
+ },
+ data: {
+ status,
+ completedAt: failedAt,
+ error,
+ },
+ select: {
+ id: true,
+ friendlyId: true,
+ status: true,
+ attemptNumber: true,
+ spanId: true,
+ batchId: true,
+ associatedWaitpoint: {
+ select: {
+ id: true,
+ },
+ },
+ runtimeEnvironment: {
+ select: {
+ id: true,
+ type: true,
+ organizationId: true,
+ },
+ },
+ taskEventStore: true,
+ createdAt: true,
+ completedAt: true,
+ },
+ });
+
+ const newSnapshot = await this.#createExecutionSnapshot(prisma, {
+ run,
+ snapshot: {
+ executionStatus: "FINISHED",
+ description: "Run failed",
+ },
+ environmentId: run.runtimeEnvironment.id,
+ environmentType: run.runtimeEnvironment.type,
+ workerId,
+ runnerId,
+ });
+
+ if (!run.associatedWaitpoint) {
+ throw new ServiceValidationError("No associated waitpoint found", 400);
+ }
+
+ await this.completeWaitpoint({
+ id: run.associatedWaitpoint.id,
+ output: { value: JSON.stringify(error), isError: true },
+ });
+
+ await this.runQueue.acknowledgeMessage(run.runtimeEnvironment.organizationId, runId);
+
+ this.eventBus.emit("runFailed", {
+ time: failedAt,
+ run: {
+ id: runId,
+ status: run.status,
+ spanId: run.spanId,
+ error,
+ taskEventStore: run.taskEventStore,
+ createdAt: run.createdAt,
+ completedAt: run.completedAt,
+ },
+ });
+
+ await this.#finalizeRun(run);
+
+ return {
+ attemptStatus: "RUN_FINISHED",
+ snapshot: newSnapshot,
+ run,
+ };
+ });
+ }
+
+ //MARK: RunQueue
+
+ /** The run can be added to the queue. When it's pulled from the queue it will be executed. */
+ async #enqueueRun({
+ run,
+ env,
+ timestamp,
+ tx,
+ snapshot,
+ batchId,
+ checkpointId,
+ completedWaitpoints,
+ workerId,
+ runnerId,
+ }: {
+ run: TaskRun;
+ env: MinimalAuthenticatedEnvironment;
+ timestamp: number;
+ tx?: PrismaClientOrTransaction;
+ snapshot?: {
+ description?: string;
+ };
+ batchId?: string;
+ checkpointId?: string;
+ completedWaitpoints?: {
+ id: string;
+ index?: number;
+ }[];
+ workerId?: string;
+ runnerId?: string;
+ }) {
+ const prisma = tx ?? this.prisma;
+
+ await this.runLock.lock([run.id], 5000, async (signal) => {
+ const newSnapshot = await this.#createExecutionSnapshot(prisma, {
+ run: run,
+ snapshot: {
+ executionStatus: "QUEUED",
+ description: snapshot?.description ?? "Run was QUEUED",
+ },
+ batchId,
+ environmentId: env.id,
+ environmentType: env.type,
+ checkpointId,
+ completedWaitpoints,
+ workerId,
+ runnerId,
+ });
+
+ const masterQueues = [run.masterQueue];
+ if (run.secondaryMasterQueue) {
+ masterQueues.push(run.secondaryMasterQueue);
+ }
+
+ await this.runQueue.enqueueMessage({
+ env,
+ masterQueues,
+ message: {
+ runId: run.id,
+ taskIdentifier: run.taskIdentifier,
+ orgId: env.organization.id,
+ projectId: env.project.id,
+ environmentId: env.id,
+ environmentType: env.type,
+ queue: run.queue,
+ concurrencyKey: run.concurrencyKey ?? undefined,
+ timestamp,
+ attempt: 0,
+ },
+ });
+ });
+ }
+
+ async #tryNackAndRequeue({
+ run,
+ environment,
+ orgId,
+ timestamp,
+ error,
+ workerId,
+ runnerId,
+ tx,
+ }: {
+ run: TaskRun;
+ environment: {
+ id: string;
+ type: RuntimeEnvironmentType;
+ };
+ orgId: string;
+ timestamp?: number;
+ error: TaskRunInternalError;
+ workerId?: string;
+ runnerId?: string;
+ tx?: PrismaClientOrTransaction;
+ }): Promise<{ wasRequeued: boolean } & ExecutionResult> {
+ const prisma = tx ?? this.prisma;
+
+ return await this.runLock.lock([run.id], 5000, async (signal) => {
+ //we nack the message, this allows another work to pick up the run
+ const gotRequeued = await this.runQueue.nackMessage({
+ orgId,
+ messageId: run.id,
+ retryAt: timestamp,
+ });
+
+ if (!gotRequeued) {
+ const result = await this.#systemFailure({
+ runId: run.id,
+ error,
+ tx: prisma,
+ });
+ return { wasRequeued: false, ...result };
+ }
+
+ const newSnapshot = await this.#createExecutionSnapshot(prisma, {
+ run: run,
+ snapshot: {
+ executionStatus: "QUEUED",
+ description: "Requeued the run after a failure",
+ },
+ environmentId: environment.id,
+ environmentType: environment.type,
+ workerId,
+ runnerId,
+ });
+
+ return {
+ wasRequeued: true,
+ snapshot: {
+ id: newSnapshot.id,
+ friendlyId: newSnapshot.friendlyId,
+ executionStatus: newSnapshot.executionStatus,
+ description: newSnapshot.description,
+ },
+ run: {
+ id: newSnapshot.runId,
+ friendlyId: newSnapshot.runFriendlyId,
+ status: newSnapshot.runStatus,
+ attemptNumber: newSnapshot.attemptNumber,
+ },
+ };
+ });
+ }
+
+ async #continueRunIfUnblocked({ runId }: { runId: string }) {
+ // 1. Get the any blocking waitpoints
+ const blockingWaitpoints = await this.prisma.taskRunWaitpoint.findMany({
+ where: { taskRunId: runId },
+ select: {
+ batchId: true,
+ batchIndex: true,
+ waitpoint: {
+ select: { id: true, status: true },
+ },
+ },
+ });
+
+ // 2. There are blockers still, so do nothing
+ if (blockingWaitpoints.some((w) => w.waitpoint.status !== "COMPLETED")) {
+ return;
+ }
+
+ // 3. Get the run with environment
+ const run = await this.prisma.taskRun.findFirst({
+ where: {
+ id: runId,
+ },
+ include: {
+ runtimeEnvironment: {
+ select: {
+ id: true,
+ type: true,
+ maximumConcurrencyLimit: true,
+ project: { select: { id: true } },
+ organization: { select: { id: true } },
+ },
+ },
+ },
+ });
+
+ if (!run) {
+ throw new Error(`#continueRunIfUnblocked: run not found: ${runId}`);
+ }
+
+ //4. Continue the run whether it's executing or not
+ await this.runLock.lock([runId], 5000, async (signal) => {
+ const snapshot = await getLatestExecutionSnapshot(this.prisma, runId);
+
+ //run is still executing, send a message to the worker
+ if (isExecuting(snapshot.executionStatus)) {
+ const newSnapshot = await this.#createExecutionSnapshot(this.prisma, {
+ run: {
+ id: runId,
+ status: snapshot.runStatus,
+ attemptNumber: snapshot.attemptNumber,
+ },
+ snapshot: {
+ executionStatus: "EXECUTING",
+ description: "Run was continued, whilst still executing.",
+ },
+ environmentId: snapshot.environmentId,
+ environmentType: snapshot.environmentType,
+ batchId: snapshot.batchId ?? undefined,
+ completedWaitpoints: blockingWaitpoints.map((b) => ({
+ id: b.waitpoint.id,
+ index: b.batchIndex ?? undefined,
+ })),
+ });
+
+ //we reacquire the concurrency if it's still running because we're not going to be dequeuing (which also does this)
+ await this.runQueue.reacquireConcurrency(run.runtimeEnvironment.organization.id, runId);
+
+ await this.#sendNotificationToWorker({ runId, snapshot: newSnapshot });
+ } else {
+ if (snapshot.executionStatus !== "RUN_CREATED" && !snapshot.checkpointId) {
+ // TODO: We're screwed, should probably fail the run immediately
+ throw new Error(`#continueRunIfUnblocked: run has no checkpoint: ${run.id}`);
+ }
+
+ //put it back in the queue, with the original timestamp (w/ priority)
+ //this prioritizes dequeuing waiting runs over new runs
+ await this.#enqueueRun({
+ run,
+ env: run.runtimeEnvironment,
+ timestamp: run.createdAt.getTime() - run.priorityMs,
+ snapshot: {
+ description: "Run was QUEUED, because all waitpoints are completed",
+ },
+ batchId: snapshot.batchId ?? undefined,
+ completedWaitpoints: blockingWaitpoints.map((b) => ({
+ id: b.waitpoint.id,
+ index: b.batchIndex ?? undefined,
+ })),
+ checkpointId: snapshot.checkpointId ?? undefined,
+ });
+ }
+ });
+
+ //5. Remove the blocking waitpoints
+ await this.prisma.taskRunWaitpoint.deleteMany({
+ where: {
+ taskRunId: runId,
+ },
+ });
+ }
+
+ async #queueRunsWaitingForWorker({ backgroundWorkerId }: { backgroundWorkerId: string }) {
+ //It could be a lot of runs, so we will process them in a batch
+ //if there are still more to process we will enqueue this function again
+ const maxCount = this.options.queueRunsWaitingForWorkerBatchSize ?? 200;
+
+ const backgroundWorker = await this.prisma.backgroundWorker.findFirst({
+ where: {
+ id: backgroundWorkerId,
+ },
+ include: {
+ runtimeEnvironment: {
+ include: {
+ project: true,
+ organization: true,
+ },
+ },
+ tasks: true,
+ },
+ });
+
+ if (!backgroundWorker) {
+ this.logger.error("#queueRunsWaitingForWorker: background worker not found", {
+ id: backgroundWorkerId,
+ });
+ return;
+ }
+
+ const runsWaitingForDeploy = await this.prisma.taskRun.findMany({
+ where: {
+ runtimeEnvironmentId: backgroundWorker.runtimeEnvironmentId,
+ projectId: backgroundWorker.projectId,
+ status: "WAITING_FOR_DEPLOY",
+ taskIdentifier: {
+ in: backgroundWorker.tasks.map((task) => task.slug),
+ },
+ },
+ orderBy: {
+ createdAt: "asc",
+ },
+ take: maxCount + 1,
+ });
+
+ //none to process
+ if (!runsWaitingForDeploy.length) return;
+
+ for (const run of runsWaitingForDeploy) {
+ await this.prisma.$transaction(async (tx) => {
+ const updatedRun = await tx.taskRun.update({
+ where: {
+ id: run.id,
+ },
+ data: {
+ status: "PENDING",
+ },
+ });
+ await this.#enqueueRun({
+ run: updatedRun,
+ env: backgroundWorker.runtimeEnvironment,
+ //add to the queue using the original run created time
+ //this should ensure they're in the correct order in the queue
+ timestamp: updatedRun.createdAt.getTime() - updatedRun.priorityMs,
+ tx,
+ });
+ });
+ }
+
+ //enqueue more if needed
+ if (runsWaitingForDeploy.length > maxCount) {
+ await this.queueRunsWaitingForWorker({ backgroundWorkerId });
+ }
+ }
+
+ //MARK: - Waitpoints
+ async #createRunAssociatedWaitpoint(
+ tx: PrismaClientOrTransaction,
+ {
+ projectId,
+ environmentId,
+ completedByTaskRunId,
+ }: { projectId: string; environmentId: string; completedByTaskRunId: string }
+ ) {
+ return tx.waitpoint.create({
+ data: {
+ ...WaitpointId.generate(),
+ type: "RUN",
+ status: "PENDING",
+ idempotencyKey: nanoid(24),
+ userProvidedIdempotencyKey: false,
+ projectId,
+ environmentId,
+ completedByTaskRunId,
+ },
+ });
+ }
+
+ async #rescheduleDateTimeWaitpoint(
+ tx: PrismaClientOrTransaction,
+ waitpointId: string,
+ completedAfter: Date
+ ): Promise<{ success: true } | { success: false; error: string }> {
+ try {
+ const updatedWaitpoint = await tx.waitpoint.update({
+ where: { id: waitpointId, status: "PENDING" },
+ data: {
+ completedAfter,
+ },
+ });
+ } catch (error) {
+ if (error instanceof Prisma.PrismaClientKnownRequestError && error.code === "P2025") {
+ return {
+ success: false,
+ error: "Waitpoint doesn't exist or is already completed",
+ };
+ }
+
+ this.logger.error("Error rescheduling waitpoint", { error });
+
+ return {
+ success: false,
+ error: "An unknown error occurred",
+ };
+ }
+
+ //reschedule completion
+ await this.worker.enqueue({
+ id: `finishWaitpoint.${waitpointId}`,
+ job: "finishWaitpoint",
+ payload: { waitpointId: waitpointId },
+ availableAt: completedAfter,
+ });
+
+ return {
+ success: true,
+ };
+ }
+
+ async #clearBlockingWaitpoints({ runId, tx }: { runId: string; tx?: PrismaClientOrTransaction }) {
+ const prisma = tx ?? this.prisma;
+ const deleted = await prisma.taskRunWaitpoint.deleteMany({
+ where: {
+ taskRunId: runId,
+ },
+ });
+
+ return deleted.count;
+ }
+
+ //#region TaskRunExecutionSnapshots
+ async #createExecutionSnapshot(
+ prisma: PrismaClientOrTransaction,
+ {
+ run,
+ snapshot,
+ batchId,
+ environmentId,
+ environmentType,
+ checkpointId,
+ workerId,
+ runnerId,
+ completedWaitpoints,
+ error,
+ }: {
+ run: { id: string; status: TaskRunStatus; attemptNumber?: number | null };
+ snapshot: {
+ executionStatus: TaskRunExecutionStatus;
+ description: string;
+ };
+ batchId?: string;
+ environmentId: string;
+ environmentType: RuntimeEnvironmentType;
+ checkpointId?: string;
+ workerId?: string;
+ runnerId?: string;
+ completedWaitpoints?: {
+ id: string;
+ index?: number;
+ }[];
+ error?: string;
+ }
+ ) {
+ const newSnapshot = await prisma.taskRunExecutionSnapshot.create({
+ data: {
+ engine: "V2",
+ executionStatus: snapshot.executionStatus,
+ description: snapshot.description,
+ runId: run.id,
+ runStatus: run.status,
+ attemptNumber: run.attemptNumber ?? undefined,
+ batchId,
+ environmentId,
+ environmentType,
+ checkpointId,
+ workerId,
+ runnerId,
+ completedWaitpoints: {
+ connect: completedWaitpoints?.map((w) => ({ id: w.id })),
+ },
+ completedWaitpointOrder: completedWaitpoints
+ ?.filter((c) => c.index !== undefined)
+ .sort((a, b) => a.index! - b.index!)
+ .map((w) => w.id),
+ isValid: error ? false : true,
+ error,
+ },
+ include: {
+ checkpoint: true,
+ },
+ });
+
+ if (!error) {
+ //set heartbeat (if relevant)
+ await this.#setHeartbeatDeadline({
+ status: newSnapshot.executionStatus,
+ runId: run.id,
+ snapshotId: newSnapshot.id,
+ });
+ }
+
+ this.eventBus.emit("executionSnapshotCreated", {
+ time: newSnapshot.createdAt,
+ run: {
+ id: newSnapshot.runId,
+ },
+ snapshot: {
+ ...newSnapshot,
+ completedWaitpointIds: completedWaitpoints?.map((w) => w.id) ?? [],
+ },
+ });
+
+ return {
+ ...newSnapshot,
+ friendlyId: SnapshotId.toFriendlyId(newSnapshot.id),
+ runFriendlyId: RunId.toFriendlyId(newSnapshot.runId),
+ };
+ }
+
+ #getHeartbeatIntervalMs(status: TaskRunExecutionStatus): number | null {
+ switch (status) {
+ case "PENDING_EXECUTING": {
+ return this.heartbeatTimeouts.PENDING_EXECUTING;
+ }
+ case "PENDING_CANCEL": {
+ return this.heartbeatTimeouts.PENDING_CANCEL;
+ }
+ case "EXECUTING": {
+ return this.heartbeatTimeouts.EXECUTING;
+ }
+ case "EXECUTING_WITH_WAITPOINTS": {
+ return this.heartbeatTimeouts.EXECUTING_WITH_WAITPOINTS;
+ }
+ default: {
+ return null;
+ }
+ }
+ }
+
+ //#endregion
+
+ //#region Heartbeat
+ async #setHeartbeatDeadline({
+ runId,
+ snapshotId,
+ status,
+ }: {
+ runId: string;
+ snapshotId: string;
+ status: TaskRunExecutionStatus;
+ }) {
+ const intervalMs = this.#getHeartbeatIntervalMs(status);
+
+ if (intervalMs === null) {
+ return;
+ }
+
+ await this.worker.enqueue({
+ id: `heartbeatSnapshot.${runId}`,
+ job: "heartbeatSnapshot",
+ payload: { snapshotId, runId },
+ availableAt: new Date(Date.now() + intervalMs),
+ });
+ }
+
+ async #handleStalledSnapshot({
+ runId,
+ snapshotId,
+ tx,
+ }: {
+ runId: string;
+ snapshotId: string;
+ tx?: PrismaClientOrTransaction;
+ }) {
+ const prisma = tx ?? this.prisma;
+ return await this.runLock.lock([runId], 5_000, async (signal) => {
+ const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId);
+ if (latestSnapshot.id !== snapshotId) {
+ this.logger.log(
+ "RunEngine.#handleStalledSnapshot() no longer the latest snapshot, stopping the heartbeat.",
+ {
+ runId,
+ snapshotId,
+ latestSnapshot: latestSnapshot,
+ }
+ );
+
+ await this.worker.ack(`heartbeatSnapshot.${runId}`);
+ return;
+ }
+
+ this.logger.log("RunEngine.#handleStalledSnapshot() handling stalled snapshot", {
+ runId,
+ snapshot: latestSnapshot,
+ });
+
+ // For dev, we just cancel runs that are stuck
+ if (latestSnapshot.environmentType === "DEVELOPMENT") {
+ this.logger.log("RunEngine.#handleStalledSnapshot() cancelling DEV run", {
+ runId,
+ snapshot: latestSnapshot,
+ });
+
+ await this.cancelRun({
+ runId: latestSnapshot.runId,
+ finalizeRun: true,
+ reason:
+ "Run was disconnected, check you're running the CLI dev command and your network connection is healthy.",
+ tx,
+ });
+ return;
+ }
+
+ switch (latestSnapshot.executionStatus) {
+ case "RUN_CREATED": {
+ throw new NotImplementedError("There shouldn't be a heartbeat for RUN_CREATED");
+ }
+ case "QUEUED": {
+ throw new NotImplementedError("There shouldn't be a heartbeat for QUEUED");
+ }
+ case "PENDING_EXECUTING": {
+ //the run didn't start executing, we need to requeue it
+ const run = await prisma.taskRun.findFirst({
+ where: { id: runId },
+ include: {
+ runtimeEnvironment: {
+ include: {
+ organization: true,
+ },
+ },
+ },
+ });
+
+ if (!run) {
+ this.logger.error(
+ "RunEngine.#handleStalledSnapshot() PENDING_EXECUTING run not found",
+ {
+ runId,
+ snapshot: latestSnapshot,
+ }
+ );
+
+ throw new Error(`Run ${runId} not found`);
+ }
+
+ //it will automatically be requeued X times depending on the queue retry settings
+ const gotRequeued = await this.#tryNackAndRequeue({
+ run,
+ environment: {
+ id: latestSnapshot.environmentId,
+ type: latestSnapshot.environmentType,
+ },
+ orgId: run.runtimeEnvironment.organizationId,
+ error: {
+ type: "INTERNAL_ERROR",
+ code: "TASK_RUN_DEQUEUED_MAX_RETRIES",
+ message: `Trying to create an attempt failed multiple times, exceeding how many times we retry.`,
+ },
+ tx: prisma,
+ });
+ break;
+ }
+ case "EXECUTING":
+ case "EXECUTING_WITH_WAITPOINTS": {
+ const retryDelay = 250;
+
+ //todo call attemptFailed and force requeuing
+ await this.#attemptFailed({
+ runId,
+ snapshotId: latestSnapshot.id,
+ completion: {
+ ok: false,
+ id: runId,
+ error: {
+ type: "INTERNAL_ERROR",
+ code:
+ latestSnapshot.executionStatus === "EXECUTING"
+ ? "TASK_RUN_STALLED_EXECUTING"
+ : "TASK_RUN_STALLED_EXECUTING_WITH_WAITPOINTS",
+ message: `Run stalled while executing. This can happen when the run becomes unresponsive, for example because the CPU is overloaded.`,
+ },
+ retry: {
+ //250ms in the future
+ timestamp: Date.now() + retryDelay,
+ delay: retryDelay,
+ },
+ },
+ forceRequeue: true,
+ tx: prisma,
+ });
+ break;
+ }
+ case "SUSPENDED": {
+ //todo should we do a periodic check here for whether waitpoints are actually still blocking?
+ //we could at least log some things out if a run has been in this state for a long time
+ throw new NotImplementedError("Not implemented SUSPENDED");
+ }
+ case "PENDING_CANCEL": {
+ //if the run is waiting to cancel but the worker hasn't confirmed that,
+ //we force the run to be cancelled
+ await this.cancelRun({
+ runId: latestSnapshot.runId,
+ finalizeRun: true,
+ tx,
+ });
+ break;
+ }
+ case "FINISHED": {
+ throw new NotImplementedError("There shouldn't be a heartbeat for FINISHED");
+ }
+ default: {
+ assertNever(latestSnapshot.executionStatus);
+ }
+ }
+ });
+ }
+
+ //#endregion
+
+ /**
+ * Sends a notification that a run has changed and we need to fetch the latest run state.
+ * The worker will call `getRunExecutionData` via the API and act accordingly.
+ */
+ async #sendNotificationToWorker({
+ runId,
+ snapshot,
+ }: {
+ runId: string;
+ snapshot: {
+ id: string;
+ executionStatus: TaskRunExecutionStatus;
+ };
+ }) {
+ this.eventBus.emit("workerNotification", {
+ time: new Date(),
+ run: {
+ id: runId,
+ },
+ snapshot: {
+ id: snapshot.id,
+ executionStatus: snapshot.executionStatus,
+ },
+ });
+ }
+
+ /*
+ * Whether the run succeeds, fails, is cancelled… we need to run these operations
+ */
+ async #finalizeRun({ id, batchId }: { id: string; batchId: string | null }) {
+ if (batchId) {
+ await this.tryCompleteBatch({ batchId });
+ }
+
+ //cancel the heartbeats
+ await this.worker.ack(`heartbeatSnapshot.${id}`);
+ }
+
+ /**
+ * Checks to see if all runs for a BatchTaskRun are completed, if they are then update the status.
+ * This isn't used operationally, but it's used for the Batches dashboard page.
+ */
+ async #tryCompleteBatch({ batchId }: { batchId: string }) {
+ return this.#trace(
+ "#tryCompleteBatch",
+ {
+ batchId,
+ },
+ async (span) => {
+ const batch = await this.prisma.batchTaskRun.findUnique({
+ select: {
+ status: true,
+ runtimeEnvironmentId: true,
+ },
+ where: {
+ id: batchId,
+ },
+ });
+
+ if (!batch) {
+ this.logger.error("#tryCompleteBatch batch doesn't exist", { batchId });
+ return;
+ }
+
+ if (batch.status === "COMPLETED") {
+ this.logger.debug("#tryCompleteBatch: Batch already completed", { batchId });
+ return;
+ }
+
+ const runs = await this.prisma.taskRun.findMany({
+ select: {
+ id: true,
+ status: true,
+ },
+ where: {
+ batchId,
+ runtimeEnvironmentId: batch.runtimeEnvironmentId,
+ },
+ });
+
+ if (runs.every((r) => isFinalRunStatus(r.status))) {
+ this.logger.debug("#tryCompleteBatch: All runs are completed", { batchId });
+ await this.prisma.batchTaskRun.update({
+ where: {
+ id: batchId,
+ },
+ data: {
+ status: "COMPLETED",
+ },
+ });
+ } else {
+ this.logger.debug("#tryCompleteBatch: Not all runs are completed", { batchId });
+ }
+ }
+ );
+ }
+
+ async #getAuthenticatedEnvironmentFromRun(runId: string, tx?: PrismaClientOrTransaction) {
+ const prisma = tx ?? this.prisma;
+ const taskRun = await prisma.taskRun.findUnique({
+ where: {
+ id: runId,
+ },
+ include: {
+ runtimeEnvironment: {
+ include: {
+ organization: true,
+ project: true,
+ },
+ },
+ },
+ });
+
+ if (!taskRun) {
+ return;
+ }
+
+ return taskRun?.runtimeEnvironment;
+ }
+
+ #environmentMasterQueueKey(environmentId: string) {
+ return `master-env:${environmentId}`;
+ }
+
+ #backgroundWorkerQueueKey(backgroundWorkerId: string) {
+ return `master-background-worker:${backgroundWorkerId}`;
+ }
+
+ async #trace(
+ trace: string,
+ attributes: Attributes | undefined,
+ fn: (span: Span) => Promise
+ ): Promise {
+ return this.tracer.startActiveSpan(
+ `${this.constructor.name}.${trace}`,
+ { attributes, kind: SpanKind.SERVER },
+ async (span) => {
+ try {
+ return await fn(span);
+ } catch (e) {
+ if (e instanceof ServiceValidationError) {
+ throw e;
+ }
+
+ if (e instanceof Error) {
+ span.recordException(e);
+ } else {
+ span.recordException(new Error(String(e)));
+ }
+
+ throw e;
+ } finally {
+ span.end();
+ }
+ }
+ );
+ }
+}
+
+export class ServiceValidationError extends Error {
+ constructor(
+ message: string,
+ public status?: number
+ ) {
+ super(message);
+ this.name = "ServiceValidationError";
+ }
+}
+
+class NotImplementedError extends Error {
+ constructor(message: string) {
+ console.error("This isn't implemented", { message });
+ super(message);
+ }
+}
+
+export class RunDuplicateIdempotencyKeyError extends Error {
+ constructor(message: string) {
+ super(message);
+ this.name = "RunDuplicateIdempotencyKeyError";
+ }
+}
diff --git a/internal-packages/run-engine/src/engine/locking.test.ts b/internal-packages/run-engine/src/engine/locking.test.ts
new file mode 100644
index 0000000000..f78cd79e86
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/locking.test.ts
@@ -0,0 +1,48 @@
+import { redisTest } from "@internal/testcontainers";
+import { expect } from "vitest";
+import { RunLocker } from "./locking.js";
+import Redis from "ioredis";
+
+describe("RunLocker", () => {
+ redisTest("Test acquiring a lock works", { timeout: 15_000 }, async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+ try {
+ const runLock = new RunLocker({ redis });
+
+ expect(runLock.isInsideLock()).toBe(false);
+
+ await runLock.lock(["test-1"], 5000, async (signal) => {
+ expect(signal).toBeDefined();
+ expect(runLock.isInsideLock()).toBe(true);
+ });
+
+ expect(runLock.isInsideLock()).toBe(false);
+ } finally {
+ await redis.quit();
+ }
+ });
+
+ redisTest("Test double locking works", { timeout: 15_000 }, async ({ redisOptions }) => {
+ const redis = new Redis(redisOptions);
+ try {
+ const runLock = new RunLocker({ redis });
+
+ expect(runLock.isInsideLock()).toBe(false);
+
+ await runLock.lock(["test-1"], 5000, async (signal) => {
+ expect(signal).toBeDefined();
+ expect(runLock.isInsideLock()).toBe(true);
+
+ //should be able to "lock it again"
+ await runLock.lock(["test-1"], 5000, async (signal) => {
+ expect(signal).toBeDefined();
+ expect(runLock.isInsideLock()).toBe(true);
+ });
+ });
+
+ expect(runLock.isInsideLock()).toBe(false);
+ } finally {
+ await redis.quit();
+ }
+ });
+});
diff --git a/internal-packages/run-engine/src/engine/locking.ts b/internal-packages/run-engine/src/engine/locking.ts
new file mode 100644
index 0000000000..cd3aecc7c6
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/locking.ts
@@ -0,0 +1,60 @@
+import Redis from "ioredis";
+import Redlock, { RedlockAbortSignal } from "redlock";
+import { AsyncLocalStorage } from "async_hooks";
+
+interface LockContext {
+ resources: string;
+ signal: RedlockAbortSignal;
+}
+
+export class RunLocker {
+ private redlock: Redlock;
+ private asyncLocalStorage: AsyncLocalStorage;
+
+ constructor(options: { redis: Redis }) {
+ this.redlock = new Redlock([options.redis], {
+ driftFactor: 0.01,
+ retryCount: 10,
+ retryDelay: 200, // time in ms
+ retryJitter: 200, // time in ms
+ automaticExtensionThreshold: 500, // time in ms
+ });
+ this.asyncLocalStorage = new AsyncLocalStorage();
+ }
+
+ /** Locks resources using RedLock. It won't lock again if we're already inside a lock with the same resources. */
+ async lock(
+ resources: string[],
+ duration: number,
+ routine: (signal: RedlockAbortSignal) => Promise
+ ): Promise {
+ const currentContext = this.asyncLocalStorage.getStore();
+ const joinedResources = resources.sort().join(",");
+
+ if (currentContext && currentContext.resources === joinedResources) {
+ // We're already inside a lock with the same resources, just run the routine
+ return routine(currentContext.signal);
+ }
+
+ // Different resources or not in a lock, proceed with new lock
+ return this.redlock.using(resources, duration, async (signal) => {
+ const newContext: LockContext = { resources: joinedResources, signal };
+
+ return this.asyncLocalStorage.run(newContext, async () => {
+ return routine(signal);
+ });
+ });
+ }
+
+ isInsideLock(): boolean {
+ return !!this.asyncLocalStorage.getStore();
+ }
+
+ getCurrentResources(): string | undefined {
+ return this.asyncLocalStorage.getStore()?.resources;
+ }
+
+ async quit() {
+ await this.redlock.quit();
+ }
+}
diff --git a/internal-packages/run-engine/src/engine/machinePresets.ts b/internal-packages/run-engine/src/engine/machinePresets.ts
new file mode 100644
index 0000000000..4c526942a7
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/machinePresets.ts
@@ -0,0 +1,74 @@
+import { MachineConfig, MachinePreset, MachinePresetName } from "@trigger.dev/core/v3";
+import { Logger } from "@trigger.dev/core/logger";
+
+const logger = new Logger("machinePresetFromConfig");
+
+export function getMachinePreset({
+ defaultMachine,
+ machines,
+ config,
+ run,
+}: {
+ defaultMachine: MachinePresetName;
+ machines: Record;
+ config: unknown;
+ run: { machinePreset: string | null };
+}): MachinePreset {
+ if (run.machinePreset) {
+ const preset = MachinePresetName.safeParse(run.machinePreset);
+ if (preset.error) {
+ logger.error("Failed to parse machine preset", { machinePreset: run.machinePreset });
+ return machinePresetFromName(machines, defaultMachine);
+ }
+ return machinePresetFromName(machines, preset.data);
+ }
+
+ const parsedConfig = MachineConfig.safeParse(config);
+
+ if (!parsedConfig.success) {
+ logger.error("Failed to parse machine config", { config });
+
+ return machinePresetFromName(machines, "small-1x");
+ }
+
+ if (parsedConfig.data.preset) {
+ return machinePresetFromName(machines, parsedConfig.data.preset);
+ }
+
+ if (parsedConfig.data.cpu && parsedConfig.data.memory) {
+ const name = derivePresetNameFromValues(
+ machines,
+ parsedConfig.data.cpu,
+ parsedConfig.data.memory
+ );
+ if (!name) {
+ return machinePresetFromName(machines, defaultMachine);
+ }
+
+ return machinePresetFromName(machines, name);
+ }
+
+ return machinePresetFromName(machines, "small-1x");
+}
+
+export function machinePresetFromName(
+ machines: Record,
+ name: MachinePresetName
+): MachinePreset {
+ return {
+ ...machines[name],
+ };
+}
+
+// Finds the smallest machine preset name that satisfies the given CPU and memory requirements
+function derivePresetNameFromValues(
+ machines: Record,
+ cpu: number,
+ memory: number
+): MachinePresetName | undefined {
+ for (const [name, preset] of Object.entries(machines)) {
+ if (preset.cpu >= cpu && preset.memory >= memory) {
+ return name as MachinePresetName;
+ }
+ }
+}
diff --git a/internal-packages/run-engine/src/engine/statuses.ts b/internal-packages/run-engine/src/engine/statuses.ts
new file mode 100644
index 0000000000..27ba540be1
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/statuses.ts
@@ -0,0 +1,46 @@
+import { TaskRunExecutionStatus, TaskRunStatus } from "@trigger.dev/database";
+
+export function isDequeueableExecutionStatus(status: TaskRunExecutionStatus): boolean {
+ const dequeuableExecutionStatuses: TaskRunExecutionStatus[] = ["QUEUED"];
+ return dequeuableExecutionStatuses.includes(status);
+}
+
+export function isExecuting(status: TaskRunExecutionStatus): boolean {
+ const executingExecutionStatuses: TaskRunExecutionStatus[] = [
+ "EXECUTING",
+ "EXECUTING_WITH_WAITPOINTS",
+ ];
+ return executingExecutionStatuses.includes(status);
+}
+
+export function isPendingExecuting(status: TaskRunExecutionStatus): boolean {
+ const pendingExecutionStatuses: TaskRunExecutionStatus[] = ["PENDING_EXECUTING"];
+ return pendingExecutionStatuses.includes(status);
+}
+
+export function isCheckpointable(status: TaskRunExecutionStatus): boolean {
+ const checkpointableStatuses: TaskRunExecutionStatus[] = [
+ //will allow checkpoint starts
+ "RUN_CREATED",
+ "QUEUED",
+ //executing
+ "EXECUTING",
+ "EXECUTING_WITH_WAITPOINTS",
+ ];
+ return checkpointableStatuses.includes(status);
+}
+
+export function isFinalRunStatus(status: TaskRunStatus): boolean {
+ const finalStatuses: TaskRunStatus[] = [
+ "CANCELED",
+ "INTERRUPTED",
+ "COMPLETED_SUCCESSFULLY",
+ "COMPLETED_WITH_ERRORS",
+ "SYSTEM_FAILURE",
+ "CRASHED",
+ "EXPIRED",
+ "TIMED_OUT",
+ ];
+
+ return finalStatuses.includes(status);
+}
diff --git a/internal-packages/run-engine/src/engine/tests/batchTrigger.test.ts b/internal-packages/run-engine/src/engine/tests/batchTrigger.test.ts
new file mode 100644
index 0000000000..c3b78f0086
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/tests/batchTrigger.test.ts
@@ -0,0 +1,182 @@
+import {
+ containerTest,
+ setupAuthenticatedEnvironment,
+ setupBackgroundWorker,
+} from "@internal/testcontainers";
+import { trace } from "@opentelemetry/api";
+import { generateFriendlyId } from "@trigger.dev/core/v3/apps";
+import { expect } from "vitest";
+import { RunEngine } from "../index.js";
+import { setTimeout } from "node:timers/promises";
+
+describe("RunEngine batchTrigger", () => {
+ containerTest(
+ "Batch trigger shares a batch",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0005,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ const backgroundWorker = await setupBackgroundWorker(
+ prisma,
+ authenticatedEnvironment,
+ taskIdentifier
+ );
+
+ const batch = await prisma.batchTaskRun.create({
+ data: {
+ friendlyId: generateFriendlyId("batch"),
+ runtimeEnvironmentId: authenticatedEnvironment.id,
+ },
+ });
+
+ //trigger the runs
+ const run1 = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ batch: { id: batch.id, index: 0 },
+ },
+ prisma
+ );
+
+ const run2 = await engine.trigger(
+ {
+ number: 2,
+ friendlyId: "run_1235",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ batch: { id: batch.id, index: 1 },
+ },
+ prisma
+ );
+
+ expect(run1).toBeDefined();
+ expect(run1.friendlyId).toBe("run_1234");
+ expect(run1.batchId).toBe(batch.id);
+
+ expect(run2).toBeDefined();
+ expect(run2.friendlyId).toBe("run_1235");
+ expect(run2.batchId).toBe(batch.id);
+
+ //check the queue length
+ const queueLength = await engine.runQueue.lengthOfEnvQueue(authenticatedEnvironment);
+ expect(queueLength).toBe(2);
+
+ //dequeue
+ const [d1, d2] = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run1.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //attempts
+ const attempt1 = await engine.startRunAttempt({
+ runId: d1.run.id,
+ snapshotId: d1.snapshot.id,
+ });
+ const attempt2 = await engine.startRunAttempt({
+ runId: d2.run.id,
+ snapshotId: d2.snapshot.id,
+ });
+
+ //complete the runs
+ const result1 = await engine.completeRunAttempt({
+ runId: attempt1.run.id,
+ snapshotId: attempt1.snapshot.id,
+ completion: {
+ ok: true,
+ id: attempt1.run.id,
+ output: `{"foo":"bar"}`,
+ outputType: "application/json",
+ },
+ });
+ const result2 = await engine.completeRunAttempt({
+ runId: attempt2.run.id,
+ snapshotId: attempt2.snapshot.id,
+ completion: {
+ ok: true,
+ id: attempt2.run.id,
+ output: `{"baz":"qux"}`,
+ outputType: "application/json",
+ },
+ });
+
+ //the batch won't complete immediately
+ const batchAfter1 = await prisma.batchTaskRun.findUnique({
+ where: {
+ id: batch.id,
+ },
+ });
+ expect(batchAfter1?.status).toBe("PENDING");
+
+ await setTimeout(3_000);
+
+ //the batch should complete
+ const batchAfter2 = await prisma.batchTaskRun.findUnique({
+ where: {
+ id: batch.id,
+ },
+ });
+ expect(batchAfter2?.status).toBe("COMPLETED");
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+});
diff --git a/internal-packages/run-engine/src/engine/tests/batchTriggerAndWait.test.ts b/internal-packages/run-engine/src/engine/tests/batchTriggerAndWait.test.ts
new file mode 100644
index 0000000000..bbe5ab2de6
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/tests/batchTriggerAndWait.test.ts
@@ -0,0 +1,374 @@
+import {
+ assertNonNullable,
+ containerTest,
+ setupAuthenticatedEnvironment,
+ setupBackgroundWorker,
+} from "@internal/testcontainers";
+import { trace } from "@opentelemetry/api";
+import { expect } from "vitest";
+import { RunEngine } from "../index.js";
+import { setTimeout } from "node:timers/promises";
+import { generateFriendlyId } from "@trigger.dev/core/v3/apps";
+
+describe("RunEngine batchTriggerAndWait", () => {
+ containerTest(
+ "batchTriggerAndWait (no idempotency)",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 20,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const parentTask = "parent-task";
+ const childTask = "child-task";
+
+ //create background worker
+ await setupBackgroundWorker(prisma, authenticatedEnvironment, [parentTask, childTask]);
+
+ //create a batch
+ const batch = await prisma.batchTaskRun.create({
+ data: {
+ friendlyId: generateFriendlyId("batch"),
+ runtimeEnvironmentId: authenticatedEnvironment.id,
+ },
+ });
+
+ //trigger the run
+ const parentRun = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_p1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier: parentTask,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: `task/${parentTask}`,
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue parent
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: parentRun.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create an attempt
+ const initialExecutionData = await engine.getRunExecutionData({ runId: parentRun.id });
+ assertNonNullable(initialExecutionData);
+ const attemptResult = await engine.startRunAttempt({
+ runId: parentRun.id,
+ snapshotId: initialExecutionData.snapshot.id,
+ });
+
+ //block using the batch
+ await engine.blockRunWithCreatedBatch({
+ runId: parentRun.id,
+ batchId: batch.id,
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.projectId,
+ });
+
+ const afterBlockedByBatch = await engine.getRunExecutionData({ runId: parentRun.id });
+ assertNonNullable(afterBlockedByBatch);
+ expect(afterBlockedByBatch.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+
+ const child1 = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_c1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier: childTask,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: `task/${childTask}`,
+ isTest: false,
+ tags: [],
+ resumeParentOnCompletion: true,
+ parentTaskRunId: parentRun.id,
+ batch: { id: batch.id, index: 0 },
+ },
+ prisma
+ );
+
+ const parentAfterChild1 = await engine.getRunExecutionData({ runId: parentRun.id });
+ assertNonNullable(parentAfterChild1);
+ expect(parentAfterChild1.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+
+ const child2 = await engine.trigger(
+ {
+ number: 2,
+ friendlyId: "run_c12345",
+ environment: authenticatedEnvironment,
+ taskIdentifier: childTask,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t123456",
+ spanId: "s123456",
+ masterQueue: "main",
+ queueName: `task/${childTask}`,
+ isTest: false,
+ tags: [],
+ resumeParentOnCompletion: true,
+ parentTaskRunId: parentRun.id,
+ batch: { id: batch.id, index: 1 },
+ },
+ prisma
+ );
+
+ const parentAfterChild2 = await engine.getRunExecutionData({ runId: parentRun.id });
+ assertNonNullable(parentAfterChild2);
+ expect(parentAfterChild2.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+
+ //check the waitpoint blocking the parent run
+ const runWaitpoints = await prisma.taskRunWaitpoint.findMany({
+ where: {
+ taskRunId: parentRun.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ orderBy: {
+ createdAt: "asc",
+ },
+ });
+ expect(runWaitpoints.length).toBe(3);
+ const child1Waitpoint = runWaitpoints.find(
+ (w) => w.waitpoint.completedByTaskRunId === child1.id
+ );
+ expect(child1Waitpoint?.waitpoint.type).toBe("RUN");
+ expect(child1Waitpoint?.waitpoint.completedByTaskRunId).toBe(child1.id);
+ expect(child1Waitpoint?.batchId).toBe(batch.id);
+ expect(child1Waitpoint?.batchIndex).toBe(0);
+ const child2Waitpoint = runWaitpoints.find(
+ (w) => w.waitpoint.completedByTaskRunId === child2.id
+ );
+ expect(child2Waitpoint?.waitpoint.type).toBe("RUN");
+ expect(child2Waitpoint?.waitpoint.completedByTaskRunId).toBe(child2.id);
+ expect(child2Waitpoint?.batchId).toBe(batch.id);
+ expect(child2Waitpoint?.batchIndex).toBe(1);
+ const batchWaitpoint = runWaitpoints.find((w) => w.waitpoint.type === "BATCH");
+ expect(batchWaitpoint?.waitpoint.type).toBe("BATCH");
+ expect(batchWaitpoint?.waitpoint.completedByBatchId).toBe(batch.id);
+
+ await engine.unblockRunForCreatedBatch({
+ runId: parentRun.id,
+ batchId: batch.id,
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.projectId,
+ });
+
+ //dequeue and start the 1st child
+ const dequeuedChild = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: child1.masterQueue,
+ maxRunCount: 1,
+ });
+
+ expect(dequeuedChild.length).toBe(1);
+
+ const childAttempt1 = await engine.startRunAttempt({
+ runId: dequeuedChild[0].run.id,
+ snapshotId: dequeuedChild[0].snapshot.id,
+ });
+
+ // complete the 1st child
+ await engine.completeRunAttempt({
+ runId: childAttempt1.run.id,
+ snapshotId: childAttempt1.snapshot.id,
+ completion: {
+ id: child1.id,
+ ok: true,
+ output: '{"foo":"bar"}',
+ outputType: "application/json",
+ },
+ });
+
+ //child snapshot
+ const childExecutionDataAfter = await engine.getRunExecutionData({
+ runId: childAttempt1.run.id,
+ });
+ assertNonNullable(childExecutionDataAfter);
+ expect(childExecutionDataAfter.snapshot.executionStatus).toBe("FINISHED");
+
+ const child1WaitpointAfter = await prisma.waitpoint.findFirst({
+ where: {
+ id: child1Waitpoint?.waitpointId,
+ },
+ });
+ expect(child1WaitpointAfter?.completedAt).not.toBeNull();
+ expect(child1WaitpointAfter?.status).toBe("COMPLETED");
+ expect(child1WaitpointAfter?.output).toBe('{"foo":"bar"}');
+
+ await setTimeout(500);
+
+ const runWaitpointsAfterFirstChild = await prisma.taskRunWaitpoint.findMany({
+ where: {
+ taskRunId: parentRun.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ expect(runWaitpointsAfterFirstChild.length).toBe(3);
+
+ //parent snapshot
+ const parentExecutionDataAfterFirstChildComplete = await engine.getRunExecutionData({
+ runId: parentRun.id,
+ });
+ assertNonNullable(parentExecutionDataAfterFirstChildComplete);
+ expect(parentExecutionDataAfterFirstChildComplete.snapshot.executionStatus).toBe(
+ "EXECUTING_WITH_WAITPOINTS"
+ );
+ expect(parentExecutionDataAfterFirstChildComplete.batch?.id).toBe(batch.id);
+ expect(parentExecutionDataAfterFirstChildComplete.completedWaitpoints.length).toBe(0);
+
+ expect(await engine.runQueue.lengthOfEnvQueue(authenticatedEnvironment)).toBe(1);
+
+ //dequeue and start the 2nd child
+ const dequeuedChild2 = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: child2.masterQueue,
+ maxRunCount: 1,
+ });
+
+ expect(dequeuedChild2.length).toBe(1);
+
+ const childAttempt2 = await engine.startRunAttempt({
+ runId: child2.id,
+ snapshotId: dequeuedChild2[0].snapshot.id,
+ });
+ await engine.completeRunAttempt({
+ runId: child2.id,
+ snapshotId: childAttempt2.snapshot.id,
+ completion: {
+ id: child2.id,
+ ok: true,
+ output: '{"baz":"qux"}',
+ outputType: "application/json",
+ },
+ });
+
+ //child snapshot
+ const child2ExecutionDataAfter = await engine.getRunExecutionData({ runId: child1.id });
+ assertNonNullable(child2ExecutionDataAfter);
+ expect(child2ExecutionDataAfter.snapshot.executionStatus).toBe("FINISHED");
+
+ const child2WaitpointAfter = await prisma.waitpoint.findFirst({
+ where: {
+ id: child2Waitpoint?.waitpointId,
+ },
+ });
+ expect(child2WaitpointAfter?.completedAt).not.toBeNull();
+ expect(child2WaitpointAfter?.status).toBe("COMPLETED");
+ expect(child2WaitpointAfter?.output).toBe('{"baz":"qux"}');
+
+ await setTimeout(500);
+
+ const runWaitpointsAfterSecondChild = await prisma.taskRunWaitpoint.findMany({
+ where: {
+ taskRunId: parentRun.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ expect(runWaitpointsAfterSecondChild.length).toBe(0);
+
+ //parent snapshot
+ const parentExecutionDataAfterSecondChildComplete = await engine.getRunExecutionData({
+ runId: parentRun.id,
+ });
+ assertNonNullable(parentExecutionDataAfterSecondChildComplete);
+ expect(parentExecutionDataAfterSecondChildComplete.snapshot.executionStatus).toBe(
+ "EXECUTING"
+ );
+ expect(parentExecutionDataAfterSecondChildComplete.batch?.id).toBe(batch.id);
+ expect(parentExecutionDataAfterSecondChildComplete.completedWaitpoints.length).toBe(3);
+
+ const completedWaitpoint0 =
+ parentExecutionDataAfterSecondChildComplete.completedWaitpoints.find(
+ (w) => w.index === 0
+ );
+ assertNonNullable(completedWaitpoint0);
+ expect(completedWaitpoint0.id).toBe(child1Waitpoint!.waitpointId);
+ expect(completedWaitpoint0.completedByTaskRun?.id).toBe(child1.id);
+ expect(completedWaitpoint0.completedByTaskRun?.batch?.id).toBe(batch.id);
+ expect(completedWaitpoint0.output).toBe('{"foo":"bar"}');
+ expect(completedWaitpoint0.index).toBe(0);
+
+ const completedWaitpoint1 =
+ parentExecutionDataAfterSecondChildComplete.completedWaitpoints.find(
+ (w) => w.index === 1
+ );
+ assertNonNullable(completedWaitpoint1);
+ expect(completedWaitpoint1.id).toBe(child2Waitpoint!.waitpointId);
+ expect(completedWaitpoint1.completedByTaskRun?.id).toBe(child2.id);
+ expect(completedWaitpoint1.completedByTaskRun?.batch?.id).toBe(batch.id);
+ expect(completedWaitpoint1.index).toBe(1);
+ expect(completedWaitpoint1.output).toBe('{"baz":"qux"}');
+
+ const batchWaitpointAfter =
+ parentExecutionDataAfterSecondChildComplete.completedWaitpoints.find(
+ (w) => w.type === "BATCH"
+ );
+ expect(batchWaitpointAfter?.id).toBe(batchWaitpoint?.waitpointId);
+ expect(batchWaitpointAfter?.completedByBatch?.id).toBe(batch.id);
+ expect(batchWaitpointAfter?.index).toBeUndefined();
+
+ const batchAfter = await prisma.batchTaskRun.findUnique({
+ where: {
+ id: batch.id,
+ },
+ });
+ expect(batchAfter?.status === "COMPLETED");
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+});
diff --git a/internal-packages/run-engine/src/engine/tests/cancelling.test.ts b/internal-packages/run-engine/src/engine/tests/cancelling.test.ts
new file mode 100644
index 0000000000..e867b6a694
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/tests/cancelling.test.ts
@@ -0,0 +1,336 @@
+import {
+ containerTest,
+ setupAuthenticatedEnvironment,
+ setupBackgroundWorker,
+ assertNonNullable,
+} from "@internal/testcontainers";
+import { trace } from "@opentelemetry/api";
+import { expect } from "vitest";
+import { RunEngine } from "../index.js";
+import { setTimeout } from "timers/promises";
+import { EventBusEventArgs } from "../eventBus.js";
+
+describe("RunEngine cancelling", () => {
+ containerTest(
+ "Cancelling a run with children (that is executing)",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const parentTask = "parent-task";
+ const childTask = "child-task";
+
+ //create background worker
+ await setupBackgroundWorker(prisma, authenticatedEnvironment, [parentTask, childTask]);
+
+ //trigger the run
+ const parentRun = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_p1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier: parentTask,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: `task/${parentTask}`,
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: parentRun.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create an attempt
+ const attemptResult = await engine.startRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: dequeued[0].snapshot.id,
+ });
+ expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING");
+
+ //start child run
+ const childRun = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_c1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier: childTask,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: `task/${childTask}`,
+ isTest: false,
+ tags: [],
+ resumeParentOnCompletion: true,
+ parentTaskRunId: parentRun.id,
+ },
+ prisma
+ );
+
+ //dequeue the child run
+ const dequeuedChild = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: childRun.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //start the child run
+ const childAttempt = await engine.startRunAttempt({
+ runId: childRun.id,
+ snapshotId: dequeuedChild[0].snapshot.id,
+ });
+
+ let workerNotifications: EventBusEventArgs<"workerNotification">[0][] = [];
+ engine.eventBus.on("workerNotification", (result) => {
+ workerNotifications.push(result);
+ });
+
+ //cancel the parent run
+ const result = await engine.cancelRun({
+ runId: parentRun.id,
+ completedAt: new Date(),
+ reason: "Cancelled by the user",
+ });
+ expect(result.snapshot.executionStatus).toBe("PENDING_CANCEL");
+
+ //check a worker notification was sent for the running parent
+ expect(workerNotifications).toHaveLength(1);
+ expect(workerNotifications[0].run.id).toBe(parentRun.id);
+
+ const executionData = await engine.getRunExecutionData({ runId: parentRun.id });
+ expect(executionData?.snapshot.executionStatus).toBe("PENDING_CANCEL");
+ expect(executionData?.run.status).toBe("CANCELED");
+
+ let cancelledEventData: EventBusEventArgs<"runCancelled">[0][] = [];
+ engine.eventBus.on("runCancelled", (result) => {
+ cancelledEventData.push(result);
+ });
+
+ //todo call completeAttempt (this will happen from the worker)
+ const completeResult = await engine.completeRunAttempt({
+ runId: parentRun.id,
+ snapshotId: executionData!.snapshot.id,
+ completion: {
+ ok: false,
+ id: executionData!.run.id,
+ error: {
+ type: "INTERNAL_ERROR" as const,
+ code: "TASK_RUN_CANCELLED" as const,
+ },
+ },
+ });
+
+ //parent should now be fully cancelled
+ const executionDataAfter = await engine.getRunExecutionData({ runId: parentRun.id });
+ expect(executionDataAfter?.snapshot.executionStatus).toBe("FINISHED");
+ expect(executionDataAfter?.run.status).toBe("CANCELED");
+
+ //check emitted event
+ expect(cancelledEventData.length).toBe(1);
+ const parentEvent = cancelledEventData.find((r) => r.run.id === parentRun.id);
+ assertNonNullable(parentEvent);
+ expect(parentEvent.run.spanId).toBe(parentRun.spanId);
+
+ //cancelling children is async, so we need to wait a brief moment
+ await setTimeout(200);
+
+ //check a worker notification was sent for the running parent
+ expect(workerNotifications).toHaveLength(2);
+ expect(workerNotifications[1].run.id).toBe(childRun.id);
+
+ //child should now be pending cancel
+ const childExecutionDataAfter = await engine.getRunExecutionData({ runId: childRun.id });
+ expect(childExecutionDataAfter?.snapshot.executionStatus).toBe("PENDING_CANCEL");
+ expect(childExecutionDataAfter?.run.status).toBe("CANCELED");
+
+ //cancel the child (this will come from the worker)
+ const completeChildResult = await engine.completeRunAttempt({
+ runId: childRun.id,
+ snapshotId: childExecutionDataAfter!.snapshot.id,
+ completion: {
+ ok: false,
+ id: childRun.id,
+ error: {
+ type: "INTERNAL_ERROR" as const,
+ code: "TASK_RUN_CANCELLED" as const,
+ },
+ },
+ });
+ expect(completeChildResult.snapshot.executionStatus).toBe("FINISHED");
+ expect(completeChildResult.run.status).toBe("CANCELED");
+
+ //child should now be pending cancel
+ const childExecutionDataCancelled = await engine.getRunExecutionData({
+ runId: childRun.id,
+ });
+ expect(childExecutionDataCancelled?.snapshot.executionStatus).toBe("FINISHED");
+ expect(childExecutionDataCancelled?.run.status).toBe("CANCELED");
+
+ //check emitted event
+ expect(cancelledEventData.length).toBe(2);
+ const childEvent = cancelledEventData.find((r) => r.run.id === childRun.id);
+ assertNonNullable(childEvent);
+ expect(childEvent.run.spanId).toBe(childRun.spanId);
+
+ //concurrency should have been released
+ const envConcurrencyCompleted = await engine.runQueue.currentConcurrencyOfEnvironment(
+ authenticatedEnvironment
+ );
+ expect(envConcurrencyCompleted).toBe(0);
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+
+ containerTest(
+ "Cancelling a run (not executing)",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const parentTask = "parent-task";
+
+ //create background worker
+ await setupBackgroundWorker(prisma, authenticatedEnvironment, [parentTask]);
+
+ //trigger the run
+ const parentRun = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_p1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier: parentTask,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: `task/${parentTask}`,
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: parentRun.masterQueue,
+ maxRunCount: 10,
+ });
+
+ let cancelledEventData: EventBusEventArgs<"runCancelled">[0][] = [];
+ engine.eventBus.on("runCancelled", (result) => {
+ cancelledEventData.push(result);
+ });
+
+ //cancel the parent run
+ const result = await engine.cancelRun({
+ runId: parentRun.id,
+ completedAt: new Date(),
+ reason: "Cancelled by the user",
+ });
+ expect(result.snapshot.executionStatus).toBe("FINISHED");
+
+ const executionData = await engine.getRunExecutionData({ runId: parentRun.id });
+ expect(executionData?.snapshot.executionStatus).toBe("FINISHED");
+ expect(executionData?.run.status).toBe("CANCELED");
+
+ //check emitted event
+ expect(cancelledEventData.length).toBe(1);
+ const parentEvent = cancelledEventData.find((r) => r.run.id === parentRun.id);
+ assertNonNullable(parentEvent);
+ expect(parentEvent.run.spanId).toBe(parentRun.spanId);
+
+ //concurrency should have been released
+ const envConcurrencyCompleted = await engine.runQueue.currentConcurrencyOfEnvironment(
+ authenticatedEnvironment
+ );
+ expect(envConcurrencyCompleted).toBe(0);
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+
+ //todo bulk cancelling runs
+});
diff --git a/internal-packages/run-engine/src/engine/tests/checkpoints.test.ts b/internal-packages/run-engine/src/engine/tests/checkpoints.test.ts
new file mode 100644
index 0000000000..a62747ca0c
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/tests/checkpoints.test.ts
@@ -0,0 +1,17 @@
+//todo checkpoint tests
+import {
+ containerTest,
+ setupAuthenticatedEnvironment,
+ setupBackgroundWorker,
+ assertNonNullable,
+} from "@internal/testcontainers";
+import { trace } from "@opentelemetry/api";
+import { expect } from "vitest";
+import { RunEngine } from "../index.js";
+import { setTimeout } from "timers/promises";
+import { EventBusEventArgs } from "../eventBus.js";
+
+describe("RunEngine checkpoints", () => {
+ //todo checkpoint tests
+ test("empty test", async () => {});
+});
diff --git a/internal-packages/run-engine/src/engine/tests/delays.test.ts b/internal-packages/run-engine/src/engine/tests/delays.test.ts
new file mode 100644
index 0000000000..be937127e9
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/tests/delays.test.ts
@@ -0,0 +1,192 @@
+import {
+ containerTest,
+ setupAuthenticatedEnvironment,
+ setupBackgroundWorker,
+ assertNonNullable,
+} from "@internal/testcontainers";
+import { trace } from "@opentelemetry/api";
+import { expect } from "vitest";
+import { RunEngine } from "../index.js";
+import { setTimeout } from "timers/promises";
+
+describe("RunEngine delays", () => {
+ containerTest("Run start delayed", { timeout: 15_000 }, async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ const backgroundWorker = await setupBackgroundWorker(
+ prisma,
+ authenticatedEnvironment,
+ taskIdentifier
+ );
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ delayUntil: new Date(Date.now() + 500),
+ },
+ prisma
+ );
+
+ //should be created but not queued yet
+ const executionData = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData);
+ expect(executionData.snapshot.executionStatus).toBe("RUN_CREATED");
+
+ //wait for 1 seconds
+ await setTimeout(1_000);
+
+ //should now be queued
+ const executionData2 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData2);
+ expect(executionData2.snapshot.executionStatus).toBe("QUEUED");
+ } finally {
+ engine.quit();
+ }
+ });
+
+ containerTest(
+ "Rescheduling a delayed run",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ const backgroundWorker = await setupBackgroundWorker(
+ prisma,
+ authenticatedEnvironment,
+ taskIdentifier
+ );
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ delayUntil: new Date(Date.now() + 200),
+ },
+ prisma
+ );
+
+ //should be created but not queued yet
+ const executionData = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData);
+ expect(executionData.snapshot.executionStatus).toBe("RUN_CREATED");
+
+ const rescheduleTo = new Date(Date.now() + 1_500);
+ const updatedRun = await engine.rescheduleRun({ runId: run.id, delayUntil: rescheduleTo });
+ expect(updatedRun.delayUntil?.toISOString()).toBe(rescheduleTo.toISOString());
+
+ //wait so the initial delay passes
+ await setTimeout(1_000);
+
+ //should still be created
+ const executionData2 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData2);
+ expect(executionData2.snapshot.executionStatus).toBe("RUN_CREATED");
+
+ //wait so the updated delay passes
+ await setTimeout(1_750);
+
+ //should now be queued
+ const executionData3 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData3);
+ expect(executionData3.snapshot.executionStatus).toBe("QUEUED");
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+});
diff --git a/internal-packages/run-engine/src/engine/tests/dequeuing.test.ts b/internal-packages/run-engine/src/engine/tests/dequeuing.test.ts
new file mode 100644
index 0000000000..3f2cfdd9dd
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/tests/dequeuing.test.ts
@@ -0,0 +1,211 @@
+import {
+ containerTest,
+ setupAuthenticatedEnvironment,
+ setupBackgroundWorker,
+} from "@internal/testcontainers";
+import { trace } from "@opentelemetry/api";
+import { generateFriendlyId } from "@trigger.dev/core/v3/apps";
+import { expect } from "vitest";
+import { RunEngine } from "../index.js";
+import { setTimeout } from "node:timers/promises";
+import { MinimalAuthenticatedEnvironment } from "../../shared/index.js";
+import { PrismaClientOrTransaction } from "@trigger.dev/database";
+
+describe("RunEngine dequeuing", () => {
+ containerTest("Dequeues 5 runs", { timeout: 15_000 }, async ({ prisma, redisOptions }) => {
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0005,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier);
+
+ //trigger the runs
+ const runs = await triggerRuns({
+ engine,
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ prisma,
+ count: 10,
+ });
+ expect(runs.length).toBe(10);
+
+ //check the queue length
+ const queueLength = await engine.runQueue.lengthOfEnvQueue(authenticatedEnvironment);
+ expect(queueLength).toBe(10);
+
+ //dequeue
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: "main",
+ maxRunCount: 5,
+ });
+
+ expect(dequeued.length).toBe(5);
+ } finally {
+ engine.quit();
+ }
+ });
+
+ containerTest(
+ "Dequeues runs within machine constraints",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0005,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier, {
+ preset: "small-1x",
+ });
+
+ //trigger the runs
+ const runs = await triggerRuns({
+ engine,
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ prisma,
+ count: 20,
+ });
+ expect(runs.length).toBe(20);
+
+ //check the queue length
+ const queueLength = await engine.runQueue.lengthOfEnvQueue(authenticatedEnvironment);
+ expect(queueLength).toBe(20);
+
+ //dequeue
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: "main",
+ maxRunCount: 5,
+ maxResources: {
+ cpu: 1.1,
+ memory: 3.8,
+ },
+ });
+ expect(dequeued.length).toBe(2);
+
+ //check the queue length
+ const queueLength2 = await engine.runQueue.lengthOfEnvQueue(authenticatedEnvironment);
+ expect(queueLength2).toBe(18);
+
+ const dequeued2 = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: "main",
+ maxRunCount: 10,
+ maxResources: {
+ cpu: 4.7,
+ memory: 3.0,
+ },
+ });
+ expect(dequeued2.length).toBe(6);
+
+ //check the queue length
+ const queueLength3 = await engine.runQueue.lengthOfEnvQueue(authenticatedEnvironment);
+ expect(queueLength3).toBe(12);
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+});
+
+async function triggerRuns({
+ engine,
+ environment,
+ taskIdentifier,
+ prisma,
+ count,
+}: {
+ engine: RunEngine;
+ environment: MinimalAuthenticatedEnvironment;
+ taskIdentifier: string;
+ prisma: PrismaClientOrTransaction;
+ count: number;
+}) {
+ const runs = [];
+ for (let i = 0; i < count; i++) {
+ runs[i] = await engine.trigger(
+ {
+ number: i,
+ friendlyId: generateFriendlyId("run"),
+ environment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: `task/${taskIdentifier}`,
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+ }
+
+ return runs;
+}
diff --git a/internal-packages/run-engine/src/engine/tests/heartbeats.test.ts b/internal-packages/run-engine/src/engine/tests/heartbeats.test.ts
new file mode 100644
index 0000000000..c1fefde687
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/tests/heartbeats.test.ts
@@ -0,0 +1,493 @@
+import {
+ containerTest,
+ setupAuthenticatedEnvironment,
+ setupBackgroundWorker,
+ assertNonNullable,
+} from "@internal/testcontainers";
+import { trace } from "@opentelemetry/api";
+import { expect } from "vitest";
+import { RunEngine } from "../index.js";
+import { setTimeout } from "timers/promises";
+
+describe("RunEngine heartbeats", () => {
+ containerTest(
+ "Attempt timeout then successfully attempted",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const pendingExecutingTimeout = 100;
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ retryOptions: {
+ maxTimeoutInMs: 50,
+ },
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ heartbeatTimeoutsMs: {
+ PENDING_EXECUTING: pendingExecutingTimeout,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ const backgroundWorker = await setupBackgroundWorker(
+ prisma,
+ authenticatedEnvironment,
+ taskIdentifier
+ );
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //expect it to be pending with 0 consecutiveFailures
+ const executionData = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData);
+ expect(executionData.snapshot.executionStatus).toBe("PENDING_EXECUTING");
+
+ await setTimeout(pendingExecutingTimeout * 2);
+
+ //expect it to be pending with 3 consecutiveFailures
+ const executionData2 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData2);
+ expect(executionData2.snapshot.executionStatus).toBe("QUEUED");
+
+ await setTimeout(1_000);
+
+ //have to dequeue again
+ const dequeued2 = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+ expect(dequeued2.length).toBe(1);
+
+ // create an attempt
+ const attemptResult = await engine.startRunAttempt({
+ runId: dequeued2[0].run.id,
+ snapshotId: dequeued2[0].snapshot.id,
+ });
+ expect(attemptResult.run.id).toBe(run.id);
+ expect(attemptResult.run.status).toBe("EXECUTING");
+ expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING");
+ } finally {
+ await engine.quit();
+ }
+ }
+ );
+
+ containerTest(
+ "All start attempts timeout",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const pendingExecutingTimeout = 100;
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ retryOptions: {
+ //intentionally set the attempts to 2 and quick
+ maxAttempts: 2,
+ minTimeoutInMs: 50,
+ maxTimeoutInMs: 50,
+ },
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ heartbeatTimeoutsMs: {
+ PENDING_EXECUTING: pendingExecutingTimeout,
+ },
+
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ const backgroundWorker = await setupBackgroundWorker(
+ prisma,
+ authenticatedEnvironment,
+ taskIdentifier
+ );
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //expect it to be pending
+ const executionData = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData);
+ expect(executionData.snapshot.executionStatus).toBe("PENDING_EXECUTING");
+
+ await setTimeout(500);
+
+ //expect it to be pending with 3 consecutiveFailures
+ const executionData2 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData2);
+ expect(executionData2.snapshot.executionStatus).toBe("QUEUED");
+
+ //have to dequeue again
+ const dequeued2 = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+ expect(dequeued2.length).toBe(1);
+
+ //expect it to be pending
+ const executionData3 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData3);
+ expect(executionData3.snapshot.executionStatus).toBe("PENDING_EXECUTING");
+
+ await setTimeout(pendingExecutingTimeout * 3);
+
+ //expect it to be pending with 3 consecutiveFailures
+ const executionData4 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData4);
+ expect(executionData4.snapshot.executionStatus).toBe("FINISHED");
+ expect(executionData4.run.status).toBe("SYSTEM_FAILURE");
+ } finally {
+ await engine.quit();
+ }
+ }
+ );
+
+ containerTest(
+ "Execution timeout (worker doesn't heartbeat)",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const executingTimeout = 100;
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ retryOptions: {
+ //intentionally set the attempts to 2 and quick
+ maxAttempts: 2,
+ minTimeoutInMs: 50,
+ maxTimeoutInMs: 50,
+ },
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ heartbeatTimeoutsMs: {
+ EXECUTING: executingTimeout,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ const backgroundWorker = await setupBackgroundWorker(
+ prisma,
+ authenticatedEnvironment,
+ taskIdentifier
+ );
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create an attempt
+ await engine.startRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: dequeued[0].snapshot.id,
+ });
+
+ //should be executing
+ const executionData = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData);
+ expect(executionData.snapshot.executionStatus).toBe("EXECUTING");
+ expect(executionData.run.status).toBe("EXECUTING");
+
+ //wait long enough for the heartbeat to timeout
+ await setTimeout(1_000);
+
+ //expect it to be queued again
+ const executionData2 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData2);
+ expect(executionData2.snapshot.executionStatus).toBe("QUEUED");
+
+ //have to dequeue again
+ const dequeued2 = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+ expect(dequeued2.length).toBe(1);
+
+ //create an attempt
+ await engine.startRunAttempt({
+ runId: dequeued2[0].run.id,
+ snapshotId: dequeued2[0].snapshot.id,
+ });
+
+ //should be executing
+ const executionData3 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData3);
+ expect(executionData3.snapshot.executionStatus).toBe("EXECUTING");
+ expect(executionData3.run.status).toBe("EXECUTING");
+
+ //again wait long enough that the heartbeat fails
+ await setTimeout(1_000);
+
+ //expect it to be queued again
+ const executionData4 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData4);
+ expect(executionData4.snapshot.executionStatus).toBe("FINISHED");
+ expect(executionData4.run.status).toBe("SYSTEM_FAILURE");
+ } finally {
+ await engine.quit();
+ }
+ }
+ );
+
+ containerTest("Pending cancel", { timeout: 15_000 }, async ({ prisma, redisOptions }) => {
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const heartbeatTimeout = 100;
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ heartbeatTimeoutsMs: {
+ PENDING_CANCEL: heartbeatTimeout,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ const backgroundWorker = await setupBackgroundWorker(
+ prisma,
+ authenticatedEnvironment,
+ taskIdentifier
+ );
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create an attempt
+ await engine.startRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: dequeued[0].snapshot.id,
+ });
+
+ //cancel run
+ await engine.cancelRun({ runId: dequeued[0].run.id });
+
+ //expect it to be pending_cancel
+ const executionData2 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData2);
+ expect(executionData2.snapshot.executionStatus).toBe("PENDING_CANCEL");
+
+ //wait long enough for the heartbeat to timeout
+ await setTimeout(1_000);
+
+ //expect it to be queued again
+ const executionData3 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData3);
+ expect(executionData3.snapshot.executionStatus).toBe("FINISHED");
+ expect(executionData3.run.status).toBe("CANCELED");
+ } finally {
+ await engine.quit();
+ }
+ });
+});
diff --git a/internal-packages/run-engine/src/engine/tests/notDeployed.test.ts b/internal-packages/run-engine/src/engine/tests/notDeployed.test.ts
new file mode 100644
index 0000000000..03da6c548e
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/tests/notDeployed.test.ts
@@ -0,0 +1,153 @@
+import {
+ containerTest,
+ setupAuthenticatedEnvironment,
+ setupBackgroundWorker,
+ assertNonNullable,
+} from "@internal/testcontainers";
+import { trace } from "@opentelemetry/api";
+import { expect } from "vitest";
+import { RunEngine } from "../index.js";
+import { setTimeout } from "timers/promises";
+
+describe("RunEngine not deployed", () => {
+ containerTest("Not yet deployed", { timeout: 15_000 }, async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ //set this so we have to requeue the runs in two batches
+ queueRunsWaitingForWorkerBatchSize: 1,
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //trigger another run
+ const run2 = await engine.trigger(
+ {
+ number: 2,
+ friendlyId: "run_1235",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12346",
+ spanId: "s12346",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //should be queued
+ const executionDataR1 = await engine.getRunExecutionData({ runId: run.id });
+ const executionDataR2 = await engine.getRunExecutionData({ runId: run2.id });
+ assertNonNullable(executionDataR1);
+ assertNonNullable(executionDataR2);
+ expect(executionDataR1.snapshot.executionStatus).toBe("QUEUED");
+ expect(executionDataR2.snapshot.executionStatus).toBe("QUEUED");
+
+ //dequeuing should fail
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+ expect(dequeued.length).toBe(0);
+
+ //queue should be empty
+ const queueLength = await engine.runQueue.lengthOfQueue(authenticatedEnvironment, run.queue);
+ expect(queueLength).toBe(0);
+
+ //check the execution data now
+ const executionData2R1 = await engine.getRunExecutionData({ runId: run.id });
+ const executionData2R2 = await engine.getRunExecutionData({ runId: run2.id });
+ assertNonNullable(executionData2R1);
+ assertNonNullable(executionData2R2);
+ expect(executionData2R1.snapshot.executionStatus).toBe("RUN_CREATED");
+ expect(executionData2R2.snapshot.executionStatus).toBe("RUN_CREATED");
+ expect(executionData2R1.run.status).toBe("WAITING_FOR_DEPLOY");
+ expect(executionData2R2.run.status).toBe("WAITING_FOR_DEPLOY");
+
+ //create background worker
+ const backgroundWorker = await setupBackgroundWorker(
+ prisma,
+ authenticatedEnvironment,
+ taskIdentifier
+ );
+
+ //now we deploy the background worker
+ await engine.queueRunsWaitingForWorker({ backgroundWorkerId: backgroundWorker.worker.id });
+
+ //it's async so we wait
+ await setTimeout(500);
+
+ //should now be queued
+ const executionData3R1 = await engine.getRunExecutionData({ runId: run.id });
+ const executionData3R2 = await engine.getRunExecutionData({ runId: run2.id });
+ assertNonNullable(executionData3R1);
+ assertNonNullable(executionData3R2);
+ expect(executionData3R1.snapshot.executionStatus).toBe("QUEUED");
+ expect(executionData3R2.snapshot.executionStatus).toBe("QUEUED");
+ expect(executionData3R1.run.status).toBe("PENDING");
+ expect(executionData3R2.run.status).toBe("PENDING");
+
+ //queue should be empty
+ const queueLength2 = await engine.runQueue.lengthOfQueue(authenticatedEnvironment, run.queue);
+ expect(queueLength2).toBe(2);
+ } finally {
+ engine.quit();
+ }
+ });
+});
diff --git a/internal-packages/run-engine/src/engine/tests/priority.test.ts b/internal-packages/run-engine/src/engine/tests/priority.test.ts
new file mode 100644
index 0000000000..c0e0f4a459
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/tests/priority.test.ts
@@ -0,0 +1,145 @@
+import {
+ containerTest,
+ setupAuthenticatedEnvironment,
+ setupBackgroundWorker,
+} from "@internal/testcontainers";
+import { trace } from "@opentelemetry/api";
+import { generateFriendlyId } from "@trigger.dev/core/v3/apps";
+import { expect } from "vitest";
+import { RunEngine } from "../index.js";
+import { PrismaClientOrTransaction } from "@trigger.dev/database";
+import { MinimalAuthenticatedEnvironment } from "../../shared/index.js";
+import { setTimeout } from "timers/promises";
+
+describe("RunEngine priority", () => {
+ containerTest(
+ "Two runs execute in the correct order",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0005,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ const backgroundWorker = await setupBackgroundWorker(
+ prisma,
+ authenticatedEnvironment,
+ taskIdentifier
+ );
+
+ //the order should be 4,3,1,0,2
+ // 0 1 2 3 4
+ const priorities = [undefined, 500, -1200, 1000, 4000];
+
+ //trigger the runs
+ const runs = await triggerRuns({
+ engine,
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ prisma,
+ priorities,
+ });
+ expect(runs.length).toBe(priorities.length);
+
+ //check the queue length
+ const queueLength = await engine.runQueue.lengthOfEnvQueue(authenticatedEnvironment);
+ expect(queueLength).toBe(priorities.length);
+
+ //dequeue (expect 4 items because of the negative priority)
+ const dequeue = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: "main",
+ maxRunCount: 20,
+ });
+ expect(dequeue.length).toBe(4);
+ expect(dequeue[0].run.friendlyId).toBe(runs[4].friendlyId);
+ expect(dequeue[1].run.friendlyId).toBe(runs[3].friendlyId);
+ expect(dequeue[2].run.friendlyId).toBe(runs[1].friendlyId);
+ expect(dequeue[3].run.friendlyId).toBe(runs[0].friendlyId);
+
+ //wait 2 seconds (because of the negative priority)
+ await setTimeout(2_000);
+ const dequeue2 = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: "main",
+ maxRunCount: 20,
+ });
+ expect(dequeue2.length).toBe(1);
+ expect(dequeue2[0].run.friendlyId).toBe(runs[2].friendlyId);
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+});
+
+async function triggerRuns({
+ engine,
+ environment,
+ taskIdentifier,
+ priorities,
+ prisma,
+}: {
+ engine: RunEngine;
+ environment: MinimalAuthenticatedEnvironment;
+ taskIdentifier: string;
+ prisma: PrismaClientOrTransaction;
+ priorities: (number | undefined)[];
+}) {
+ const runs = [];
+ for (let i = 0; i < priorities.length; i++) {
+ runs[i] = await engine.trigger(
+ {
+ number: i,
+ friendlyId: generateFriendlyId("run"),
+ environment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: `task/${taskIdentifier}`,
+ isTest: false,
+ tags: [],
+ priorityMs: priorities[i],
+ },
+ prisma
+ );
+ }
+
+ return runs;
+}
diff --git a/internal-packages/run-engine/src/engine/tests/trigger.test.ts b/internal-packages/run-engine/src/engine/tests/trigger.test.ts
new file mode 100644
index 0000000000..69dbb66b08
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/tests/trigger.test.ts
@@ -0,0 +1,490 @@
+import {
+ assertNonNullable,
+ containerTest,
+ setupAuthenticatedEnvironment,
+ setupBackgroundWorker,
+} from "@internal/testcontainers";
+import { trace } from "@opentelemetry/api";
+import { expect } from "vitest";
+import { EventBusEventArgs } from "../eventBus.js";
+import { RunEngine } from "../index.js";
+
+describe("RunEngine trigger()", () => {
+ containerTest("Single run (success)", { timeout: 15_000 }, async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0005,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ const backgroundWorker = await setupBackgroundWorker(
+ prisma,
+ authenticatedEnvironment,
+ taskIdentifier
+ );
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+ expect(run).toBeDefined();
+ expect(run.friendlyId).toBe("run_1234");
+
+ //check it's actually in the db
+ const runFromDb = await prisma.taskRun.findUnique({
+ where: {
+ friendlyId: "run_1234",
+ },
+ });
+ expect(runFromDb).toBeDefined();
+ expect(runFromDb?.id).toBe(run.id);
+
+ const executionData = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData);
+ expect(executionData.snapshot.executionStatus).toBe("QUEUED");
+
+ //check the waitpoint is created
+ const runWaitpoint = await prisma.waitpoint.findMany({
+ where: {
+ completedByTaskRunId: run.id,
+ },
+ });
+ expect(runWaitpoint.length).toBe(1);
+ expect(runWaitpoint[0].type).toBe("RUN");
+
+ //check the queue length
+ const queueLength = await engine.runQueue.lengthOfQueue(authenticatedEnvironment, run.queue);
+ expect(queueLength).toBe(1);
+
+ //concurrency before
+ const envConcurrencyBefore = await engine.runQueue.currentConcurrencyOfEnvironment(
+ authenticatedEnvironment
+ );
+ expect(envConcurrencyBefore).toBe(0);
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+ expect(dequeued.length).toBe(1);
+ expect(dequeued[0].run.id).toBe(run.id);
+ expect(dequeued[0].run.attemptNumber).toBe(1);
+
+ const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment(
+ authenticatedEnvironment
+ );
+ expect(envConcurrencyAfter).toBe(1);
+
+ let attemptEvent: EventBusEventArgs<"runAttemptStarted">[0] | undefined = undefined;
+ engine.eventBus.on("runAttemptStarted", (result) => {
+ attemptEvent = result;
+ });
+
+ //create an attempt
+ const attemptResult = await engine.startRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: dequeued[0].snapshot.id,
+ });
+ expect(attemptResult.run.id).toBe(run.id);
+ expect(attemptResult.run.status).toBe("EXECUTING");
+ expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING");
+
+ //attempt event
+ assertNonNullable(attemptEvent);
+ const attemptedEvent = attemptEvent as EventBusEventArgs<"runAttemptStarted">[0];
+ expect(attemptedEvent.run.id).toBe(run.id);
+ expect(attemptedEvent.run.baseCostInCents).toBe(0.0005);
+
+ const executionData2 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData2);
+ expect(executionData2.snapshot.executionStatus).toBe("EXECUTING");
+ expect(executionData2.run.attemptNumber).toBe(1);
+ expect(executionData2.run.status).toBe("EXECUTING");
+
+ let successEvent: EventBusEventArgs<"runSucceeded">[0] | undefined = undefined;
+ engine.eventBus.on("runSucceeded", (result) => {
+ successEvent = result;
+ });
+
+ //complete the run
+ const result = await engine.completeRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: attemptResult.snapshot.id,
+ completion: {
+ ok: true,
+ id: dequeued[0].run.id,
+ output: `{"foo":"bar"}`,
+ outputType: "application/json",
+ },
+ });
+ expect(result.attemptStatus).toBe("RUN_FINISHED");
+ expect(result.snapshot.executionStatus).toBe("FINISHED");
+ expect(result.run.attemptNumber).toBe(1);
+ expect(result.run.status).toBe("COMPLETED_SUCCESSFULLY");
+
+ //state should be completed
+ const executionData3 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData3);
+ expect(executionData3.snapshot.executionStatus).toBe("FINISHED");
+ expect(executionData3.run.attemptNumber).toBe(1);
+ expect(executionData3.run.status).toBe("COMPLETED_SUCCESSFULLY");
+
+ //success event
+ assertNonNullable(successEvent);
+ const completedEvent = successEvent as EventBusEventArgs<"runSucceeded">[0];
+ expect(completedEvent.run.spanId).toBe(run.spanId);
+ expect(completedEvent.run.output).toBe('{"foo":"bar"}');
+ expect(completedEvent.run.outputType).toBe("application/json");
+
+ //concurrency should have been released
+ const envConcurrencyCompleted = await engine.runQueue.currentConcurrencyOfEnvironment(
+ authenticatedEnvironment
+ );
+ expect(envConcurrencyCompleted).toBe(0);
+
+ //waitpoint should have been completed, with the output
+ const runWaitpointAfter = await prisma.waitpoint.findMany({
+ where: {
+ completedByTaskRunId: run.id,
+ },
+ });
+ expect(runWaitpointAfter.length).toBe(1);
+ expect(runWaitpointAfter[0].type).toBe("RUN");
+ expect(runWaitpointAfter[0].output).toBe(`{"foo":"bar"}`);
+ } finally {
+ engine.quit();
+ }
+ });
+
+ containerTest("Single run (failed)", { timeout: 15_000 }, async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ const backgroundWorker = await setupBackgroundWorker(
+ prisma,
+ authenticatedEnvironment,
+ taskIdentifier
+ );
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create an attempt
+ const attemptResult = await engine.startRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: dequeued[0].snapshot.id,
+ });
+
+ //fail the attempt
+ const error = {
+ type: "BUILT_IN_ERROR" as const,
+ name: "UserError",
+ message: "This is a user error",
+ stackTrace: "Error: This is a user error\n at :1:1",
+ };
+ const result = await engine.completeRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: attemptResult.snapshot.id,
+ completion: {
+ ok: false,
+ id: dequeued[0].run.id,
+ error,
+ },
+ });
+ expect(result.attemptStatus).toBe("RUN_FINISHED");
+ expect(result.snapshot.executionStatus).toBe("FINISHED");
+ expect(result.run.attemptNumber).toBe(1);
+ expect(result.run.status).toBe("COMPLETED_WITH_ERRORS");
+
+ //state should be completed
+ const executionData3 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData3);
+ expect(executionData3.snapshot.executionStatus).toBe("FINISHED");
+ expect(executionData3.run.attemptNumber).toBe(1);
+ expect(executionData3.run.status).toBe("COMPLETED_WITH_ERRORS");
+
+ //concurrency should have been released
+ const envConcurrencyCompleted = await engine.runQueue.currentConcurrencyOfEnvironment(
+ authenticatedEnvironment
+ );
+ expect(envConcurrencyCompleted).toBe(0);
+
+ //waitpoint should have been completed, with the output
+ const runWaitpointAfter = await prisma.waitpoint.findMany({
+ where: {
+ completedByTaskRunId: run.id,
+ },
+ });
+ expect(runWaitpointAfter.length).toBe(1);
+ expect(runWaitpointAfter[0].type).toBe("RUN");
+ const output = JSON.parse(runWaitpointAfter[0].output as string);
+ expect(output.type).toBe(error.type);
+ expect(runWaitpointAfter[0].outputIsError).toBe(true);
+ } finally {
+ engine.quit();
+ }
+ });
+
+ containerTest(
+ "Single run (retry attempt, then succeed)",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ const backgroundWorker = await setupBackgroundWorker(
+ prisma,
+ authenticatedEnvironment,
+ taskIdentifier
+ );
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create an attempt
+ const attemptResult = await engine.startRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: dequeued[0].snapshot.id,
+ });
+
+ //fail the attempt
+ const error = {
+ type: "BUILT_IN_ERROR" as const,
+ name: "UserError",
+ message: "This is a user error",
+ stackTrace: "Error: This is a user error\n at :1:1",
+ };
+ const result = await engine.completeRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: attemptResult.snapshot.id,
+ completion: {
+ ok: false,
+ id: dequeued[0].run.id,
+ error,
+ retry: {
+ timestamp: Date.now(),
+ delay: 0,
+ },
+ },
+ });
+ expect(result.attemptStatus).toBe("RETRY_IMMEDIATELY");
+ expect(result.snapshot.executionStatus).toBe("PENDING_EXECUTING");
+ expect(result.run.status).toBe("RETRYING_AFTER_FAILURE");
+
+ //state should be completed
+ const executionData3 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData3);
+ expect(executionData3.snapshot.executionStatus).toBe("PENDING_EXECUTING");
+ //only when the new attempt is created, should the attempt be increased
+ expect(executionData3.run.attemptNumber).toBe(1);
+ expect(executionData3.run.status).toBe("RETRYING_AFTER_FAILURE");
+
+ //create a second attempt
+ const attemptResult2 = await engine.startRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: executionData3.snapshot.id,
+ });
+ expect(attemptResult2.run.attemptNumber).toBe(2);
+
+ //now complete it successfully
+ const result2 = await engine.completeRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: attemptResult2.snapshot.id,
+ completion: {
+ ok: true,
+ id: dequeued[0].run.id,
+ output: `{"foo":"bar"}`,
+ outputType: "application/json",
+ },
+ });
+ expect(result2.snapshot.executionStatus).toBe("FINISHED");
+ expect(result2.run.attemptNumber).toBe(2);
+ expect(result2.run.status).toBe("COMPLETED_SUCCESSFULLY");
+
+ //waitpoint should have been completed, with the output
+ const runWaitpointAfter = await prisma.waitpoint.findMany({
+ where: {
+ completedByTaskRunId: run.id,
+ },
+ });
+ expect(runWaitpointAfter.length).toBe(1);
+ expect(runWaitpointAfter[0].type).toBe("RUN");
+ expect(runWaitpointAfter[0].output).toBe(`{"foo":"bar"}`);
+ expect(runWaitpointAfter[0].outputIsError).toBe(false);
+
+ //state should be completed
+ const executionData4 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData4);
+ expect(executionData4.snapshot.executionStatus).toBe("FINISHED");
+ expect(executionData4.run.attemptNumber).toBe(2);
+ expect(executionData4.run.status).toBe("COMPLETED_SUCCESSFULLY");
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+});
diff --git a/internal-packages/run-engine/src/engine/tests/triggerAndWait.test.ts b/internal-packages/run-engine/src/engine/tests/triggerAndWait.test.ts
new file mode 100644
index 0000000000..92c2cb12bc
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/tests/triggerAndWait.test.ts
@@ -0,0 +1,456 @@
+import {
+ assertNonNullable,
+ containerTest,
+ setupAuthenticatedEnvironment,
+ setupBackgroundWorker,
+} from "@internal/testcontainers";
+import { trace } from "@opentelemetry/api";
+import { expect } from "vitest";
+import { RunEngine } from "../index.js";
+import { setTimeout } from "node:timers/promises";
+
+describe("RunEngine triggerAndWait", () => {
+ containerTest("triggerAndWait", { timeout: 15_000 }, async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const parentTask = "parent-task";
+ const childTask = "child-task";
+
+ //create background worker
+ await setupBackgroundWorker(prisma, authenticatedEnvironment, [parentTask, childTask]);
+
+ //trigger the run
+ const parentRun = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_p1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier: parentTask,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: `task/${parentTask}`,
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue parent
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: parentRun.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create an attempt
+ const initialExecutionData = await engine.getRunExecutionData({ runId: parentRun.id });
+ assertNonNullable(initialExecutionData);
+ const attemptResult = await engine.startRunAttempt({
+ runId: parentRun.id,
+ snapshotId: initialExecutionData.snapshot.id,
+ });
+
+ const childRun = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_c1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier: childTask,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: `task/${childTask}`,
+ isTest: false,
+ tags: [],
+ resumeParentOnCompletion: true,
+ parentTaskRunId: parentRun.id,
+ },
+ prisma
+ );
+
+ const childExecutionData = await engine.getRunExecutionData({ runId: childRun.id });
+ assertNonNullable(childExecutionData);
+ expect(childExecutionData.snapshot.executionStatus).toBe("QUEUED");
+
+ const parentExecutionData = await engine.getRunExecutionData({ runId: parentRun.id });
+ assertNonNullable(parentExecutionData);
+ expect(parentExecutionData.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+
+ //check the waitpoint blocking the parent run
+ const runWaitpoint = await prisma.taskRunWaitpoint.findFirst({
+ where: {
+ taskRunId: parentRun.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ assertNonNullable(runWaitpoint);
+ expect(runWaitpoint.waitpoint.type).toBe("RUN");
+ expect(runWaitpoint.waitpoint.completedByTaskRunId).toBe(childRun.id);
+
+ //dequeue the child run
+ const dequeuedChild = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: childRun.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //start the child run
+ const childAttempt = await engine.startRunAttempt({
+ runId: childRun.id,
+ snapshotId: dequeuedChild[0].snapshot.id,
+ });
+
+ // complete the child run
+ await engine.completeRunAttempt({
+ runId: childRun.id,
+ snapshotId: childAttempt.snapshot.id,
+ completion: {
+ id: childRun.id,
+ ok: true,
+ output: '{"foo":"bar"}',
+ outputType: "application/json",
+ },
+ });
+
+ //child snapshot
+ const childExecutionDataAfter = await engine.getRunExecutionData({ runId: childRun.id });
+ assertNonNullable(childExecutionDataAfter);
+ expect(childExecutionDataAfter.snapshot.executionStatus).toBe("FINISHED");
+
+ const waitpointAfter = await prisma.waitpoint.findFirst({
+ where: {
+ id: runWaitpoint.waitpointId,
+ },
+ });
+ expect(waitpointAfter?.completedAt).not.toBeNull();
+ expect(waitpointAfter?.status).toBe("COMPLETED");
+ expect(waitpointAfter?.output).toBe('{"foo":"bar"}');
+
+ await setTimeout(500);
+
+ const runWaitpointAfter = await prisma.taskRunWaitpoint.findFirst({
+ where: {
+ taskRunId: parentRun.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ expect(runWaitpointAfter).toBeNull();
+
+ //parent snapshot
+ const parentExecutionDataAfter = await engine.getRunExecutionData({ runId: parentRun.id });
+ assertNonNullable(parentExecutionDataAfter);
+ expect(parentExecutionDataAfter.snapshot.executionStatus).toBe("EXECUTING");
+ expect(parentExecutionDataAfter.completedWaitpoints?.length).toBe(1);
+ expect(parentExecutionDataAfter.completedWaitpoints![0].id).toBe(runWaitpoint.waitpointId);
+ expect(parentExecutionDataAfter.completedWaitpoints![0].completedByTaskRun?.id).toBe(
+ childRun.id
+ );
+ expect(parentExecutionDataAfter.completedWaitpoints![0].output).toBe('{"foo":"bar"}');
+ } finally {
+ engine.quit();
+ }
+ });
+
+ /** This happens if you `triggerAndWait` with an idempotencyKey if that run is in progress */
+ containerTest(
+ "triggerAndWait two runs with shared awaited child",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const parentTask = "parent-task";
+ const childTask = "child-task";
+
+ //create background worker
+ await setupBackgroundWorker(prisma, authenticatedEnvironment, [parentTask, childTask]);
+
+ //trigger the run
+ const parentRun1 = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_p1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier: parentTask,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: `task/${parentTask}`,
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue parent and create the attempt
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: parentRun1.masterQueue,
+ maxRunCount: 10,
+ });
+ const attemptResult = await engine.startRunAttempt({
+ runId: parentRun1.id,
+ snapshotId: dequeued[0].snapshot.id,
+ });
+
+ //trigger the child
+ const childRun = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_c1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier: childTask,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: `task/${childTask}`,
+ isTest: false,
+ tags: [],
+ resumeParentOnCompletion: true,
+ parentTaskRunId: parentRun1.id,
+ },
+ prisma
+ );
+
+ const childExecutionData = await engine.getRunExecutionData({ runId: childRun.id });
+ assertNonNullable(childExecutionData);
+ expect(childExecutionData.snapshot.executionStatus).toBe("QUEUED");
+
+ const parentExecutionData = await engine.getRunExecutionData({ runId: parentRun1.id });
+ assertNonNullable(parentExecutionData);
+ expect(parentExecutionData.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+
+ //check the waitpoint blocking the parent run
+ const runWaitpoint = await prisma.taskRunWaitpoint.findFirst({
+ where: {
+ taskRunId: parentRun1.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ assertNonNullable(runWaitpoint);
+ expect(runWaitpoint.waitpoint.type).toBe("RUN");
+ expect(runWaitpoint.waitpoint.completedByTaskRunId).toBe(childRun.id);
+
+ //dequeue the child run
+ const dequeuedChild = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: childRun.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //start the child run
+ const childAttempt = await engine.startRunAttempt({
+ runId: childRun.id,
+ snapshotId: dequeuedChild[0].snapshot.id,
+ });
+
+ //trigger a second parent run
+ const parentRun2 = await engine.trigger(
+ {
+ number: 2,
+ friendlyId: "run_p1235",
+ environment: authenticatedEnvironment,
+ taskIdentifier: parentTask,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12346",
+ spanId: "s12346",
+ masterQueue: "main",
+ queueName: `task/${parentTask}`,
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+ //dequeue 2nd parent
+ const dequeued2 = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: parentRun2.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create the 2nd parent attempt
+ const attemptResultParent2 = await engine.startRunAttempt({
+ runId: parentRun2.id,
+ snapshotId: dequeued2[0].snapshot.id,
+ });
+
+ //block the 2nd parent run with the child
+ const childRunWithWaitpoint = await prisma.taskRun.findUniqueOrThrow({
+ where: { id: childRun.id },
+ include: {
+ associatedWaitpoint: true,
+ },
+ });
+ const blockedResult = await engine.blockRunWithWaitpoint({
+ runId: parentRun2.id,
+ waitpoints: childRunWithWaitpoint.associatedWaitpoint!.id,
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.project.id,
+ organizationId: authenticatedEnvironment.organizationId,
+ tx: prisma,
+ });
+ expect(blockedResult.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+ const parent2ExecutionData = await engine.getRunExecutionData({ runId: parentRun2.id });
+ assertNonNullable(parent2ExecutionData);
+ expect(parent2ExecutionData.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+
+ // complete the child run
+ await engine.completeRunAttempt({
+ runId: childRun.id,
+ snapshotId: childAttempt.snapshot.id,
+ completion: {
+ id: childRun.id,
+ ok: true,
+ output: '{"foo":"bar"}',
+ outputType: "application/json",
+ },
+ });
+
+ //child snapshot
+ const childExecutionDataAfter = await engine.getRunExecutionData({ runId: childRun.id });
+ assertNonNullable(childExecutionDataAfter);
+ expect(childExecutionDataAfter.snapshot.executionStatus).toBe("FINISHED");
+
+ const waitpointAfter = await prisma.waitpoint.findFirst({
+ where: {
+ id: runWaitpoint.waitpointId,
+ },
+ });
+ expect(waitpointAfter?.completedAt).not.toBeNull();
+ expect(waitpointAfter?.status).toBe("COMPLETED");
+ expect(waitpointAfter?.output).toBe('{"foo":"bar"}');
+
+ await setTimeout(500);
+
+ const parent1RunWaitpointAfter = await prisma.taskRunWaitpoint.findFirst({
+ where: {
+ taskRunId: parentRun1.id,
+ },
+ });
+ expect(parent1RunWaitpointAfter).toBeNull();
+
+ const parent2RunWaitpointAfter = await prisma.taskRunWaitpoint.findFirst({
+ where: {
+ taskRunId: parentRun2.id,
+ },
+ });
+ expect(parent2RunWaitpointAfter).toBeNull();
+
+ //parent snapshot
+ const parentExecutionDataAfter = await engine.getRunExecutionData({ runId: parentRun1.id });
+ assertNonNullable(parentExecutionDataAfter);
+ expect(parentExecutionDataAfter.snapshot.executionStatus).toBe("EXECUTING");
+ expect(parentExecutionDataAfter.completedWaitpoints?.length).toBe(1);
+ expect(parentExecutionDataAfter.completedWaitpoints![0].id).toBe(runWaitpoint.waitpointId);
+ expect(parentExecutionDataAfter.completedWaitpoints![0].completedByTaskRun?.id).toBe(
+ childRun.id
+ );
+ expect(parentExecutionDataAfter.completedWaitpoints![0].output).toBe('{"foo":"bar"}');
+
+ //parent 2 snapshot
+ const parent2ExecutionDataAfter = await engine.getRunExecutionData({
+ runId: parentRun2.id,
+ });
+ assertNonNullable(parent2ExecutionDataAfter);
+ expect(parent2ExecutionDataAfter.snapshot.executionStatus).toBe("EXECUTING");
+ expect(parent2ExecutionDataAfter.completedWaitpoints?.length).toBe(1);
+ expect(parent2ExecutionDataAfter.completedWaitpoints![0].id).toBe(
+ childRunWithWaitpoint.associatedWaitpoint!.id
+ );
+ expect(parentExecutionDataAfter.completedWaitpoints![0].completedByTaskRun?.id).toBe(
+ childRun.id
+ );
+ expect(parent2ExecutionDataAfter.completedWaitpoints![0].output).toBe('{"foo":"bar"}');
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+});
diff --git a/internal-packages/run-engine/src/engine/tests/ttl.test.ts b/internal-packages/run-engine/src/engine/tests/ttl.test.ts
new file mode 100644
index 0000000000..11e3225038
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/tests/ttl.test.ts
@@ -0,0 +1,110 @@
+import {
+ containerTest,
+ setupAuthenticatedEnvironment,
+ setupBackgroundWorker,
+ assertNonNullable,
+} from "@internal/testcontainers";
+import { trace } from "@opentelemetry/api";
+import { expect } from "vitest";
+import { RunEngine } from "../index.js";
+import { setTimeout } from "timers/promises";
+import { EventBusEventArgs } from "../eventBus.js";
+
+describe("RunEngine ttl", () => {
+ containerTest("Run expiring (ttl)", { timeout: 15_000 }, async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ const backgroundWorker = await setupBackgroundWorker(
+ prisma,
+ authenticatedEnvironment,
+ taskIdentifier
+ );
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ ttl: "1s",
+ },
+ prisma
+ );
+
+ const executionData = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData);
+ expect(executionData.snapshot.executionStatus).toBe("QUEUED");
+
+ let expiredEventData: EventBusEventArgs<"runExpired">[0] | undefined = undefined;
+ engine.eventBus.on("runExpired", (result) => {
+ expiredEventData = result;
+ });
+
+ //wait for 1 seconds
+ await setTimeout(1_500);
+
+ assertNonNullable(expiredEventData);
+ const assertedExpiredEventData = expiredEventData as EventBusEventArgs<"runExpired">[0];
+ expect(assertedExpiredEventData.run.spanId).toBe(run.spanId);
+
+ const executionData2 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData2);
+ expect(executionData2.snapshot.executionStatus).toBe("FINISHED");
+ expect(executionData2.run.attemptNumber).toBe(undefined);
+ expect(executionData2.run.status).toBe("EXPIRED");
+
+ //concurrency should have been released
+ const envConcurrencyCompleted = await engine.runQueue.currentConcurrencyOfEnvironment(
+ authenticatedEnvironment
+ );
+ expect(envConcurrencyCompleted).toBe(0);
+ } finally {
+ engine.quit();
+ }
+ });
+});
diff --git a/internal-packages/run-engine/src/engine/tests/waitpoints.test.ts b/internal-packages/run-engine/src/engine/tests/waitpoints.test.ts
new file mode 100644
index 0000000000..1d09e336d9
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/tests/waitpoints.test.ts
@@ -0,0 +1,1163 @@
+import {
+ assertNonNullable,
+ containerTest,
+ setupAuthenticatedEnvironment,
+ setupBackgroundWorker,
+} from "@internal/testcontainers";
+import { trace } from "@opentelemetry/api";
+import { expect } from "vitest";
+import { RunEngine } from "../index.js";
+import { setTimeout } from "timers/promises";
+import { EventBusEventArgs } from "../eventBus.js";
+import { isWaitpointOutputTimeout } from "@trigger.dev/core/v3";
+
+describe("RunEngine Waitpoints", () => {
+ containerTest("waitForDuration", { timeout: 15_000 }, async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier);
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_p1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create an attempt
+ const attemptResult = await engine.startRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: dequeued[0].snapshot.id,
+ });
+ expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING");
+
+ const durationMs = 1_000;
+
+ //waitForDuration
+ const date = new Date(Date.now() + durationMs);
+ const { waitpoint } = await engine.createDateTimeWaitpoint({
+ projectId: authenticatedEnvironment.project.id,
+ environmentId: authenticatedEnvironment.id,
+ completedAfter: date,
+ });
+ expect(waitpoint.completedAfter!.toISOString()).toBe(date.toISOString());
+
+ const result = await engine.blockRunWithWaitpoint({
+ runId: run.id,
+ waitpoints: [waitpoint.id],
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.project.id,
+ organizationId: authenticatedEnvironment.organization.id,
+ releaseConcurrency: {
+ releaseQueue: true,
+ },
+ });
+ expect(result.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+ expect(result.runStatus).toBe("EXECUTING");
+
+ const executionData = await engine.getRunExecutionData({ runId: run.id });
+ expect(executionData?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+
+ await setTimeout(2_000);
+
+ const waitpoint2 = await prisma.waitpoint.findFirst({
+ where: {
+ id: waitpoint.id,
+ },
+ });
+ expect(waitpoint2?.status).toBe("COMPLETED");
+ expect(waitpoint2?.completedAt?.getTime()).toBeLessThanOrEqual(date.getTime() + 200);
+
+ const executionDataAfter = await engine.getRunExecutionData({ runId: run.id });
+ expect(executionDataAfter?.snapshot.executionStatus).toBe("EXECUTING");
+ } finally {
+ engine.quit();
+ }
+ });
+
+ containerTest(
+ "Waitpoints cleared if attempt fails",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier);
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_p1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create an attempt
+ const attemptResult = await engine.startRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: dequeued[0].snapshot.id,
+ });
+ expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING");
+
+ //waitForDuration
+ const date = new Date(Date.now() + 60_000);
+ const { waitpoint } = await engine.createDateTimeWaitpoint({
+ projectId: authenticatedEnvironment.project.id,
+ environmentId: authenticatedEnvironment.id,
+ completedAfter: date,
+ });
+ expect(waitpoint.completedAfter!.toISOString()).toBe(date.toISOString());
+
+ const result = await engine.blockRunWithWaitpoint({
+ runId: run.id,
+ waitpoints: [waitpoint.id],
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.project.id,
+ organizationId: authenticatedEnvironment.organization.id,
+ });
+
+ const executionData = await engine.getRunExecutionData({ runId: run.id });
+ expect(executionData?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+
+ //fail the attempt (user error)
+ const error = {
+ type: "BUILT_IN_ERROR" as const,
+ name: "UserError",
+ message: "This is a user error",
+ stackTrace: "Error: This is a user error\n at :1:1",
+ };
+ const failResult = await engine.completeRunAttempt({
+ runId: executionData!.run.id,
+ snapshotId: executionData!.snapshot.id,
+ completion: {
+ ok: false,
+ id: executionData!.run.id,
+ error,
+ retry: {
+ timestamp: Date.now(),
+ delay: 0,
+ },
+ },
+ });
+ expect(failResult.attemptStatus).toBe("RETRY_IMMEDIATELY");
+ expect(failResult.snapshot.executionStatus).toBe("PENDING_EXECUTING");
+ expect(failResult.run.attemptNumber).toBe(1);
+ expect(failResult.run.status).toBe("RETRYING_AFTER_FAILURE");
+
+ const executionData2 = await engine.getRunExecutionData({ runId: run.id });
+ assertNonNullable(executionData2);
+ expect(executionData2.snapshot.executionStatus).toBe("PENDING_EXECUTING");
+ expect(executionData2.run.attemptNumber).toBe(1);
+ expect(executionData2.run.status).toBe("RETRYING_AFTER_FAILURE");
+ expect(executionData2.completedWaitpoints.length).toBe(0);
+
+ //check there are no waitpoints blocking the parent run
+ const runWaitpoint = await prisma.taskRunWaitpoint.findFirst({
+ where: {
+ taskRunId: run.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ expect(runWaitpoint).toBeNull();
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+
+ containerTest(
+ "Create, block, and complete a Manual waitpoint",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier);
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_p1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create an attempt
+ const attemptResult = await engine.startRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: dequeued[0].snapshot.id,
+ });
+ expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING");
+
+ //create a manual waitpoint
+ const result = await engine.createManualWaitpoint({
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.projectId,
+ });
+ expect(result.waitpoint.status).toBe("PENDING");
+
+ //block the run
+ await engine.blockRunWithWaitpoint({
+ runId: run.id,
+ waitpoints: result.waitpoint.id,
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.projectId,
+ organizationId: authenticatedEnvironment.organizationId,
+ });
+
+ const executionData = await engine.getRunExecutionData({ runId: run.id });
+ expect(executionData?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+
+ //check there is a waitpoint blocking the parent run
+ const runWaitpointBefore = await prisma.taskRunWaitpoint.findFirst({
+ where: {
+ taskRunId: run.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ expect(runWaitpointBefore?.waitpointId).toBe(result.waitpoint.id);
+
+ let event: EventBusEventArgs<"workerNotification">[0] | undefined = undefined;
+ engine.eventBus.on("workerNotification", (result) => {
+ event = result;
+ });
+
+ //complete the waitpoint
+ await engine.completeWaitpoint({
+ id: result.waitpoint.id,
+ });
+
+ await setTimeout(200);
+
+ assertNonNullable(event);
+ const notificationEvent = event as EventBusEventArgs<"workerNotification">[0];
+ expect(notificationEvent.run.id).toBe(run.id);
+
+ const executionData2 = await engine.getRunExecutionData({ runId: run.id });
+ expect(executionData2?.snapshot.executionStatus).toBe("EXECUTING");
+
+ //check there are no waitpoints blocking the parent run
+ const runWaitpoint = await prisma.taskRunWaitpoint.findFirst({
+ where: {
+ taskRunId: run.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ expect(runWaitpoint).toBeNull();
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+
+ containerTest(
+ "Manual waitpoint failAfter",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier);
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_p1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create an attempt
+ const attemptResult = await engine.startRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: dequeued[0].snapshot.id,
+ });
+ expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING");
+
+ //create a manual waitpoint
+ const result = await engine.createManualWaitpoint({
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.projectId,
+ //fail after 200ms
+ timeout: new Date(Date.now() + 200),
+ });
+
+ //block the run
+ await engine.blockRunWithWaitpoint({
+ runId: run.id,
+ waitpoints: result.waitpoint.id,
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.projectId,
+ organizationId: authenticatedEnvironment.organizationId,
+ });
+
+ const executionData = await engine.getRunExecutionData({ runId: run.id });
+ expect(executionData?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+
+ await setTimeout(750);
+
+ const executionData2 = await engine.getRunExecutionData({ runId: run.id });
+ expect(executionData2?.snapshot.executionStatus).toBe("EXECUTING");
+ expect(executionData2?.completedWaitpoints.length).toBe(1);
+ expect(executionData2?.completedWaitpoints[0].outputIsError).toBe(true);
+
+ //check there are no waitpoints blocking the parent run
+ const runWaitpoint = await prisma.taskRunWaitpoint.findFirst({
+ where: {
+ taskRunId: run.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ expect(runWaitpoint).toBeNull();
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+
+ containerTest(
+ "Race condition with multiple waitpoints completing simultaneously",
+ { timeout: 60_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier);
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_p1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create an attempt
+ const attemptResult = await engine.startRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: dequeued[0].snapshot.id,
+ });
+ expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING");
+
+ const iterationCount = 10;
+
+ for (let i = 0; i < iterationCount; i++) {
+ const waitpointCount = 5;
+
+ //create waitpoints
+ const results = await Promise.all(
+ Array.from({ length: waitpointCount }).map(() =>
+ engine.createManualWaitpoint({
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.projectId,
+ })
+ )
+ );
+
+ //block the run with them
+ await Promise.all(
+ results.map((result) =>
+ engine.blockRunWithWaitpoint({
+ runId: run.id,
+ waitpoints: result.waitpoint.id,
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.projectId,
+ organizationId: authenticatedEnvironment.organizationId,
+ })
+ )
+ );
+
+ const executionData = await engine.getRunExecutionData({ runId: run.id });
+ expect(executionData?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+
+ //check there is a waitpoint blocking the parent run
+ const runWaitpointsBefore = await prisma.taskRunWaitpoint.findMany({
+ where: {
+ taskRunId: run.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ expect(runWaitpointsBefore.length).toBe(waitpointCount);
+
+ //complete the waitpoints
+ await Promise.all(
+ results.map((result) =>
+ engine.completeWaitpoint({
+ id: result.waitpoint.id,
+ })
+ )
+ );
+
+ await setTimeout(500);
+
+ //expect the run to be executing again
+ const executionData2 = await engine.getRunExecutionData({ runId: run.id });
+ expect(executionData2?.snapshot.executionStatus).toBe("EXECUTING");
+
+ //check there are no waitpoints blocking the parent run
+ const runWaitpoints = await prisma.taskRunWaitpoint.findMany({
+ where: {
+ taskRunId: run.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ expect(runWaitpoints.length).toBe(0);
+ }
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+
+ containerTest(
+ "Create a Manual waitpoint and let it timeout",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier);
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_p1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create an attempt
+ const attemptResult = await engine.startRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: dequeued[0].snapshot.id,
+ });
+ expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING");
+
+ //create a manual waitpoint with timeout
+ const timeout = new Date(Date.now() + 1_000);
+ const result = await engine.createManualWaitpoint({
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.projectId,
+ timeout,
+ });
+ expect(result.waitpoint.status).toBe("PENDING");
+ expect(result.waitpoint.completedAfter).toStrictEqual(timeout);
+
+ //block the run
+ await engine.blockRunWithWaitpoint({
+ runId: run.id,
+ waitpoints: result.waitpoint.id,
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.projectId,
+ organizationId: authenticatedEnvironment.organizationId,
+ });
+
+ const executionData = await engine.getRunExecutionData({ runId: run.id });
+ expect(executionData?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+
+ //check there is a waitpoint blocking the parent run
+ const runWaitpointBefore = await prisma.taskRunWaitpoint.findFirst({
+ where: {
+ taskRunId: run.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ expect(runWaitpointBefore?.waitpointId).toBe(result.waitpoint.id);
+
+ let event: EventBusEventArgs<"workerNotification">[0] | undefined = undefined;
+ engine.eventBus.on("workerNotification", (result) => {
+ event = result;
+ });
+
+ await setTimeout(1_250);
+
+ const executionData2 = await engine.getRunExecutionData({ runId: run.id });
+ expect(executionData2?.snapshot.executionStatus).toBe("EXECUTING");
+
+ assertNonNullable(event);
+ const notificationEvent = event as EventBusEventArgs<"workerNotification">[0];
+ expect(notificationEvent.run.id).toBe(run.id);
+
+ //check there are no waitpoints blocking the parent run
+ const runWaitpoint = await prisma.taskRunWaitpoint.findFirst({
+ where: {
+ taskRunId: run.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ expect(runWaitpoint).toBeNull();
+
+ const waitpoint2 = await prisma.waitpoint.findUnique({
+ where: {
+ id: result.waitpoint.id,
+ },
+ });
+ assertNonNullable(waitpoint2);
+ expect(waitpoint2.status).toBe("COMPLETED");
+ expect(waitpoint2.outputIsError).toBe(true);
+ assertNonNullable(waitpoint2.output);
+ const isTimeout = isWaitpointOutputTimeout(waitpoint2.output);
+ expect(isTimeout).toBe(true);
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+
+ containerTest(
+ "Manual waitpoint with idempotency",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier);
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_p1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create an attempt
+ const attemptResult = await engine.startRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: dequeued[0].snapshot.id,
+ });
+ expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING");
+
+ const idempotencyKey = "a-key";
+
+ //create a manual waitpoint with timeout
+ const result = await engine.createManualWaitpoint({
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.projectId,
+ idempotencyKey,
+ });
+ expect(result.waitpoint.status).toBe("PENDING");
+ expect(result.waitpoint.idempotencyKey).toBe(idempotencyKey);
+ expect(result.waitpoint.userProvidedIdempotencyKey).toBe(true);
+
+ //block the run
+ await engine.blockRunWithWaitpoint({
+ runId: run.id,
+ waitpoints: result.waitpoint.id,
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.projectId,
+ organizationId: authenticatedEnvironment.organizationId,
+ });
+
+ const executionData = await engine.getRunExecutionData({ runId: run.id });
+ expect(executionData?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+
+ //check there is a waitpoint blocking the parent run
+ const runWaitpointBefore = await prisma.taskRunWaitpoint.findFirst({
+ where: {
+ taskRunId: run.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ expect(runWaitpointBefore?.waitpointId).toBe(result.waitpoint.id);
+
+ let event: EventBusEventArgs<"workerNotification">[0] | undefined = undefined;
+ engine.eventBus.on("workerNotification", (result) => {
+ event = result;
+ });
+
+ //complete the waitpoint
+ await engine.completeWaitpoint({
+ id: result.waitpoint.id,
+ });
+
+ await setTimeout(200);
+
+ const executionData2 = await engine.getRunExecutionData({ runId: run.id });
+ expect(executionData2?.snapshot.executionStatus).toBe("EXECUTING");
+
+ assertNonNullable(event);
+ const notificationEvent = event as EventBusEventArgs<"workerNotification">[0];
+ expect(notificationEvent.run.id).toBe(run.id);
+
+ //check there are no waitpoints blocking the parent run
+ const runWaitpoint = await prisma.taskRunWaitpoint.findFirst({
+ where: {
+ taskRunId: run.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ expect(runWaitpoint).toBeNull();
+
+ const waitpoint2 = await prisma.waitpoint.findUnique({
+ where: {
+ id: result.waitpoint.id,
+ },
+ });
+ assertNonNullable(waitpoint2);
+ expect(waitpoint2.status).toBe("COMPLETED");
+ expect(waitpoint2.outputIsError).toBe(false);
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+
+ containerTest(
+ "Manual waitpoint with idempotency and ttl",
+ { timeout: 15_000 },
+ async ({ prisma, redisOptions }) => {
+ //create environment
+ const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+ const engine = new RunEngine({
+ prisma,
+ worker: {
+ redis: redisOptions,
+ workers: 1,
+ tasksPerWorker: 10,
+ pollIntervalMs: 100,
+ },
+ queue: {
+ redis: redisOptions,
+ },
+ runLock: {
+ redis: redisOptions,
+ },
+ machines: {
+ defaultMachine: "small-1x",
+ machines: {
+ "small-1x": {
+ name: "small-1x" as const,
+ cpu: 0.5,
+ memory: 0.5,
+ centsPerMs: 0.0001,
+ },
+ },
+ baseCostInCents: 0.0001,
+ },
+ tracer: trace.getTracer("test", "0.0.0"),
+ });
+
+ try {
+ const taskIdentifier = "test-task";
+
+ //create background worker
+ await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier);
+
+ //trigger the run
+ const run = await engine.trigger(
+ {
+ number: 1,
+ friendlyId: "run_p1234",
+ environment: authenticatedEnvironment,
+ taskIdentifier,
+ payload: "{}",
+ payloadType: "application/json",
+ context: {},
+ traceContext: {},
+ traceId: "t12345",
+ spanId: "s12345",
+ masterQueue: "main",
+ queueName: "task/test-task",
+ isTest: false,
+ tags: [],
+ },
+ prisma
+ );
+
+ //dequeue the run
+ const dequeued = await engine.dequeueFromMasterQueue({
+ consumerId: "test_12345",
+ masterQueue: run.masterQueue,
+ maxRunCount: 10,
+ });
+
+ //create an attempt
+ const attemptResult = await engine.startRunAttempt({
+ runId: dequeued[0].run.id,
+ snapshotId: dequeued[0].snapshot.id,
+ });
+ expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING");
+
+ const idempotencyKey = "a-key";
+
+ //create a manual waitpoint with timeout
+ const result = await engine.createManualWaitpoint({
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.projectId,
+ idempotencyKey,
+ idempotencyKeyExpiresAt: new Date(Date.now() + 200),
+ });
+ expect(result.waitpoint.status).toBe("PENDING");
+ expect(result.waitpoint.idempotencyKey).toBe(idempotencyKey);
+ expect(result.waitpoint.userProvidedIdempotencyKey).toBe(true);
+
+ const sameWaitpointResult = await engine.createManualWaitpoint({
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.projectId,
+ idempotencyKey,
+ idempotencyKeyExpiresAt: new Date(Date.now() + 200),
+ });
+ expect(sameWaitpointResult.waitpoint.id).toBe(result.waitpoint.id);
+
+ //block the run
+ await engine.blockRunWithWaitpoint({
+ runId: run.id,
+ waitpoints: result.waitpoint.id,
+ environmentId: authenticatedEnvironment.id,
+ projectId: authenticatedEnvironment.projectId,
+ organizationId: authenticatedEnvironment.organizationId,
+ });
+
+ const executionData = await engine.getRunExecutionData({ runId: run.id });
+ expect(executionData?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS");
+
+ //check there is a waitpoint blocking the parent run
+ const runWaitpointBefore = await prisma.taskRunWaitpoint.findFirst({
+ where: {
+ taskRunId: run.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ expect(runWaitpointBefore?.waitpointId).toBe(result.waitpoint.id);
+
+ let event: EventBusEventArgs<"workerNotification">[0] | undefined = undefined;
+ engine.eventBus.on("workerNotification", (result) => {
+ event = result;
+ });
+
+ //complete the waitpoint
+ await engine.completeWaitpoint({
+ id: result.waitpoint.id,
+ });
+
+ await setTimeout(200);
+
+ const executionData2 = await engine.getRunExecutionData({ runId: run.id });
+ expect(executionData2?.snapshot.executionStatus).toBe("EXECUTING");
+
+ assertNonNullable(event);
+ const notificationEvent = event as EventBusEventArgs<"workerNotification">[0];
+ expect(notificationEvent.run.id).toBe(run.id);
+
+ //check there are no waitpoints blocking the parent run
+ const runWaitpoint = await prisma.taskRunWaitpoint.findFirst({
+ where: {
+ taskRunId: run.id,
+ },
+ include: {
+ waitpoint: true,
+ },
+ });
+ expect(runWaitpoint).toBeNull();
+
+ const waitpoint2 = await prisma.waitpoint.findUnique({
+ where: {
+ id: result.waitpoint.id,
+ },
+ });
+ assertNonNullable(waitpoint2);
+ expect(waitpoint2.status).toBe("COMPLETED");
+ expect(waitpoint2.outputIsError).toBe(false);
+ } finally {
+ engine.quit();
+ }
+ }
+ );
+});
diff --git a/internal-packages/run-engine/src/engine/types.ts b/internal-packages/run-engine/src/engine/types.ts
new file mode 100644
index 0000000000..a708fd1269
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/types.ts
@@ -0,0 +1,89 @@
+import { type WorkerConcurrencyOptions } from "@internal/redis-worker";
+import { Tracer } from "@opentelemetry/api";
+import { MachinePreset, MachinePresetName, QueueOptions, RetryOptions } from "@trigger.dev/core/v3";
+import { PrismaClient } from "@trigger.dev/database";
+import { type RedisOptions } from "ioredis";
+import { MinimalAuthenticatedEnvironment } from "../shared";
+
+export type RunEngineOptions = {
+ prisma: PrismaClient;
+ worker: WorkerConcurrencyOptions & {
+ redis: RedisOptions;
+ pollIntervalMs?: number;
+ immediatePollIntervalMs?: number;
+ };
+ machines: {
+ defaultMachine: MachinePresetName;
+ machines: Record;
+ baseCostInCents: number;
+ };
+ queue: {
+ redis: RedisOptions;
+ retryOptions?: RetryOptions;
+ defaultEnvConcurrency?: number;
+ };
+ runLock: {
+ redis: RedisOptions;
+ };
+ /** If not set then checkpoints won't ever be used */
+ retryWarmStartThresholdMs?: number;
+ heartbeatTimeoutsMs?: Partial;
+ queueRunsWaitingForWorkerBatchSize?: number;
+ tracer: Tracer;
+};
+
+export type HeartbeatTimeouts = {
+ PENDING_EXECUTING: number;
+ PENDING_CANCEL: number;
+ EXECUTING: number;
+ EXECUTING_WITH_WAITPOINTS: number;
+};
+
+export type TriggerParams = {
+ friendlyId: string;
+ number: number;
+ environment: MinimalAuthenticatedEnvironment;
+ idempotencyKey?: string;
+ idempotencyKeyExpiresAt?: Date;
+ taskIdentifier: string;
+ payload: string;
+ payloadType: string;
+ context: any;
+ traceContext: Record;
+ traceId: string;
+ spanId: string;
+ parentSpanId?: string;
+ lockedToVersionId?: string;
+ taskVersion?: string;
+ sdkVersion?: string;
+ cliVersion?: string;
+ concurrencyKey?: string;
+ masterQueue: string;
+ queueName: string;
+ queue?: QueueOptions;
+ isTest: boolean;
+ delayUntil?: Date;
+ queuedAt?: Date;
+ maxAttempts?: number;
+ taskEventStore: string;
+ priorityMs?: number;
+ ttl?: string;
+ tags: { id: string; name: string }[];
+ parentTaskRunId?: string;
+ rootTaskRunId?: string;
+ batch?: {
+ id: string;
+ index: number;
+ };
+ resumeParentOnCompletion?: boolean;
+ depth?: number;
+ metadata?: string;
+ metadataType?: string;
+ seedMetadata?: string;
+ seedMetadataType?: string;
+ oneTimeUseToken?: string;
+ maxDurationInSeconds?: number;
+ machine?: MachinePresetName;
+ workerId?: string;
+ runnerId?: string;
+};
diff --git a/internal-packages/run-engine/src/index.ts b/internal-packages/run-engine/src/index.ts
new file mode 100644
index 0000000000..c7cf00b11e
--- /dev/null
+++ b/internal-packages/run-engine/src/index.ts
@@ -0,0 +1,2 @@
+export { RunEngine, RunDuplicateIdempotencyKeyError } from "./engine/index";
+export type { EventBusEventArgs } from "./engine/eventBus";
diff --git a/internal-packages/run-engine/src/run-queue/index.test.ts b/internal-packages/run-engine/src/run-queue/index.test.ts
new file mode 100644
index 0000000000..0def021dd7
--- /dev/null
+++ b/internal-packages/run-engine/src/run-queue/index.test.ts
@@ -0,0 +1,891 @@
+import { redisTest } from "@internal/testcontainers";
+import { trace } from "@opentelemetry/api";
+import { Logger } from "@trigger.dev/core/logger";
+import Redis from "ioredis";
+import { describe } from "node:test";
+import { setTimeout } from "node:timers/promises";
+import { RunQueue } from "./index.js";
+import { SimpleWeightedChoiceStrategy } from "./simpleWeightedPriorityStrategy.js";
+import { InputPayload } from "./types.js";
+
+const testOptions = {
+ name: "rq",
+ tracer: trace.getTracer("rq"),
+ queuePriorityStrategy: new SimpleWeightedChoiceStrategy({ queueSelectionCount: 36 }),
+ envQueuePriorityStrategy: new SimpleWeightedChoiceStrategy({ queueSelectionCount: 12 }),
+ workers: 1,
+ defaultEnvConcurrency: 25,
+ enableRebalancing: false,
+ logger: new Logger("RunQueue", "warn"),
+ retryOptions: {
+ maxAttempts: 5,
+ factor: 1.1,
+ minTimeoutInMs: 100,
+ maxTimeoutInMs: 1_000,
+ randomize: true,
+ },
+};
+
+const authenticatedEnvProd = {
+ id: "e1234",
+ type: "PRODUCTION" as const,
+ maximumConcurrencyLimit: 10,
+ project: { id: "p1234" },
+ organization: { id: "o1234" },
+};
+
+const authenticatedEnvDev = {
+ id: "e1234",
+ type: "DEVELOPMENT" as const,
+ maximumConcurrencyLimit: 10,
+ project: { id: "p1234" },
+ organization: { id: "o1234" },
+};
+
+const messageProd: InputPayload = {
+ runId: "r1234",
+ taskIdentifier: "task/my-task",
+ orgId: "o1234",
+ projectId: "p1234",
+ environmentId: "e1234",
+ environmentType: "PRODUCTION",
+ queue: "task/my-task",
+ timestamp: Date.now(),
+ attempt: 0,
+};
+
+const messageDev: InputPayload = {
+ runId: "r4321",
+ taskIdentifier: "task/my-task",
+ orgId: "o1234",
+ projectId: "p1234",
+ environmentId: "e4321",
+ environmentType: "DEVELOPMENT",
+ queue: "task/my-task",
+ timestamp: Date.now(),
+ attempt: 0,
+};
+
+describe("RunQueue", () => {
+ redisTest("Get/set Queue concurrency limit", { timeout: 15_000 }, async ({ redisContainer }) => {
+ const queue = new RunQueue({
+ ...testOptions,
+ redis: {
+ keyPrefix: "runqueue:test:",
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ },
+ });
+
+ try {
+ //initial value
+ const initial = await queue.getQueueConcurrencyLimit(authenticatedEnvProd, "task/my-task");
+ expect(initial).toBe(undefined);
+
+ //set 20
+ const result = await queue.updateQueueConcurrencyLimits(
+ authenticatedEnvProd,
+ "task/my-task",
+ 20
+ );
+ expect(result).toBe("OK");
+
+ //get 20
+ const updated = await queue.getQueueConcurrencyLimit(authenticatedEnvProd, "task/my-task");
+ expect(updated).toBe(20);
+
+ //remove
+ const result2 = await queue.removeQueueConcurrencyLimits(
+ authenticatedEnvProd,
+ "task/my-task"
+ );
+ expect(result2).toBe(1);
+
+ //get undefined
+ const removed = await queue.getQueueConcurrencyLimit(authenticatedEnvProd, "task/my-task");
+ expect(removed).toBe(undefined);
+ } finally {
+ await queue.quit();
+ }
+ });
+
+ redisTest("Update env concurrency limits", { timeout: 5_000 }, async ({ redisContainer }) => {
+ const queue = new RunQueue({
+ ...testOptions,
+ redis: {
+ keyPrefix: "runqueue:test:",
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ },
+ });
+
+ try {
+ //initial value
+ const initial = await queue.getEnvConcurrencyLimit(authenticatedEnvProd);
+ expect(initial).toBe(25);
+
+ //set 20
+ await queue.updateEnvConcurrencyLimits({
+ ...authenticatedEnvProd,
+ maximumConcurrencyLimit: 20,
+ });
+
+ //get 20
+ const updated = await queue.getEnvConcurrencyLimit(authenticatedEnvProd);
+ expect(updated).toBe(20);
+ } finally {
+ await queue.quit();
+ }
+ });
+
+ redisTest(
+ "Enqueue/Dequeue a message in env (DEV run, no concurrency key)",
+ { timeout: 5_000 },
+ async ({ redisContainer }) => {
+ const queue = new RunQueue({
+ ...testOptions,
+ redis: {
+ keyPrefix: "runqueue:test:",
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ },
+ });
+
+ try {
+ //initial queue length
+ const result = await queue.lengthOfQueue(authenticatedEnvDev, messageDev.queue);
+ expect(result).toBe(0);
+ const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvDev);
+ expect(envQueueLength).toBe(0);
+
+ //initial oldest message
+ const oldestScore = await queue.oldestMessageInQueue(authenticatedEnvDev, messageDev.queue);
+ expect(oldestScore).toBe(undefined);
+
+ const envMasterQueue = `env:${authenticatedEnvDev.id}`;
+
+ //enqueue message
+ await queue.enqueueMessage({
+ env: authenticatedEnvDev,
+ message: messageDev,
+ masterQueues: ["main", envMasterQueue],
+ });
+
+ //queue length
+ const result2 = await queue.lengthOfQueue(authenticatedEnvDev, messageDev.queue);
+ expect(result2).toBe(1);
+ const envQueueLength2 = await queue.lengthOfEnvQueue(authenticatedEnvDev);
+ expect(envQueueLength2).toBe(1);
+
+ //oldest message
+ const oldestScore2 = await queue.oldestMessageInQueue(
+ authenticatedEnvDev,
+ messageDev.queue
+ );
+ expect(oldestScore2).toBe(messageDev.timestamp);
+
+ //concurrencies
+ const queueConcurrency = await queue.currentConcurrencyOfQueue(
+ authenticatedEnvDev,
+ messageDev.queue
+ );
+ expect(queueConcurrency).toBe(0);
+ const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvDev);
+ expect(envConcurrency).toBe(0);
+ const projectConcurrency = await queue.currentConcurrencyOfProject(authenticatedEnvDev);
+ expect(projectConcurrency).toBe(0);
+ const taskConcurrency = await queue.currentConcurrencyOfTask(
+ authenticatedEnvDev,
+ messageDev.taskIdentifier
+ );
+ expect(taskConcurrency).toBe(0);
+
+ const dequeued = await queue.dequeueMessageFromMasterQueue(
+ "test_12345",
+ envMasterQueue,
+ 10
+ );
+ expect(dequeued.length).toBe(1);
+ expect(dequeued[0].messageId).toEqual(messageDev.runId);
+ expect(dequeued[0].message.orgId).toEqual(messageDev.orgId);
+ expect(dequeued[0].message.version).toEqual("1");
+ expect(dequeued[0].message.masterQueues).toEqual(["main", envMasterQueue]);
+
+ //concurrencies
+ const queueConcurrency2 = await queue.currentConcurrencyOfQueue(
+ authenticatedEnvDev,
+ messageDev.queue
+ );
+ expect(queueConcurrency2).toBe(1);
+ const envConcurrency2 = await queue.currentConcurrencyOfEnvironment(authenticatedEnvDev);
+ expect(envConcurrency2).toBe(1);
+ const projectConcurrency2 = await queue.currentConcurrencyOfProject(authenticatedEnvDev);
+ expect(projectConcurrency2).toBe(1);
+ const taskConcurrency2 = await queue.currentConcurrencyOfTask(
+ authenticatedEnvDev,
+ messageDev.taskIdentifier
+ );
+ expect(taskConcurrency2).toBe(1);
+
+ //queue lengths
+ const result3 = await queue.lengthOfQueue(authenticatedEnvDev, messageDev.queue);
+ expect(result3).toBe(0);
+ const envQueueLength3 = await queue.lengthOfEnvQueue(authenticatedEnvDev);
+ expect(envQueueLength3).toBe(0);
+
+ const dequeued2 = await queue.dequeueMessageFromMasterQueue(
+ "test_12345",
+ envMasterQueue,
+ 10
+ );
+ expect(dequeued2.length).toBe(0);
+
+ const dequeued3 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10);
+ expect(dequeued3.length).toBe(0);
+ } finally {
+ await queue.quit();
+ }
+ }
+ );
+
+ redisTest(
+ "Enqueue/Dequeue a message from the main queue (PROD run, no concurrency key)",
+ { timeout: 5_000 },
+ async ({ redisContainer }) => {
+ const queue = new RunQueue({
+ ...testOptions,
+ redis: {
+ keyPrefix: "runqueue:test:",
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ },
+ });
+
+ try {
+ //initial queue length
+ const result = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue);
+ expect(result).toBe(0);
+ const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvProd);
+ expect(envQueueLength).toBe(0);
+
+ //initial oldest message
+ const oldestScore = await queue.oldestMessageInQueue(
+ authenticatedEnvProd,
+ messageProd.queue
+ );
+ expect(oldestScore).toBe(undefined);
+
+ const envMasterQueue = `env:${authenticatedEnvDev.id}`;
+
+ //enqueue message
+ await queue.enqueueMessage({
+ env: authenticatedEnvProd,
+ message: messageProd,
+ masterQueues: ["main", envMasterQueue],
+ });
+
+ //queue length
+ const queueLength = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue);
+ expect(queueLength).toBe(1);
+ const envLength = await queue.lengthOfEnvQueue(authenticatedEnvProd);
+ expect(envLength).toBe(1);
+
+ //oldest message
+ const oldestScore2 = await queue.oldestMessageInQueue(
+ authenticatedEnvProd,
+ messageProd.queue
+ );
+ expect(oldestScore2).toBe(messageProd.timestamp);
+
+ //concurrencies
+ const queueConcurrency = await queue.currentConcurrencyOfQueue(
+ authenticatedEnvProd,
+ messageProd.queue
+ );
+ expect(queueConcurrency).toBe(0);
+ const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd);
+ expect(envConcurrency).toBe(0);
+ const projectConcurrency = await queue.currentConcurrencyOfProject(authenticatedEnvProd);
+ expect(projectConcurrency).toBe(0);
+ const taskConcurrency = await queue.currentConcurrencyOfTask(
+ authenticatedEnvProd,
+ messageProd.taskIdentifier
+ );
+ expect(taskConcurrency).toBe(0);
+
+ //dequeue
+ const dequeued = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10);
+ expect(dequeued.length).toBe(1);
+ expect(dequeued[0].messageId).toEqual(messageProd.runId);
+ expect(dequeued[0].message.orgId).toEqual(messageProd.orgId);
+ expect(dequeued[0].message.version).toEqual("1");
+ expect(dequeued[0].message.masterQueues).toEqual(["main", envMasterQueue]);
+
+ //concurrencies
+ const queueConcurrency2 = await queue.currentConcurrencyOfQueue(
+ authenticatedEnvProd,
+ messageProd.queue
+ );
+ expect(queueConcurrency2).toBe(1);
+ const envConcurrency2 = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd);
+ expect(envConcurrency2).toBe(1);
+ const projectConcurrency2 = await queue.currentConcurrencyOfProject(authenticatedEnvProd);
+ expect(projectConcurrency2).toBe(1);
+ const taskConcurrency2 = await queue.currentConcurrencyOfTask(
+ authenticatedEnvProd,
+ messageProd.taskIdentifier
+ );
+ expect(taskConcurrency2).toBe(1);
+
+ //queue length
+ const length2 = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue);
+ expect(length2).toBe(0);
+ const envLength2 = await queue.lengthOfEnvQueue(authenticatedEnvProd);
+ expect(envLength2).toBe(0);
+
+ const dequeued2 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10);
+ expect(dequeued2.length).toBe(0);
+ } finally {
+ await queue.quit();
+ }
+ }
+ );
+
+ redisTest(
+ "Dequeue multiple messages from the queue",
+ { timeout: 5_000 },
+ async ({ redisContainer }) => {
+ const queue = new RunQueue({
+ ...testOptions,
+ redis: {
+ keyPrefix: "runqueue:test:",
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ },
+ });
+
+ try {
+ // Create 20 messages with different runIds and some with different queues
+ const messages = Array.from({ length: 20 }, (_, i) => ({
+ ...messageProd,
+ runId: `r${i + 1}`,
+ queue: i < 15 ? "task/my-task" : "task/other-task", // Mix up the queues
+ }));
+
+ // Enqueue all messages
+ for (const message of messages) {
+ await queue.enqueueMessage({
+ env: authenticatedEnvProd,
+ message,
+ masterQueues: "main",
+ });
+ }
+
+ // Check initial queue lengths
+ const initialLength1 = await queue.lengthOfQueue(authenticatedEnvProd, "task/my-task");
+ const initialLength2 = await queue.lengthOfQueue(authenticatedEnvProd, "task/other-task");
+ expect(initialLength1).toBe(15);
+ expect(initialLength2).toBe(5);
+ const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvProd);
+ expect(envQueueLength).toBe(20);
+
+ // Dequeue first batch of 10 messages
+ const dequeued1 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10);
+ expect(dequeued1.length).toBe(10);
+
+ // Dequeue second batch of 10 messages
+ const dequeued2 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10);
+ expect(dequeued2.length).toBe(10);
+
+ // Combine all dequeued message IDs
+ const dequeuedIds = [...dequeued1, ...dequeued2].map((m) => m.messageId);
+
+ // Check that all original messages were dequeued
+ const allOriginalIds = messages.map((m) => m.runId);
+ expect(dequeuedIds.sort()).toEqual(allOriginalIds.sort());
+
+ // Try to dequeue more - should get none
+ const dequeued3 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10);
+ expect(dequeued3.length).toBe(0);
+
+ // Check final queue lengths
+ const finalLength1 = await queue.lengthOfQueue(authenticatedEnvProd, "task/my-task");
+ const finalLength2 = await queue.lengthOfQueue(authenticatedEnvProd, "task/other-task");
+ expect(finalLength1).toBe(0);
+ expect(finalLength2).toBe(0);
+ const finalEnvQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvProd);
+ expect(finalEnvQueueLength).toBe(0);
+ } finally {
+ await queue.quit();
+ }
+ }
+ );
+
+ redisTest("Get shared queue details", { timeout: 5_000 }, async ({ redisContainer }) => {
+ const queue = new RunQueue({
+ ...testOptions,
+ redis: {
+ keyPrefix: "runqueue:test:",
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ },
+ });
+
+ try {
+ const result = await queue.getSharedQueueDetails("main", 10);
+ expect(result.selectionId).toBe("getSharedQueueDetails");
+ expect(result.queueCount).toBe(0);
+ expect(result.queueChoice.choices).toStrictEqual({ abort: true });
+
+ await queue.enqueueMessage({
+ env: authenticatedEnvProd,
+ message: messageProd,
+ masterQueues: "main",
+ });
+
+ const result2 = await queue.getSharedQueueDetails("main", 10);
+ expect(result2.selectionId).toBe("getSharedQueueDetails");
+ expect(result2.queueCount).toBe(1);
+ expect(result2.queues[0].score).toBe(messageProd.timestamp);
+ if (!Array.isArray(result2.queueChoice.choices)) {
+ throw new Error("Expected queueChoice.choices to be an array");
+ }
+ expect(result2.queueChoice.choices[0]).toBe(
+ "{org:o1234}:proj:p1234:env:e1234:queue:task/my-task"
+ );
+ } finally {
+ await queue.quit();
+ }
+ });
+
+ redisTest("Acking", { timeout: 5_000 }, async ({ redisContainer, redisOptions }) => {
+ const queue = new RunQueue({
+ ...testOptions,
+ redis: {
+ keyPrefix: "runqueue:test:",
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ },
+ });
+
+ const redis = new Redis({ ...redisOptions, keyPrefix: "runqueue:test:" });
+
+ try {
+ await queue.enqueueMessage({
+ env: authenticatedEnvProd,
+ message: messageProd,
+ masterQueues: "main",
+ });
+
+ const queueLength = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue);
+ expect(queueLength).toBe(1);
+ const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvProd);
+ expect(envQueueLength).toBe(1);
+
+ const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10);
+ expect(messages.length).toBe(1);
+
+ const queueLength2 = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue);
+ expect(queueLength2).toBe(0);
+ const envQueueLength2 = await queue.lengthOfEnvQueue(authenticatedEnvProd);
+ expect(envQueueLength2).toBe(0);
+
+ //check the message is gone
+ const key = queue.keys.messageKey(messages[0].message.orgId, messages[0].messageId);
+ const exists = await redis.exists(key);
+ expect(exists).toBe(1);
+
+ await queue.acknowledgeMessage(messages[0].message.orgId, messages[0].messageId);
+
+ //concurrencies
+ const queueConcurrency = await queue.currentConcurrencyOfQueue(
+ authenticatedEnvProd,
+ messageProd.queue
+ );
+ expect(queueConcurrency).toBe(0);
+ const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd);
+ expect(envConcurrency).toBe(0);
+ const projectConcurrency = await queue.currentConcurrencyOfProject(authenticatedEnvProd);
+ expect(projectConcurrency).toBe(0);
+ const taskConcurrency = await queue.currentConcurrencyOfTask(
+ authenticatedEnvProd,
+ messageProd.taskIdentifier
+ );
+ expect(taskConcurrency).toBe(0);
+
+ //queue lengths
+ const queueLength3 = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue);
+ expect(queueLength3).toBe(0);
+ const envQueueLength3 = await queue.lengthOfEnvQueue(authenticatedEnvProd);
+ expect(envQueueLength3).toBe(0);
+
+ //check the message is gone
+ const exists2 = await redis.exists(key);
+ expect(exists2).toBe(0);
+
+ //dequeue
+ const messages2 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10);
+ expect(messages2.length).toBe(0);
+ } finally {
+ try {
+ await queue.quit();
+ await redis.quit();
+ } catch (e) {}
+ }
+ });
+
+ redisTest("Ack (before dequeue)", { timeout: 5_000 }, async ({ redisContainer }) => {
+ const queue = new RunQueue({
+ ...testOptions,
+ redis: {
+ keyPrefix: "runqueue:test:",
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ },
+ });
+
+ try {
+ await queue.enqueueMessage({
+ env: authenticatedEnvProd,
+ message: messageProd,
+ masterQueues: "main",
+ });
+
+ const queueLength = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue);
+ expect(queueLength).toBe(1);
+ const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvProd);
+ expect(envQueueLength).toBe(1);
+
+ await queue.acknowledgeMessage(messageProd.orgId, messageProd.runId);
+
+ //concurrencies
+ const queueConcurrency = await queue.currentConcurrencyOfQueue(
+ authenticatedEnvProd,
+ messageProd.queue
+ );
+ expect(queueConcurrency).toBe(0);
+ const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd);
+ expect(envConcurrency).toBe(0);
+ const projectConcurrency = await queue.currentConcurrencyOfProject(authenticatedEnvProd);
+ expect(projectConcurrency).toBe(0);
+ const taskConcurrency = await queue.currentConcurrencyOfTask(
+ authenticatedEnvProd,
+ messageProd.taskIdentifier
+ );
+ expect(taskConcurrency).toBe(0);
+
+ //queue lengths
+ const queueLength3 = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue);
+ expect(queueLength3).toBe(0);
+ const envQueueLength3 = await queue.lengthOfEnvQueue(authenticatedEnvProd);
+ expect(envQueueLength3).toBe(0);
+
+ //dequeue
+ const messages2 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10);
+ expect(messages2.length).toBe(0);
+ } finally {
+ await queue.quit();
+ }
+ });
+
+ redisTest("Nacking", { timeout: 15_000 }, async ({ redisContainer, redisOptions }) => {
+ const queue = new RunQueue({
+ ...testOptions,
+ redis: {
+ keyPrefix: "runqueue:test:",
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ },
+ });
+
+ const redis = new Redis({ ...redisOptions, keyPrefix: "runqueue:test:" });
+
+ try {
+ await queue.enqueueMessage({
+ env: authenticatedEnvProd,
+ message: messageProd,
+ masterQueues: "main2",
+ });
+
+ const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main2", 10);
+ expect(messages.length).toBe(1);
+
+ //check the message is there
+ const key = queue.keys.messageKey(messages[0].message.orgId, messages[0].messageId);
+ const exists = await redis.exists(key);
+ expect(exists).toBe(1);
+
+ //concurrencies
+ const queueConcurrency = await queue.currentConcurrencyOfQueue(
+ authenticatedEnvProd,
+ messageProd.queue
+ );
+ expect(queueConcurrency).toBe(1);
+ const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd);
+ expect(envConcurrency).toBe(1);
+ const projectConcurrency = await queue.currentConcurrencyOfProject(authenticatedEnvProd);
+ expect(projectConcurrency).toBe(1);
+ const taskConcurrency = await queue.currentConcurrencyOfTask(
+ authenticatedEnvProd,
+ messageProd.taskIdentifier
+ );
+ expect(taskConcurrency).toBe(1);
+
+ await queue.nackMessage({
+ orgId: messages[0].message.orgId,
+ messageId: messages[0].messageId,
+ });
+
+ //we need to wait because the default wait is 1 second
+ await setTimeout(300);
+
+ //concurrencies
+ const queueConcurrency2 = await queue.currentConcurrencyOfQueue(
+ authenticatedEnvProd,
+ messageProd.queue
+ );
+ expect(queueConcurrency2).toBe(0);
+ const envConcurrency2 = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd);
+ expect(envConcurrency2).toBe(0);
+ const projectConcurrency2 = await queue.currentConcurrencyOfProject(authenticatedEnvProd);
+ expect(projectConcurrency2).toBe(0);
+ const taskConcurrency2 = await queue.currentConcurrencyOfTask(
+ authenticatedEnvProd,
+ messageProd.taskIdentifier
+ );
+ expect(taskConcurrency2).toBe(0);
+
+ //queue lengths
+ const queueLength = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue);
+ expect(queueLength).toBe(1);
+ const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvProd);
+ expect(envQueueLength).toBe(1);
+
+ //check the message is there
+ const exists2 = await redis.exists(key);
+ expect(exists2).toBe(1);
+
+ //dequeue
+ const messages2 = await queue.dequeueMessageFromMasterQueue("test_12345", "main2", 10);
+ expect(messages2[0].messageId).toBe(messageProd.runId);
+ } finally {
+ try {
+ await queue.quit();
+ await redis.quit();
+ } catch (e) {}
+ }
+ });
+
+ redisTest(
+ "Releasing concurrency",
+ { timeout: 5_000 },
+ async ({ redisContainer, redisOptions }) => {
+ const queue = new RunQueue({
+ ...testOptions,
+ redis: {
+ keyPrefix: "runqueue:test:",
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ },
+ });
+
+ const redis = new Redis({ ...redisOptions, keyPrefix: "runqueue:test:" });
+
+ try {
+ await queue.enqueueMessage({
+ env: authenticatedEnvProd,
+ message: messageProd,
+ masterQueues: "main",
+ });
+
+ const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10);
+ expect(messages.length).toBe(1);
+
+ //check the message is gone
+ const key = queue.keys.messageKey(messages[0].message.orgId, messages[0].messageId);
+ const exists = await redis.exists(key);
+ expect(exists).toBe(1);
+
+ //concurrencies
+ expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe(
+ 1
+ );
+ expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1);
+ expect(await queue.currentConcurrencyOfProject(authenticatedEnvProd)).toBe(1);
+ expect(
+ await queue.currentConcurrencyOfTask(authenticatedEnvProd, messageProd.taskIdentifier)
+ ).toBe(1);
+
+ //release the concurrency (not the queue)
+ await queue.releaseConcurrency(
+ authenticatedEnvProd.organization.id,
+ messages[0].messageId,
+ false
+ );
+
+ //concurrencies
+ expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe(
+ 1
+ );
+ expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(0);
+ expect(await queue.currentConcurrencyOfProject(authenticatedEnvProd)).toBe(0);
+ expect(
+ await queue.currentConcurrencyOfTask(authenticatedEnvProd, messageProd.taskIdentifier)
+ ).toBe(0);
+
+ //reacquire the concurrency
+ await queue.reacquireConcurrency(
+ authenticatedEnvProd.organization.id,
+ messages[0].messageId
+ );
+
+ //check concurrencies are back to what they were before
+ expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe(
+ 1
+ );
+ expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1);
+ expect(await queue.currentConcurrencyOfProject(authenticatedEnvProd)).toBe(1);
+ expect(
+ await queue.currentConcurrencyOfTask(authenticatedEnvProd, messageProd.taskIdentifier)
+ ).toBe(1);
+
+ //release the concurrency (with the queue this time)
+ await queue.releaseConcurrency(
+ authenticatedEnvProd.organization.id,
+ messages[0].messageId,
+ true
+ );
+
+ //concurrencies
+ expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe(
+ 0
+ );
+ expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(0);
+ expect(await queue.currentConcurrencyOfProject(authenticatedEnvProd)).toBe(0);
+ expect(
+ await queue.currentConcurrencyOfTask(authenticatedEnvProd, messageProd.taskIdentifier)
+ ).toBe(0);
+
+ //reacquire the concurrency
+ await queue.reacquireConcurrency(
+ authenticatedEnvProd.organization.id,
+ messages[0].messageId
+ );
+
+ //check concurrencies are back to what they were before
+ expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe(
+ 1
+ );
+ expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1);
+ expect(await queue.currentConcurrencyOfProject(authenticatedEnvProd)).toBe(1);
+ expect(
+ await queue.currentConcurrencyOfTask(authenticatedEnvProd, messageProd.taskIdentifier)
+ ).toBe(1);
+ } finally {
+ try {
+ await queue.quit();
+ await redis.quit();
+ } catch (e) {}
+ }
+ }
+ );
+
+ redisTest("Dead Letter Queue", { timeout: 8_000 }, async ({ redisContainer, redisOptions }) => {
+ const queue = new RunQueue({
+ ...testOptions,
+ retryOptions: {
+ maxAttempts: 1,
+ },
+ redis: {
+ keyPrefix: "runqueue:test:",
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ },
+ });
+
+ const redis = new Redis({ ...redisOptions, keyPrefix: "runqueue:test:" });
+
+ try {
+ await queue.enqueueMessage({
+ env: authenticatedEnvProd,
+ message: messageProd,
+ masterQueues: "main",
+ });
+
+ const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10);
+ expect(messages.length).toBe(1);
+
+ //check the message is there
+ const key = queue.keys.messageKey(messages[0].message.orgId, messages[0].messageId);
+ const exists = await redis.exists(key);
+ expect(exists).toBe(1);
+
+ //nack (we only have attempts set to 1)
+ await queue.nackMessage({
+ orgId: messages[0].message.orgId,
+ messageId: messages[0].messageId,
+ });
+
+ //dequeue
+ const messages2 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10);
+ expect(messages2.length).toBe(0);
+
+ //concurrencies
+ const queueConcurrency2 = await queue.currentConcurrencyOfQueue(
+ authenticatedEnvProd,
+ messageProd.queue
+ );
+ expect(queueConcurrency2).toBe(0);
+ const envConcurrency2 = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd);
+ expect(envConcurrency2).toBe(0);
+ const projectConcurrency2 = await queue.currentConcurrencyOfProject(authenticatedEnvProd);
+ expect(projectConcurrency2).toBe(0);
+ const taskConcurrency2 = await queue.currentConcurrencyOfTask(
+ authenticatedEnvProd,
+ messageProd.taskIdentifier
+ );
+ expect(taskConcurrency2).toBe(0);
+
+ //check the message is still there
+ const exists2 = await redis.exists(key);
+ expect(exists2).toBe(1);
+
+ //check it's in the dlq
+ const dlqKey = "dlq";
+ const dlqExists = await redis.exists(dlqKey);
+ expect(dlqExists).toBe(1);
+ const dlqMembers = await redis.zrange(dlqKey, 0, -1);
+ expect(dlqMembers).toContain(messageProd.runId);
+
+ //redrive
+ const redisClient = new Redis({
+ host: redisContainer.getHost(),
+ port: redisContainer.getPort(),
+ password: redisContainer.getPassword(),
+ });
+
+ // Publish redrive message
+ await redisClient.publish(
+ "rq:redrive",
+ JSON.stringify({ runId: messageProd.runId, orgId: messageProd.orgId })
+ );
+
+ // Wait for the item to be redrived and processed
+ await setTimeout(5_000);
+ await redisClient.quit();
+
+ //shouldn't be in the dlq now
+ const dlqMembersAfter = await redis.zrange(dlqKey, 0, -1);
+ expect(dlqMembersAfter).not.toContain(messageProd.runId);
+
+ //dequeue
+ const messages3 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10);
+ expect(messages3[0].messageId).toBe(messageProd.runId);
+ } finally {
+ try {
+ await queue.quit();
+ await redis.quit();
+ } catch (e) {}
+ }
+ });
+});
diff --git a/internal-packages/run-engine/src/run-queue/index.ts b/internal-packages/run-engine/src/run-queue/index.ts
new file mode 100644
index 0000000000..4c58ff4bb3
--- /dev/null
+++ b/internal-packages/run-engine/src/run-queue/index.ts
@@ -0,0 +1,1679 @@
+import { context, propagation, Span, SpanKind, SpanOptions, Tracer } from "@opentelemetry/api";
+import {
+ SEMATTRS_MESSAGE_ID,
+ SEMATTRS_MESSAGING_OPERATION,
+ SEMATTRS_MESSAGING_SYSTEM,
+} from "@opentelemetry/semantic-conventions";
+import { Logger } from "@trigger.dev/core/logger";
+import { calculateNextRetryDelay, flattenAttributes } from "@trigger.dev/core/v3";
+import { type RetryOptions } from "@trigger.dev/core/v3/schemas";
+import { Redis, type Callback, type RedisOptions, type Result } from "ioredis";
+import {
+ attributesFromAuthenticatedEnv,
+ MinimalAuthenticatedEnvironment,
+} from "../shared/index.js";
+import { RunQueueShortKeyProducer } from "./keyProducer.js";
+import {
+ InputPayload,
+ OutputPayload,
+ QueueCapacities,
+ QueueRange,
+ RunQueueKeyProducer,
+ RunQueuePriorityStrategy,
+} from "./types.js";
+
+const SemanticAttributes = {
+ QUEUE: "runqueue.queue",
+ MASTER_QUEUES: "runqueue.masterQueues",
+ RUN_ID: "runqueue.runId",
+ RESULT_COUNT: "runqueue.resultCount",
+ CONCURRENCY_KEY: "runqueue.concurrencyKey",
+ ORG_ID: "runqueue.orgId",
+};
+
+export type RunQueueOptions = {
+ name: string;
+ tracer: Tracer;
+ redis: RedisOptions;
+ defaultEnvConcurrency: number;
+ windowSize?: number;
+ queuePriorityStrategy: RunQueuePriorityStrategy;
+ envQueuePriorityStrategy: RunQueuePriorityStrategy;
+ verbose?: boolean;
+ logger: Logger;
+ retryOptions?: RetryOptions;
+};
+
+type DequeuedMessage = {
+ messageId: string;
+ messageScore: string;
+ message: OutputPayload;
+};
+
+const defaultRetrySettings = {
+ maxAttempts: 12,
+ factor: 2,
+ minTimeoutInMs: 1_000,
+ maxTimeoutInMs: 3_600_000,
+ randomize: true,
+};
+
+/**
+ * RunQueue – the queue that's used to process runs
+ */
+export class RunQueue {
+ private retryOptions: RetryOptions;
+ private subscriber: Redis;
+ private logger: Logger;
+ private redis: Redis;
+ public keys: RunQueueKeyProducer;
+ private queuePriorityStrategy: RunQueuePriorityStrategy;
+
+ constructor(private readonly options: RunQueueOptions) {
+ this.retryOptions = options.retryOptions ?? defaultRetrySettings;
+ this.redis = new Redis(options.redis);
+ this.logger = options.logger;
+
+ this.keys = new RunQueueShortKeyProducer("rq:");
+ this.queuePriorityStrategy = options.queuePriorityStrategy;
+
+ this.subscriber = new Redis(options.redis);
+ this.#setupSubscriber();
+
+ this.#registerCommands();
+ }
+
+ get name() {
+ return this.options.name;
+ }
+
+ get tracer() {
+ return this.options.tracer;
+ }
+
+ public async updateQueueConcurrencyLimits(
+ env: MinimalAuthenticatedEnvironment,
+ queue: string,
+ concurrency: number
+ ) {
+ return this.redis.set(this.keys.queueConcurrencyLimitKey(env, queue), concurrency);
+ }
+
+ public async removeQueueConcurrencyLimits(env: MinimalAuthenticatedEnvironment, queue: string) {
+ return this.redis.del(this.keys.queueConcurrencyLimitKey(env, queue));
+ }
+
+ public async getQueueConcurrencyLimit(env: MinimalAuthenticatedEnvironment, queue: string) {
+ const result = await this.redis.get(this.keys.queueConcurrencyLimitKey(env, queue));
+
+ return result ? Number(result) : undefined;
+ }
+
+ public async updateEnvConcurrencyLimits(env: MinimalAuthenticatedEnvironment) {
+ await this.#callUpdateGlobalConcurrencyLimits({
+ envConcurrencyLimitKey: this.keys.envConcurrencyLimitKey(env),
+ envConcurrencyLimit: env.maximumConcurrencyLimit,
+ });
+ }
+
+ public async getEnvConcurrencyLimit(env: MinimalAuthenticatedEnvironment) {
+ const result = await this.redis.get(this.keys.envConcurrencyLimitKey(env));
+
+ return result ? Number(result) : this.options.defaultEnvConcurrency;
+ }
+
+ public async lengthOfQueue(
+ env: MinimalAuthenticatedEnvironment,
+ queue: string,
+ concurrencyKey?: string
+ ) {
+ return this.redis.zcard(this.keys.queueKey(env, queue, concurrencyKey));
+ }
+
+ public async lengthOfEnvQueue(env: MinimalAuthenticatedEnvironment) {
+ return this.redis.zcard(this.keys.envQueueKey(env));
+ }
+
+ public async oldestMessageInQueue(
+ env: MinimalAuthenticatedEnvironment,
+ queue: string,
+ concurrencyKey?: string
+ ) {
+ // Get the "score" of the sorted set to get the oldest message score
+ const result = await this.redis.zrange(
+ this.keys.queueKey(env, queue, concurrencyKey),
+ 0,
+ 0,
+ "WITHSCORES"
+ );
+
+ if (result.length === 0) {
+ return;
+ }
+
+ return Number(result[1]);
+ }
+
+ public async currentConcurrencyOfQueue(
+ env: MinimalAuthenticatedEnvironment,
+ queue: string,
+ concurrencyKey?: string
+ ) {
+ return this.redis.scard(this.keys.currentConcurrencyKey(env, queue, concurrencyKey));
+ }
+
+ public async currentConcurrencyOfEnvironment(env: MinimalAuthenticatedEnvironment) {
+ return this.redis.scard(this.keys.envCurrentConcurrencyKey(env));
+ }
+
+ public async currentConcurrencyOfProject(env: MinimalAuthenticatedEnvironment) {
+ return this.redis.scard(this.keys.projectCurrentConcurrencyKey(env));
+ }
+
+ public async currentConcurrencyOfTask(
+ env: MinimalAuthenticatedEnvironment,
+ taskIdentifier: string
+ ) {
+ return this.redis.scard(this.keys.taskIdentifierCurrentConcurrencyKey(env, taskIdentifier));
+ }
+
+ public async enqueueMessage({
+ env,
+ message,
+ masterQueues,
+ }: {
+ env: MinimalAuthenticatedEnvironment;
+ message: InputPayload;
+ masterQueues: string | string[];
+ }) {
+ return await this.#trace(
+ "enqueueMessage",
+ async (span) => {
+ const { runId, concurrencyKey } = message;
+
+ const queue = this.keys.queueKey(env, message.queue, concurrencyKey);
+
+ propagation.inject(context.active(), message);
+
+ const parentQueues = typeof masterQueues === "string" ? [masterQueues] : masterQueues;
+
+ span.setAttributes({
+ [SemanticAttributes.QUEUE]: queue,
+ [SemanticAttributes.RUN_ID]: runId,
+ [SemanticAttributes.CONCURRENCY_KEY]: concurrencyKey,
+ [SemanticAttributes.MASTER_QUEUES]: parentQueues.join(","),
+ });
+
+ const messagePayload: OutputPayload = {
+ ...message,
+ version: "1",
+ queue,
+ masterQueues: parentQueues,
+ attempt: 0,
+ };
+
+ await this.#callEnqueueMessage(messagePayload, parentQueues);
+ },
+ {
+ kind: SpanKind.PRODUCER,
+ attributes: {
+ [SEMATTRS_MESSAGING_OPERATION]: "publish",
+ [SEMATTRS_MESSAGE_ID]: message.runId,
+ [SEMATTRS_MESSAGING_SYSTEM]: "runqueue",
+ ...attributesFromAuthenticatedEnv(env),
+ },
+ }
+ );
+ }
+
+ public async getSharedQueueDetails(masterQueue: string, maxCount: number) {
+ const { range } = await this.queuePriorityStrategy.nextCandidateSelection(
+ masterQueue,
+ "getSharedQueueDetails"
+ );
+ const queues = await this.#getChildQueuesWithScores(masterQueue, range);
+
+ const queuesWithScores = await this.#calculateQueueScores(queues, (queue) =>
+ this.#calculateMessageQueueCapacities(queue)
+ );
+
+ // We need to priority shuffle here to ensure all workers aren't just working on the highest priority queue
+ const result = this.queuePriorityStrategy.chooseQueues(
+ queuesWithScores,
+ masterQueue,
+ "getSharedQueueDetails",
+ range,
+ maxCount
+ );
+
+ return {
+ selectionId: "getSharedQueueDetails",
+ queues,
+ queuesWithScores,
+ nextRange: range,
+ queueCount: queues.length,
+ queueChoice: result,
+ };
+ }
+
+ /**
+ * Dequeue messages from the master queue
+ */
+ public async dequeueMessageFromMasterQueue(
+ consumerId: string,
+ masterQueue: string,
+ maxCount: number
+ ): Promise {
+ return this.#trace(
+ "dequeueMessageInSharedQueue",
+ async (span) => {
+ // Read the parent queue for matching queues
+ const selectedQueues = await this.#getRandomQueueFromParentQueue(
+ masterQueue,
+ this.options.queuePriorityStrategy,
+ (queue) => this.#calculateMessageQueueCapacities(queue, { checkForDisabled: true }),
+ consumerId,
+ maxCount
+ );
+
+ if (!selectedQueues || selectedQueues.length === 0) {
+ return [];
+ }
+
+ const messages: DequeuedMessage[] = [];
+ const remainingMessages = selectedQueues.map((q) => q.size);
+ let currentQueueIndex = 0;
+
+ while (messages.length < maxCount) {
+ let foundMessage = false;
+
+ // Try each queue once in this round
+ for (let i = 0; i < selectedQueues.length; i++) {
+ currentQueueIndex = (currentQueueIndex + i) % selectedQueues.length;
+
+ // Skip if this queue is empty
+ if (remainingMessages[currentQueueIndex] <= 0) continue;
+
+ const selectedQueue = selectedQueues[currentQueueIndex];
+ const queue = selectedQueue.queue;
+
+ const message = await this.#callDequeueMessage({
+ messageQueue: queue,
+ concurrencyLimitKey: this.keys.concurrencyLimitKeyFromQueue(queue),
+ currentConcurrencyKey: this.keys.currentConcurrencyKeyFromQueue(queue),
+ envConcurrencyLimitKey: this.keys.envConcurrencyLimitKeyFromQueue(queue),
+ envCurrentConcurrencyKey: this.keys.envCurrentConcurrencyKeyFromQueue(queue),
+ projectCurrentConcurrencyKey: this.keys.projectCurrentConcurrencyKeyFromQueue(queue),
+ messageKeyPrefix: this.keys.messageKeyPrefixFromQueue(queue),
+ envQueueKey: this.keys.envQueueKeyFromQueue(queue),
+ taskCurrentConcurrentKeyPrefix:
+ this.keys.taskIdentifierCurrentConcurrencyKeyPrefixFromQueue(queue),
+ });
+
+ if (message) {
+ messages.push(message);
+ remainingMessages[currentQueueIndex]--;
+ foundMessage = true;
+ break;
+ } else {
+ // If we failed to get a message, mark this queue as empty
+ remainingMessages[currentQueueIndex] = 0;
+ }
+ }
+
+ // If we couldn't get a message from any queue, break
+ if (!foundMessage) break;
+ }
+
+ span.setAttributes({
+ [SemanticAttributes.RESULT_COUNT]: messages.length,
+ [SemanticAttributes.MASTER_QUEUES]: masterQueue,
+ });
+
+ return messages;
+ },
+ {
+ kind: SpanKind.CONSUMER,
+ attributes: {
+ [SEMATTRS_MESSAGING_OPERATION]: "receive",
+ [SEMATTRS_MESSAGING_SYSTEM]: "runqueue",
+ },
+ }
+ );
+ }
+
+ /**
+ * Acknowledge a message, which will:
+ * - remove all data from the queue
+ * - release all concurrency
+ * This is done when the run is in a final state.
+ * @param messageId
+ */
+ public async acknowledgeMessage(orgId: string, messageId: string) {
+ return this.#trace(
+ "acknowledgeMessage",
+ async (span) => {
+ const message = await this.#readMessage(orgId, messageId);
+
+ if (!message) {
+ this.logger.log(`[${this.name}].acknowledgeMessage() message not found`, {
+ messageId,
+ service: this.name,
+ });
+ return;
+ }
+
+ span.setAttributes({
+ [SemanticAttributes.QUEUE]: message.queue,
+ [SemanticAttributes.ORG_ID]: message.orgId,
+ [SemanticAttributes.RUN_ID]: messageId,
+ [SemanticAttributes.CONCURRENCY_KEY]: message.concurrencyKey,
+ });
+
+ await this.#callAcknowledgeMessage({
+ messageId,
+ messageQueue: message.queue,
+ masterQueues: message.masterQueues,
+ messageKey: this.keys.messageKey(orgId, messageId),
+ concurrencyKey: this.keys.currentConcurrencyKeyFromQueue(message.queue),
+ envConcurrencyKey: this.keys.envCurrentConcurrencyKeyFromQueue(message.queue),
+ taskConcurrencyKey: this.keys.taskIdentifierCurrentConcurrencyKeyFromQueue(
+ message.queue,
+ message.taskIdentifier
+ ),
+ envQueueKey: this.keys.envQueueKeyFromQueue(message.queue),
+ projectConcurrencyKey: this.keys.projectCurrentConcurrencyKeyFromQueue(message.queue),
+ });
+ },
+ {
+ kind: SpanKind.CONSUMER,
+ attributes: {
+ [SEMATTRS_MESSAGING_OPERATION]: "ack",
+ [SEMATTRS_MESSAGE_ID]: messageId,
+ [SEMATTRS_MESSAGING_SYSTEM]: "runqueue",
+ },
+ }
+ );
+ }
+
+ /**
+ * Negative acknowledge a message, which will requeue the message (with an optional future date).
+ If you pass no date it will get reattempted with exponential backoff.
+ */
+ public async nackMessage({
+ orgId,
+ messageId,
+ retryAt,
+ incrementAttemptCount = true,
+ }: {
+ orgId: string;
+ messageId: string;
+ retryAt?: number;
+ incrementAttemptCount?: boolean;
+ }) {
+ return this.#trace(
+ "nackMessage",
+ async (span) => {
+ const maxAttempts = this.retryOptions.maxAttempts ?? defaultRetrySettings.maxAttempts;
+
+ const message = await this.#readMessage(orgId, messageId);
+ if (!message) {
+ this.logger.log(`[${this.name}].nackMessage() message not found`, {
+ orgId,
+ messageId,
+ maxAttempts,
+ retryAt,
+ service: this.name,
+ });
+ return;
+ }
+
+ span.setAttributes({
+ [SemanticAttributes.QUEUE]: message.queue,
+ [SemanticAttributes.RUN_ID]: messageId,
+ [SemanticAttributes.CONCURRENCY_KEY]: message.concurrencyKey,
+ [SemanticAttributes.MASTER_QUEUES]: message.masterQueues.join(","),
+ });
+
+ const messageKey = this.keys.messageKey(orgId, messageId);
+ const messageQueue = message.queue;
+ const concurrencyKey = this.keys.currentConcurrencyKeyFromQueue(message.queue);
+ const envConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(message.queue);
+ const taskConcurrencyKey = this.keys.taskIdentifierCurrentConcurrencyKeyFromQueue(
+ message.queue,
+ message.taskIdentifier
+ );
+ const projectConcurrencyKey = this.keys.projectCurrentConcurrencyKeyFromQueue(
+ message.queue
+ );
+ const envQueueKey = this.keys.envQueueKeyFromQueue(message.queue);
+
+ if (incrementAttemptCount) {
+ message.attempt = message.attempt + 1;
+ if (message.attempt >= maxAttempts) {
+ await this.redis.moveToDeadLetterQueue(
+ messageKey,
+ messageQueue,
+ concurrencyKey,
+ envConcurrencyKey,
+ projectConcurrencyKey,
+ envQueueKey,
+ taskConcurrencyKey,
+ "dlq",
+ messageId,
+ JSON.stringify(message.masterQueues),
+ this.options.redis.keyPrefix ?? ""
+ );
+ return false;
+ }
+ }
+
+ const nextRetryDelay = calculateNextRetryDelay(this.retryOptions, message.attempt);
+ const messageScore = retryAt ?? (nextRetryDelay ? Date.now() + nextRetryDelay : Date.now());
+
+ this.logger.debug("Calling nackMessage", {
+ messageKey,
+ messageQueue,
+ masterQueues: message.masterQueues,
+ concurrencyKey,
+ envConcurrencyKey,
+ projectConcurrencyKey,
+ envQueueKey,
+ taskConcurrencyKey,
+ messageId,
+ messageScore,
+ attempt: message.attempt,
+ service: this.name,
+ });
+
+ await this.redis.nackMessage(
+ //keys
+ messageKey,
+ messageQueue,
+ concurrencyKey,
+ envConcurrencyKey,
+ projectConcurrencyKey,
+ envQueueKey,
+ taskConcurrencyKey,
+ //args
+ messageId,
+ JSON.stringify(message),
+ String(messageScore),
+ JSON.stringify(message.masterQueues),
+ this.options.redis.keyPrefix ?? ""
+ );
+ return true;
+ },
+ {
+ kind: SpanKind.CONSUMER,
+ attributes: {
+ [SEMATTRS_MESSAGING_OPERATION]: "nack",
+ [SEMATTRS_MESSAGE_ID]: messageId,
+ [SEMATTRS_MESSAGING_SYSTEM]: "runqueue",
+ },
+ }
+ );
+ }
+
+ public async releaseConcurrency(
+ orgId: string,
+ messageId: string,
+ releaseForRun: boolean = false
+ ) {
+ return this.#trace(
+ "releaseConcurrency",
+ async (span) => {
+ const message = await this.#readMessage(orgId, messageId);
+
+ if (!message) {
+ this.logger.log(`[${this.name}].acknowledgeMessage() message not found`, {
+ messageId,
+ service: this.name,
+ });
+ return;
+ }
+
+ span.setAttributes({
+ [SemanticAttributes.QUEUE]: message.queue,
+ [SemanticAttributes.ORG_ID]: message.orgId,
+ [SemanticAttributes.RUN_ID]: messageId,
+ [SemanticAttributes.CONCURRENCY_KEY]: message.concurrencyKey,
+ });
+
+ return this.redis.releaseConcurrency(
+ this.keys.messageKey(orgId, messageId),
+ message.queue,
+ releaseForRun ? this.keys.currentConcurrencyKeyFromQueue(message.queue) : "",
+ this.keys.envCurrentConcurrencyKeyFromQueue(message.queue),
+ this.keys.projectCurrentConcurrencyKeyFromQueue(message.queue),
+ this.keys.taskIdentifierCurrentConcurrencyKeyFromQueue(
+ message.queue,
+ message.taskIdentifier
+ ),
+ messageId,
+ JSON.stringify(message.masterQueues)
+ );
+ },
+ {
+ kind: SpanKind.CONSUMER,
+ attributes: {
+ [SEMATTRS_MESSAGING_OPERATION]: "releaseConcurrency",
+ [SEMATTRS_MESSAGE_ID]: messageId,
+ [SEMATTRS_MESSAGING_SYSTEM]: "runqueue",
+ },
+ }
+ );
+ }
+
+ public async reacquireConcurrency(orgId: string, messageId: string) {
+ return this.#trace(
+ "reacquireConcurrency",
+ async (span) => {
+ const message = await this.#readMessage(orgId, messageId);
+
+ if (!message) {
+ this.logger.log(`[${this.name}].acknowledgeMessage() message not found`, {
+ messageId,
+ service: this.name,
+ });
+ return;
+ }
+
+ span.setAttributes({
+ [SemanticAttributes.QUEUE]: message.queue,
+ [SemanticAttributes.ORG_ID]: message.orgId,
+ [SemanticAttributes.RUN_ID]: messageId,
+ [SemanticAttributes.CONCURRENCY_KEY]: message.concurrencyKey,
+ });
+
+ return this.redis.reacquireConcurrency(
+ this.keys.messageKey(orgId, messageId),
+ message.queue,
+ this.keys.currentConcurrencyKeyFromQueue(message.queue),
+ this.keys.envCurrentConcurrencyKeyFromQueue(message.queue),
+ this.keys.projectCurrentConcurrencyKeyFromQueue(message.queue),
+ this.keys.taskIdentifierCurrentConcurrencyKeyFromQueue(
+ message.queue,
+ message.taskIdentifier
+ ),
+ messageId,
+ JSON.stringify(message.masterQueues)
+ );
+ },
+ {
+ kind: SpanKind.CONSUMER,
+ attributes: {
+ [SEMATTRS_MESSAGING_OPERATION]: "releaseConcurrency",
+ [SEMATTRS_MESSAGE_ID]: messageId,
+ [SEMATTRS_MESSAGING_SYSTEM]: "runqueue",
+ },
+ }
+ );
+ }
+
+ queueConcurrencyScanStream(
+ count: number = 100,
+ onEndCallback?: () => void,
+ onErrorCallback?: (error: Error) => void
+ ) {
+ const pattern = this.keys.queueCurrentConcurrencyScanPattern();
+
+ this.logger.debug("Starting queue concurrency scan stream", {
+ pattern,
+ component: "runqueue",
+ operation: "queueConcurrencyScanStream",
+ service: this.name,
+ count,
+ });
+
+ const redis = this.redis.duplicate();
+
+ const stream = redis.scanStream({
+ match: pattern,
+ type: "set",
+ count,
+ });
+
+ stream.on("end", () => {
+ onEndCallback?.();
+ redis.quit();
+ });
+
+ stream.on("error", (error) => {
+ onErrorCallback?.(error);
+ redis.quit();
+ });
+
+ return { stream, redis };
+ }
+
+ async quit() {
+ await this.subscriber.unsubscribe();
+ await this.subscriber.quit();
+ await this.redis.quit();
+ }
+
+ private async handleRedriveMessage(channel: string, message: string) {
+ try {
+ const { runId, orgId } = JSON.parse(message) as any;
+ if (typeof orgId !== "string" || typeof runId !== "string") {
+ this.logger.error(
+ "handleRedriveMessage: invalid message format: runId and orgId must be strings",
+ { message, channel }
+ );
+ return;
+ }
+
+ const data = await this.#readMessage(orgId, runId);
+
+ if (!data) {
+ this.logger.error(`handleRedriveMessage: couldn't read message`, { orgId, runId, channel });
+ return;
+ }
+
+ await this.enqueueMessage({
+ env: {
+ id: data.environmentId,
+ type: data.environmentType,
+ //this isn't used in enqueueMessage
+ maximumConcurrencyLimit: -1,
+ project: {
+ id: data.projectId,
+ },
+ organization: {
+ id: data.orgId,
+ },
+ },
+ message: {
+ ...data,
+ attempt: 0,
+ },
+ masterQueues: data.masterQueues,
+ });
+
+ //remove from the dlq
+ const result = await this.redis.zrem("dlq", runId);
+
+ if (result === 0) {
+ this.logger.error(`handleRedriveMessage: couldn't remove message from dlq`, {
+ orgId,
+ runId,
+ channel,
+ });
+ return;
+ }
+
+ this.logger.log(`handleRedriveMessage: redrived item ${runId} from Dead Letter Queue`);
+ } catch (error) {
+ this.logger.error("Error processing redrive message", { error, message });
+ }
+ }
+
+ async #trace(
+ name: string,
+ fn: (span: Span) => Promise,
+ options?: SpanOptions & { sampleRate?: number }
+ ): Promise {
+ return this.tracer.startActiveSpan(
+ name,
+ {
+ ...options,
+ attributes: {
+ ...options?.attributes,
+ },
+ },
+ async (span) => {
+ try {
+ return await fn(span);
+ } catch (e) {
+ if (e instanceof Error) {
+ span.recordException(e);
+ } else {
+ span.recordException(new Error(String(e)));
+ }
+
+ throw e;
+ } finally {
+ span.end();
+ }
+ }
+ );
+ }
+
+ async #setupSubscriber() {
+ const channel = `${this.options.name}:redrive`;
+ this.subscriber.subscribe(channel, (err) => {
+ if (err) {
+ this.logger.error(`Failed to subscribe to ${channel}`, { error: err });
+ } else {
+ this.logger.log(`Subscribed to ${channel}`);
+ }
+ });
+
+ this.subscriber.on("message", this.handleRedriveMessage.bind(this));
+ }
+
+ async #readMessage(orgId: string, messageId: string) {
+ return this.#trace(
+ "readMessage",
+ async (span) => {
+ const rawMessage = await this.redis.get(this.keys.messageKey(orgId, messageId));
+
+ if (!rawMessage) {
+ return;
+ }
+
+ const message = OutputPayload.safeParse(JSON.parse(rawMessage));
+
+ if (!message.success) {
+ this.logger.error(`[${this.name}] Failed to parse message`, {
+ messageId,
+ error: message.error,
+ service: this.name,
+ });
+
+ return;
+ }
+
+ return message.data;
+ },
+ {
+ attributes: {
+ [SEMATTRS_MESSAGING_OPERATION]: "receive",
+ [SEMATTRS_MESSAGE_ID]: messageId,
+ [SEMATTRS_MESSAGING_SYSTEM]: "marqs",
+ [SemanticAttributes.RUN_ID]: messageId,
+ },
+ }
+ );
+ }
+
+ async #getRandomQueueFromParentQueue(
+ parentQueue: string,
+ queuePriorityStrategy: RunQueuePriorityStrategy,
+ calculateCapacities: (queue: string) => Promise,
+ consumerId: string,
+ maxCount: number
+ ): Promise<
+ | {
+ queue: string;
+ capacities: QueueCapacities;
+ age: number;
+ size: number;
+ }[]
+ | undefined
+ > {
+ return this.#trace(
+ "getRandomQueueFromParentQueue",
+ async (span) => {
+ span.setAttribute("consumerId", consumerId);
+
+ const { range } = await queuePriorityStrategy.nextCandidateSelection(
+ parentQueue,
+ consumerId
+ );
+
+ const queues = await this.#getChildQueuesWithScores(parentQueue, range, span);
+ span.setAttribute("queueCount", queues.length);
+
+ const queuesWithScores = await this.#calculateQueueScores(queues, calculateCapacities);
+ span.setAttribute("queuesWithScoresCount", queuesWithScores.length);
+
+ // We need to priority shuffle here to ensure all workers aren't just working on the highest priority queue
+ const { choices, nextRange } = queuePriorityStrategy.chooseQueues(
+ queuesWithScores,
+ parentQueue,
+ consumerId,
+ range,
+ maxCount
+ );
+
+ span.setAttributes({
+ ...flattenAttributes(queues, "runqueue.queues"),
+ });
+ span.setAttributes({
+ ...flattenAttributes(queuesWithScores, "runqueue.queuesWithScores"),
+ });
+ span.setAttribute("range.offset", range.offset);
+ span.setAttribute("range.count", range.count);
+ span.setAttribute("nextRange.offset", nextRange.offset);
+ span.setAttribute("nextRange.count", nextRange.count);
+
+ if (this.options.verbose || nextRange.offset > 0) {
+ if (Array.isArray(choices)) {
+ this.logger.debug(`[${this.name}] getRandomQueueFromParentQueue`, {
+ queues,
+ queuesWithScores,
+ range,
+ nextRange,
+ queueCount: queues.length,
+ queuesWithScoresCount: queuesWithScores.length,
+ queueChoices: choices,
+ consumerId,
+ });
+ } else {
+ this.logger.debug(`[${this.name}] getRandomQueueFromParentQueue`, {
+ queues,
+ queuesWithScores,
+ range,
+ nextRange,
+ queueCount: queues.length,
+ queuesWithScoresCount: queuesWithScores.length,
+ noQueueChoice: true,
+ consumerId,
+ });
+ }
+ }
+
+ if (Array.isArray(choices)) {
+ span.setAttribute("queueChoices", choices);
+ return queuesWithScores.filter((queue) => choices.includes(queue.queue));
+ } else {
+ span.setAttribute("noQueueChoice", true);
+ return;
+ }
+ },
+ {
+ kind: SpanKind.CONSUMER,
+ attributes: {
+ [SEMATTRS_MESSAGING_OPERATION]: "receive",
+ [SEMATTRS_MESSAGING_SYSTEM]: "runqueue",
+ [SemanticAttributes.MASTER_QUEUES]: parentQueue,
+ },
+ }
+ );
+ }
+
+ // Calculate the weights of the queues based on the age and the capacity
+ async #calculateQueueScores(
+ queues: Array<{ value: string; score: number }>,
+ calculateCapacities: (queue: string) => Promise
+ ) {
+ const now = Date.now();
+
+ const queueScores = await Promise.all(
+ queues.map(async (queue) => {
+ return {
+ queue: queue.value,
+ capacities: await calculateCapacities(queue.value),
+ age: now - queue.score,
+ size: await this.redis.zcard(queue.value),
+ };
+ })
+ );
+
+ return queueScores;
+ }
+
+ async #calculateMessageQueueCapacities(queue: string, options?: { checkForDisabled?: boolean }) {
+ return await this.#callCalculateMessageCapacities({
+ currentConcurrencyKey: this.keys.currentConcurrencyKeyFromQueue(queue),
+ currentEnvConcurrencyKey: this.keys.envCurrentConcurrencyKeyFromQueue(queue),
+ concurrencyLimitKey: this.keys.concurrencyLimitKeyFromQueue(queue),
+ envConcurrencyLimitKey: this.keys.envConcurrencyLimitKeyFromQueue(queue),
+ disabledConcurrencyLimitKey: options?.checkForDisabled
+ ? this.keys.disabledConcurrencyLimitKeyFromQueue(queue)
+ : undefined,
+ });
+ }
+
+ async #getChildQueuesWithScores(
+ key: string,
+ range: QueueRange,
+ span?: Span
+ ): Promise> {
+ const valuesWithScores = await this.redis.zrangebyscore(
+ key,
+ "-inf",
+ Date.now(),
+ "WITHSCORES",
+ "LIMIT",
+ range.offset,
+ range.count
+ );
+
+ span?.setAttribute("zrangebyscore.valuesWithScores.rawLength", valuesWithScores.length);
+ span?.setAttributes({
+ ...flattenAttributes(valuesWithScores, "zrangebyscore.valuesWithScores.rawValues"),
+ });
+
+ const result: Array<{ value: string; score: number }> = [];
+
+ for (let i = 0; i < valuesWithScores.length; i += 2) {
+ result.push({
+ value: valuesWithScores[i],
+ score: Number(valuesWithScores[i + 1]),
+ });
+ }
+
+ return result;
+ }
+
+ async #callEnqueueMessage(message: OutputPayload, masterQueues: string[]) {
+ const concurrencyKey = this.keys.currentConcurrencyKeyFromQueue(message.queue);
+ const envConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(message.queue);
+ const taskConcurrencyKey = this.keys.taskIdentifierCurrentConcurrencyKeyFromQueue(
+ message.queue,
+ message.taskIdentifier
+ );
+ const projectConcurrencyKey = this.keys.projectCurrentConcurrencyKeyFromQueue(message.queue);
+
+ this.logger.debug("Calling enqueueMessage", {
+ messagePayload: message,
+ concurrencyKey,
+ envConcurrencyKey,
+ masterQueues,
+ service: this.name,
+ });
+
+ return this.redis.enqueueMessage(
+ message.queue,
+ this.keys.messageKey(message.orgId, message.runId),
+ concurrencyKey,
+ envConcurrencyKey,
+ taskConcurrencyKey,
+ projectConcurrencyKey,
+ this.keys.envQueueKeyFromQueue(message.queue),
+ message.queue,
+ message.runId,
+ JSON.stringify(message),
+ String(message.timestamp),
+ JSON.stringify(masterQueues),
+ this.options.redis.keyPrefix ?? ""
+ );
+ }
+
+ async #callDequeueMessage({
+ messageQueue,
+ concurrencyLimitKey,
+ envConcurrencyLimitKey,
+ currentConcurrencyKey,
+ envCurrentConcurrencyKey,
+ projectCurrentConcurrencyKey,
+ messageKeyPrefix,
+ envQueueKey,
+ taskCurrentConcurrentKeyPrefix,
+ }: {
+ messageQueue: string;
+ concurrencyLimitKey: string;
+ envConcurrencyLimitKey: string;
+ currentConcurrencyKey: string;
+ envCurrentConcurrencyKey: string;
+ projectCurrentConcurrencyKey: string;
+ messageKeyPrefix: string;
+ envQueueKey: string;
+ taskCurrentConcurrentKeyPrefix: string;
+ }): Promise {
+ const result = await this.redis.dequeueMessage(
+ //keys
+ messageQueue,
+ concurrencyLimitKey,
+ envConcurrencyLimitKey,
+ currentConcurrencyKey,
+ envCurrentConcurrencyKey,
+ projectCurrentConcurrencyKey,
+ messageKeyPrefix,
+ envQueueKey,
+ taskCurrentConcurrentKeyPrefix,
+ //args
+ messageQueue,
+ String(Date.now()),
+ String(this.options.defaultEnvConcurrency),
+ this.options.redis.keyPrefix ?? ""
+ );
+
+ if (!result) {
+ return;
+ }
+
+ this.logger.debug("Dequeue message result", {
+ result,
+ service: this.name,
+ });
+
+ if (result.length !== 3) {
+ this.logger.error("Invalid dequeue message result", {
+ result,
+ service: this.name,
+ });
+ return;
+ }
+
+ const [messageId, messageScore, rawMessage] = result;
+
+ //read message
+ const parsedMessage = OutputPayload.safeParse(JSON.parse(rawMessage));
+ if (!parsedMessage.success) {
+ this.logger.error(`[${this.name}] Failed to parse message`, {
+ messageId,
+ error: parsedMessage.error,
+ service: this.name,
+ });
+
+ return;
+ }
+
+ const message = parsedMessage.data;
+
+ return {
+ messageId,
+ messageScore,
+ message,
+ };
+ }
+
+ async #callAcknowledgeMessage({
+ messageId,
+ masterQueues,
+ messageKey,
+ messageQueue,
+ concurrencyKey,
+ envConcurrencyKey,
+ taskConcurrencyKey,
+ envQueueKey,
+ projectConcurrencyKey,
+ }: {
+ masterQueues: string[];
+ messageKey: string;
+ messageQueue: string;
+ concurrencyKey: string;
+ envConcurrencyKey: string;
+ taskConcurrencyKey: string;
+ envQueueKey: string;
+ projectConcurrencyKey: string;
+ messageId: string;
+ }) {
+ this.logger.debug("Calling acknowledgeMessage", {
+ messageKey,
+ messageQueue,
+ concurrencyKey,
+ envConcurrencyKey,
+ projectConcurrencyKey,
+ envQueueKey,
+ taskConcurrencyKey,
+ messageId,
+ masterQueues,
+ service: this.name,
+ });
+
+ return this.redis.acknowledgeMessage(
+ messageKey,
+ messageQueue,
+ concurrencyKey,
+ envConcurrencyKey,
+ projectConcurrencyKey,
+ envQueueKey,
+ taskConcurrencyKey,
+ messageId,
+ JSON.stringify(masterQueues),
+ this.options.redis.keyPrefix ?? ""
+ );
+ }
+
+ async #callCalculateMessageCapacities({
+ currentConcurrencyKey,
+ currentEnvConcurrencyKey,
+ concurrencyLimitKey,
+ envConcurrencyLimitKey,
+ disabledConcurrencyLimitKey,
+ }: {
+ currentConcurrencyKey: string;
+ currentEnvConcurrencyKey: string;
+ concurrencyLimitKey: string;
+ envConcurrencyLimitKey: string;
+ disabledConcurrencyLimitKey: string | undefined;
+ }): Promise {
+ const capacities = disabledConcurrencyLimitKey
+ ? await this.redis.calculateMessageQueueCapacitiesWithDisabling(
+ currentConcurrencyKey,
+ currentEnvConcurrencyKey,
+ concurrencyLimitKey,
+ envConcurrencyLimitKey,
+ disabledConcurrencyLimitKey,
+ String(this.options.defaultEnvConcurrency)
+ )
+ : await this.redis.calculateMessageQueueCapacities(
+ currentConcurrencyKey,
+ currentEnvConcurrencyKey,
+ concurrencyLimitKey,
+ envConcurrencyLimitKey,
+ String(this.options.defaultEnvConcurrency)
+ );
+
+ const queueCurrent = Number(capacities[0]);
+ const envLimit = Number(capacities[3]);
+ const isOrgEnabled = Boolean(capacities[4]);
+ const queueLimit = capacities[1]
+ ? Number(capacities[1])
+ : Math.min(envLimit, isOrgEnabled ? Infinity : 0);
+ const envCurrent = Number(capacities[2]);
+
+ return {
+ queue: { current: queueCurrent, limit: queueLimit },
+ env: { current: envCurrent, limit: envLimit },
+ };
+ }
+
+ #callUpdateGlobalConcurrencyLimits({
+ envConcurrencyLimitKey,
+ envConcurrencyLimit,
+ }: {
+ envConcurrencyLimitKey: string;
+ envConcurrencyLimit: number;
+ }) {
+ return this.redis.updateGlobalConcurrencyLimits(
+ envConcurrencyLimitKey,
+ String(envConcurrencyLimit)
+ );
+ }
+
+ #registerCommands() {
+ this.redis.defineCommand("enqueueMessage", {
+ numberOfKeys: 7,
+ lua: `
+local queue = KEYS[1]
+local messageKey = KEYS[2]
+local concurrencyKey = KEYS[3]
+local envConcurrencyKey = KEYS[4]
+local taskConcurrencyKey = KEYS[5]
+local projectConcurrencyKey = KEYS[6]
+local envQueueKey = KEYS[7]
+
+local queueName = ARGV[1]
+local messageId = ARGV[2]
+local messageData = ARGV[3]
+local messageScore = ARGV[4]
+local parentQueues = cjson.decode(ARGV[5])
+local keyPrefix = ARGV[6]
+
+-- Write the message to the message key
+redis.call('SET', messageKey, messageData)
+
+-- Add the message to the queue
+redis.call('ZADD', queue, messageScore, messageId)
+
+-- Add the message to the env queue
+redis.call('ZADD', envQueueKey, messageScore, messageId)
+
+-- Rebalance the parent queues
+local earliestMessage = redis.call('ZRANGE', queue, 0, 0, 'WITHSCORES')
+
+for _, parentQueue in ipairs(parentQueues) do
+ local prefixedParentQueue = keyPrefix .. parentQueue
+ if #earliestMessage == 0 then
+ redis.call('ZREM', prefixedParentQueue, queueName)
+ else
+ redis.call('ZADD', prefixedParentQueue, earliestMessage[2], queueName)
+ end
+end
+
+-- Update the concurrency keys
+redis.call('SREM', concurrencyKey, messageId)
+redis.call('SREM', envConcurrencyKey, messageId)
+redis.call('SREM', taskConcurrencyKey, messageId)
+redis.call('SREM', projectConcurrencyKey, messageId)
+ `,
+ });
+
+ this.redis.defineCommand("dequeueMessage", {
+ numberOfKeys: 9,
+ lua: `
+local childQueue = KEYS[1]
+local concurrencyLimitKey = KEYS[2]
+local envConcurrencyLimitKey = KEYS[3]
+local currentConcurrencyKey = KEYS[4]
+local envCurrentConcurrencyKey = KEYS[5]
+local projectConcurrencyKey = KEYS[6]
+local messageKeyPrefix = KEYS[7]
+local envQueueKey = KEYS[8]
+local taskCurrentConcurrentKeyPrefix = KEYS[9]
+
+local childQueueName = ARGV[1]
+local currentTime = tonumber(ARGV[2])
+local defaultEnvConcurrencyLimit = ARGV[3]
+local keyPrefix = ARGV[4]
+
+-- Check current env concurrency against the limit
+local envCurrentConcurrency = tonumber(redis.call('SCARD', envCurrentConcurrencyKey) or '0')
+local envConcurrencyLimit = tonumber(redis.call('GET', envConcurrencyLimitKey) or defaultEnvConcurrencyLimit)
+
+if envCurrentConcurrency >= envConcurrencyLimit then
+ return nil
+end
+
+-- Check current queue concurrency against the limit
+local currentConcurrency = tonumber(redis.call('SCARD', currentConcurrencyKey) or '0')
+local concurrencyLimit = tonumber(redis.call('GET', concurrencyLimitKey) or '1000000')
+
+-- Check condition only if concurrencyLimit exists
+if currentConcurrency >= concurrencyLimit then
+ return nil
+end
+
+-- Attempt to dequeue the next message
+local messages = redis.call('ZRANGEBYSCORE', childQueue, '-inf', currentTime, 'WITHSCORES', 'LIMIT', 0, 1)
+
+if #messages == 0 then
+ return nil
+end
+
+local messageId = messages[1]
+local messageScore = tonumber(messages[2])
+
+-- Get the message payload
+local messageKey = messageKeyPrefix .. messageId
+local messagePayload = redis.call('GET', messageKey)
+local decodedPayload = cjson.decode(messagePayload);
+
+-- Extract taskIdentifier
+local taskIdentifier = decodedPayload.taskIdentifier
+
+-- Perform SADD with taskIdentifier and messageId
+local taskConcurrencyKey = taskCurrentConcurrentKeyPrefix .. taskIdentifier
+
+-- Update concurrency
+redis.call('ZREM', childQueue, messageId)
+redis.call('ZREM', envQueueKey, messageId)
+redis.call('SADD', currentConcurrencyKey, messageId)
+redis.call('SADD', envCurrentConcurrencyKey, messageId)
+redis.call('SADD', projectConcurrencyKey, messageId)
+redis.call('SADD', taskConcurrencyKey, messageId)
+
+-- Rebalance the parent queues
+local earliestMessage = redis.call('ZRANGE', childQueue, 0, 0, 'WITHSCORES')
+for _, parentQueue in ipairs(decodedPayload.masterQueues) do
+ local prefixedParentQueue = keyPrefix .. parentQueue
+ if #earliestMessage == 0 then
+ redis.call('ZREM', prefixedParentQueue, childQueue)
+ else
+ redis.call('ZADD', prefixedParentQueue, earliestMessage[2], childQueue)
+ end
+end
+
+return {messageId, messageScore, messagePayload} -- Return message details
+ `,
+ });
+
+ this.redis.defineCommand("acknowledgeMessage", {
+ numberOfKeys: 7,
+ lua: `
+-- Keys:
+local messageKey = KEYS[1]
+local messageQueue = KEYS[2]
+local concurrencyKey = KEYS[3]
+local envCurrentConcurrencyKey = KEYS[4]
+local projectCurrentConcurrencyKey = KEYS[5]
+local envQueueKey = KEYS[6]
+local taskCurrentConcurrencyKey = KEYS[7]
+
+-- Args:
+local messageId = ARGV[1]
+local parentQueues = cjson.decode(ARGV[2])
+local keyPrefix = ARGV[3]
+
+-- Remove the message from the message key
+redis.call('DEL', messageKey)
+
+-- Remove the message from the queue
+redis.call('ZREM', messageQueue, messageId)
+redis.call('ZREM', envQueueKey, messageId)
+
+-- Rebalance the parent queues
+local earliestMessage = redis.call('ZRANGE', messageQueue, 0, 0, 'WITHSCORES')
+for _, parentQueue in ipairs(parentQueues) do
+ local prefixedParentQueue = keyPrefix .. parentQueue
+ if #earliestMessage == 0 then
+ redis.call('ZREM', prefixedParentQueue, messageQueue)
+ else
+ redis.call('ZADD', prefixedParentQueue, earliestMessage[2], messageQueue)
+ end
+end
+
+-- Update the concurrency keys
+redis.call('SREM', concurrencyKey, messageId)
+redis.call('SREM', envCurrentConcurrencyKey, messageId)
+redis.call('SREM', projectCurrentConcurrencyKey, messageId)
+redis.call('SREM', taskCurrentConcurrencyKey, messageId)
+`,
+ });
+
+ this.redis.defineCommand("nackMessage", {
+ numberOfKeys: 7,
+ lua: `
+-- Keys:
+local messageKey = KEYS[1]
+local messageQueueKey = KEYS[2]
+local concurrencyKey = KEYS[3]
+local envConcurrencyKey = KEYS[4]
+local projectConcurrencyKey = KEYS[5]
+local envQueueKey = KEYS[6]
+local taskConcurrencyKey = KEYS[7]
+
+-- Args:
+local messageId = ARGV[1]
+local messageData = ARGV[2]
+local messageScore = tonumber(ARGV[3])
+local parentQueues = cjson.decode(ARGV[4])
+local keyPrefix = ARGV[5]
+
+-- Update the message data
+redis.call('SET', messageKey, messageData)
+
+-- Update the concurrency keys
+redis.call('SREM', concurrencyKey, messageId)
+redis.call('SREM', envConcurrencyKey, messageId)
+redis.call('SREM', projectConcurrencyKey, messageId)
+redis.call('SREM', taskConcurrencyKey, messageId)
+
+-- Enqueue the message into the queue
+redis.call('ZADD', messageQueueKey, messageScore, messageId)
+redis.call('ZADD', envQueueKey, messageScore, messageId)
+
+-- Rebalance the parent queues
+local earliestMessage = redis.call('ZRANGE', messageQueueKey, 0, 0, 'WITHSCORES')
+for _, parentQueue in ipairs(parentQueues) do
+ local prefixedParentQueue = keyPrefix .. parentQueue
+ if #earliestMessage == 0 then
+ redis.call('ZREM', prefixedParentQueue, messageQueueKey)
+ else
+ redis.call('ZADD', prefixedParentQueue, earliestMessage[2], messageQueueKey)
+ end
+end
+`,
+ });
+
+ this.redis.defineCommand("moveToDeadLetterQueue", {
+ numberOfKeys: 8,
+ lua: `
+-- Keys:
+local messageKey = KEYS[1]
+local messageQueue = KEYS[2]
+local concurrencyKey = KEYS[3]
+local envCurrentConcurrencyKey = KEYS[4]
+local projectCurrentConcurrencyKey = KEYS[5]
+local envQueueKey = KEYS[6]
+local taskCurrentConcurrencyKey = KEYS[7]
+local deadLetterQueueKey = KEYS[8]
+
+-- Args:
+local messageId = ARGV[1]
+local parentQueues = cjson.decode(ARGV[2])
+local keyPrefix = ARGV[3]
+
+-- Remove the message from the queue
+redis.call('ZREM', messageQueue, messageId)
+redis.call('ZREM', envQueueKey, messageId)
+
+-- Rebalance the parent queues
+local earliestMessage = redis.call('ZRANGE', messageQueue, 0, 0, 'WITHSCORES')
+for _, parentQueue in ipairs(parentQueues) do
+ local prefixedParentQueue = keyPrefix .. parentQueue
+ if #earliestMessage == 0 then
+ redis.call('ZREM', prefixedParentQueue, messageQueue)
+ else
+ redis.call('ZADD', prefixedParentQueue, earliestMessage[2], messageQueue)
+ end
+end
+
+-- Add the message to the dead letter queue
+redis.call('ZADD', deadLetterQueueKey, tonumber(redis.call('TIME')[1]), messageId)
+
+-- Update the concurrency keys
+redis.call('SREM', concurrencyKey, messageId)
+redis.call('SREM', envCurrentConcurrencyKey, messageId)
+redis.call('SREM', projectCurrentConcurrencyKey, messageId)
+redis.call('SREM', taskCurrentConcurrencyKey, messageId)
+`,
+ });
+
+ this.redis.defineCommand("releaseConcurrency", {
+ numberOfKeys: 6,
+ lua: `
+-- Keys:
+local messageKey = KEYS[1]
+local messageQueue = KEYS[2]
+local concurrencyKey = KEYS[3]
+local envCurrentConcurrencyKey = KEYS[4]
+local projectCurrentConcurrencyKey = KEYS[5]
+local taskCurrentConcurrencyKey = KEYS[6]
+
+-- Args:
+local messageId = ARGV[1]
+
+-- Update the concurrency keys
+if concurrencyKey ~= "" then
+ redis.call('SREM', concurrencyKey, messageId)
+end
+redis.call('SREM', envCurrentConcurrencyKey, messageId)
+redis.call('SREM', projectCurrentConcurrencyKey, messageId)
+redis.call('SREM', taskCurrentConcurrencyKey, messageId)
+`,
+ });
+
+ this.redis.defineCommand("reacquireConcurrency", {
+ numberOfKeys: 6,
+ lua: `
+-- Keys:
+local messageKey = KEYS[1]
+local messageQueue = KEYS[2]
+local concurrencyKey = KEYS[3]
+local envCurrentConcurrencyKey = KEYS[4]
+local projectCurrentConcurrencyKey = KEYS[5]
+local taskCurrentConcurrencyKey = KEYS[6]
+
+-- Args:
+local messageId = ARGV[1]
+
+-- Update the concurrency keys
+redis.call('SADD', concurrencyKey, messageId)
+redis.call('SADD', envCurrentConcurrencyKey, messageId)
+redis.call('SADD', projectCurrentConcurrencyKey, messageId)
+redis.call('SADD', taskCurrentConcurrencyKey, messageId)
+`,
+ });
+
+ this.redis.defineCommand("calculateMessageQueueCapacitiesWithDisabling", {
+ numberOfKeys: 5,
+ lua: `
+-- Keys
+local currentConcurrencyKey = KEYS[1]
+local currentEnvConcurrencyKey = KEYS[2]
+local concurrencyLimitKey = KEYS[3]
+local envConcurrencyLimitKey = KEYS[4]
+local disabledConcurrencyLimitKey = KEYS[5]
+
+-- Args
+local defaultEnvConcurrencyLimit = tonumber(ARGV[1])
+
+-- Check if disabledConcurrencyLimitKey exists
+local orgIsEnabled
+if redis.call('EXISTS', disabledConcurrencyLimitKey) == 1 then
+ orgIsEnabled = false
+else
+ orgIsEnabled = true
+end
+
+local currentEnvConcurrency = tonumber(redis.call('SCARD', currentEnvConcurrencyKey) or '0')
+local envConcurrencyLimit = tonumber(redis.call('GET', envConcurrencyLimitKey) or defaultEnvConcurrencyLimit)
+
+local currentConcurrency = tonumber(redis.call('SCARD', currentConcurrencyKey) or '0')
+local concurrencyLimit = redis.call('GET', concurrencyLimitKey)
+
+-- Return current capacity and concurrency limits for the queue, env, org
+return { currentConcurrency, concurrencyLimit, currentEnvConcurrency, envConcurrencyLimit, orgIsEnabled }
+ `,
+ });
+
+ this.redis.defineCommand("calculateMessageQueueCapacities", {
+ numberOfKeys: 4,
+ lua: `
+-- Keys:
+local currentConcurrencyKey = KEYS[1]
+local currentEnvConcurrencyKey = KEYS[2]
+local concurrencyLimitKey = KEYS[3]
+local envConcurrencyLimitKey = KEYS[4]
+
+-- Args
+local defaultEnvConcurrencyLimit = tonumber(ARGV[1])
+
+local currentEnvConcurrency = tonumber(redis.call('SCARD', currentEnvConcurrencyKey) or '0')
+local envConcurrencyLimit = tonumber(redis.call('GET', envConcurrencyLimitKey) or defaultEnvConcurrencyLimit)
+
+local currentConcurrency = tonumber(redis.call('SCARD', currentConcurrencyKey) or '0')
+local concurrencyLimit = redis.call('GET', concurrencyLimitKey)
+
+-- Return current capacity and concurrency limits for the queue, env, org
+return { currentConcurrency, concurrencyLimit, currentEnvConcurrency, envConcurrencyLimit, true }
+ `,
+ });
+
+ this.redis.defineCommand("updateGlobalConcurrencyLimits", {
+ numberOfKeys: 1,
+ lua: `
+-- Keys: envConcurrencyLimitKey
+local envConcurrencyLimitKey = KEYS[1]
+
+-- Args: envConcurrencyLimit
+local envConcurrencyLimit = ARGV[1]
+
+redis.call('SET', envConcurrencyLimitKey, envConcurrencyLimit)
+ `,
+ });
+ }
+}
+
+declare module "ioredis" {
+ interface RedisCommander {
+ enqueueMessage(
+ //keys
+ queue: string,
+ messageKey: string,
+ concurrencyKey: string,
+ envConcurrencyKey: string,
+ taskConcurrencyKey: string,
+ projectConcurrencyKey: string,
+ envQueueKey: string,
+ //args
+ queueName: string,
+ messageId: string,
+ messageData: string,
+ messageScore: string,
+ parentQueues: string,
+ keyPrefix: string,
+ callback?: Callback
+ ): Result;
+
+ dequeueMessage(
+ //keys
+ childQueue: string,
+ concurrencyLimitKey: string,
+ envConcurrencyLimitKey: string,
+ currentConcurrencyKey: string,
+ envConcurrencyKey: string,
+ projectConcurrencyKey: string,
+ messageKeyPrefix: string,
+ envQueueKey: string,
+ taskCurrentConcurrentKeyPrefix: string,
+ //args
+ childQueueName: string,
+ currentTime: string,
+ defaultEnvConcurrencyLimit: string,
+ keyPrefix: string,
+ callback?: Callback<[string, string]>
+ ): Result<[string, string, string] | null, Context>;
+
+ acknowledgeMessage(
+ messageKey: string,
+ messageQueue: string,
+ concurrencyKey: string,
+ envConcurrencyKey: string,
+ projectConcurrencyKey: string,
+ envQueueKey: string,
+ taskConcurrencyKey: string,
+ messageId: string,
+ masterQueues: string,
+ keyPrefix: string,
+ callback?: Callback
+ ): Result;
+
+ nackMessage(
+ messageKey: string,
+ messageQueue: string,
+ concurrencyKey: string,
+ envConcurrencyKey: string,
+ projectConcurrencyKey: string,
+ envQueueKey: string,
+ taskConcurrencyKey: string,
+ messageId: string,
+ messageData: string,
+ messageScore: string,
+ masterQueues: string,
+ keyPrefix: string,
+ callback?: Callback
+ ): Result;
+
+ moveToDeadLetterQueue(
+ messageKey: string,
+ messageQueue: string,
+ concurrencyKey: string,
+ envConcurrencyKey: string,
+ projectConcurrencyKey: string,
+ envQueueKey: string,
+ taskConcurrencyKey: string,
+ deadLetterQueueKey: string,
+ messageId: string,
+ masterQueues: string,
+ keyPrefix: string,
+ callback?: Callback
+ ): Result;
+
+ releaseConcurrency(
+ messageKey: string,
+ messageQueue: string,
+ concurrencyKey: string,
+ envConcurrencyKey: string,
+ projectConcurrencyKey: string,
+ taskConcurrencyKey: string,
+ messageId: string,
+ masterQueues: string,
+ callback?: Callback
+ ): Result;
+
+ reacquireConcurrency(
+ messageKey: string,
+ messageQueue: string,
+ concurrencyKey: string,
+ envConcurrencyKey: string,
+ projectConcurrencyKey: string,
+ taskConcurrencyKey: string,
+ messageId: string,
+ masterQueues: string,
+ callback?: Callback
+ ): Result;
+
+ calculateMessageQueueCapacities(
+ currentConcurrencyKey: string,
+ currentEnvConcurrencyKey: string,
+ concurrencyLimitKey: string,
+ envConcurrencyLimitKey: string,
+ defaultEnvConcurrencyLimit: string,
+ callback?: Callback