cloud/openenv.yaml at main · jitheender-ops/cloud · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
name: CustomerSupportTriage-v0
version: "1.0.0"
description: >
  A real-world AI agent benchmark for customer support triage.
  The agent must read a queue of support tickets and for each one:
  assign a priority level, route to the correct department,
  draft a professional customer reply, and flag tickets requiring human review.

author: support-triage-team
tags:
  - openenv
  - customer-support
  - triage
  - nlp
  - real-world

tasks:
  - name: easy
    description: "5 unambiguous tickets with clear signals"
    difficulty: easy
    max_steps: 5
    success_threshold: 0.6

  - name: medium
    description: "10 tickets with multi-issue bodies and ambiguous routing"
    difficulty: medium
    max_steps: 10
    success_threshold: 0.55

  - name: hard
    description: "15 adversarial tickets: misleading sentiment, legal edge-cases, downplayed urgency"
    difficulty: hard
    max_steps: 15
    success_threshold: 0.5

observation_space:
  type: object
  fields:
    queue:
      type: array
      items:
        type: object
        fields:
          ticket_id:     {type: string}
          subject:       {type: string}
          body:          {type: string}
          customer_name: {type: string}
          customer_tier: {type: string, enum: [free, pro, enterprise]}
          created_at:    {type: string, format: "ISO 8601"}
          sentiment:     {type: string, enum: [positive, neutral, negative, angry]}
          tags:          {type: array, items: {type: string}}
    processed:      {type: integer}
    total_tickets:  {type: integer}
    task_name:      {type: string}
    step_number:    {type: integer}
    time_remaining: {type: integer}

action_space:
  type: object
  fields:
    actions:
      type: array
      items:
        type: object
        fields:
          ticket_id:
            type: string
            description: "ID of the ticket being triaged"
          priority:
            type: string
            enum: [low, medium, high, urgent]
            description: "Assigned priority level"
          department:
            type: string
            enum: [billing, technical, shipping, returns, general, escalation]
            description: "Department to route this ticket to"
          response:
            type: string
            min_length: 10
            max_length: 500
            description: "Draft reply to the customer"
          needs_human:
            type: boolean
            description: "Flag for human review"
          reasoning:
            type: string
            description: "Agent's chain of thought (not scored)"

reward:
  type: float
  range: [0.0, 1.0]
  description: >
    Mean score across graded tickets in this step.
    Composite of: priority accuracy (30%), routing accuracy (30%),
    response quality/keyword coverage (25%), escalation correctness (15%).
    Partial credit awarded for near-correct priority and routing.

endpoints:
  reset:  {method: POST, path: /reset}
  step:   {method: POST, path: /step}
  state:  {method: GET,  path: /state}
  health: {method: GET,  path: /health}